def test_batch_spectrogram(self): waveform, sample_rate = torchaudio.load(self.test_filepath) # Single then transform then batch expected = transforms.Spectrogram()(waveform).repeat(3, 1, 1, 1) # Batch then transform computed = transforms.Spectrogram()(waveform.repeat(3, 1, 1)) self.assertTrue(computed.shape == expected.shape, (computed.shape, expected.shape)) self.assertTrue(torch.allclose(computed, expected))
def __init__(self, d, src_path, batch_size, device): super(VAE, self).__init__() ''' =========== ARGUMENTS =========== > d - dimensionality of latent space > src_path - path to source samples > batch_size - number of training examples in single batch > device - CPU or GPU in use ================================= ''' self.enc = ResNetBigger(d=d) self.dec = nn.Sequential(nn.Linear(d, 40), nn.ReLU(), nn.Linear(40, 50), nn.Sigmoid()) # For computing spectrogram then normalizing self.spectrogram = T.Spectrogram( n_fft=2048, win_length=None, hop_length=512, power=2, ) self.amp_to_db = T.AmplitudeToDB(stype='power') self.src = torch.from_numpy(np.load(src_path)) self.src = self.src.unsqueeze(0).repeat(batch_size, 1, 1) self.src = self.src.to(device) self.d = d self.device = device
def preprocess(file_path='../DATASETS/LJSpeech-1.1/metadata.csv', root_dir='../DATASETS/LJSpeech-1.1'): with open(file_path, encoding='utf8') as file: data_ = [line.strip().split('|') for line in file] root_dir = root_dir sample_rate = 8000 resample = transforms.Resample(orig_freq=22050, new_freq=sample_rate) spectogram = transforms.Spectrogram(n_fft=1024, hop_length=256) to_mel = transforms.MelScale(n_mels=80, sample_rate=sample_rate, n_stft=1024 // 2 + 1) mel_data = torch.zeros(len(data_), 316, 80) mel_len = torch.empty(len(data_), dtype=torch.int) for idx, data in enumerate(tqdm(data_)): path, text = data[0], data[1] path = f'{root_dir}/wavs/{path}.wav' data, sample_rate = torchaudio.load(path) data = resample(data) data = spectogram(data) data = to_mel(data) data = data.transpose(1, 2).squeeze(0) mel_data[idx, :data.size(0)] = data mel_len[idx] = data.size(0) torch.save(mel_data, f'{root_dir}/mel_data.pt') torch.save(mel_len, f'{root_dir}/mel_len.pt')
def test_spectrogram(self): specgram = transforms.Spectrogram(center=False, pad_mode="reflect", onesided=False) self.assertEqual(specgram.center, False) self.assertEqual(specgram.pad_mode, "reflect") self.assertEqual(specgram.onesided, False)
def test_Spectrogram_complex(self): n_fft = 400 hop_length = 200 sample_rate = 16000 waveform = get_whitenoise( sample_rate=sample_rate, n_channels=1, ).to(self.device, self.dtype) expected = librosa.core.spectrum._spectrogram( y=waveform[0].cpu().numpy(), n_fft=n_fft, hop_length=hop_length, power=1)[0] result = T.Spectrogram( n_fft=n_fft, hop_length=hop_length, power=None, return_complex=True, ).to(self.device, self.dtype)(waveform)[0] self.assertEqual(result.abs(), torch.from_numpy(expected), atol=1e-5, rtol=1e-5)
def __getitem__(self, index): filename = self.data_path[index] n_fft = 128 #fbins = n_fft//2 + 1 spec_transform = transforms.Spectrogram(n_fft = n_fft, normalized = False) label = int(filename.split("/")[-1].split("_")[0]) soundSource = filename.split("/")[-1].split("_")[1] number = filename.split("/")[-1].split("_")[2] wave, sample_rate = torchaudio.load_wav(filename) spec = spec_transform(wave) log_spec = (spec + 1e-9).log2()[0, :, :] width = 65 height = log_spec.shape[0] dim = (width, height) log_spec = cv2.resize(log_spec.numpy(), dim, interpolation = cv2.INTER_AREA) plt.figure() plt.imshow(log_spec) plt.show() return log_spec, label, soundSource
def __init__(self, sample_rate=16000, n_fft=800, win_length=800, hop_length=200, n_mels=80, rescale=True, rescaling_max=0.9, max_abs_value=4., preemphasis=0.97, preemphasize=True, fmin=55, fmax=7600, min_level_db=-100, ref_level_db=20, symmetric_mels=True): super(logFbankCal, self).__init__() # these basic hyparams can be removed self.sample_rate = sample_rate self.n_fft = n_fft self.win_length = win_length self.hop_length = hop_length self.n_mels = n_mels self.fmin = fmin self.fmax = fmax self.rescale = rescale self.rescaling_max = torch.tensor(rescaling_max, dtype=torch.float) self.preemphasize = preemphasize self.flipped_filter = torch.FloatTensor([-preemphasis, 1.]).unsqueeze(0).unsqueeze(0) self.stftCal = transforms.Spectrogram(n_fft, win_length, hop_length, power=None) mel_basis = librosa_mel_fn(sample_rate, n_fft, n_mels, fmin, fmax) self.mel_basis = torch.from_numpy(mel_basis).float() self.symmetric_mels = symmetric_mels self.ref_level_db = torch.tensor(ref_level_db, dtype=torch.float) self.min_level_db = torch.tensor(min_level_db, dtype=torch.float) self.min_level = torch.tensor(np.exp(min_level_db / 20 * np.log(10)), dtype=torch.float) self.max_abs_value = torch.tensor(max_abs_value, dtype=torch.float)
def test_spectrogram(self, kwargs): # replication_pad1d_backward_cuda is not deteministic and # gives very small (~2.7756e-17) difference. # # See https://github.com/pytorch/pytorch/issues/54093 transform = T.Spectrogram(**kwargs) waveform = get_whitenoise(sample_rate=8000, duration=0.05, n_channels=2) self.assert_grad(transform, [waveform], nondet_tol=1e-10)
def __init__(self, sample_rate, n_fft, win_length, hop_length, n_mels): super(PhaseFbankCal, self).__init__() self.complexSpec = transforms.Spectrogram(n_fft=n_fft, win_length=win_length, hop_length=hop_length, power=None) self.mel_scale = transforms.MelScale(n_mels=n_mels, sample_rate=sample_rate, n_stft=n_fft // 2 + 1)
def compute_stft(waveform: Tensor, n_fft: int, win_length: int, hop_length: int) -> Tuple[Tensor, Tensor]: device = waveform.device spectrogram = _transf.Spectrogram(n_fft=n_fft, win_length=win_length, hop_length=hop_length, power=None).to(device) amplitude, phase = _func.magphase(spectrogram(waveform)) return amplitude, phase
def __init__(self, n_fft, hop_length): """Calculate spectrogram of a set of 1D signals. The first dimension is batch/channel and the second should be time. Arguments: n_fft {int} -- Size of time window over which to calculate each FFT. hop_length {int} -- The stride length between the start of each FFT window. """ self.spec_fn = transforms.Spectrogram(n_fft=n_fft, hop_length=hop_length)
def test_roundtrip_spectrogram(self, **args): """Test the spectrogram + inverse spectrogram results in approximate identity.""" waveform = get_whitenoise(sample_rate=8000, duration=0.5, dtype=self.dtype) s = T.Spectrogram(**args, power=None) inv_s = T.InverseSpectrogram(**args) transformed = s.forward(waveform) restored = inv_s.forward(transformed, length=waveform.shape[-1]) self.assertEqual(waveform, restored, atol=1e-6, rtol=1e-6)
def __init__(self, d, src_path, batch_size, device, dropout_rate=0.5, encoder = 'ResNet'): super(VAE, self).__init__() ''' =========== ARGUMENTS =========== > d - dimensionality of latent space > src_path - path to source samples > batch_size - number of training examples in single batch > device - CPU or GPU in use ================================= ''' if encoder == 'ResNet': self.enc = ResNetBigger(d=d, dropout_rate=dropout_rate) elif encoder == 'CNN': self.enc = CNN(d=d, batch_size=batch_size, dropout_rate=dropout_rate) elif encoder == 'linear': self.enc == nn.Sequential( nn.Linear(1025, 400), nn.ReLU(), nn.Linear(400, 100), nn.ReLU(), nn.Linear(100, 50), nn.ReLU(), nn.Linear(50, d) ) self.dec = nn.Sequential( nn.Linear(d, 40), nn.ReLU(), nn.Linear(40, 50), nn.Softmax() ) self.softmax = nn.Softmax(dim=0) # For computing spectrogram then normalizing self.spectrogram = T.Spectrogram( n_fft=2048, win_length=None, hop_length=512, power=2, ) self.amp_to_db = T.AmplitudeToDB(stype='power') self.src = torch.from_numpy(np.load(src_path)) self.src = self.src.unsqueeze(0).repeat(batch_size, 1, 1) self.src = self.src.to(device) self.d = d self.global_step = 0 self.epoch = 0
def _wav_to_spec(path=None, wav=None, sr=sample_rate, engine='librosa'): ''' STFT Spectrogram with absolute values ''' if path is None and wav is None: raise ValueError if path is not None: wav, _ = librosa.core.load(path, sr=None) if engine == 'librosa': return np.abs(librosa.stft(wav, **stft_params)) elif engine == 'torch': return tf.Spectrogram(**stft_params, power=power)(torch.from_numpy(wav)) raise ValueError(engine)
def spectro_gram(aud, spectro_type='mel', n_mels=64, n_fft=1024, hop_len=None): sig,sr = aud f_min, f_max, ws, top_db, pad = 0.0, None, None, 80, 0 # spec has shape [channel, n_mels, time], where channel is mono, stereo etc if (spectro_type == 'mel'): spec = transforms.MelSpectrogram(sr, n_fft, ws, hop_len, f_min, f_max, pad, n_mels)(sig) elif (spectro_type == 'mfcc'): pass else: spec = transforms.Spectrogram(n_fft, ws, hop_len, pad, normalize=False)(sig) # Convert to decibels spec = transforms.AmplitudeToDB(top_db=top_db)(spec) return (spec)
def get_train_transforms( config: object, transforms_set: TformsSet = TformsSet.Audtorch) -> object: if config.use_mels: if transforms_set == TformsSet.TorchAudio: trans = tforms_vision.Compose([ tforms_torch.Resample(orig_freq=44100, new_freq=config.resampling_rate), tforms_torch.MelSpectrogram(sample_rate=config.resampling_rate, n_fft=config.n_fft, win_length=config.hop_length, hop_length=config.hop_length, f_min=float(config.fmin), f_max=float(config.fmax), pad=0, n_mels=config.n_mels), tforms_torch.AmplitudeToDB(stype='power', top_db=80), #tforms_aud.RandomCrop(config.max_length_frames), # Raises "Can't call numpy() on Variable that requires grad. Use var.detach().numpy() instead." ]) elif transforms_set == TformsSet.MySet: # this works trans = tforms_aud.Compose([ tforms_torch.Resample(orig_freq=44100, new_freq=config.resampling_rate), tforms_mine.Spectrogram(config), tforms_aud.RandomCrop(config.max_length_frames) ]) else: if transforms_set == TformsSet.TorchAudio: # this works trans = tforms_aud.Compose([ tforms_torch.Resample(orig_freq=44100, new_freq=config.resampling_rate), tforms_torch.Spectrogram(n_fft=config.n_fft, win_length=config.hop_length, hop_length=config.hop_length, pad=0, power=2, normalized=True), tforms_torch.AmplitudeToDB(stype='power', top_db=80), tforms_aud.RandomCrop(config.max_length_frames) ]) elif transforms_set == TformsSet.MySet: # this works trans = tforms_aud.Compose([ tforms_torch.Resample(orig_freq=44100, new_freq=config.resampling_rate), tforms_mine.Spectrogram(config), tforms_aud.RandomCrop(config.max_length_frames) ]) return trans
def get_spectrogram( n_fft = 400, win_len = None, hop_len = None, power = 2.0, ): waveform, _ = get_speech_sample() spectrogram = T.Spectrogram( n_fft=n_fft, win_length=win_len, hop_length=hop_len, center=True, pad_mode="reflect", power=power, ) return spectrogram(waveform)
def test_ecoacoustics_dm(root: Path): dm = EcoacousticsDataModule(root=root, segment_len=30.0, target_attrs="habitat", train_transforms=AT.Spectrogram()) dm.prepare_data() dm.setup() # Test loading a sample. train_dl = dm.train_dataloader() test_sample = next(iter(train_dl)) # Test size(). assert test_sample.x.size()[1] == dm.dims[0] assert test_sample.x.size()[2] == dm.dims[1] assert test_sample.x.size()[3] == dm.dims[2]
def __call__(self, data): signal = data["signal"] sr = data['sample_rate'] self.n_fft = int(np.ceil(0.025 * sr)) self.win_length = int(np.ceil(0.025 * sr)) self.hop_length = int(np.ceil(0.01 * sr)) spec = nn.Sequential( T.Spectrogram(n_fft=self.n_fft, win_length=self.win_length, hop_length=self.hop_length), T.AmplitudeToDB()) data['Spectrogram'] = spec(signal) data['input'] = spec(signal) return data
def get_time_frequency_transform(config): """ Returns a nn.Sequential block to do a time-frequency transform, and crop to the desired size. The spectrogram has shape: [batch, channels, freq_bins, frames] :param config: :return: """ if config.use_mels: transformer = nn.Sequential( tforms_torch.MelSpectrogram(sample_rate=config.new_fs, n_fft=config.n_fft, win_length=config.win_length, hop_length=config.hop_length, f_min=float(config.fmin), f_max=float(config.fmax), pad=0, n_mels=config.n_mels), #utils.make_module(tforms_mine.RandomCrop)(config.max_length_frames), tforms_mine.RandomCrop( (1, config.n_mels if config.use_mels else config.n_fft // 2 + 1, config.max_length_frames), value=0), tforms_torch.AmplitudeToDB(stype='power', top_db=80), tforms_mine.ReScaleSpec([-1, 1]), ) else: transformer = nn.Sequential( tforms_torch.Spectrogram(n_fft=config.n_fft, win_length=config.win_length, hop_length=config.hop_length, pad=0, power=2, normalized=True), tforms_mine.RandomCrop( (1, config.n_mels if config.use_mels else config.n_fft // 2 + 1, config.max_length_frames), value=0), tforms_mine.AmplitudeToDB(stype='power', top_db=80), #utils.make_module(tforms_mine.RandomCrop)(config.max_length_frames), tforms_mine.ReScaleSpec([-1, 1]), ) return transformer
def tfm_spectro(ad, sr): # We must reshape signal for torchaudio to generate the spectrogram. ws=512 hop=256 to_db_scale=False n_fft=1024 f_min=0.0 f_max=-80 pad=0 n_mels=128 #mel = transforms.MelSpectrogram(sr, n_mels=n_mels, n_fft=n_fft, hop=hop, f_min=f_min, f_max=f_max, pad=pad)(ad) sp = transforms.Spectrogram()(ad) mel = transforms.MelScale()(sp) #mel = mel.permute(0,2,1) # swap dimension, mostly to look sane to a human. #if to_db_scale: mel = transforms.SpectrogramToDB(stype='magnitude', top_db=f_max)(mel) #mel = mel.detach().numpy() if to_db_scale: mel = 20*torch.log10(mel) return mel
def __init__(self, table_path, alphabet, dataset_path='', max_len=0, preprocess='raw', normalize=False, pkwargs=None): global transform, do_normalize super(SpeechDataset, self).__init__() self.table = pd.read_csv(table_path) self.dataset_path = dataset_path self.intencode = IntegerEncode(alphabet) self.max_len = max_len if preprocess == "mfcc": transform = transforms.MFCC(sample_rate=pkwargs['sr'], n_mfcc=pkwargs['num_features']) elif preprocess == "spectrogram": transform = transforms.Spectrogram( n_fft=pkwargs['n_fft'], normalized=pkwargs['normalized'])
def test_Spectrogram(self, n_fft, hop_length, power): sample_rate = 16000 waveform = get_whitenoise( sample_rate=sample_rate, n_channels=1, ).to(self.device, self.dtype) expected = librosa.core.spectrum._spectrogram( y=waveform[0].cpu().numpy(), n_fft=n_fft, hop_length=hop_length, power=power)[0] result = T.Spectrogram( n_fft=n_fft, hop_length=hop_length, power=power, ).to(self.device, self.dtype)(waveform)[0] self.assertEqual(result, torch.from_numpy(expected), atol=1e-5, rtol=1e-5)
def compareTforms(config): ''' Here I compare different transfromations sets for spectrograms, using (torchaudio, audtorch, and my own custom spectrogram using librosa. This codes is applied to a sample audio file from the librispeech dataset. This code was done mostly to post as an issue in github. As a minimal working example. ''' config.use_mels = False config.win_length = 400 config.hop_length = 400 config.n_fft = 2048 config.resampling_rate = 16000 augment1 = tforms2.Compose([ myTforms.ToTensor(), tforms.Spectrogram( n_fft=2048, win_length=400, # 400 samples @ 16k = 25 ms, hop_length=400, pad=0, power=2, normalized=False), tforms.AmplitudeToDB(stype='power', top_db=80) ]) augment2 = tforms2.Compose([ tforms2.Spectrogram( window_size=400, # 400 samples @ 16k = 25 ms hop_size=400, fft_size=2048), myTforms.ToTensor(), tforms.AmplitudeToDB(stype='magnitude', top_db=80) ]) augment3 = tforms2.Compose([myTforms.Spectrogram(config)]) data1 = dsets.LibriSpeech( root='/m/cs/work/falconr1/datasets/librespeech/LibriSpeech', sets='dev-clean', download=False, transform=augment1) data2 = dsets.LibriSpeech( root='/m/cs/work/falconr1/datasets/librespeech/LibriSpeech', sets='dev-clean', download=False, transform=augment2) data3 = dsets.LibriSpeech( root='/m/cs/work/falconr1/datasets/librespeech/LibriSpeech', sets='dev-clean', download=False, transform=augment3) plt.figure(figsize=(16, 8)) titles = ['torchaudio', 'audtorch', 'myset'] for i, data in enumerate([data1, data2, data3]): spec, label = data[0] if isinstance(spec, torch.Tensor): spec = spec.numpy() plt.subplot(1, 3, i + 1) plt.imshow(spec.squeeze(), interpolation='nearest', cmap='inferno', origin='lower', aspect='auto') plt.colorbar() plt.title(titles[i]) plt.savefig(os.path.join('./results', 'Test_Output_compare_specs.png')) plt.show()
def test_Spectrogram_return_complex(self): tensor = torch.rand((1, 1000)) self._assert_consistency( T.Spectrogram(power=None, return_complex=True), tensor)
def test_Spectrogram(self): tensor = torch.rand((1, 1000)) self._assert_consistency(T.Spectrogram(), tensor)
# samples = [0, 1, 2, 2547] # plot_samples(samples) df_train, df_val = train_test_split(df, train_size=SPLIT_RATIO, test_size=1 - SPLIT_RATIO, random_state=RANDOM_STATE) print(len(df_train)) print(len(df_val)) print(f'-----------------\n' \ f'{df_train.loc[0]}') spectrogram = T.Spectrogram(n_fft=128, hop_length=64) classes = torch.Tensor(df.target.unique()) model = resnet18(pretrained=True) model.conv1 = nn.Conv2d(1, model.conv1.out_channels, kernel_size=model.conv1.kernel_size[0], stride=model.conv1.stride[0], padding=model.conv1.padding[0]) num_ftrs = model.fc.in_features model.fc = nn.Linear(num_ftrs, len(classes)) train = df.sample(frac=1, random_state=RANDOM_STATE) X = train.file_path y = train.target model.fit(torch.Tensor(np.load(X[0])), target[0])
# # To get the frequency make-up of an audio signal as it varies with time, # you can use ``Spectrogram``. # waveform, sample_rate = get_speech_sample() n_fft = 1024 win_length = None hop_length = 512 # define transformation spectrogram = T.Spectrogram( n_fft=n_fft, win_length=win_length, hop_length=hop_length, center=True, pad_mode="reflect", power=2.0, ) # Perform transformation spec = spectrogram(waveform) print_stats(spec) plot_spectrogram(spec[0], title='torchaudio') ###################################################################### # GriffinLim # ---------- # # To recover a waveform from a spectrogram, you can use ``GriffinLim``. #
val_path = r'X:\DS Training Data\samples\val.npy' val_data = Dataset(val_path) val_loader = torch.utils.data.DataLoader(val_data, drop_last=True, **params) test_path = r'X:\DS Training Data\samples\test.npy' test_data = Dataset(test_path) test_loader = torch.utils.data.DataLoader(test_data, drop_last=True, **params) spectrogram = T.Spectrogram( n_fft=2048, win_length=None, hop_length=512, power=2, ) amp_to_db = T.AmplitudeToDB(stype='power') src_path = test_path = r'X:\DS Training Data\samples\src.npy' model = VAE(d=32, src_path=src_path, batch_size=params['batch_size'], device=device, dropout_rate=0.25, encoder='CNN').cuda() print(model) criterion = loss_function
def get_train_transforms(config: object, set: TformsSet = TformsSet.Audtorch) -> object: if config.use_mels: if set == TformsSet.TorchAudio: trans = transforms.Compose([ tforms2.Crop((441000, 441000 + 441000)), tforms.MelSpectrogram(sample_rate=config.resampling_rate, n_fft=config.n_fft, win_length=config.hop_length, hop_length=config.hop_length, f_min=float(config.fmin), f_max=float(config.fmax), pad=0, n_mels=config.n_mels), tforms.AmplitudeToDB(stype='power', top_db=80), # transforms.ToPILImage(), # transforms.RandomCrop((96, 256), pad_if_needed=True, # padding_mode='reflect'), # transforms.ToTensor(), ]) elif set == TformsSet.Audtorch: ## no real mel spectrogram in audtorch trans = tforms2.Compose([ myTforms.ToNumpy(), tforms2.Crop((441000, 441000 + 441000)), # tforms2.Normalize(), tforms2.Spectrogram( window_size=config.hop_length, hop_size=config.hop_length, fft_size=config.n_fft, ), tforms2.Log(), myTforms.ToTensor(), tforms.AmplitudeToDB(stype='magnitude', top_db=80) ]) elif set == TformsSet.MySet: trans = tforms2.Compose([ tforms2.Crop((441000, 441000 + 441000)), myTforms.Spectrogram(config) ]) else: if set == TformsSet.TorchAudio: trans = transforms.Compose([ tforms2.Crop((441000, 441000 + 441000)), tforms.Spectrogram(n_fft=config.n_fft, win_length=config.hop_length, hop_length=config.hop_length, pad=0, power=2, normalized=True), tforms.AmplitudeToDB(stype='power', top_db=80), # tforms.MelSpectrogram(sample_rate=config.resampling_rate, # n_fft=config.n_fft, # win_length=config.hop_length, # hop_length=config.hop_length, # f_min=float(config.fmin), # f_max=float(config.fmax), # pad=0, # n_mels=config.n_mels), #transforms.ToPILImage(), #transforms.RandomCrop((96, 256), pad_if_needed=True, # padding_mode='reflect'), #transforms.ToTensor(), ]) elif set == TformsSet.Audtorch: trans = tforms2.Compose([ myTforms.ToNumpy(), tforms2.Crop((441000, 441000 + 441000)), #tforms2.Normalize(), tforms2.Spectrogram( window_size=config.hop_length, hop_size=config.hop_length, fft_size=config.n_fft, ), myTforms.ToTensor(), tforms.AmplitudeToDB(stype='magnitude', top_db=80) ]) elif set == TformsSet.MySet: trans = tforms2.Compose([ tforms2.Crop((441000, 441000 + 441000)), myTforms.Spectrogram(config) ]) return trans