def test_batch_Resample(self): waveform = torch.randn(2, 2786) # Single then transform then batch expected = transforms.Resample()(waveform).repeat(3, 1, 1) # Batch then transform computed = transforms.Resample()(waveform.repeat(3, 1, 1)) self.assertTrue(computed.shape == expected.shape, (computed.shape, expected.shape)) self.assertTrue(torch.allclose(computed, expected))
def get_train_transforms( config: object, transforms_set: TformsSet = TformsSet.Audtorch) -> object: if config.use_mels: if transforms_set == TformsSet.TorchAudio: trans = tforms_vision.Compose([ tforms_torch.Resample(orig_freq=44100, new_freq=config.resampling_rate), tforms_torch.MelSpectrogram(sample_rate=config.resampling_rate, n_fft=config.n_fft, win_length=config.hop_length, hop_length=config.hop_length, f_min=float(config.fmin), f_max=float(config.fmax), pad=0, n_mels=config.n_mels), tforms_torch.AmplitudeToDB(stype='power', top_db=80), #tforms_aud.RandomCrop(config.max_length_frames), # Raises "Can't call numpy() on Variable that requires grad. Use var.detach().numpy() instead." ]) elif transforms_set == TformsSet.MySet: # this works trans = tforms_aud.Compose([ tforms_torch.Resample(orig_freq=44100, new_freq=config.resampling_rate), tforms_mine.Spectrogram(config), tforms_aud.RandomCrop(config.max_length_frames) ]) else: if transforms_set == TformsSet.TorchAudio: # this works trans = tforms_aud.Compose([ tforms_torch.Resample(orig_freq=44100, new_freq=config.resampling_rate), tforms_torch.Spectrogram(n_fft=config.n_fft, win_length=config.hop_length, hop_length=config.hop_length, pad=0, power=2, normalized=True), tforms_torch.AmplitudeToDB(stype='power', top_db=80), tforms_aud.RandomCrop(config.max_length_frames) ]) elif transforms_set == TformsSet.MySet: # this works trans = tforms_aud.Compose([ tforms_torch.Resample(orig_freq=44100, new_freq=config.resampling_rate), tforms_mine.Spectrogram(config), tforms_aud.RandomCrop(config.max_length_frames) ]) return trans
def test_resample_identity(self, resampling_method, sample_rate): """When sampling rate is not changed, the transform returns an identical Tensor""" waveform = get_whitenoise(sample_rate=sample_rate, duration=1) resampler = T.Resample(sample_rate, sample_rate, resampling_method) resampled = resampler(waveform) self.assertEqual(waveform, resampled)
def __call__(self, waveform, sample_rate): ''' Args: waveform: torch tsr [num_audio_channels, num_time_steps] sample_rate: per second sample rate Returns: batched torch tsr of shape [N, C, T] ''' x = waveform.mean(axis=0, keepdims=True) # average over channels resampler = ta_trans.Resample(sample_rate, CommonParams.TARGET_SAMPLE_RATE) x = resampler(x) x = self.mel_trans_ope(x) x = x.squeeze(dim=0).T # # [1, C, T] -> [T, C] window_size_in_frames = int( round(CommonParams.PATCH_WINDOW_IN_SECONDS / CommonParams.STFT_HOP_LENGTH_SECONDS)) num_chunks = x.shape[0] // window_size_in_frames # reshape into chunks of non-overlapping sliding window num_frames_to_use = num_chunks * window_size_in_frames x = x[:num_frames_to_use] # [num_chunks, 1, window_size, num_freq] x = x.reshape(num_chunks, 1, window_size_in_frames, x.shape[-1]) return x
def benchmark_resample( method, waveform, sample_rate, resample_rate, lowpass_filter_width=DEFAULT_LOWPASS_FILTER_WIDTH, rolloff=DEFAULT_ROLLOFF, resampling_method=DEFAULT_RESAMPLING_METHOD, beta=None, librosa_type=None, iters=5 ): if method == "functional": begin = time.time() for _ in range(iters): F.resample(waveform, sample_rate, resample_rate, lowpass_filter_width=lowpass_filter_width, rolloff=rolloff, resampling_method=resampling_method) elapsed = time.time() - begin return elapsed / iters elif method == "transforms": resampler = T.Resample(sample_rate, resample_rate, lowpass_filter_width=lowpass_filter_width, rolloff=rolloff, resampling_method=resampling_method, dtype=waveform.dtype) begin = time.time() for _ in range(iters): resampler(waveform) elapsed = time.time() - begin return elapsed / iters elif method == "librosa": waveform_np = waveform.squeeze().numpy() begin = time.time() for _ in range(iters): librosa.resample(waveform_np, sample_rate, resample_rate, res_type=librosa_type) elapsed = time.time() - begin return elapsed / iters
def __init__(self, fs): super(ApplyReverb, self).__init__() dir = '/m/cs/scratch/sequentialml/datasets/RIRs/razr' fn = 'BRIRs_23-Nov-2019_19-35-31.mat' fn = os.path.join(dir, fn) self.resampler = tforms_torch.Resample(orig_freq=48000, new_freq=fs) import h5py tmp_rirs = [] with h5py.File(fn, 'r') as f: #for k, v in f.items(): # arrays[k] = np.array(v) for i in range(0, len(f['allIRs'])): tmp = np.array(f[f['allIRs'][i][0]]).transpose() tmp = np.mean(tmp, axis=1) # mono tmp = torch.Tensor(tmp) if len(tmp.shape) < 2: tmp = tmp.unsqueeze(0) # shape is (channels, timesteps) tmp = self.resampler(tmp) tmp_rirs.append(tmp) self.rirs = tmp_rirs print("Loaded {} RIRs ", len(self.rirs))
def get_train_transforms_audio_only( config: object, transforms_set: TformsSet = TformsSet.Audtorch) -> object: trans = tforms_torch.Resample(orig_freq=44100, new_freq=config.resampling_rate) return trans
def preprocess(file_path='../DATASETS/LJSpeech-1.1/metadata.csv', root_dir='../DATASETS/LJSpeech-1.1'): with open(file_path, encoding='utf8') as file: data_ = [line.strip().split('|') for line in file] root_dir = root_dir sample_rate = 8000 resample = transforms.Resample(orig_freq=22050, new_freq=sample_rate) spectogram = transforms.Spectrogram(n_fft=1024, hop_length=256) to_mel = transforms.MelScale(n_mels=80, sample_rate=sample_rate, n_stft=1024 // 2 + 1) mel_data = torch.zeros(len(data_), 316, 80) mel_len = torch.empty(len(data_), dtype=torch.int) for idx, data in enumerate(tqdm(data_)): path, text = data[0], data[1] path = f'{root_dir}/wavs/{path}.wav' data, sample_rate = torchaudio.load(path) data = resample(data) data = spectogram(data) data = to_mel(data) data = data.transpose(1, 2).squeeze(0) mel_data[idx, :data.size(0)] = data mel_len[idx] = data.size(0) torch.save(mel_data, f'{root_dir}/mel_data.pt') torch.save(mel_len, f'{root_dir}/mel_len.pt')
def test_vctk_transform_pipeline(self): test_filepath_vctk = common_utils.get_asset_path( 'VCTK-Corpus', 'wav48', 'p224', 'p224_002.wav') wf_vctk, sr_vctk = torchaudio.load(test_filepath_vctk) # rate sample = T.Resample(sr_vctk, 16000, resampling_method='sinc_interpolation') wf_vctk = sample(wf_vctk) # dither wf_vctk = F.dither(wf_vctk, noise_shaping=True) E = torchaudio.sox_effects.SoxEffectsChain() E.set_input_file(test_filepath_vctk) E.append_effect_to_chain("gain", ["-h"]) E.append_effect_to_chain("channels", [1]) E.append_effect_to_chain("rate", [16000]) E.append_effect_to_chain("gain", ["-rh"]) E.append_effect_to_chain("dither", ["-s"]) wf_vctk_sox = E.sox_build_flow_effects()[0] torch.testing.assert_allclose(wf_vctk, wf_vctk_sox, rtol=1e-03, atol=1e-03)
def test_vctk_transform_pipeline(self): test_filepath_vctk = os.path.join(self.test_dirpath, "assets/VCTK-Corpus/wav48/p224/", "p224_002.wav") wf_vctk, sr_vctk = torchaudio.load(test_filepath_vctk) # rate sample = T.Resample(sr_vctk, 16000, resampling_method='sinc_interpolation') wf_vctk = sample(wf_vctk) # dither wf_vctk = F.dither(wf_vctk, noise_shaping=True) E = torchaudio.sox_effects.SoxEffectsChain() E.set_input_file(test_filepath_vctk) E.append_effect_to_chain("gain", ["-h"]) E.append_effect_to_chain("channels", [1]) E.append_effect_to_chain("rate", [16000]) E.append_effect_to_chain("gain", ["-rh"]) E.append_effect_to_chain("dither", ["-s"]) wf_vctk_sox = E.sox_build_flow_effects()[0] self.assertTrue( torch.allclose(wf_vctk, wf_vctk_sox, rtol=1e-03, atol=1e-03))
def load_data(path, f, sr=22050, normalize=False, transforms=None): ''' Load in audio, sample-rate and x (either MFCC or spectrogram). ''' #if t1 is None and t2 is None: #Load the entire thing. audio, sr0 = torchaudio.load(path, normalize=normalize) #else: #Load from t1 to t2. # sr0 = torchaudio.info(path).sample_rate # frame_offset = int(np.round(t1 * sr0)) # num_frames = int(np.round((t2-t1)*sr0)) # audio, sr0 = torchaudio.load(path, normalize = normalize, frame_offset = frame_offset, num_frames = num_frames) # assert audio.shape[-1] != 0, f'audio.shape[-1] found to have size 0.' #if normalize: # audio, sr0 = torchaudio.sox_effects.apply_effects_tensor(audio, sr0, [['gain', '-n']], channels_first=True) #Resample to the desired sample rate. if sr0 != sr: audio = T.Resample(sr0, sr)(audio) if transforms: if type(transforms) is not list: transforms = [transforms] for t in transforms: audio, sr = t(audio, sr) x = f(audio) return audio, sr, x
def _decode_example_with_torchaudio(self, value): try: import torchaudio import torchaudio.transforms as T except ImportError as err: raise ImportError( "To support decoding 'mp3' audio files, please install 'torchaudio'." ) from err try: torchaudio.set_audio_backend("sox_io") except RuntimeError as err: raise ImportError( "To support decoding 'mp3' audio files, please install 'sox'." ) from err array, sampling_rate = torchaudio.load(value) if self.sampling_rate and self.sampling_rate != sampling_rate: if not hasattr(self, "_resampler"): self._resampler = T.Resample(sampling_rate, self.sampling_rate) array = self._resampler(array) sampling_rate = self.sampling_rate array = array.numpy() if self.mono: array = array.mean(axis=0) return array, sampling_rate
def get_resampling_transform(config): ''' Torchaudio has no support for batches when resmapling. :param config: :return: ''' return nn.Sequential( tforms_torch.Resample(orig_freq=config.original_fs, new_freq=config.new_fs))
def load_wav_to_torch(full_path, sr): sampling_rate, data = read(full_path) if sampling_rate != sr: data = torch.FloatTensor(data.astype(np.float32)) data = transforms.Resample(orig_freq=sampling_rate, new_freq=sr)(data) return data # assert sr == sampling_rate, "{} SR doesn't match {} on path {}".format( # sr, sampling_rate, full_path) return torch.FloatTensor(data.astype(np.float32))
def __init__(self, orig_fs=44100, **kargs): super().__init__(**kargs) self.orig_fs = orig_fs self.trans = [ atrans.Resample(self.orig_fs, self.fs), atrans.MelSpectrogram(sample_rate=self.fs, n_fft=self.n_fft, hop_length=self.hop_length, n_mels=self.n_mels) ]
def __init__(self, file_path, root_dir, mel_scale): with open(file_path, encoding='utf8') as file: self.data = [line.strip().split('|') for line in file] self.root_dir = root_dir self.resample = transforms.Resample(orig_freq=22050, new_freq=16000) self.to_mel = transforms.MelSpectrogram(n_mels=80, sample_rate=16000, n_fft=1024, hop_length=256, f_max=8000.) self.mel_scale = mel_scale self.text_pad = _symbol_to_id[' ']
def _process(self, wav_path): waveform, tmp_sr = torchaudio.load(wav_path, normalization=True) if tmp_sr != self.sr: waveform = AT.Resample(orig_freq=tmp_sr, new_freq=self.sr)(waveform) name = basename(wav_path).replace(".wav", ".pt").replace(".sph", ".pt") if self.vad_mask: vad_percent = torch.load(join(self.root, "VAD", name)) vad_samples = percent_to_onehot(vad_percent, waveform.shape[-1]) waveform *= vad_samples pitch = self._F0(waveform) torch.save(pitch, join(self.savepath, name))
def wavform_to_log_mel(self, waveform, sample_rate): ''' Args: waveform: torch tsr [num_audio_channels, num_time_steps] sample_rate: per second sample rate Returns: batched torch tsr of shape [N, C, T] ''' x = waveform.mean(axis=0, keepdims=True) # average over channels resampler = ta_trans.Resample(sample_rate, CommonParams.TARGET_SAMPLE_RATE) x = resampler(x) x = self.mel_trans_ope(x) x = x.squeeze(dim=0).T # # [1, C, T] -> [T, C] spectrogram = x.cpu().numpy().copy() window_size_in_frames = int( round(CommonParams.PATCH_WINDOW_IN_SECONDS / CommonParams.STFT_HOP_LENGTH_SECONDS)) if YAMNetParams.PATCH_HOP_SECONDS == YAMNetParams.PATCH_WINDOW_SECONDS: num_chunks = x.shape[0] // window_size_in_frames # reshape into chunks of non-overlapping sliding window num_frames_to_use = num_chunks * window_size_in_frames x = x[:num_frames_to_use] # [num_chunks, 1, window_size, num_freq] x = x.reshape(num_chunks, 1, window_size_in_frames, x.shape[-1]) else: # generate chunks with custom sliding window length `patch_hop_seconds` patch_hop_in_frames = int( round(YAMNetParams.PATCH_HOP_SECONDS / CommonParams.STFT_HOP_LENGTH_SECONDS)) # TODO performance optimization with zero copy patch_hop_num_chunks = ( x.shape[0] - window_size_in_frames) // patch_hop_in_frames + 1 num_frames_to_use = window_size_in_frames + ( patch_hop_num_chunks - 1) * patch_hop_in_frames x = x[:num_frames_to_use] x_in_frames = x.reshape(-1, x.shape[-1]) x_output = np.empty( (patch_hop_num_chunks, window_size_in_frames, x.shape[-1])) for i in range(patch_hop_num_chunks): start_frame = i * patch_hop_in_frames x_output[i] = x_in_frames[start_frame:start_frame + window_size_in_frames] x = x_output.reshape(patch_hop_num_chunks, 1, window_size_in_frames, x.shape[-1]) x = torch.tensor(x, dtype=torch.float32) return x, spectrogram
def load( path: str, sample_rate: int, mono: bool = True, device: torch.device = torch.device("cpu")) -> Tensor: waveform, original_sr = ta.load(path) waveform = waveform.to(device) if original_sr != sample_rate: resample = _transf.Resample(original_sr, sample_rate).to(device) waveform = resample(waveform) if mono: channels_dim = 0 channels_count = waveform.shape[channels_dim] waveform = waveform.sum(dim=channels_dim) / channels_count return waveform
def main(): # ============================================================== # Config and parameters config = get_params() dataset = get_dataset(config) if config.resampling_rate != -1: tforms = transforms.Resample(orig_freq=dataset.fs, new_freq=config.resampling_rate) else: tforms = None start_id = config.start_id * config.chunk_size stop_id = start_id + config.chunk_size full_output_path = os.path.join(dataset.root, dataset.processed_directory) if not os.path.exists(full_output_path): os.mkdir(full_output_path) # ============================================================== # Start print("========== WORKER %04d ---> %04d" % (start_id, stop_id)) result = [] pbar = tqdm(range(start_id, stop_id, 1)) ctr = 0 for i in pbar: ctr += 1 audio, _, fname = dataset[i] result.append( worker_proccess_audio(audio, fname, full_output_path, config.filetype)) pbar.set_description("Processing ids [{} -- {}], Step [{}/{}]".format( start_id, stop_id, ctr, config.chunk_size)) print("========== WORKER %04d ---> %04d Processed %d files" % (start_id, stop_id, len(result)))
def collate_fn(batch, resample_rate): # A data tuple has the form: # waveform, sample_rate, label, speaker_id, utterance_number tensors, targets = [], [] # resampling func transform = transforms.Resample(orig_freq=16000, new_freq=resample_rate) # Gather in lists, and encode labels as indices for waveform, _, label, *_ in batch: tensors += [transform(waveform)] targets += [label_to_index(label)] # Group the list of tensors into a batched tensor tensors = pad_sequence(tensors) targets = torch.stack(targets) return tensors, targets
def __getitem__(self, index): audio, sr = load(self.file_paths[index]) audio = torch.mean(audio, dim=0, keepdim=True) if self.sr != sr: audio = transforms.Resample(sr, self.sr)(audio) mel_spectrogram = transforms.MelSpectrogram(sample_rate=self.sr, n_fft=self.n_fft, win_length=self.win_length, hop_length=self.hop_length, n_mels=self.n_mels, f_max=self.sr / 2)(audio) if self.log_mel: offset = 1e-6 mel_spectrogram = torch.log(mel_spectrogram + offset) else: mel_spectrogram = transforms.AmplitudeToDB( stype="power", top_db=80)(mel_spectrogram) if self.augment: audio = transforms.FrequencyMasking(freq_mask_param=20)(audio) audio = transforms.TimeMasking(time_mask_param=10)(audio) label = self.labels[index] return mel_spectrogram, label
def test_resample(self, orig_freq, new_freq): transform = T.Resample(orig_freq=orig_freq, new_freq=new_freq) waveform = get_whitenoise(sample_rate=8000, duration=0.05, n_channels=2) self.assert_grad(transform, [waveform])
def test_Resample(self): sr1, sr2 = 16000, 8000 tensor = common_utils.get_whitenoise(sample_rate=sr1) self._assert_consistency(T.Resample(float(sr1), float(sr2)), tensor)
def __init__(self, root_dir=constants.DATA_BASE_DIR, result_mode=False, chunk_size=constants.CHUNK_SIZE, model_type='all'): assert chunk_size == 1, 'current implementation only supports 1 second chunks' fs = [ f for f in os.listdir(root_dir) if f.endswith(constants.VISUAL_SUFFIX) ] self.data = [] self.meta = [] if not result_mode: labels = json.loads( open(os.path.join(root_dir, 'labels.json'), 'r').read()) self.ensemble_audio_transforms = [ IT.Compose([ ReduceAudioChannels(), NormalizeAudio(), AT.Resample(constants.AUDIO_SAMPLE_RATE, constants.RESAMPLED_AUDIO_SAMPLE_RATE), AT.MFCC(sample_rate=constants.RESAMPLED_AUDIO_SAMPLE_RATE, n_mfcc=constants.N_MFCCS) ]), IT.Compose([ ReduceAudioChannels(), NormalizeAudio(), AT.Resample(constants.AUDIO_SAMPLE_RATE, constants.RESAMPLED_AUDIO_SAMPLE_RATE), AT.MFCC(sample_rate=constants.RESAMPLED_AUDIO_SAMPLE_RATE, n_mfcc=constants.N_MFCCS) ]) ] self.ensemble_video_transforms = [ IT.Compose([ VideoTransform(IT.ToPILImage()), VideoTransform( IT.Resize((constants.INPUT_FRAME_WIDTH, constants.INPUT_FRAME_WIDTH))), VideoTransform(IT.ToTensor()), VideoTransform( IT.Normalize(mean=constants.MEAN, std=constants.STD)), ]), IT.Compose([ VideoTransform(IT.ToPILImage()), VideoTransform( IT.Resize((constants.INPUT_FRAME_WIDTH, constants.INPUT_FRAME_WIDTH))), VideoTransform(IT.ToTensor()), VideoTransform( IT.Normalize(mean=constants.MEAN, std=constants.STD)), ]) ] self.ensemble_video_post_transforms = [ lambda x: x.permute([1, 0, 2, 3]), lambda x: x.permute([1, 0, 2, 3]) ] self.ensemble_audio_post_transforms = [lambda x: x, lambda x: x] if model_type == 'conv3D_MFCCs': self.ensemble_video_transforms = [ self.ensemble_video_transforms[0] ] self.ensemble_audio_transforms = [ self.ensemble_audio_transforms[0] ] self.ensemble_video_post_transforms = [ self.ensemble_video_post_transforms[0] ] self.ensemble_audio_post_transforms = [ self.ensemble_audio_post_transforms[0] ] for f in fs: # break in 1 sec chunks and add label audio_file = f[:-len(constants.VISUAL_SUFFIX )] + constants.AUDIO_SUFFIX chunks, meta = self.break_in_chunks( os.path.join(root_dir, f), os.path.join(root_dir, audio_file), [] if result_mode else labels[f], chunk_size) self.data += chunks self.meta += meta if not result_mode: self.print_data_stats()
def test_Resample(self): tensor = torch.rand((2, 1000)) sample_rate = 100. sample_rate_2 = 50. self._assert_consistency(T.Resample(sample_rate, sample_rate_2), tensor)
# load dataset batch_size = 32 batchsize_for_val = 128 (train_loader, val_loader, test_loader) = fgnh.SpeechCommands_Dataloaders( resample_rate=8000, batch_size=batch_size, batchsize_for_val=batchsize_for_val, num_workers=5, pin_memory=True) val_set = val_loader.dataset # note these are not resampled # necessary re-sampling from torchaudio import transforms transform = transforms.Resample(orig_freq=16000, new_freq=8000) ### values to iterate over # max distance allowed based on epsilon # precomputed bounds min and max input values min_bound = -1.3844940662384033 max_bound = 1.3773366212844849 epsilons = [(max_bound - min_bound) * x * (1. / 256.) for x in [ 1. / 512, 3. / 1024, 1. / 256, 3. / 512, 1. / 128, 3. / 256, 1. / 64, 3. / 128,
def test_resample_cache_dtype(self, resampling_method, dtype): """Providing dtype changes the kernel cache dtype""" transform = T.Resample(16000, 44100, resampling_method, dtype=dtype) assert transform.kernel.dtype == dtype if dtype is not None else torch.float32
# # The spectrograms below show the frequency representation of the signal, # where the x-axis corresponds to the frequency of the original # waveform (in log scale), y-axis the frequency of the # plotted waveform, and color intensity the amplitude. # sample_rate = 48000 resample_rate = 32000 waveform = get_sine_sweep(sample_rate) plot_sweep(waveform, sample_rate, title="Original Waveform") play_audio(waveform, sample_rate) resampler = T.Resample(sample_rate, resample_rate, dtype=waveform.dtype) resampled_waveform = resampler(waveform) plot_sweep(resampled_waveform, resample_rate, title="Resampled Waveform") play_audio(waveform, sample_rate) ###################################################################### # Controling resampling quality with parameters # --------------------------------------------- # # Lowpass filter width # ~~~~~~~~~~~~~~~~~~~~ # # Because the filter used for interpolation extends infinitely, the # ``lowpass_filter_width`` parameter is used to control for the width of # the filter to use to window the interpolation. It is also referred to as
def test_resample_identity(self, resampling_method, sample_rate): waveform = get_whitenoise(sample_rate=sample_rate, duration=1) resampler = T.Resample(sample_rate, sample_rate) resampled = resampler(waveform) self.assertEqual(waveform, resampled)