def benchmark_resample( method, waveform, sample_rate, resample_rate, lowpass_filter_width=DEFAULT_LOWPASS_FILTER_WIDTH, rolloff=DEFAULT_ROLLOFF, resampling_method=DEFAULT_RESAMPLING_METHOD, beta=None, librosa_type=None, iters=5 ): if method == "functional": begin = time.time() for _ in range(iters): F.resample(waveform, sample_rate, resample_rate, lowpass_filter_width=lowpass_filter_width, rolloff=rolloff, resampling_method=resampling_method) elapsed = time.time() - begin return elapsed / iters elif method == "transforms": resampler = T.Resample(sample_rate, resample_rate, lowpass_filter_width=lowpass_filter_width, rolloff=rolloff, resampling_method=resampling_method, dtype=waveform.dtype) begin = time.time() for _ in range(iters): resampler(waveform) elapsed = time.time() - begin return elapsed / iters elif method == "librosa": waveform_np = waveform.squeeze().numpy() begin = time.time() for _ in range(iters): librosa.resample(waveform_np, sample_rate, resample_rate, res_type=librosa_type) elapsed = time.time() - begin return elapsed / iters
def test_resample_no_warning(self): sample_rate = 44100 waveform = get_whitenoise(sample_rate=sample_rate, duration=0.1) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") F.resample(waveform, float(sample_rate), sample_rate / 2.) assert len(w) == 0
def test_resample_warning(self): """resample should throw a warning if an input frequency is not of an integer value""" sample_rate = 44100 waveform = get_whitenoise(sample_rate=sample_rate, duration=0.1) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") F.resample(waveform, sample_rate, 5512.5) assert len(w) == 1
def func_beta(tensor): sr1, sr2 = 16000., 8000. beta = 6. return F.resample(tensor, sr1, sr2, resampling_method="kaiser_window", beta=beta)
def test_resample(self): input_path = common_utils.get_asset_path('sinewave.wav') waveform, sample_rate = common_utils.load_wav(input_path) upsample_rate = sample_rate * 2 downsample_rate = sample_rate // 2 ta_upsampled = F.resample(waveform, sample_rate, upsample_rate) lr_upsampled = librosa.resample( waveform.squeeze(0).numpy(), sample_rate, upsample_rate) lr_upsampled = torch.from_numpy(lr_upsampled).unsqueeze(0) self.assertEqual(ta_upsampled, lr_upsampled, atol=1e-2, rtol=1e-5) ta_downsampled = F.resample(waveform, sample_rate, downsample_rate) lr_downsampled = librosa.resample( waveform.squeeze(0).numpy(), sample_rate, downsample_rate) lr_downsampled = torch.from_numpy(lr_downsampled).unsqueeze(0) self.assertEqual(ta_downsampled, lr_downsampled, atol=1e-2, rtol=1e-5)
def test_resample_waveform_identity_size(self, resampling_method): sr = 16000 waveform = get_whitenoise( sample_rate=sr, duration=0.5, ) resampled = F.resample(waveform, sr, sr, resampling_method=resampling_method) assert resampled.size(-1) == waveform.size(-1)
def test_resample_waveform_downsample_size(self, resampling_method): sr = 16000 waveform = get_whitenoise( sample_rate=sr, duration=0.5, ) downsampled = F.resample(waveform, sr, sr // 2, resampling_method=resampling_method) assert downsampled.size(-1) == waveform.size(-1) // 2
def forward(self, waveform: Tensor) -> Tensor: r""" Args: waveform (Tensor): Tensor of audio of dimension (..., time). Returns: Tensor: Output signal of dimension (..., time). """ if self.resampling_method == 'sinc_interpolation': return F.resample(waveform, self.orig_freq, self.new_freq) raise ValueError('Invalid resampling method: {}'.format(self.resampling_method))
def _test_resample_waveform_accuracy( self, up_scale_factor=None, down_scale_factor=None, resampling_method="sinc_interpolation", atol=1e-1, rtol=1e-4): # resample the signal and compare it to the ground truth n_to_trim = 20 sample_rate = 1000 new_sample_rate = sample_rate if up_scale_factor is not None: new_sample_rate *= up_scale_factor if down_scale_factor is not None: new_sample_rate //= down_scale_factor duration = 5 # seconds original_timestamps = torch.arange(0, duration, 1.0 / sample_rate) sound = 123 * torch.cos( 2 * math.pi * 3 * original_timestamps).unsqueeze(0) estimate = F.resample(sound, sample_rate, new_sample_rate, resampling_method=resampling_method).squeeze() new_timestamps = torch.arange(0, duration, 1.0 / new_sample_rate)[:estimate.size(0)] ground_truth = 123 * torch.cos(2 * math.pi * 3 * new_timestamps) # trim the first/last n samples as these points have boundary effects ground_truth = ground_truth[..., n_to_trim:-n_to_trim] estimate = estimate[..., n_to_trim:-n_to_trim] self.assertEqual(estimate, ground_truth, atol=atol, rtol=rtol)
def func(tensor): sr1, sr2 = 16000., 8000. return F.resample(tensor, sr1, sr2, resampling_method="kaiser_window")
def func(tensor): sr1, sr2 = 16000., 8000. return F.resample(tensor, sr1, sr2, resampling_method="sinc_interpolation")
def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None): if isinstance(inputs, str): with open(inputs, "rb") as f: inputs = f.read() if isinstance(inputs, bytes): inputs = ffmpeg_read(inputs, self.feature_extractor.sampling_rate) stride = None extra = {} if isinstance(inputs, dict): stride = inputs.pop("stride", None) _inputs = inputs.pop("raw") in_sampling_rate = inputs.pop("sampling_rate") extra = inputs inputs = _inputs if in_sampling_rate != self.feature_extractor.sampling_rate: import torch from torchaudio import functional as F inputs = F.resample( torch.from_numpy(inputs), in_sampling_rate, self.feature_extractor.sampling_rate).numpy() ratio = self.feature_extractor.sampling_rate / in_sampling_rate else: ratio = 1 if stride is not None: if stride[0] + stride[1] > inputs.shape[0]: raise ValueError("Stride is too large for input") # Stride needs to get the chunk length here, it's going to get # swallowed by the `feature_extractor` later, and then batching # can add extra data in the inputs, so we need to keep track # of the original length in the stride so we can cut properly. stride = (inputs.shape[0], int(round(stride[0] * ratio)), int(round(stride[1] * ratio))) if not isinstance(inputs, np.ndarray): raise ValueError( f"We expect a numpy ndarray as input, got `{type(inputs)}`") if len(inputs.shape) != 1: raise ValueError( "We expect a single channel audio input for AutomaticSpeechRecognitionPipeline" ) if chunk_length_s: if stride_length_s is None: stride_length_s = chunk_length_s / 6 if isinstance(stride_length_s, (int, float)): stride_length_s = [stride_length_s, stride_length_s] # XXX: Carefuly, this variable will not exist in `seq2seq` setting. # Currently chunking is not possible at this level for `seq2seq` so # it's ok. align_to = self.model.config.inputs_to_logits_ratio chunk_len = int( round(chunk_length_s * self.feature_extractor.sampling_rate / align_to)) * align_to stride_left = int( round(stride_length_s[0] * self.feature_extractor.sampling_rate / align_to)) * align_to stride_right = int( round(stride_length_s[1] * self.feature_extractor.sampling_rate / align_to)) * align_to if self.type not in {"ctc", "ctc_with_lm"}: raise ValueError( "`chunk_length_s` is only valid for CTC models, use other chunking options for other models" ) if chunk_len < stride_left + stride_right: raise ValueError( "Chunk length must be superior to stride length") # make sure that for item in chunk_iter(inputs, self.feature_extractor, chunk_len, stride_left, stride_right): yield item else: processed = self.feature_extractor( inputs, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt") if stride is not None: if self.model.__class__ in MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING.values( ): raise ValueError( "Stride is only usable with CTC models, try removing it" ) processed["stride"] = stride yield {"is_last": True, **processed, **extra}
def test_resample_identity(self, resampling_method, sample_rate): waveform = get_whitenoise(sample_rate=sample_rate, duration=1) resampled = F.resample(waveform, sample_rate, sample_rate) self.assertEqual(waveform, resampled)
# # Because the filter used for interpolation extends infinitely, the # ``lowpass_filter_width`` parameter is used to control for the width of # the filter to use to window the interpolation. It is also referred to as # the number of zero crossings, since the interpolation passes through # zero at every time unit. Using a larger ``lowpass_filter_width`` # provides a sharper, more precise filter, but is more computationally # expensive. # sample_rate = 48000 resample_rate = 32000 resampled_waveform = F.resample( waveform, sample_rate, resample_rate, lowpass_filter_width=6 ) plot_sweep(resampled_waveform, resample_rate, title="lowpass_filter_width=6") resampled_waveform = F.resample( waveform, sample_rate, resample_rate, lowpass_filter_width=128 ) plot_sweep(resampled_waveform, resample_rate, title="lowpass_filter_width=128") ###################################################################### # Rolloff # ~~~~~~~ # # The ``rolloff`` parameter is represented as a fraction of the Nyquist # frequency, which is the maximal frequency representable by a given