def extend_dataset(y, sr): #return (y,) # Make 2x faster D = librosa.stft(y, n_fft=2048, hop_length=512) D_fast = librosa.phase_vocoder(D, 2.0, hop_length=512) y_fast = librosa.istft(D_fast, hop_length=512) # Concatenate two 2x frames together y_fast = append(y_fast, y_fast) # Make 2x slower D_slow = librosa.phase_vocoder(D, 0.5, hop_length=512) y_slow = librosa.istft(D_slow, hop_length=512) # split two 0.5x frames together y_slow1, y_slow2 = split(y_slow, 2) ## Frequency scaling #y_pitch_up = librosa.effects.pitch_shift(y, sr, n_steps=4) #y_pitch_down = librosa.effects.pitch_shift(y, sr, n_steps=-4) samples = min([len(y), len(y_fast), len(y_slow1), len(y_slow2)]) y = y[:samples] y_fast = y_fast[:samples] y_slow1 = y_slow1[:samples] y_slow2 = y_slow2[:samples] return (y, y_fast, y_slow1, y_slow2)
def test_phase_vocoder(y_multi, rate): y, sr = y_multi D = librosa.stft(y) D0 = librosa.phase_vocoder(D[0], rate=rate) D1 = librosa.phase_vocoder(D[1], rate=rate) D2 = librosa.phase_vocoder(D, rate=rate) assert np.allclose(D2[0], D0) assert np.allclose(D2[1], D1) assert not np.allclose(D2[0], D2[1])
def test_phase_vocoder(self, rate, test_pseudo_complex): hop_length = 256 num_freq = 1025 num_frames = 400 torch.random.manual_seed(42) # Due to cummulative sum, numerical error in using torch.float32 will # result in bottom right values of the stretched sectrogram to not # match with librosa. spec = torch.randn(num_freq, num_frames, device=self.device, dtype=torch.complex128) phase_advance = torch.linspace(0, np.pi * hop_length, num_freq, device=self.device, dtype=torch.float64)[..., None] stretched = F.phase_vocoder( torch.view_as_real(spec) if test_pseudo_complex else spec, rate=rate, phase_advance=phase_advance) expected_stretched = librosa.phase_vocoder(spec.cpu().numpy(), rate=rate, hop_length=hop_length) self.assertEqual( torch.view_as_complex(stretched) if test_pseudo_complex else stretched, torch.from_numpy(expected_stretched))
def stretch_demo(input_file, output_file, speed): '''Phase-vocoder time stretch demo function. :parameters: - input_file : str path to input audio - output_file : str path to save output (wav) - speed : float > 0 speed up by this factor ''' N_FFT = 2048 HOP_LENGTH = N_FFT / 4 # 1. Load the wav file, resample print 'Loading ', input_file y, sr = librosa.load(input_file) # 2. generate STFT @ 2048 samples print 'Computing short-time fourier transform... ' D = librosa.stft(y, n_fft=N_FFT, hop_length=HOP_LENGTH) print 'Playing back at %3.f%% speed' % (speed * 100) D_stretch = librosa.phase_vocoder(D, speed, hop_length=HOP_LENGTH) y_stretch = librosa.istft(D_stretch, hop_length=HOP_LENGTH) print 'Saving stretched audio to: ', output_file librosa.output.write_wav(output_file, y_stretch, sr)
def stretch_demo(input_file, output_file, speed): '''Phase-vocoder time stretch demo function. :parameters: - input_file : str path to input audio - output_file : str path to save output (wav) - speed : float > 0 speed up by this factor ''' N_FFT = 2048 HOP_LENGTH = N_FFT /4 # 1. Load the wav file, resample print 'Loading ', input_file y, sr = librosa.load(input_file) # 2. generate STFT @ 2048 samples print 'Computing short-time fourier transform... ' D = librosa.stft(y, n_fft=N_FFT, hop_length=HOP_LENGTH) print 'Playing back at %3.f%% speed' % (speed * 100) D_stretch = librosa.phase_vocoder(D, speed, hop_length=HOP_LENGTH) y_stretch = librosa.istft(D_stretch, hop_length=HOP_LENGTH) print 'Saving stretched audio to: ', output_file librosa.output.write_wav(output_file, y_stretch, sr)
def test_phase_vocoder(complex_specgrams, rate, hop_length): # Due to cummulative sum, numerical error in using torch.float32 will # result in bottom right values of the stretched sectrogram to not # match with librosa. complex_specgrams = complex_specgrams.type(torch.float64) phase_advance = torch.linspace(0, np.pi * hop_length, complex_specgrams.shape[-3], dtype=torch.float64)[..., None] complex_specgrams_stretch = F.phase_vocoder(complex_specgrams, rate=rate, phase_advance=phase_advance) # == Test shape expected_size = list(complex_specgrams.size()) expected_size[-2] = int(np.ceil(expected_size[-2] / rate)) assert complex_specgrams.dim() == complex_specgrams_stretch.dim() assert complex_specgrams_stretch.size() == torch.Size(expected_size) # == Test values index = [0] * (complex_specgrams.dim() - 3) + [slice(None)] * 3 mono_complex_specgram = complex_specgrams[index].numpy() mono_complex_specgram = mono_complex_specgram[..., 0] + \ mono_complex_specgram[..., 1] * 1j expected_complex_stretch = librosa.phase_vocoder(mono_complex_specgram, rate=rate, hop_length=hop_length) complex_stretch = complex_specgrams_stretch[index].numpy() complex_stretch = complex_stretch[..., 0] + 1j * complex_stretch[..., 1] assert np.allclose(complex_stretch, expected_complex_stretch, atol=1e-5)
def time_shift(wav, tg_lngth): """ Shifts the audio length while not affecting pitch by using a phase vocoder :param wav: The audio to shift :param tg_lngth: The target length of the audio to be shifted, measured in terms of number of samples :return: A timeshifted audio file that is the length of tg_length """ D = stft(wav) shift_factor = tg_lngth / len(wav) D_shifted = phase_vocoder(D, shift_factor) return istft(D_shifted)
def freq_shift(wav, fq, tg_fq, sample_rate): """ Shifts the given audio from its current frequency the target frequency :param wav: The audio to shift :param fq: The frequency of the current audio :param tg_fq: The target frequency to shift to (in Hz) :param sample_rate: The sampling rate used for the audio when imported by librosa :return: A shifted frequency audio sample to the given target frequency """ D = stft(wav) shift_factor = 1 + (fq - tg_fq) / fq D_shifted = phase_vocoder(D, shift_factor) x_shifted = istft(D_shifted) return resample(x_shifted, sample_rate, int(sample_rate / shift_factor))
def timemap_stretch(x, sr, path, hop_length=32, n_fft=4096): """ Stretch audio x so that it aligns with another audio clip, according to a warping path Parameters ---------- x: ndarray(N) An array of audio samples sr: int Sample rate path: ndarray(K, 2) Warping path. Indices of x are in first row hop_length: int Hop length to use in the phase vocoder n_fft: int Number of fft samples to use in the phase vocoder """ # Break down into regions of constant slope xdiff = path[1::, 0] - path[0:-1, 0] ydiff = path[1::, 1] - path[0:-1, 1] xdiff = xdiff[1::] - xdiff[0:-1] ydiff = ydiff[1::] - ydiff[0:-1] diff = xdiff + ydiff ret = np.array([]) i1 = 0 while i1 < len(diff): i2 = i1 + 1 while i2 < len(diff) and diff[i2] == 0: i2 += 1 while i2 < len(diff) and path[i2, 0] - path[i1, 0] < n_fft: i2 += 1 if i2 >= len(diff): break fac = (path[i2, 1] - path[i1, 1]) / (path[i2, 0] - path[i1, 0]) if fac > 0: fac = 1 / fac xi = x[path[i1, 0]:path[i2, 0] + 1] D = librosa.stft(xi, n_fft=n_fft, hop_length=hop_length) DNew = librosa.phase_vocoder(D, fac, hop_length=hop_length) xifast = librosa.istft(DNew, hop_length=hop_length) ret = np.concatenate((ret, xifast)) i1 = i2 return ret
def beat_stretch(D, beats_orig, beats_target): D_out = np.empty((D.shape[0], beats_target[-1]), dtype=D.dtype) # Compute beat deltas db_orig = np.diff(beats_orig) db_target = np.diff(beats_target) t = 0 for (i, delta) in enumerate(db_target): Dslice = D[:, beats_orig[i]:beats_orig[i + 1]] Dnew = librosa.phase_vocoder(Dslice, float(db_orig[i]) / delta) D_out[:, t:t + delta] = Dnew[:, :delta] t = t + delta return D_out
def beat_stretch(D, beats_orig, beats_target): D_out = np.empty( (D.shape[0], beats_target[-1]), dtype=D.dtype) # Compute beat deltas db_orig = np.diff(beats_orig) db_target = np.diff(beats_target) t = 0 for (i, delta) in enumerate(db_target): Dslice = D[:, beats_orig[i]:beats_orig[i+1]] Dnew = librosa.phase_vocoder(Dslice, float(db_orig[i])/delta) D_out[:, t:t+delta] = Dnew[:,:delta] t = t + delta return D_out
def speedDown(y, n_step=0.5): y_D = librosa.stft(y) y_D_slow = librosa.phase_vocoder(y_D, n_step) y_slower = librosa.istft(y_D_slow) return y_slower
def speedUp(y, n_step=2): y_D = librosa.stft(y) y_D_fast = librosa.phase_vocoder(y_D, n_step) y_faster = librosa.istft(y_D_fast) return y_faster
def speed(self, sp): tmp = librosa.stft(self.y, n_fft=2048, hop_length=512) tmp_speed = librosa.phase_vocoder(tmp, sp, hop_length=512) return librosa.istft(tmp_speed, hop_length=512)
def vocode(x, y, rate=2.0): return librosa.phase_vocoder(x, rate), y
def test_phase_vocoder(complex_specgrams, rate, hop_length): # Using a decorator here causes parametrize to fail on Python 2 if not IMPORT_LIBROSA: raise unittest.SkipTest('Librosa is not available') # Due to cummulative sum, numerical error in using torch.float32 will # result in bottom right values of the stretched sectrogram to not # match with librosa. complex_specgrams = complex_specgrams.type(torch.float64) phase_advance = torch.linspace(0, np.pi * hop_length, complex_specgrams.shape[-3], dtype=torch.float64)[..., None] complex_specgrams_stretch = F.phase_vocoder(complex_specgrams, rate=rate, phase_advance=phase_advance) # == Test shape expected_size = list(complex_specgrams.size()) expected_size[-2] = int(np.ceil(expected_size[-2] / rate)) assert complex_specgrams.dim() == complex_specgrams_stretch.dim() assert complex_specgrams_stretch.size() == torch.Size(expected_size) # == Test values index = [0] * (complex_specgrams.dim() - 3) + [slice(None)] * 3 mono_complex_specgram = complex_specgrams[index].numpy() mono_complex_specgram = mono_complex_specgram[..., 0] + \ mono_complex_specgram[..., 1] * 1j expected_complex_stretch = librosa.phase_vocoder(mono_complex_specgram, rate=rate, hop_length=hop_length) complex_stretch = complex_specgrams_stretch[index].numpy() complex_stretch = complex_stretch[..., 0] + 1j * complex_stretch[..., 1] assert np.allclose(complex_stretch, expected_complex_stretch, atol=1e-5) def test_torchscript_create_fb_matrix(self): n_stft = 100 f_min = 0.0 f_max = 20.0 n_mels = 10 sample_rate = 16000 _test_torchscript_functional(F.create_fb_matrix, n_stft, f_min, f_max, n_mels, sample_rate) def test_torchscript_amplitude_to_DB(self): spec = torch.rand((6, 201)) multiplier = 10.0 amin = 1e-10 db_multiplier = 0.0 top_db = 80.0 _test_torchscript_functional(F.amplitude_to_DB, spec, multiplier, amin, db_multiplier, top_db) def test_torchscript_create_dct(self): n_mfcc = 40 n_mels = 128 norm = "ortho" _test_torchscript_functional(F.create_dct, n_mfcc, n_mels, norm) def test_torchscript_mu_law_encoding(self): tensor = torch.rand((1, 10)) qc = 256 _test_torchscript_functional(F.mu_law_encoding, tensor, qc) def test_torchscript_mu_law_decoding(self): tensor = torch.rand((1, 10)) qc = 256 _test_torchscript_functional(F.mu_law_decoding, tensor, qc) def test_torchscript_complex_norm(self): complex_tensor = torch.randn(1, 2, 1025, 400, 2), power = 2 _test_torchscript_functional(F.complex_norm, complex_tensor, power) def test_mask_along_axis(self): specgram = torch.randn(2, 1025, 400), mask_param = 100 mask_value = 30. axis = 2 _test_torchscript_functional(F.mask_along_axis, specgram, mask_param, mask_value, axis) def test_mask_along_axis_iid(self): specgram = torch.randn(2, 1025, 400), specgrams = torch.randn(4, 2, 1025, 400), mask_param = 100 mask_value = 30. axis = 2 _test_torchscript_functional(F.mask_along_axis_iid, specgrams, mask_param, mask_value, axis) def test_torchscript_gain(self): tensor = torch.rand((1, 1000)) gainDB = 2.0 _test_torchscript_functional(F.gain, tensor, gainDB) def test_torchscript_dither(self): tensor = torch.rand((1, 1000)) _test_torchscript_functional(F.dither, tensor) _test_torchscript_functional(F.dither, tensor, "RPDF") _test_torchscript_functional(F.dither, tensor, "GPDF")
import librosa from IPython import get_ipython from pydub import AudioSegment from pydub.playback import play warnings.filterwarnings('ignore') path = r'/Users/peterzuker/Desktop/Audio Modification/10047/model_input/spells/1/exemplars/1499777912068.wav' #reload the audio to use librosa's expected format lr_speech_data, lr_speech_rate = librosa.load(path) stretched = librosa.effects.time_stretch(lr_speech_data, 1.47) y, sr = librosa.load(path) D = librosa.stft(y, n_fft=2048, hop_length=512) D_slow = librosa.phase_vocoder(D, 1. / 3, hop_length=512) y_slow = librosa.istft(D_slow, hop_length=512) wavfile.write('test.wav', y_slow, D_slow) rate, data = wavfile.read(path) sound = AudioSegment.from_file(path, format="wav") play(sound) def remove_silence(audio, threshold): #identify all samples with an absolute value greater than the threshold greater_index = numpy.greater(numpy.absolute(audio), threshold) #filter to only include the identified samples above_threshold_data = audio[greater_index]