def song(self,path): y, sr = librosa.load(self.path, duration=120) S_full, phase = librosa.magphase(librosa.stft(y)) S_filter = librosa.decompose.nn_filter(S_full,aggregate=np.median,metric='cosine', width=int(librosa.time_to_frames(2, sr=sr))) S_filter = np.minimum(S_full, S_filter) margin_i, margin_v = 2, 10 power = 2 mask_i = librosa.util.softmask(S_filter,margin_i * (S_full - S_filter),power=power) mask_v = librosa.util.softmask(S_full - S_filter,margin_v * S_filter,power=power) S_foreground = mask_v * S_full S_background = mask_i * S_full music =librosa.griffinlim(S_background) vocal =librosa.griffinlim(S_foreground) scipy.io.wavfile.write('sound_results/song/music.wav',sr,music) scipy.io.wavfile.write('sound_results/song/vocal.wav',sr,vocal) utl.plotSounds([music, vocal], ["music", "vocal"], sr, "plot_results/song/song_separation_plot.png") img = pg.QtGui.QGraphicsPixmapItem(pg.QtGui.QPixmap('plot_results/song/song_separation_plot.png')) self.ui1.widget_song.addItem(img) self.ui1.widget_song.invertY(True) self.alarm("Check Plot & Sound Results Files")
def spectrogram_to_audio(data, data_recon, output_dir): data_np = (data.squeeze(0).to(torch.device("cpu"))).detach().numpy() data_griffin_lim = librosa.griffinlim(data_np) data_recon_np = (data_recon.squeeze(0).to( torch.device("cpu"))).detach().numpy() data_recon_griffin_lim = librosa.griffinlim(data_recon_np) source_aud_path = output_dir + '_input_' + '.wav' target_aud_path = output_dir + '_output_' + '.wav' librosa.output.write_wav(source_aud_path, data_griffin_lim, 16000) librosa.output.write_wav(target_aud_path, data_recon_griffin_lim, 16000) return source_aud_path, target_aud_path
def img_to_audio(image=None, out_wav=None, sr=48000, hl=None, wl=None): if image == None or ".png" not in image: print("Please Specify an image file! (e.g. my_image.png)") return None elif out_wav == None or ".wav" not in out_wav: print("Please Specify an output file! (e.g. my_sound.wav)") return None if wl == "None": wl = None if hl == "None": hl = None basepath = path.dirname(__file__) filepath = path.abspath(path.join(basepath, 'images', image)) print(f"Image location: {filepath}") img = cv2.imread(filepath) print(f"Read image of shape {img.shape}") avg_img = np.mean(img, axis=2) snd = librosa.griffinlim(avg_img, n_iter=64, hop_length=hl, win_length=wl) print(f"Output audio with {len(snd)} samples and a sample rate of {sr} Hz") librosa.output.write_wav(f"audio/{out_wav}", snd, sr)
def test_griffinlim(self): # NOTE: This test is flaky without a fixed random seed # See https://github.com/pytorch/audio/issues/382 torch.random.manual_seed(42) tensor = torch.rand((1, 1000)) n_fft = 400 ws = 400 hop = 100 window = torch.hann_window(ws) normalize = False momentum = 0.99 n_iter = 8 length = 1000 rand_init = False init = 'random' if rand_init else None specgram = F.spectrogram(tensor, 0, window, n_fft, hop, ws, 2, normalize).sqrt() ta_out = F.griffinlim(specgram, window, n_fft, hop, ws, 1, normalize, n_iter, momentum, length, rand_init) lr_out = librosa.griffinlim(specgram.squeeze(0).numpy(), n_iter=n_iter, hop_length=hop, momentum=momentum, init=init, length=length) lr_out = torch.from_numpy(lr_out).unsqueeze(0) self.assertTrue(torch.allclose(ta_out, lr_out, atol=5e-5))
def get_audio_from_stft_spectrogram_GL(self, stft_features): return librosa.griffinlim(stft_features, init=None, win_length=self.window_length, hop_length=self.overlap, window=self.window, center=True)
def test_griffinlim(self, momentum): # FFT params n_fft = 400 win_length = n_fft hop_length = n_fft // 4 window = torch.hann_window(win_length, device=self.device) power = 1 # GriffinLim params n_iter = 8 waveform = get_whitenoise(device=self.device, dtype=self.dtype) specgram = get_spectrogram( waveform, n_fft=n_fft, hop_length=hop_length, power=power, win_length=win_length, window=window) result = F.griffinlim( specgram, window=window, n_fft=n_fft, hop_length=hop_length, win_length=win_length, power=power, n_iter=n_iter, momentum=momentum, length=waveform.size(1), rand_init=False) expected = librosa.griffinlim( specgram[0].cpu().numpy(), n_iter=n_iter, hop_length=hop_length, momentum=momentum, init=None, length=waveform.size(1))[None, ...] self.assertEqual(result, torch.from_numpy(expected), atol=5e-5, rtol=1e-07)
def griffin_lim_(spc, n_fft, n_shift, win_length, window='hann', n_iters=100): """Convert linear spectrogram into waveform using Griffin-Lim. Args: spc (ndarray): Linear spectrogram (T, n_fft // 2 + 1). n_fft (int): Number of FFT points. n_shift (int): Shift size in points. win_length (int): Window length in points. window (str, optional): Window function type. n_iters (int, optionl): Number of iterations of Griffin-Lim Algorithm. Returns: ndarray: Reconstructed waveform (N,). """ # assert the size of input linear spectrogram assert spc.shape[1] == n_fft // 2 + 1 spc = np.abs(spc.T) y = librosa.griffinlim( S=spc, n_iter=n_iters, hop_length=n_shift, win_length=win_length, window=window ) return y
def inverse_stft_griffin_lim(self, stft_mat, name): filename = self.output_path + self.h_params.time_for_output + "_griffin_" + self.output_name + "_" + name + ".wav" istft_mat = librosa.griffinlim( abs(stft_mat), hop_length=self.h_params.stft_hop_length, win_length=self.h_params.stft_window_size) sf.write(filename, istft_mat, self.h_params.down_sample_rate) return istft_mat
def spec_wav(specgram, filename): # Return the all-zero vector with the same shape of `a_content` a = np.exp(specgram.cpu().detach().numpy()) - 1 p = 2 * np.pi * np.random.random_sample(specgram.shape) - np.pi x = librosa.griffinlim(a) p = np.angle(librosa.stft(x, 400)) librosa.output.write_wav(filename, x, sr=22050)
def audio_reconstruction_test(src_dir, dest_dir, ext=".png", size=None): """ Test different approaches to image to audio conversion :param src_dir: image directory :param dest_dir: audio directory :param ext: image type :param size: desired dimension for resizing """ paths = prep_utils.get_unprocessed_items(src_dir=src_dir, dest_dir=dest_dir) start_time = time.time() for path in paths: prep_utils.display_progress_eta(current_item=path, total_items=paths, start_time=start_time) S = np.load(path) cv2.imshow("image", S) cv2.waitKey(0) pd.DataFrame(S).to_csv(dest_dir + "S.csv", header=None, index=False) S_scaled = prep_utils.increase_brightness(S) cv2.imshow("image", S_scaled) cv2.waitKey(0) pd.DataFrame(S_scaled).to_csv(dest_dir + "S_scaled.csv", header=None, index=False) out_path = dest_dir + "gray" + ext cv2.imwrite(out_path, S_scaled) S = cv2.imread(out_path, 0) S = np.array(S, dtype=np.float32) S_recovered = S if size: S_recovered = cv2.resize(S_recovered, (size, size), interpolation=cv2.INTER_CUBIC) pd.DataFrame(S_recovered).to_csv(dest_dir + "S_recovered.csv", header=None, index=False) out_path = dest_dir + "resized" + ext cv2.imwrite(out_path, S_recovered) S_audio = np.genfromtxt(dest_dir + "S_recovered.csv", delimiter=',') S_audio = np.array(S_audio, dtype=np.float32) S_audio = cv2.resize(S_audio, (431, 1025), interpolation=cv2.INTER_CUBIC) y = librosa.griffinlim(S_audio) out = dest_dir + "s.wav" # Save reconstructed data scipy.io.wavfile.write(out, 22050, y) break
def inv_linear_spectrogram(linear_spectrogram): if hparams.signal_normalization: D = _denormalize(linear_spectrogram) else: D = linear_spectrogram S = _db_to_amp(D + hparams.ref_level_db) # Convert back to linear return librosa.griffinlim(S**1.55)
def get_spectrogram(wav): y, sr = librosa.load(librosa.ex('trumpet')) # Get the magnitude spectrogram S = np.abs(librosa.stft(y)) # Invert using Griffin-Lim y_inv = librosa.griffinlim(S) # Invert without estimating phase y_istft = librosa.istft(S) return S
def inv_spectrogram(spectrogram): '''Converts spectrogram to waveform using librosa''' S = _db_to_amp(_denormalize(spectrogram) + hparams.spec_ref_level_db) # Convert back to linear #S = librosa.db_to_amplitude(_denormalize(spectrogram) + hparams.spec_ref_level_db) D = librosa.griffinlim(S**hparams.power, hop_length=hparams.hop_size, win_length=hparams.fft_wsize) return inv_preemphasis(D)
def inv_mel_spectrogram(mel_spectrogram): if hparams.signal_normalization: D = _denormalize(mel_spectrogram) else: D = mel_spectrogram S = _mel_to_linear( _db_to_amp(D + hparams.ref_level_db)) # Convertir de vuelta a lineal return librosa.griffinlim(S**1.55)
def inverse_spectrogram(s, mel=False): """Convert log-magnitude spectrogram to waveform.""" S = db_to_amplitude(s) wf = ms_to_frames(hp.stft_window_ms) hf = ms_to_frames(hp.stft_shift_ms) if mel: S = librosa.feature.inverse.mel_to_stft(S, power=1, sr=hp.sample_rate, n_fft=hp.num_fft) y = librosa.griffinlim(S ** hp.griffin_lim_power, n_iter=hp.griffin_lim_iters, hop_length=hf, win_length=wf) if hp.use_preemphasis: y = deemphasis(y) y /= max(y) return y
def griffin_lim(level,audio,args=1): starting_w=80.0 ending_w = 1.0 levels = 100.0 noise_level = starting_w + level *((ending_w-starting_w)/100) noise_level=round(noise_level,4) S = np.abs(librosa.stft(audio)) y_inv = librosa.griffinlim(S,n_iter=int(noise_level)) return y_inv
def test_griffinlim_multi(y_multi): y, sr = y_multi # Compute the stft D = librosa.stft(y) # Run a couple of iterations of griffin-lim yout = librosa.griffinlim(np.abs(D), n_iter=2, length=y.shape[-1]) # Check the lengths assert np.allclose(y.shape, yout.shape)
def _spec_to_wav(spec, sr=sample_rate, engine='librosa'): ''' using Griffin-Lim algorithm ''' if engine == 'librosa': return librosa.griffinlim(spec, hop_length=stft_params['hop_length'], win_length=stft_params['win_length']) elif engine == 'torch': return tf.GriffinLim(**stft_params, power=power)(spec) raise ValueError(engine)
def griffin_lim( spc: np.ndarray, n_fft: int, n_shift: int, win_length: int = None, window: Optional[str] = "hann", n_iter: Optional[int] = 32, ) -> np.ndarray: """Convert linear spectrogram into waveform using Griffin-Lim. Args: spc: Linear spectrogram (T, n_fft // 2 + 1). n_fft: The number of FFT points. n_shift: Shift size in points. win_length: Window length in points. window: Window function type. n_iter: The number of iterations. Returns: Reconstructed waveform (N,). """ # assert the size of input linear spectrogram assert spc.shape[1] == n_fft // 2 + 1 if V(librosa.__version__) >= V("0.7.0"): # use librosa's fast Grriffin-Lim algorithm spc = np.abs(spc.T) y = librosa.griffinlim( S=spc, n_iter=n_iter, hop_length=n_shift, win_length=win_length, window=window, center=True if spc.shape[1] > 1 else False, ) else: # use slower version of Grriffin-Lim algorithm logging.warning( "librosa version is old. use slow version of Grriffin-Lim algorithm." "if you want to use fast Griffin-Lim, please update librosa via " "`source ./path.sh && pip install librosa==0.7.0`." ) cspc = np.abs(spc).astype(np.complex).T angles = np.exp(2j * np.pi * np.random.rand(*cspc.shape)) y = librosa.istft(cspc * angles, n_shift, win_length, window=window) for i in range(n_iter): angles = np.exp( 1j * np.angle(librosa.stft(y, n_fft, n_shift, win_length, window=window)) ) y = librosa.istft(cspc * angles, n_shift, win_length, window=window) return y
def post_audio(self, y): y = np.array(y) y = np.exp(y) y = librosa.feature.inverse.mel_to_stft(y, sr=self.sr, n_fft=self.n_fft, power=self.power) y = librosa.griffinlim(y, hop_length=self.hop_length, win_length=self.win_length) return y
def griffinlim_librosa(spectrogram, fs, hparams): hop_length = int(hparams.hop_length_ms / 1000 * fs) win_length = int(hparams.win_length_ms / 1000 * fs) return inv_preemphasis( librosa.griffinlim( spectrogram, n_iter=hparams.griffin_lim_iters, hop_length=hop_length, win_length=win_length, ), hparams, )
def extract_audio( Z, feature, params): # if normalized Z: unnormalize first, then pass to func. # convert to audio if feature == "Stft": # undo log-magnitude scaling S = librosa.db_to_amplitude(Z) # upsample S = _upsample_fft(S, params["fft_sample_rate"], params["stft_window_length"]) yhat = librosa.griffinlim(S, hop_length=params["stft_hop_length"]) elif feature == "Mel": # undo log-power scaling S = librosa.db_to_power(Z) yhat = librosa.feature.inverse.mel_to_audio( S, sr=params["fft_sample_rate"], n_fft=params["stft_window_length"], hop_length=params["stft_hop_length"], ) elif feature == "Cqt": # undo log-amplitude scaling S = librosa.db_to_amplitude(Z) yhat = librosa.griffinlim_cqt( S, sr=params["fft_sample_rate"], hop_length=params["stft_hop_length"], fmin=librosa.note_to_hz(params["cqt_min_frequency"]), ) elif feature == "Mfcc": yhat = librosa.feature.inverse.mfcc_to_audio( Z, n_mels=params["frequency_bins"], sr=params["fft_sample_rate"], n_fft=params["stft_window_length"], hop_length=params["stft_hop_length"], ) else: print("Error: feature invalid") # throw/raise something return -1 return yhat, params["fft_sample_rate"]
def mel_to_sound(dst_sound_file_path, mel_spectrogram, sample_rate, method='griffin_lim'): ''' converts mel spectrogram (librosa object) to sound file with defined sample rate and 1 method: griffin_lim https://paperswithcode.com/method/griffin-lim-algorithm ''' inverted_features = librosa.feature.inverse.mel_to_stft(mel_spectrogram) audio_signal = librosa.griffinlim(inverted_features) scipy.io.wavfile.write(dst_sound_file_path, audio_signal, sample_rate)
def griffinlim_sp(spectrogram, n_fft, win_length, hop_length): if win_length is None: win_length = n_fft if hop_length is None: hop_length = win_length // 4 s = librosa.griffinlim( spectrogram, n_iter=50, hop_length=hop_length, win_length=win_length, momentum=0.5, ) return s / np.max(np.abs(s))
def amp_sp_to_raw(amp_sp: np.array, fs: int, hop_size_ms: int = 5, preemphasis: float = 0.00): """ Transform the amplitude spectrum into the waveform with Griffin-Lim. The amplitude spectrum has to have the pitch information. Using amplitude spectrum which was extracted with pitch aligned windows (as WORLD does it) will not work. """ raw = librosa.griffinlim(amp_sp.T * np.sqrt(amp_sp.shape[1]), hop_length=int(fs * hop_size_ms / 1000.)) return AudioProcessing.depreemphasis(raw, preemphasis)
def synthesize(self, mel): mel = self._denormalize_from_VC(mel) lin_out = librosa.feature.inverse.mel_to_stft(mel, sr=Config.audio_sr, n_fft=Config.n_fft, fmin=Config.fmin, fmax=Config.fmax) waveform = librosa.griffinlim(lin_out, win_length=Config.win_length, hop_length=Config.hop_length) return waveform
def spec2audio(cls, spec: np.ndarray, n_iter: int = 60, enhancement_factor: float = 1.5) -> np.ndarray: """Converts magnitude spectrogram into a waveform using Griffin-Lim algorithm and istft""" spec = cls.denormalize(spec) spec = cls.db2lin(spec + AudioProcessParam.ref_level_db) spec = np.power(spec, enhancement_factor) # enhance waveform = librosa.griffinlim(spec, n_iter=n_iter, hop_length=cls.param.hop_length) # TODO: de-emphasize the waveform? return cls.de_emphasize(waveform)
def griffinlim(*targets): try: print("Griffinlim booted") while True: S = (yield) # Invert using Griffin-Lim y_inv = librosa.griffinlim(S) for t in targets: print(t) t.send(y_inv) except GeneratorExit: print("Griffinlim shutdown") for t in targets: t.close()
def load_vocal_audio(self, y, sr): S_full, phase = librosa.magphase(librosa.stft(y)) S_filter = librosa.decompose.nn_filter(S_full, aggregate=np.median, metric='cosine', width=int(librosa.time_to_frames(2, sr=sr))) S_filter = np.minimum(S_full, S_filter) margin_i, margin_v = 2, 10 power = 2 mask_v = librosa.util.softmask(S_full - S_filter, margin_v * S_filter, power=power) S_foreground = mask_v * S_full output_data = librosa.griffinlim(S_foreground) return output_data, sr
def spec_to_waveform(spec, min_level_db, ref_level_db, power, n_iter, win_length, hop_length, preemphasis): """Convert output linear spec to waveform using griffin-lim vocoder. Args: spec (ndarray): the output linear spectrogram, shape(C, T), where C means n_fft, T means frames. """ denoramlized = np.clip(spec, 0, 1) * (-min_level_db) + min_level_db lin_scaled = np.exp((denoramlized + ref_level_db) / 20 * np.log(10)) wav = librosa.griffinlim(lin_scaled**power, n_iter=n_iter, hop_length=hop_length, win_length=win_length) if preemphasis > 0: wav = signal.lfilter([1.], [1., -preemphasis], wav) return wav