def get_wav_from_stft(stft_modified, rate_hz, modified_scale, fft_size, hopsamp, iterations=2000, outfile=None): stft_modified_scaled = stft_modified / modified_scale stft_modified_scaled = stft_modified_scaled**0.5 x_reconstruct = audio_utilities.reconstruct_signal_griffin_lim\ (stft_modified_scaled, fft_size=fft_size, hopsamp=hopsamp, iterations=iterations) max_sample = np.max(abs(x_reconstruct)) if max_sample > 1.0: x_reconstruct = x_reconstruct / max_sample audio_utilities.save_audio_to_file(x_reconstruct, rate_hz, outfile=outfile) return stft_modified_scaled
def run_demo(): """Test Griffin & Lim method for reconstructing audio from a magnitude spectrogram. Example of using the Griffin-Lim algorithm. The input file is loaded, the spectrogram is computed (note that we discard the phase information). Then, using only the (magnitude) spectrogram, the Griffin-Lim algorithm is run to reconstruct an audio signal from the spectrogram. The reconstructed audio is finally saved to a file. A plot of the spectrogram is also displayed. """ parser = argparse.ArgumentParser() parser.add_argument('--in_file', type=str, default="bkvhi.wav", help='Input WAV file') parser.add_argument('--sample_rate_hz', default=44100, type=int, help='Sample rate in Hz') parser.add_argument('--fft_size', default=2048, type=int, help='FFT siz') parser.add_argument('--iterations', default=300, type=int, help='Number of iterations to run') parser.add_argument('--enable_filter', action='store_true', help='Apply a low-pass filter') parser.add_argument('--enable_mel_scale', action='store_true', help='Convert to mel scale and back') parser.add_argument( '--cutoff_freq', type=int, default=1000, help='If filter is enable, the low-pass cutoff frequency in Hz') args = parser.parse_args() in_file = args.in_file # Load an audio file. It must be WAV format. Multi-channel files will be # converted to mono. input_signal = audio_utilities.get_signal(in_file, expected_fs=args.sample_rate_hz) # Hopsamp is the number of samples that the analysis window is shifted after # computing the FFT. For example, if the sample rate is 44100 Hz and hopsamp is # 256, then there will be approximately 44100/256 = 172 FFTs computed per second # and thus 172 spectral slices (i.e., columns) per second in the spectrogram. hopsamp = args.fft_size // 8 # Compute the Short-Time Fourier Transform (STFT) from the audio file. This is a 2-dim Numpy array with # time_slices rows and frequency_bins columns. Thus, you will need to take the # transpose of this matrix to get the usual STFT which has frequency bins as rows # and time slices as columns. stft_full = audio_utilities.stft_for_reconstruction( input_signal, args.fft_size, hopsamp) # Note that the STFT is complex-valued. Therefore, to get the (magnitude) # spectrogram, we need to take the absolute value. print(stft_full.shape) stft_mag = abs(stft_full)**2.0 # Note that `stft_mag` only contains the magnitudes and so we have lost the # phase information. scale = 1.0 / np.amax(stft_mag) print('Maximum value in the magnitude spectrogram: ', 1 / scale) # Rescale to put all values in the range [0, 1]. stft_mag *= scale print(stft_mag.shape) # We now have a (magnitude only) spectrogram, `stft_mag` that is normalized to be within [0, 1.0]. # In a practical use case, we would probably want to perform some processing on `stft_mag` here # which would produce a modified version that we would want to reconstruct audio from. figure(1) imshow(stft_mag.T**0.125, origin='lower', cmap=cm.hot, aspect='auto', interpolation='nearest') colorbar() title('Unmodified spectrogram') xlabel('time index') ylabel('frequency bin index') savefig('unmodified_spectrogram.png', dpi=150) # If the mel scale option is selected, apply a perceptual frequency scale. if args.enable_mel_scale: min_freq_hz = 70 max_freq_hz = 8000 mel_bin_count = 200 linear_bin_count = 1 + args.fft_size // 2 filterbank = audio_utilities.make_mel_filterbank( min_freq_hz, max_freq_hz, mel_bin_count, linear_bin_count, args.sample_rate_hz) figure(2) imshow(filterbank, origin='lower', cmap=cm.hot, aspect='auto', interpolation='nearest') colorbar() title('Mel scale filter bank') xlabel('linear frequency index') ylabel('mel frequency index') savefig('mel_scale_filterbank.png', dpi=150) mel_spectrogram = np.dot(filterbank, stft_mag.T) clf() figure(3) imshow(mel_spectrogram**0.125, origin='lower', cmap=cm.hot, aspect='auto', interpolation='nearest') colorbar() title('Mel scale spectrogram') xlabel('time index') ylabel('mel frequency bin index') savefig('mel_scale_spectrogram.png', dpi=150) inverted_mel_to_linear_freq_spectrogram = np.dot( filterbank.T, mel_spectrogram) clf() figure(4) imshow(inverted_mel_to_linear_freq_spectrogram**0.125, origin='lower', cmap=cm.hot, aspect='auto', interpolation='nearest') colorbar() title('Linear scale spectrogram obtained from mel scale spectrogram') xlabel('time index') ylabel('frequency bin index') savefig('inverted_mel_to_linear_freq_spectrogram.png', dpi=150) stft_modified = inverted_mel_to_linear_freq_spectrogram.T else: stft_modified = stft_mag savefig('stft_modified.png', dpi=150) ###### Optional: modify the spectrogram # For example, we can implement a low-pass filter by simply setting all frequency bins above # some threshold frequency (args.cutoff_freq) to 0 as follows. if args.enable_filter: # Calculate corresponding bin index. cutoff_bin = round(args.cutoff_freq * args.fft_size / args.sample_rate_hz) stft_modified[:, cutoff_bin:] = 0 ########### # Undo the rescaling. stft_modified_scaled = stft_modified / scale stft_modified_scaled = stft_modified_scaled**0.5 # Use the Griffin&Lim algorithm to reconstruct an audio signal from the # magnitude spectrogram. x_reconstruct = audio_utilities.reconstruct_signal_griffin_lim( stft_modified_scaled, args.fft_size, hopsamp, args.iterations) # The output signal must be in the range [-1, 1], otherwise we need to clip or normalize. max_sample = np.max(abs(x_reconstruct)) if max_sample > 1.0: x_reconstruct = x_reconstruct / max_sample # Save the reconstructed signal to a WAV file. audio_utilities.save_audio_to_file(x_reconstruct, args.sample_rate_hz) # Save the spectrogram image also. clf() figure(5) imshow(stft_modified.T**0.125, origin='lower', cmap=cm.hot, aspect='auto', interpolation='nearest') colorbar() title('Spectrogram used to reconstruct audio') xlabel('time index') ylabel('frequency bin index') savefig('reconstruction_spectrogram.png', dpi=150)
def run_recon(): """Test Griffin & Lim method for reconstructing audio from a magnitude spectrogram. Example of using the Griffin-Lim algorithm. The input file is loaded, the spectrogram is computed (note that we discard the phase information). Then, using only the (magnitude) spectrogram, the Griffin-Lim algorithm is run to reconstruct an audio signal from the spectrogram. The reconstructed audio is finally saved to a file. A plot of the spectrogram is also displayed. """ parser = argparse.ArgumentParser() parser.add_argument('--in_file', type=str, default="bkvhi.wav", help='Input WAV file') parser.add_argument('--sample_rate_hz', default=44100, type=int, help='Sample rate in Hz') parser.add_argument('--fft_size', default=2048, type=int, help='FFT siz') parser.add_argument('--iterations', default=300, type=int, help='Number of iterations to run') parser.add_argument('--enable_filter', action='store_true', help='Apply a low-pass filter') parser.add_argument('--enable_mel_scale', action='store_true', help='Convert to mel scale and back') parser.add_argument( '--cutoff_freq', type=int, default=1000, help='If filter is enable, the low-pass cutoff frequency in Hz') args = parser.parse_args() in_file = Image.open(args.in_file) #print(in_file.shape) in_file = in_file.resize((1025, 640), Image.ANTIALIAS) ext = ".png" in_file.save("rescaledimage" + ext) in_file = plt.imread("rescaledimage.png") print(in_file.shape) in_file = rgb2gray(in_file) hopsamp = args.fft_size // 8 print(in_file.shape) # Use the Griffin&Lim algorithm to reconstruct an audio signal from the # magnitude spectrogram. x_reconstruct = audio_utilities.reconstruct_signal_griffin_lim( in_file, args.fft_size, hopsamp, args.iterations) # The output signal must be in the range [-1, 1], otherwise we need to clip or normalize. max_sample = np.max(abs(x_reconstruct)) if max_sample > 1.0: x_reconstruct = x_reconstruct / max_sample # Save the reconstructed signal to a WAV file. audio_utilities.save_audio_to_file(x_reconstruct, args.sample_rate_hz)
for wav_path in wavs ] melSpectrum = np.load('mel-batch_0_sentence_0.npy') imshow(melSpectrum.T**0.125, origin='lower', cmap=cm.hot, aspect='auto', interpolation='nearest') stft_modified = melSpectrum.T filterbank = audio_utilities.make_mel_filterbank(70, 8000, 80, 123, 44100) inverted_mel_to_linear_freq_spectrogram = np.dot(filterbank.T, stft_modified) spectrogram = inverted_mel_to_linear_freq_spectrogram print("Linear spectrograms dim: ") print(spectrogram[0].shape) spectrogram = spectrogram.astype(np.float32) spectrogram = spectrogram # --------------------------------- librosa Version --------------------------------- # convert back x_reconstruct = audio_utilities.reconstruct_signal_griffin_lim( spectrogram, 244, 123, 300) max_sample = np.max(abs(x_reconstruct)) if max_sample > 1.0: x_reconstruct = x_reconstruct / max_sample audio_utilities.save_audio_to_file(x_reconstruct, 44100) print("Done!")
if not os.path.exists(input_dir): print("Input directory does not exist") sys.exit() if input_dir[-1] != "/": input_dir = input_dir + '/' if not os.path.exists(output_dir): print("Output directory created") os.makedirs(output_dir) if output_dir[-1] != "/": output_dir = output_dir + '/' for root, subdirs, files in os.walk(input_dir): for name in tqdm(files): if name.endswith(".npy"): iname = "%s/%s" % (root, name) root_out = root.replace(input_dir, output_dir) if not os.path.exists(root_out): os.makedirs(root_out) oname = "%s/%s-griffinlim-reconstructed.wav" % (root_out, name[:-4]) S = np.load(iname) y_inv = lr.griffinlim(S) audio_utilities.save_audio_to_file(y_inv, sr, outfile=oname) print("Done!\n")
stft_modified=melSpectrum.T if args.enable_filter: # Calculate corresponding bin index. cutoff_bin = round(args.cutoff_freq*args.fft_size/args.sample_rate_hz) stft_modified[:, cutoff_bin:] = 0 stft_modified_scaled = stft_modified stft_modified_scaled = stft_modified_scaled**0.5 # Use the Griffin&Lim algorithm to reconstruct an audio signal from the # magnitude spectrogram. hopsamp = args.fft_size // 8 x_reconstruct = audio_utilities.reconstruct_signal_griffin_lim(stft_modified_scaled, args.fft_size, hopsamp, args.iterations) # The output signal must be in the range [-1, 1], otherwise we need to clip or normalize. max_sample = np.max(abs(x_reconstruct)) if max_sample > 1.0: x_reconstruct = x_reconstruct / max_sample # Save the reconstructed signal to a WAV file. audio_utilities.save_audio_to_file(x_reconstruct, args.sample_rate_hz) # Save the spectrogram image also. clf() figure(5) imshow(stft_modified.T**0.125, origin='lower', cmap=cm.hot, aspect='auto', interpolation='nearest') colorbar() title('Spectrogram used to reconstruct audio') xlabel('time index') ylabel('frequency bin index') savefig('reconstruction_spectrogram.png', dpi=150)
if np.ndim(spectrogram) == 2: # gray mel_spectrogram = (1 - spectrogram / 65535.) else: # color mel_spectrogram = rgb2gray(spectrogram / 255.) mel_spectrogram = np.flip(mel_spectrogram, 0) mel_spectrogram /= scale_mel mel_spectrogram = np.exp(mel_spectrogram / 20. * np.log(10)) * 10e-6 inverted_mel_to_linear_freq_spectrogram = np.dot(filterbank.T, mel_spectrogram) stft_modified = inverted_mel_to_linear_freq_spectrogram.T stft_modified = stft_modified**0.5 # Use the Griffin&Lim algorithm to reconstruct an audio signal from the # magnitude spectrogram. x_reconstruct = audio_utilities.reconstruct_signal_griffin_lim( stft_modified, fft_size, hopsamp, args.iterations) # The output signal must be in the range [-1, 1], otherwise we need to clip or normalize. max_sample = np.max(abs(x_reconstruct)) if max_sample > 1.0: x_reconstruct = x_reconstruct / max_sample # Save the reconstructed signal to a WAV file. audio_utilities.save_audio_to_file(x_reconstruct, sample_rate_hz, outfile=args.out_file)