def prediction(weights_path, name_model, audio_dir_prediction, dir_save_prediction, audio_input_prediction, audio_output_prediction, sample_rate, min_duration, frame_length, hop_length_frame, n_fft, hop_length_fft): """ This function takes as input pretrained weights, noisy voice sound to denoise, predict the denoise sound and save it to disk. """ # load json and create model json_file = open(weights_path + '/' + name_model + '.json', 'r') loaded_model_json = json_file.read() json_file.close() loaded_model = model_from_json(loaded_model_json) # load weights into new model loaded_model.load_weights(weights_path + '/' + name_model + '.h5') print("Loaded model from disk") # Extracting noise and voice from folder and convert to numpy audio = audio_files_to_numpy(audio_dir_prediction, audio_input_prediction, sample_rate, frame_length, hop_length_frame, min_duration) #Dimensions of squared spectrogram dim_square_spec = int(n_fft / 2) + 1 print(dim_square_spec) # Create Amplitude and phase of the sounds m_amp_db_audio, m_pha_audio = numpy_audio_to_matrix_spectrogram( audio, dim_square_spec, n_fft, hop_length_fft) #global scaling to have distribution -1/1 X_in = scaled_in(m_amp_db_audio) #Reshape for prediction X_in = X_in.reshape(X_in.shape[0], X_in.shape[1], X_in.shape[2], 1) #Prediction using loaded network X_pred = loaded_model.predict(X_in) #Rescale back the noise model inv_sca_X_pred = inv_scaled_ou(X_pred) #Remove noise model from noisy speech X_denoise = m_amp_db_audio - inv_sca_X_pred[:, :, :, 0] #Reconstruct audio from denoised spectrogram and phase print(X_denoise.shape) print(m_pha_audio.shape) print(frame_length) print(hop_length_fft) audio_denoise_recons = matrix_spectrogram_to_numpy_audio( X_denoise, m_pha_audio, frame_length, hop_length_fft) #Number of frames nb_samples = audio_denoise_recons.shape[0] #Save all frames in one file denoise_long = audio_denoise_recons.reshape(1, nb_samples * frame_length) * 10 # Update of Librosa no longer uses .output use sf.write instead #librosa.output.write_wav(dir_save_prediction + audio_output_prediction, denoise_long[0, :], sample_rate) import soundfile as sf sf.write(dir_save_prediction + audio_output_prediction, denoise_long[0, :], sample_rate, 'PCM_16')
def prediction(weights_path, model, audio_input_prediction, sample_rate, min_duration, frame_length, hop_length_frame, n_fft, hop_length_fft): """ This function takes as input pretrained weights, noisy voice sound to denoise, predict the denoise sound and save it to disk. """ loaded_model = model # load weights into new model loaded_model.load_weights(weights_path + '/' + 'model_best.h5') print("Loaded model from disk") audio_dir_prediction = "" # Extracting noise and voice from folder and convert to numpy audio = audio_files_to_numpy(audio_dir_prediction, audio_input_prediction, sample_rate, frame_length, hop_length_frame, min_duration) #Dimensions of squared spectrogram dim_square_spec = int(n_fft / 2) + 1 print(dim_square_spec) # Create Amplitude and phase of the sounds m_amp_db_audio, m_pha_audio = numpy_audio_to_matrix_spectrogram( audio, dim_square_spec, n_fft, hop_length_fft) #global scaling to have distribution -1/1 X_in = scaled_in(m_amp_db_audio) #Reshape for prediction X_in = X_in.reshape(X_in.shape[0], X_in.shape[1], X_in.shape[2], 1) #Prediction using loaded network X_pred = loaded_model.predict(X_in) #Rescale back the noise model inv_sca_X_pred = inv_scaled_ou(X_pred) #Remove noise model from noisy speech X_denoise = m_amp_db_audio - inv_sca_X_pred[:, :, :, 0] #Reconstruct audio from denoised spectrogram and phase print(X_denoise.shape) print(m_pha_audio.shape) print(frame_length) print(hop_length_fft) audio_denoise_recons = matrix_spectrogram_to_numpy_audio( X_denoise, m_pha_audio, frame_length, hop_length_fft) #Number of frames nb_samples = audio_denoise_recons.shape[0] #Save all frames in one file denoise_long = audio_denoise_recons.reshape(1, nb_samples * frame_length) * 10 return [denoise_long[0, :], sample_rate]
def decode(weights_path=args.weights_folder, name_model=args.name_model, audio_dir_prediction=args.audio_dir_prediction, dir_save_prediction=args.dir_save_prediction, audio_input_prediction=args.audio_input_prediction, audio_output_prediction=args.audio_output_prediction, sample_rate=args.sample_rate, min_duration=args.min_duration, frame_length=args.frame_length, hop_length_frame=args.hop_length_frame, n_fft=args.n_fft, hop_length_fft=args.hop_length_fft): loaded_model = ConvAutoEncoder(weights_path=weights_path) loaded_model.load_weights() loaded_model.info() print("Loaded model from disk") audio = audio_files_to_numpy(audio_dir_prediction, audio_input_prediction, sample_rate, frame_length, hop_length_frame, min_duration) #Dimensions of squared spectrogram dim_square_spec = int(n_fft / 2) + 1 # Create Amplitude and phase of the sounds m_amp_db_audio, m_pha_audio = numpy_audio_to_matrix_spectrogram( audio, dim_square_spec, n_fft, hop_length_fft) data_compress = np.load('aaa.npy') print(data_compress.shape) decoded = loaded_model.decode(data_compress) #Rescale back the noise model inv_sca_X_pred = inv_scaled_ou(decoded) #Remove noise model from noisy speech X_denoise = m_amp_db_audio - inv_sca_X_pred[:, :, :, 0] #Reconstruct audio from denoised spectrogram and phase print(X_denoise.shape) print(m_pha_audio.shape) print(frame_length) print(hop_length_fft) audio_denoise_recons = matrix_spectrogram_to_numpy_audio( X_denoise, m_pha_audio, frame_length, hop_length_fft) #Number of frames nb_samples = audio_denoise_recons.shape[0] #Save all frames in one file denoise_long = audio_denoise_recons.reshape(1, nb_samples * frame_length) * 10 librosa.output.write_wav(dir_save_prediction + audio_output_prediction, denoise_long[0, :], sample_rate) print('saved audio decoded file in:', dir_save_prediction + audio_output_prediction)
hop_length_fft = 63 dim_square_spec = int(n_fft / 2) + 1 # Create Amplitude and phase of the sounds m_amp_db_audio, m_pha_audio = numpy_audio_to_matrix_spectrogram( audio, dim_square_spec, n_fft, hop_length_fft) #global scaling to have distribution -1/1 X_in = scaled_in(m_amp_db_audio) #Reshape for prediction X_in = X_in.reshape(X_in.shape[0], X_in.shape[1], X_in.shape[2], 1) #Prediction using loaded network X_pred = loaded_model.predict(X_in) #Rescale back the noise model inv_sca_X_pred = inv_scaled_ou(X_pred) #Remove noise model from noisy speech X_denoise = m_amp_db_audio - inv_sca_X_pred[:, :, :, 0] #Reconstruct audio from denoised spectrogram and phase audio_denoise_recons = matrix_spectrogram_to_numpy_audio( X_denoise, m_pha_audio, frame_length, hop_length_fft) #Number of frames nb_samples = audio_denoise_recons.shape[0] def test_dimensions_spectrogram(): """ test that dimensions are correct""" assert dim_square_spec == 128 assert dim_square_spec == m_amp_db_audio.shape[1] assert dim_square_spec == m_amp_db_audio.shape[2] assert dim_square_spec == X_denoise.shape[1]