def reconstruct_audio_signal(audio_sample, sample_rate): mel_converter = MelConverter(sample_rate, n_mel_freqs=128, freq_min_hz=0, freq_max_hz=4000) slice_mel_spectrograms = [audio_sample[i, :].reshape((mel_converter.get_n_mel_freqs(), -1)) for i in range(audio_sample.shape[0])] full_mel_spectrogram = np.concatenate(slice_mel_spectrograms, axis=1) return mel_converter.reconstruct_signal_from_mel_spectrogram(full_mel_spectrogram)
def enhance_speech(speaker_file_path, noise_file_path, speech_prediction_path, speech_profile): print("enhancing mix of %s, %s" % (speaker_file_path, noise_file_path)) speaker_source_signal = AudioSignal.from_wav_file(speaker_file_path) noise_source_signal = AudioSignal.from_wav_file(noise_file_path) while noise_source_signal.get_number_of_samples( ) < speaker_source_signal.get_number_of_samples(): noise_source_signal = AudioSignal.concat( [noise_source_signal, noise_source_signal]) noise_source_signal = noise_source_signal.slice( 0, speaker_source_signal.get_number_of_samples()) mixed_signal = AudioMixer.mix([speaker_source_signal, noise_source_signal]) predicted_speech_signal = AudioSignal.from_wav_file(speech_prediction_path) signals = [mixed_signal, predicted_speech_signal] max_length = max([signal.get_number_of_samples() for signal in signals]) for signal in signals: signal.pad_with_zeros(max_length) mel_converter = MelConverter(mixed_signal.get_sample_rate(), n_mel_freqs=128, freq_min_hz=0, freq_max_hz=4000) mixed_spectrogram, original_phase = mel_converter.signal_to_mel_spectrogram( mixed_signal, get_phase=True) predicted_speech_spectrogram = mel_converter.signal_to_mel_spectrogram( predicted_speech_signal) speech_enhancement_mask = np.zeros(shape=mixed_spectrogram.shape) thresholds = np.zeros(shape=(speech_enhancement_mask.shape[0])) for f in range(speech_enhancement_mask.shape[0]): thresholds[f] = np.percentile(speech_profile[f, :], 85) for f in range(speech_enhancement_mask.shape[0]): for t in range(speech_enhancement_mask.shape[1]): if predicted_speech_spectrogram[f, t] > thresholds[f]: speech_enhancement_mask[f, t] = 1 continue enhanced_speech_spectrogram = mixed_spectrogram * speech_enhancement_mask enhanced_speech_signal = mel_converter.reconstruct_signal_from_mel_spectrogram( enhanced_speech_spectrogram, original_phase) return mixed_signal, enhanced_speech_signal
def build_speech_profile(speaker_speech_dir, max_files=50): print("building speech profile...") speech_file_paths = [ os.path.join(speaker_speech_dir, f) for f in os.listdir(speaker_speech_dir) ][:max_files] speech_signals = [AudioSignal.from_wav_file(f) for f in speech_file_paths] mel_converter = MelConverter(speech_signals[0].get_sample_rate(), n_mel_freqs=128, freq_min_hz=0, freq_max_hz=4000) speech_spectrograms = [ mel_converter.signal_to_mel_spectrogram(signal) for signal in speech_signals ] speech_profile = np.concatenate(speech_spectrograms, axis=1) return speech_profile
def preprocess_audio_sample(audio_file_path, slice_duration_ms=330): print("preprocessing %s" % audio_file_path) audio_signal = AudioSignal.from_wav_file(audio_file_path) mel_converter = MelConverter(audio_signal.get_sample_rate(), n_mel_freqs=128, freq_min_hz=0, freq_max_hz=4000) new_signal_length = int( math.ceil( float(audio_signal.get_number_of_samples()) / mel_converter.get_hop_length())) * mel_converter.get_hop_length() audio_signal.pad_with_zeros(new_signal_length) mel_spectrogram = mel_converter.signal_to_mel_spectrogram(audio_signal) samples_per_slice = int( (float(slice_duration_ms) / 1000) * audio_signal.get_sample_rate()) spectrogram_samples_per_slice = int(samples_per_slice / mel_converter.get_hop_length()) n_slices = int(mel_spectrogram.shape[1] / spectrogram_samples_per_slice) slices = [ mel_spectrogram[:, (i * spectrogram_samples_per_slice):( (i + 1) * spectrogram_samples_per_slice)].flatten() for i in range(n_slices) ] return np.stack(slices)
def separate_sources(source_file_paths, prediction_file_paths, separation_function): print("separating mixture of %s" % str(source_file_paths)) source_signals = [AudioSignal.from_wav_file(f) for f in source_file_paths] prediction_signals = [ AudioSignal.from_wav_file(f) for f in prediction_file_paths ] signals = source_signals + prediction_signals max_length = max([signal.get_number_of_samples() for signal in signals]) for signal in signals: signal.pad_with_zeros(max_length) mixed_signal = AudioMixer.mix(source_signals) mel_converter = MelConverter(mixed_signal.get_sample_rate(), n_mel_freqs=128, freq_min_hz=0, freq_max_hz=None) mixed_spectrogram, original_phase = mel_converter.signal_to_mel_spectrogram( mixed_signal, get_phase=True) prediction_spectrograms = [ mel_converter.signal_to_mel_spectrogram(signal) for signal in prediction_signals ] masks = generate_separation_masks(mixed_spectrogram, prediction_spectrograms, separation_function) separated_spectrograms = [mixed_spectrogram * mask for mask in masks] separated_signals = [ mel_converter.reconstruct_signal_from_mel_spectrogram( s, original_phase) for s in separated_spectrograms ] return mixed_signal, separated_signals