def reconstruct_audio_signal(audio_sample, sample_rate):
	mel_converter = MelConverter(sample_rate, n_mel_freqs=128, freq_min_hz=0, freq_max_hz=4000)

	slice_mel_spectrograms = [audio_sample[i, :].reshape((mel_converter.get_n_mel_freqs(), -1)) for i in range(audio_sample.shape[0])]
	full_mel_spectrogram = np.concatenate(slice_mel_spectrograms, axis=1)

	return mel_converter.reconstruct_signal_from_mel_spectrogram(full_mel_spectrogram)
def enhance_speech(speaker_file_path, noise_file_path, speech_prediction_path,
                   speech_profile):
    print("enhancing mix of %s, %s" % (speaker_file_path, noise_file_path))

    speaker_source_signal = AudioSignal.from_wav_file(speaker_file_path)
    noise_source_signal = AudioSignal.from_wav_file(noise_file_path)

    while noise_source_signal.get_number_of_samples(
    ) < speaker_source_signal.get_number_of_samples():
        noise_source_signal = AudioSignal.concat(
            [noise_source_signal, noise_source_signal])

    noise_source_signal = noise_source_signal.slice(
        0, speaker_source_signal.get_number_of_samples())
    mixed_signal = AudioMixer.mix([speaker_source_signal, noise_source_signal])

    predicted_speech_signal = AudioSignal.from_wav_file(speech_prediction_path)

    signals = [mixed_signal, predicted_speech_signal]
    max_length = max([signal.get_number_of_samples() for signal in signals])
    for signal in signals:
        signal.pad_with_zeros(max_length)

    mel_converter = MelConverter(mixed_signal.get_sample_rate(),
                                 n_mel_freqs=128,
                                 freq_min_hz=0,
                                 freq_max_hz=4000)
    mixed_spectrogram, original_phase = mel_converter.signal_to_mel_spectrogram(
        mixed_signal, get_phase=True)
    predicted_speech_spectrogram = mel_converter.signal_to_mel_spectrogram(
        predicted_speech_signal)

    speech_enhancement_mask = np.zeros(shape=mixed_spectrogram.shape)

    thresholds = np.zeros(shape=(speech_enhancement_mask.shape[0]))
    for f in range(speech_enhancement_mask.shape[0]):
        thresholds[f] = np.percentile(speech_profile[f, :], 85)

    for f in range(speech_enhancement_mask.shape[0]):
        for t in range(speech_enhancement_mask.shape[1]):
            if predicted_speech_spectrogram[f, t] > thresholds[f]:
                speech_enhancement_mask[f, t] = 1
                continue

    enhanced_speech_spectrogram = mixed_spectrogram * speech_enhancement_mask
    enhanced_speech_signal = mel_converter.reconstruct_signal_from_mel_spectrogram(
        enhanced_speech_spectrogram, original_phase)

    return mixed_signal, enhanced_speech_signal
def build_speech_profile(speaker_speech_dir, max_files=50):
    print("building speech profile...")

    speech_file_paths = [
        os.path.join(speaker_speech_dir, f)
        for f in os.listdir(speaker_speech_dir)
    ][:max_files]
    speech_signals = [AudioSignal.from_wav_file(f) for f in speech_file_paths]

    mel_converter = MelConverter(speech_signals[0].get_sample_rate(),
                                 n_mel_freqs=128,
                                 freq_min_hz=0,
                                 freq_max_hz=4000)
    speech_spectrograms = [
        mel_converter.signal_to_mel_spectrogram(signal)
        for signal in speech_signals
    ]

    speech_profile = np.concatenate(speech_spectrograms, axis=1)
    return speech_profile
def preprocess_audio_sample(audio_file_path, slice_duration_ms=330):
    print("preprocessing %s" % audio_file_path)

    audio_signal = AudioSignal.from_wav_file(audio_file_path)

    mel_converter = MelConverter(audio_signal.get_sample_rate(),
                                 n_mel_freqs=128,
                                 freq_min_hz=0,
                                 freq_max_hz=4000)

    new_signal_length = int(
        math.ceil(
            float(audio_signal.get_number_of_samples()) /
            mel_converter.get_hop_length())) * mel_converter.get_hop_length()

    audio_signal.pad_with_zeros(new_signal_length)

    mel_spectrogram = mel_converter.signal_to_mel_spectrogram(audio_signal)

    samples_per_slice = int(
        (float(slice_duration_ms) / 1000) * audio_signal.get_sample_rate())
    spectrogram_samples_per_slice = int(samples_per_slice /
                                        mel_converter.get_hop_length())

    n_slices = int(mel_spectrogram.shape[1] / spectrogram_samples_per_slice)

    slices = [
        mel_spectrogram[:, (i * spectrogram_samples_per_slice):(
            (i + 1) * spectrogram_samples_per_slice)].flatten()
        for i in range(n_slices)
    ]

    return np.stack(slices)
def separate_sources(source_file_paths, prediction_file_paths,
                     separation_function):
    print("separating mixture of %s" % str(source_file_paths))

    source_signals = [AudioSignal.from_wav_file(f) for f in source_file_paths]
    prediction_signals = [
        AudioSignal.from_wav_file(f) for f in prediction_file_paths
    ]

    signals = source_signals + prediction_signals
    max_length = max([signal.get_number_of_samples() for signal in signals])
    for signal in signals:
        signal.pad_with_zeros(max_length)

    mixed_signal = AudioMixer.mix(source_signals)

    mel_converter = MelConverter(mixed_signal.get_sample_rate(),
                                 n_mel_freqs=128,
                                 freq_min_hz=0,
                                 freq_max_hz=None)
    mixed_spectrogram, original_phase = mel_converter.signal_to_mel_spectrogram(
        mixed_signal, get_phase=True)
    prediction_spectrograms = [
        mel_converter.signal_to_mel_spectrogram(signal)
        for signal in prediction_signals
    ]

    masks = generate_separation_masks(mixed_spectrogram,
                                      prediction_spectrograms,
                                      separation_function)
    separated_spectrograms = [mixed_spectrogram * mask for mask in masks]
    separated_signals = [
        mel_converter.reconstruct_signal_from_mel_spectrogram(
            s, original_phase) for s in separated_spectrograms
    ]

    return mixed_signal, separated_signals