예제 #1
0
def preprocess_audio_pair(speech_file_path, noise_file_path, slice_duration_ms,
                          n_video_slices, video_frame_rate):
    print("preprocessing pair: %s, %s" % (speech_file_path, noise_file_path))

    speech_signal = AudioSignal.from_wav_file(speech_file_path)
    noise_signal = AudioSignal.from_wav_file(noise_file_path)

    while noise_signal.get_number_of_samples(
    ) < speech_signal.get_number_of_samples():
        noise_signal = AudioSignal.concat([noise_signal, noise_signal])

    noise_signal.truncate(speech_signal.get_number_of_samples())

    factor = AudioMixer.snr_factor(speech_signal, noise_signal, snr_db=0)
    noise_signal.amplify_by_factor(factor)

    mixed_signal = AudioMixer.mix([speech_signal, noise_signal],
                                  mixing_weights=[1, 1])

    mixed_spectrograms = preprocess_audio_signal(mixed_signal,
                                                 slice_duration_ms,
                                                 n_video_slices,
                                                 video_frame_rate)
    speech_spectrograms = preprocess_audio_signal(speech_signal,
                                                  slice_duration_ms,
                                                  n_video_slices,
                                                  video_frame_rate)
    noise_spectrograms = preprocess_audio_signal(noise_signal,
                                                 slice_duration_ms,
                                                 n_video_slices,
                                                 video_frame_rate)

    return mixed_spectrograms, speech_spectrograms, noise_spectrograms, mixed_signal
def enhance_speech(speaker_file_path, noise_file_path, speech_prediction_path,
                   speech_profile):
    print("enhancing mix of %s, %s" % (speaker_file_path, noise_file_path))

    speaker_source_signal = AudioSignal.from_wav_file(speaker_file_path)
    noise_source_signal = AudioSignal.from_wav_file(noise_file_path)

    while noise_source_signal.get_number_of_samples(
    ) < speaker_source_signal.get_number_of_samples():
        noise_source_signal = AudioSignal.concat(
            [noise_source_signal, noise_source_signal])

    noise_source_signal = noise_source_signal.slice(
        0, speaker_source_signal.get_number_of_samples())
    mixed_signal = AudioMixer.mix([speaker_source_signal, noise_source_signal])

    predicted_speech_signal = AudioSignal.from_wav_file(speech_prediction_path)

    signals = [mixed_signal, predicted_speech_signal]
    max_length = max([signal.get_number_of_samples() for signal in signals])
    for signal in signals:
        signal.pad_with_zeros(max_length)

    mel_converter = MelConverter(mixed_signal.get_sample_rate(),
                                 n_mel_freqs=128,
                                 freq_min_hz=0,
                                 freq_max_hz=4000)
    mixed_spectrogram, original_phase = mel_converter.signal_to_mel_spectrogram(
        mixed_signal, get_phase=True)
    predicted_speech_spectrogram = mel_converter.signal_to_mel_spectrogram(
        predicted_speech_signal)

    speech_enhancement_mask = np.zeros(shape=mixed_spectrogram.shape)

    thresholds = np.zeros(shape=(speech_enhancement_mask.shape[0]))
    for f in range(speech_enhancement_mask.shape[0]):
        thresholds[f] = np.percentile(speech_profile[f, :], 85)

    for f in range(speech_enhancement_mask.shape[0]):
        for t in range(speech_enhancement_mask.shape[1]):
            if predicted_speech_spectrogram[f, t] > thresholds[f]:
                speech_enhancement_mask[f, t] = 1
                continue

    enhanced_speech_spectrogram = mixed_spectrogram * speech_enhancement_mask
    enhanced_speech_signal = mel_converter.reconstruct_signal_from_mel_spectrogram(
        enhanced_speech_spectrogram, original_phase)

    return mixed_signal, enhanced_speech_signal
def preprocess_audio_sample(audio_file_path, slice_duration_ms=330):
    print("preprocessing %s" % audio_file_path)

    audio_signal = AudioSignal.from_wav_file(audio_file_path)

    mel_converter = MelConverter(audio_signal.get_sample_rate(),
                                 n_mel_freqs=128,
                                 freq_min_hz=0,
                                 freq_max_hz=4000)

    new_signal_length = int(
        math.ceil(
            float(audio_signal.get_number_of_samples()) /
            mel_converter.get_hop_length())) * mel_converter.get_hop_length()

    audio_signal.pad_with_zeros(new_signal_length)

    mel_spectrogram = mel_converter.signal_to_mel_spectrogram(audio_signal)

    samples_per_slice = int(
        (float(slice_duration_ms) / 1000) * audio_signal.get_sample_rate())
    spectrogram_samples_per_slice = int(samples_per_slice /
                                        mel_converter.get_hop_length())

    n_slices = int(mel_spectrogram.shape[1] / spectrogram_samples_per_slice)

    slices = [
        mel_spectrogram[:, (i * spectrogram_samples_per_slice):(
            (i + 1) * spectrogram_samples_per_slice)].flatten()
        for i in range(n_slices)
    ]

    return np.stack(slices)
def evaluate(source_file_paths, estimated_file_paths):
	source_signals = [AudioSignal.from_wav_file(f) for f in source_file_paths]
	estimated_signals = [AudioSignal.from_wav_file(f) for f in estimated_file_paths]

	signals = source_signals + estimated_signals
	max_length = max([signal.get_number_of_samples() for signal in signals])
	for signal in signals:
		signal.pad_with_zeros(max_length)

	source_data = [signal.get_data(channel_index=0) for signal in source_signals]
	estimated_data = [signal.get_data(channel_index=0) for signal in estimated_signals]

	source_data = np.stack(source_data)
	estimated_data = np.stack(estimated_data)

	return mir_eval.separation.bss_eval_sources(source_data, estimated_data, compute_permutation=True)
def evaluate(enhancement_dir):
    noisy_snr_dbs = []
    snr_dbs = []

    speaker_ids = os.listdir(enhancement_dir)
    for speaker_id in speaker_ids:
        for sample_dir_name in os.listdir(
                os.path.join(enhancement_dir, speaker_id)):
            print('evaluating snr of %s' % sample_dir_name)

            source_path = os.path.join(enhancement_dir, speaker_id,
                                       sample_dir_name, 'source.wav')
            mixture_path = os.path.join(enhancement_dir, speaker_id,
                                        sample_dir_name, 'mixture.wav')
            enhanced_path = os.path.join(enhancement_dir, speaker_id,
                                         sample_dir_name, 'enhanced.wav')

            source_signal = AudioSignal.from_wav_file(source_path)
            mixture_signal = AudioSignal.from_wav_file(mixture_path)
            enhanced_signal = AudioSignal.from_wav_file(enhanced_path)

            truncate_longer_signal(mixture_signal, source_signal)

            s = source_signal.get_data()
            n = mixture_signal.get_data() - source_signal.get_data()

            noisy_snr = np.var(s) / np.var(n)
            noisy_snr_db = 10 * np.log10(noisy_snr)
            print('noisy snr db: %f' % noisy_snr_db)

            noisy_snr_dbs.append(noisy_snr_db)

            truncate_longer_signal(enhanced_signal, source_signal)

            s = source_signal.get_data()
            e = enhanced_signal.get_data()
            residual_noise = e - s

            snr = np.var(s) / np.var(residual_noise)
            snr_db = 10 * np.log10(snr)
            print('snr db: %f' % snr_db)

            snr_dbs.append(snr_db)

    print('mean noisy snr db: %f' % np.mean(noisy_snr_dbs))
    print('mean snr db: %f' % np.mean(snr_dbs))
def start(args):

    # Initialize Network
    assets = AssetManager(args.prediction_dir)
    storage = PredictionStorage(args.prediction_dir)
    network = SpeechEnhancementNetwork.load(
        assets.get_model_cache_path(args.model_dir))
    network.start_prediction_mode()
    network.predict(np.zeros((2, 80, 24)), np.zeros((2, 128, 128, 6)))


    predicted_speech_signal = reconstruct_speech_signal\
        (AudioSignal.from_wav_file("/cs/engproj/322/real_time/raw_data/mixture.wav"), np.zeros((2, 80, 24)), 30)

    with open(assets.get_normalization_cache_path(args.model_dir),
              'rb') as normalization_fd:
        video_normalizer = pickle.load(normalization_fd)

    lock = Lock()
    video_dir = assets.get_video_cache_path(args.video_audio_dir)
    predict_object = RunPredict(network, video_dir, storage.storage_dir)

    # Run video, audio, preprocess and play threads
    video_queue = Queue()
    audio_queue = Queue()
    predict_queue = Queue()
    play_queue = Queue()
    video_object = VideoProcess(video_dir)
    video_thread = Process(target=video_object.capture_frames,
                           args=(video_queue, lock))
    audio_object = AudioProcess(
        assets.get_audio_cache_path(args.video_audio_dir))
    audio_thread = Process(target=audio_object.capture_frames,
                           args=(audio_queue, lock))
    preprocess_thread = Process(target=predict_object.run_pre_process,
                                args=(video_queue, audio_queue, predict_queue,
                                      video_normalizer, lock))
    play_thread = Process(target=predict_object.play, args=(play_queue, lock))

    video_thread.start()
    audio_thread.start()
    preprocess_thread.start()
    play_thread.start()

    # Run predict
    predict_object.predict(predict_queue, play_queue, lock)

    video_thread.join()
    audio_thread.join()
    preprocess_thread.join()
    play_thread.join()

    # Save files
    predict_object.save_files(storage)

    print("*Finish All*")
def build_speech_profile(speaker_speech_dir, max_files=50):
	print("building speech profile...")

	speech_file_paths = [os.path.join(speaker_speech_dir, f) for f in os.listdir(speaker_speech_dir)][:max_files]
	speech_signals = [AudioSignal.from_wav_file(f) for f in speech_file_paths]

	mel_converter = MelConverter(speech_signals[0].get_sample_rate(), n_mel_freqs=128, freq_min_hz=0, freq_max_hz=None)
	speech_spectrograms = [mel_converter.signal_to_mel_spectrogram(signal) for signal in speech_signals]

	speech_profile = np.concatenate(speech_spectrograms, axis=1)
	return speech_profile
def separate_sources(source_file_paths, prediction_file_paths,
                     separation_function):
    print("separating mixture of %s" % str(source_file_paths))

    source_signals = [AudioSignal.from_wav_file(f) for f in source_file_paths]
    prediction_signals = [
        AudioSignal.from_wav_file(f) for f in prediction_file_paths
    ]

    signals = source_signals + prediction_signals
    max_length = max([signal.get_number_of_samples() for signal in signals])
    for signal in signals:
        signal.pad_with_zeros(max_length)

    mixed_signal = AudioMixer.mix(source_signals)

    mel_converter = MelConverter(mixed_signal.get_sample_rate(),
                                 n_mel_freqs=128,
                                 freq_min_hz=0,
                                 freq_max_hz=None)
    mixed_spectrogram, original_phase = mel_converter.signal_to_mel_spectrogram(
        mixed_signal, get_phase=True)
    prediction_spectrograms = [
        mel_converter.signal_to_mel_spectrogram(signal)
        for signal in prediction_signals
    ]

    masks = generate_separation_masks(mixed_spectrogram,
                                      prediction_spectrograms,
                                      separation_function)
    separated_spectrograms = [mixed_spectrogram * mask for mask in masks]
    separated_signals = [
        mel_converter.reconstruct_signal_from_mel_spectrogram(
            s, original_phase) for s in separated_spectrograms
    ]

    return mixed_signal, separated_signals
예제 #9
0
def reconstruct_signal_from_spectrogram(
    magnitude, phase, sample_rate, n_fft, hop_length, mel=True, db=True
):
    if db:
        magnitude = librosa.db_to_amplitude(magnitude)

    if mel:
        mel_filterbank = librosa.filters.mel(
            sr=sample_rate, n_fft=n_fft, n_mels=80, fmin=0, fmax=8000
        )

        magnitude = np.dot(np.linalg.pinv(mel_filterbank), magnitude)

    signal = librosa.istft(magnitude * phase, hop_length=hop_length)

    return AudioSignal(signal, sample_rate)
    def reconstruct_signal_from_mel_spectrogram(self,
                                                mel_spectrogram,
                                                log=True,
                                                phase=None):
        if log:
            mel_spectrogram = librosa.db_to_power(mel_spectrogram)

        mel_spectrogram = mel_spectrogram**0.5

        magnitude = np.dot(np.linalg.pinv(self._MEL_FILTER), mel_spectrogram)

        if phase is not None:
            inverted_signal = librosa.istft(magnitude * phase,
                                            hop_length=self._HOP_LENGTH)
        else:
            inverted_signal = griffin_lim(magnitude,
                                          self._N_FFT,
                                          self._HOP_LENGTH,
                                          n_iterations=10)

        return AudioSignal(inverted_signal, self._SAMPLE_RATE)
    def run_pre_process(self, v_queue, a_queue, predict_queue,
                        video_normalizer, lock):

        with lock:
            print("*Start pre-process*\n")

        video_out = cv2.VideoWriter(self.path_video_writer_path,
                                    cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'),
                                    self.fps, self.frame_size)

        while self.open:

            video_frames_list, video_slice_number = v_queue.get()
            audio_frames_list, audio_slice_number = a_queue.get()

            # Video pre-process
            for frame in video_frames_list:
                im_crop = self.face_detector.crop_mouth(
                    frame, self.bounding_box)
                video_out.write(im_crop)

                im_gray = cv2.cvtColor(im_crop, cv2.COLOR_BGR2GRAY)
                self.slice_of_frames[:, :, self.frames_counter] = im_gray
                self.frames_counter += 1

            slices = [
                self.slice_of_frames[:, :, (i * RunPredict.FRAMES_PER_SLICE):(
                    (i + 1) * RunPredict.FRAMES_PER_SLICE)]
                for i in range(RunPredict.NUMBER_OF_SLICES)
            ]

            slices = np.stack(slices)
            video_normalizer.normalize(slices)

            # Audio pre-process
            data = np.fromstring(audio_frames_list, dtype=np.int16)
            mixed_signal = AudioSignal(data, 16000)

            self.num_iteration += 1

            mixed_spectrograms = preprocess_audio_signal(
                mixed_signal, self.slice_duration_ms, self.n_video_slices,
                self.fps)

            # Predict
            predict_queue.put(
                (slices, mixed_signal, mixed_spectrograms,
                 int(video_slice_number), int(audio_slice_number)))

            self.slice_of_frames = \
                np.zeros(shape=(128, 128, RunPredict.NUMBER_OF_FRAMES), dtype=np.float32)
            self.frames_counter = 0

            if (v_queue.empty() and a_queue.empty()) or audio_slice_number == 0\
                    or video_slice_number == 0:

                with lock:
                    print(
                        "****************************************************************"
                    )
                    print("Video - slice number: " + str(video_slice_number))
                    print("Audio - slice number: " + str(audio_slice_number))
                    print("Predict - number of iterations: " +
                          str(self.num_iteration))
                    print(
                        "****************************************************************"
                    )

                if not v_queue.empty():
                    video_slice_number -= 1
                    v_queue.get()

                elif not a_queue.empty():
                    audio_slice_number -= 1
                    a_queue.get()

                v_queue.close()
                a_queue.close()

                with lock:
                    print("*Video queue and Audio queue are empty*\n")
                break
def preprocess_sample(speech_entry, noise_file_path, slice_duration_ms=200):
    print("preprocessing sample: %s, %s, %s..." %
          (speech_entry.video_path, speech_entry.audio_path, noise_file_path))

    mouth_height = 128
    mouth_width = 128

    print("preprocessing %s" % speech_entry.video_path)

    face_detector = FaceDetector()
    a = speech_entry.video_path

    with VideoFileReader(a) as reader:

        frames = reader.read_all_frames(convert_to_gray_scale=True)

        mouth_cropped_frames = np.zeros(shape=(mouth_height, mouth_width, 75),
                                        dtype=np.float32)
        for i in range(75):
            mouth_cropped_frames[:, :, i] = face_detector.crop_mouth(
                frames[i], bounding_box_shape=(mouth_width, mouth_height))

        frames_per_slice = int(slice_duration_ms / 1000 *
                               reader.get_frame_rate())

        slices = [
            mouth_cropped_frames[:, :,
                                 (i * frames_per_slice):((i + 1) *
                                                         frames_per_slice)]
            for i in range(int(75 / frames_per_slice))
        ]

        video_samples = np.stack(slices)
        video_frame_rate = reader.get_frame_rate()

    print("preprocessing pair: %s, %s" %
          (speech_entry.audio_path, noise_file_path))

    speech_signal = AudioSignal.from_wav_file(speech_entry.audio_path)
    print(noise_file_path)
    noise_signal = AudioSignal.from_wav_file(noise_file_path)
    print(noise_signal.get_data())
    print(noise_signal.get_sample_rate())
    noise_signal.save_to_wav_file('./noise.wav')
    while noise_signal.get_number_of_samples(
    ) < speech_signal.get_number_of_samples():
        noise_signal = AudioSignal.concat([noise_signal, noise_signal])

    noise_signal.truncate(speech_signal.get_number_of_samples())

    factor = AudioMixer.snr_factor(speech_signal, noise_signal, snr_db=0)
    # print(factor)
    noise_signal.amplify_by_factor(factor)

    #noise_signal.save_to_wav_file('./noise.wav')
    mixed_signal = AudioMixer.mix([speech_signal, noise_signal],
                                  mixing_weights=[1, 1])
    mixed_signal.save_to_wav_file('./mixed.wav')
    mixed_spectrograms = preprocess_audio_signal(mixed_signal,
                                                 slice_duration_ms,
                                                 video_samples.shape[0],
                                                 video_frame_rate)
    speech_spectrograms = preprocess_audio_signal(speech_signal,
                                                  slice_duration_ms,
                                                  video_samples.shape[0],
                                                  video_frame_rate)
    noise_spectrograms = preprocess_audio_signal(noise_signal,
                                                 slice_duration_ms,
                                                 video_samples.shape[0],
                                                 video_frame_rate)

    n_slices = min(video_samples.shape[0], mixed_spectrograms.shape[0])

    return Sample(speaker_id=speech_entry.speaker_id,
                  video_file_path=speech_entry.video_path,
                  speech_file_path=speech_entry.audio_path,
                  noise_file_path=noise_file_path,
                  video_samples=video_samples[:n_slices],
                  mixed_spectrograms=mixed_spectrograms[:n_slices],
                  speech_spectrograms=speech_spectrograms[:n_slices],
                  noise_spectrograms=noise_spectrograms[:n_slices],
                  mixed_signal=mixed_signal,
                  video_frame_rate=video_frame_rate)