예제 #1
0
def encode_worker(
    config: Config,
    wrapper: VoiceChangerStreamWrapper,
    audio_config: AudioConfig,
    queue_input: Queue,
    queue_output: Queue,
):
    wrapper.voice_changer_stream.vocoder = Vocoder(
        acoustic_param=config.dataset.acoustic_param,
        out_sampling_rate=audio_config.rate,
    )

    start_time = 0
    time_length = audio_config.convert_chunk / audio_config.rate

    z = numpy.zeros(round(time_length * audio_config.rate),
                    dtype=numpy.float32)
    w = Wave(wave=z, sampling_rate=audio_config.rate)
    wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=w)
    start_time += time_length

    while True:
        wave = queue_input.get()

        w = Wave(wave=wave, sampling_rate=audio_config.rate)
        wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=w)
        start_time += time_length

        feature_wrapper = wrapper.pre_convert_next(time_length=time_length)
        queue_output.put(feature_wrapper)
예제 #2
0
def convert_worker(
    config: Config,
    wrapper: VoiceChangerStreamWrapper,
    acoustic_converter: AcousticConverter,
    super_resolution: SuperResolution,
    audio_config: AudioConfig,
    queue_input: Queue,
    queue_output: Queue,
):
    wrapper.voice_changer_stream.voice_changer = VoiceChanger(
        super_resolution=super_resolution,
        acoustic_converter=acoustic_converter,
    )

    start_time = 0
    time_length = audio_config.convert_chunk / audio_config.rate
    while True:
        in_feature: AcousticFeatureWrapper = queue_input.get()
        wrapper.voice_changer_stream.add_in_feature(
            start_time=start_time,
            feature_wrapper=in_feature,
            frame_period=audio_config.frame_period,
        )
        start_time += time_length

        out_feature = wrapper.convert_next(time_length=time_length)
        queue_output.put(out_feature)
예제 #3
0
def encode_worker(
    config: Config,
    wrapper: VoiceChangerStreamWrapper,
    audio_config: AudioConfig,
    queue_input: Queue,
    queue_output: Queue,
):
    wrapper.voice_changer_stream.vocoder = Vocoder(
        acoustic_param=config.dataset.acoustic_param,
        out_sampling_rate=audio_config.out_rate,
    )

    start_time = 0
    time_length = audio_config.in_audio_chunk / audio_config.in_rate

    # padding 1s
    prev_original = numpy.zeros(round(time_length * audio_config.in_rate),
                                dtype=numpy.float32)
    w = Wave(wave=prev_original, sampling_rate=audio_config.in_rate)
    wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=w)
    start_time += time_length

    while True:
        item: Item = queue_input.get()
        item.original, prev_original = prev_original, item.original
        wave = item.item

        w = Wave(wave=wave, sampling_rate=audio_config.in_rate)
        wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=w)
        start_time += time_length

        feature_wrapper = wrapper.pre_convert_next(time_length=time_length)
        item.item = feature_wrapper
        queue_output.put(item)
예제 #4
0
def decode_worker(
    config: Config,
    wrapper: VoiceChangerStreamWrapper,
    audio_config: AudioConfig,
    queue_input: Queue,
    queue_output: Queue,
):
    wrapper.voice_changer_stream.vocoder = RealtimeVocoder(
        acoustic_param=config.dataset.acoustic_param,
        out_sampling_rate=audio_config.out_rate,
        buffer_size=audio_config.vocoder_buffer_size,
        number_of_pointers=16,
    )

    start_time = 0
    time_length = audio_config.out_audio_chunk / audio_config.out_rate
    wave_fragment = numpy.empty(0)
    while True:
        item: Item = queue_input.get()
        feature: AcousticFeature = item.item
        wrapper.voice_changer_stream.add_out_feature(
            start_time=start_time,
            feature=feature,
            frame_period=audio_config.frame_period,
        )
        start_time += time_length

        wave = wrapper.post_convert_next(time_length=time_length).wave

        wave_fragment = numpy.concatenate([wave_fragment, wave])
        if len(wave_fragment) >= audio_config.out_audio_chunk:
            wave, wave_fragment = wave_fragment[:audio_config.
                                                out_audio_chunk], wave_fragment[
                                                    audio_config.
                                                    out_audio_chunk:]

            power = librosa.core.power_to_db(numpy.abs(
                librosa.stft(wave))**2).mean()
            if power < audio_config.silent_threshold:
                wave = None  # pass
        else:
            wave = None

        item.item = wave
        queue_output.put(item)
예제 #5
0
def main():
    print('model loading...', flush=True)

    queue_input_wave = Queue()
    queue_input_feature = Queue()
    queue_output_feature = Queue()
    queue_output_wave = Queue()

    input_statistics_path = Path('./trained/f0_statistics/hiho_f0stat.npy')
    target_statistics_path = Path('./trained/f0_statistics/yukari_f0stat.npy')
    f0_converter = F0Converter(input_statistics=input_statistics_path,
                               target_statistics=target_statistics_path)

    model_path = Path('./trained/pp-el8-wof0/predictor_2260000.npz')
    config_path = Path('./trained/pp-el8-wof0/config.json')
    config = create_config(config_path)
    acoustic_converter = AcousticConverter(config,
                                           model_path,
                                           gpu=0,
                                           f0_converter=f0_converter)
    print('model 1 loaded!', flush=True)

    model_path = Path('./trained/sr-noise3/predictor_180000.npz')
    config_path = Path('./trained/sr-noise3/config.json')
    sr_config = create_sr_config(config_path)
    super_resolution = SuperResolution(sr_config, model_path, gpu=0)
    print('model 2 loaded!', flush=True)

    audio_instance = pyaudio.PyAudio()
    audio_config = AudioConfig(
        rate=config.dataset.acoustic_param.sampling_rate,
        frame_period=config.dataset.acoustic_param.frame_period,
        audio_chunk=config.dataset.acoustic_param.sampling_rate,
        convert_chunk=config.dataset.acoustic_param.sampling_rate,
        vocoder_buffer_size=config.dataset.acoustic_param.sampling_rate // 16,
        in_norm=1 / 8,
        out_norm=4.0,
        silent_threshold=-80.0,
    )

    voice_changer_stream = VoiceChangerStream(
        sampling_rate=audio_config.rate,
        frame_period=config.dataset.acoustic_param.frame_period,
        order=config.dataset.acoustic_param.order,
        in_dtype=numpy.float32,
    )

    wrapper = VoiceChangerStreamWrapper(
        voice_changer_stream=voice_changer_stream,
        extra_time_pre=0.2,
        extra_time=0.5,
    )

    process_encoder = Process(target=encode_worker,
                              kwargs=dict(
                                  config=config,
                                  wrapper=wrapper,
                                  audio_config=audio_config,
                                  queue_input=queue_input_wave,
                                  queue_output=queue_input_feature,
                              ))
    process_encoder.start()

    process_converter = Process(target=convert_worker,
                                kwargs=dict(
                                    config=config,
                                    wrapper=wrapper,
                                    acoustic_converter=acoustic_converter,
                                    super_resolution=super_resolution,
                                    audio_config=audio_config,
                                    queue_input=queue_input_feature,
                                    queue_output=queue_output_feature,
                                ))
    process_converter.start()

    process_decoder = Process(target=decode_worker,
                              kwargs=dict(
                                  config=config,
                                  wrapper=wrapper,
                                  audio_config=audio_config,
                                  queue_input=queue_output_feature,
                                  queue_output=queue_output_wave,
                              ))
    process_decoder.start()

    audio_stream = audio_instance.open(
        format=pyaudio.paFloat32,
        channels=1,
        rate=audio_config.rate,
        frames_per_buffer=audio_config.audio_chunk,
        input=True,
        output=True,
    )

    while True:
        # input audio
        in_data = audio_stream.read(audio_config.audio_chunk)
        wave = numpy.fromstring(in_data,
                                dtype=numpy.float32) * audio_config.in_norm
        queue_input_wave.put(wave)

        print('queue_input_wave', queue_input_wave.qsize(), flush=True)
        print('queue_input_feature', queue_input_feature.qsize(), flush=True)
        print('queue_output_feature', queue_output_feature.qsize(), flush=True)
        print('queue_output_wave', queue_output_wave.qsize(), flush=True)

        # output
        try:
            wave = queue_output_wave.get_nowait()
        except:
            wave = None

        if wave is not None:
            wave *= audio_config.out_norm
            b = wave.astype(numpy.float32).tobytes()
            audio_stream.write(b)
예제 #6
0
    acoustic_converter=acoustic_converter,
)

voice_changer_stream = VoiceChangerStream(
    sampling_rate=audio_config.rate,
    frame_period=acoustic_converter._param.frame_period,
    order=acoustic_converter._param.order,
    in_dtype=numpy.float32,
)

voice_changer_stream.voice_changer = voice_changer
voice_changer_stream.vocoder = vocoder

wrapper = VoiceChangerStreamWrapper(
    voice_changer_stream=voice_changer_stream,
    extra_time_pre=0.2,
    extra_time=0.1,
)

raw_wave, _ = librosa.load(str(test_data_path), sr=audio_config.rate)
wave_out_list = []

start_time = 0
for i in range(0, len(raw_wave), audio_config.chunk):
    wave_in = Wave(wave=raw_wave[i:i + audio_config.chunk], sampling_rate=audio_config.rate)
    wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=wave_in)
    start_time += len(wave_in.wave) / wave_in.sampling_rate

start_time = 0
for i in range(len(raw_wave) // audio_config.chunk + 1):
    feature_in = wrapper.pre_convert_next(time_length=audio_config.chunk / audio_config.rate)
예제 #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-odn', '--output_device_name')
    args = parser.parse_args()

    print('model loading...', flush=True)

    queue_input_wave = Queue()
    queue_input_feature = Queue()
    queue_output_feature = Queue()
    queue_output_wave = Queue()

    input_statistics_path = Path('./dat/out_1st_my_npy/')
    target_statistics_path = Path('./dat/out_1st_yukari_npy/')
    f0_converter = F0Converter(input_statistics=input_statistics_path,
                               target_statistics=target_statistics_path)
    # model_path = Path('./trained/f0trans-wmc-multi-ref-el8-woD/predictor_13840000.npz')
    # config_path = Path('./trained/f0trans-wmc-multi-ref-el8-woD/config.json')
    # f0_converter = AcousticConverter(create_config(config_path), model_path, gpu=0)

    model_path = Path(
        './trained/multi-16k-ref24k-el8-woD-gbc8/predictor_2910000.npz')
    config_path = Path('./trained/multi-16k-ref24k-el8-woD-gbc8/config.json')
    # model_path = Path('./trained/akane-multi-ref-el8-woD-gbc8/predictor_5130000.npz')
    # config_path = Path('./trained/akane-multi-ref-el8-woD-gbc8/config.json')
    # model_path = Path('./trained/aoi-multi-ref-el8-woD-gbc8/predictor_5720000.npz')
    # config_path = Path('./trained/aoi-multi-ref-el8-woD-gbc8/config.json')
    # model_path = Path('./trained/zunko-multi-ref-el8-woD-gbc8/predictor_5710000.npz')
    # config_path = Path('./trained/zunko-multi-ref-el8-woD-gbc8/config.json')
    config = create_config(config_path)
    acoustic_converter = AcousticConverter(
        config,
        model_path,
        gpu=0,
        f0_converter=f0_converter,
        out_sampling_rate=24000,
    )
    print('model 1 loaded!', flush=True)

    model_path = Path('./dat/model/yukari_2nd/predictor_120000.npz')
    config_path = Path('./dat/model/yukari_2nd/config.json')
    # model_path = Path('./trained/akane-super-resolution/predictor_240000.npz')
    # config_path = Path('./trained/akane-super-resolution/config.json')
    sr_config = create_sr_config(config_path)
    super_resolution = SuperResolution(sr_config, model_path, gpu=0)
    print('model 2 loaded!', flush=True)

    audio_instance = pyaudio.PyAudio()
    audio_config = AudioConfig(
        in_rate=config.dataset.acoustic_param.sampling_rate,
        out_rate=24000,
        frame_period=config.dataset.acoustic_param.frame_period,
        in_audio_chunk=config.dataset.acoustic_param.sampling_rate,
        out_audio_chunk=24000,
        vocoder_buffer_size=config.dataset.acoustic_param.sampling_rate // 16,
        in_norm=1 / 8,
        out_norm=2.0,
        silent_threshold=-80.0,
    )

    conversion_flag = True

    voice_changer_stream = VoiceChangerStream(
        in_sampling_rate=audio_config.in_rate,
        frame_period=config.dataset.acoustic_param.frame_period,
        order=config.dataset.acoustic_param.order,
        in_dtype=numpy.float32,
    )

    wrapper = VoiceChangerStreamWrapper(
        voice_changer_stream=voice_changer_stream,
        extra_time_pre=0.2,
        extra_time=0.5,
    )

    process_encoder = Process(target=encode_worker,
                              kwargs=dict(
                                  config=config,
                                  wrapper=wrapper,
                                  audio_config=audio_config,
                                  queue_input=queue_input_wave,
                                  queue_output=queue_input_feature,
                              ))
    process_encoder.start()

    process_converter = Process(target=convert_worker,
                                kwargs=dict(
                                    config=config,
                                    wrapper=wrapper,
                                    acoustic_converter=acoustic_converter,
                                    super_resolution=super_resolution,
                                    audio_config=audio_config,
                                    queue_input=queue_input_feature,
                                    queue_output=queue_output_feature,
                                ))
    process_converter.start()

    process_decoder = Process(target=decode_worker,
                              kwargs=dict(
                                  config=config,
                                  wrapper=wrapper,
                                  audio_config=audio_config,
                                  queue_input=queue_output_feature,
                                  queue_output=queue_output_wave,
                              ))
    process_decoder.start()

    # output device
    name = args.output_device_name
    if name is None:
        output_device_index = audio_instance.get_default_output_device_info(
        )['index']

    else:
        for i in range(audio_instance.get_device_count()):
            if name in str(audio_instance.get_device_info_by_index(i)['name']):
                output_device_index = i
                break
        else:
            print('device not found')
            exit(1)

    # audio stream
    print('output_device_index', output_device_index)
    audio_input_stream = audio_instance.open(
        format=pyaudio.paFloat32,
        channels=1,
        rate=audio_config.in_rate,
        frames_per_buffer=audio_config.in_audio_chunk,
        input=True,
    )

    audio_output_stream = audio_instance.open(
        format=pyaudio.paFloat32,
        channels=1,
        rate=audio_config.out_rate,
        frames_per_buffer=audio_config.out_audio_chunk,
        output=True,
        output_device_index=output_device_index,
    )

    # signal
    def signal_handler(*args, **kwargs):
        process_encoder.terminate()
        process_converter.terminate()
        process_decoder.terminate()
        sys.exit(0)

    signal.signal(signal.SIGINT, signal_handler)

    # key event
    def key_handler(key):
        nonlocal conversion_flag
        if key == pynput.keyboard.Key.space:  # switch
            conversion_flag = not conversion_flag

    key_listener = pynput.keyboard.Listener(on_press=key_handler)
    key_listener.start()

    index_input = 0
    index_output = 0
    while True:
        # input audio
        in_data = audio_input_stream.read(audio_config.in_audio_chunk)
        wave = numpy.fromstring(in_data,
                                dtype=numpy.float32) * audio_config.in_norm

        item = Item(
            original=wave * 5,
            item=wave,
            index=index_input,
            conversion_flag=conversion_flag,
        )
        queue_input_wave.put(item)
        index_input += 1

        print('queue_input_wave', queue_input_wave.qsize(), flush=True)
        print('queue_input_feature', queue_input_feature.qsize(), flush=True)
        print('queue_output_feature', queue_output_feature.qsize(), flush=True)
        print('queue_output_wave', queue_output_wave.qsize(), flush=True)

        # output
        wave: numpy.ndarray = None
        popped_list: List[Item] = []

        while True:
            try:
                while True:
                    item: Item = queue_output_wave.get_nowait()
                    popped_list.append(item)
            except queue.Empty:
                pass

            print('index_output', index_output)
            item = next(
                filter(lambda ii: ii.index == index_output, popped_list), None)
            if item is None:
                break

            popped_list.remove(item)

            index_output += 1
            if item.item is None:
                continue

            wave = item.item if item.conversion_flag else item.original
            break

        if wave is not None:
            wave *= audio_config.out_norm
            b = wave.astype(numpy.float32).tobytes()
            audio_output_stream.write(b)