def __init__( self, voice_changer_model: Path, voice_changer_config: Path, super_resolution_model: Path, super_resolution_config: Path, input_statistics: Path, target_statistics: Path, gpu: int, ): # f0 converter if input_statistics is not None: f0_converter = F0Converter(input_statistics=input_statistics, target_statistics=target_statistics) else: f0_converter = None # acoustic converter config = create_config(voice_changer_config) acoustic_converter = AcousticConverter( config, voice_changer_model, gpu=gpu, f0_converter=f0_converter, ) # super resolution sr_config = create_sr_config(super_resolution_config) super_resolution = SuperResolution(sr_config, super_resolution_model, gpu=gpu) self.acoustic_converter = acoustic_converter self.super_resolution = super_resolution
def main(): save_arguments(arguments, output / 'arguments.json') # f0 converter if f0_trans_model_dir is not None: model = _get_predictor_model_path(f0_trans_model_dir, f0_trans_model_iteration) f0_converter = AcousticConverter(create_config(f0_trans_config), model, gpu=gpu) elif input_statistics is not None: f0_converter = F0Converter(input_statistics=input_statistics, target_statistics=target_statistics) else: f0_converter = None # acoustic converter config = create_config(voice_changer_config) model = _get_predictor_model_path(voice_changer_model_dir, voice_changer_model_iteration) acoustic_converter = AcousticConverter( config, model, gpu=gpu, f0_converter=f0_converter, out_sampling_rate=arguments.out_sampling_rate, ) print(f'Loaded acoustic converter model "{model}"') # super resolution sr_config = create_sr_config(super_resolution_config) super_resolution = SuperResolution(sr_config, super_resolution_model, gpu=gpu) print(f'Loaded super resolution model "{super_resolution_model}"') # dataset's test if not disable_dataset_test: input_paths = list( sorted( [Path(p) for p in glob.glob(str(config.dataset.input_glob))])) numpy.random.RandomState(config.dataset.seed).shuffle(input_paths) paths_test = input_paths[-config.dataset.num_test:] else: paths_test = [] # test data if test_wave_dir is not None: paths_test += list(test_wave_dir.glob('*.wav')) process_partial = partial(process, acoustic_converter=acoustic_converter, super_resolution=super_resolution) if gpu is None: list(multiprocessing.Pool().map(process_partial, paths_test)) else: list(map(process_partial, paths_test))
def main(): save_arguments(arguments, output_dir / 'arguments.json') # f0 converter if input_statistics is not None: f0_converter = F0Converter(input_statistics=input_statistics, target_statistics=target_statistics) else: f0_converter = None # acoustic converter config = create_config(config_path) model = _get_predictor_model_path(model_dir, model_iteration) acoustic_converter = AcousticConverter( config, model, gpu=gpu, f0_converter=f0_converter, out_sampling_rate=output_sampling_rate, ) print(f'Loaded acoustic converter model "{model}"') # dataset test if not disable_dataset_test: input_paths = list( sorted( [Path(p) for p in glob.glob(str(config.dataset.input_glob))])) numpy.random.RandomState(config.dataset.seed).shuffle(input_paths) paths_test = input_paths[-config.dataset.num_test:] else: paths_test = [] # additional test if test_wave_dir is not None: paths_test += list(test_wave_dir.glob('*.wav')) process_partial = partial(process, acoustic_converter=acoustic_converter) if gpu is None: list(multiprocessing.Pool().map(process_partial, paths_test)) else: list(map(process_partial, paths_test))
def models(self): if self._models is None: f0_converter = F0Converter( input_statistics=self.input_statistics_path, target_statistics=self.target_statistics_path, ) ac_config = self.ac_config sr_config = self.sr_config acoustic_converter = AcousticConverter( ac_config, self.stage1_model_path, f0_converter=f0_converter, out_sampling_rate=self.out_sampling_rate, ) super_resolution = SuperResolution( sr_config, self.stage2_model_path, ) self._models = acoustic_converter, super_resolution return self._models
def make_yukarin_converter( input_statistics_path: Path, target_statistics_path: Path, stage1_model_path: Path, stage1_config_path: Path, stage2_model_path: Path, stage2_config_path: Path, ): logger = logging.getLogger('encode') init_logger(logger) logger.info('make_yukarin_converter') f0_converter = F0Converter( input_statistics=input_statistics_path, target_statistics=target_statistics_path, ) config = create_config(stage1_config_path) acoustic_converter = AcousticConverter( config=config, model_path=stage1_model_path, gpu=0, f0_converter=f0_converter, out_sampling_rate=24000, ) logger.info('model 1 loaded!') sr_config = create_sr_config(stage2_config_path) super_resolution = SuperResolution( config=sr_config, model_path=stage2_model_path, gpu=0, ) logger.info('model 2 loaded!') return YukarinConverter( acoustic_converter=acoustic_converter, super_resolution=super_resolution, )
def main(): print('model loading...', flush=True) queue_input_wave = Queue() queue_input_feature = Queue() queue_output_feature = Queue() queue_output_wave = Queue() input_statistics_path = Path('./trained/f0_statistics/hiho_f0stat.npy') target_statistics_path = Path('./trained/f0_statistics/yukari_f0stat.npy') f0_converter = F0Converter(input_statistics=input_statistics_path, target_statistics=target_statistics_path) model_path = Path('./trained/pp-el8-wof0/predictor_2260000.npz') config_path = Path('./trained/pp-el8-wof0/config.json') config = create_config(config_path) acoustic_converter = AcousticConverter(config, model_path, gpu=0, f0_converter=f0_converter) print('model 1 loaded!', flush=True) model_path = Path('./trained/sr-noise3/predictor_180000.npz') config_path = Path('./trained/sr-noise3/config.json') sr_config = create_sr_config(config_path) super_resolution = SuperResolution(sr_config, model_path, gpu=0) print('model 2 loaded!', flush=True) audio_instance = pyaudio.PyAudio() audio_config = AudioConfig( rate=config.dataset.acoustic_param.sampling_rate, frame_period=config.dataset.acoustic_param.frame_period, audio_chunk=config.dataset.acoustic_param.sampling_rate, convert_chunk=config.dataset.acoustic_param.sampling_rate, vocoder_buffer_size=config.dataset.acoustic_param.sampling_rate // 16, in_norm=1 / 8, out_norm=4.0, silent_threshold=-80.0, ) voice_changer_stream = VoiceChangerStream( sampling_rate=audio_config.rate, frame_period=config.dataset.acoustic_param.frame_period, order=config.dataset.acoustic_param.order, in_dtype=numpy.float32, ) wrapper = VoiceChangerStreamWrapper( voice_changer_stream=voice_changer_stream, extra_time_pre=0.2, extra_time=0.5, ) process_encoder = Process(target=encode_worker, kwargs=dict( config=config, wrapper=wrapper, audio_config=audio_config, queue_input=queue_input_wave, queue_output=queue_input_feature, )) process_encoder.start() process_converter = Process(target=convert_worker, kwargs=dict( config=config, wrapper=wrapper, acoustic_converter=acoustic_converter, super_resolution=super_resolution, audio_config=audio_config, queue_input=queue_input_feature, queue_output=queue_output_feature, )) process_converter.start() process_decoder = Process(target=decode_worker, kwargs=dict( config=config, wrapper=wrapper, audio_config=audio_config, queue_input=queue_output_feature, queue_output=queue_output_wave, )) process_decoder.start() audio_stream = audio_instance.open( format=pyaudio.paFloat32, channels=1, rate=audio_config.rate, frames_per_buffer=audio_config.audio_chunk, input=True, output=True, ) while True: # input audio in_data = audio_stream.read(audio_config.audio_chunk) wave = numpy.fromstring(in_data, dtype=numpy.float32) * audio_config.in_norm queue_input_wave.put(wave) print('queue_input_wave', queue_input_wave.qsize(), flush=True) print('queue_input_feature', queue_input_feature.qsize(), flush=True) print('queue_output_feature', queue_output_feature.qsize(), flush=True) print('queue_output_wave', queue_output_wave.qsize(), flush=True) # output try: wave = queue_output_wave.get_nowait() except: wave = None if wave is not None: wave *= audio_config.out_norm b = wave.astype(numpy.float32).tobytes() audio_stream.write(b)
def check( input_path: Path, input_time_length: int, output_path: Path, input_statistics_path: Path, target_statistics_path: Path, stage1_model_path: Path, stage1_config_path: Path, stage2_model_path: Path, stage2_config_path: Path, ): ac_config = create_config(stage1_config_path) sr_config = create_sr_config(stage2_config_path) input_rate = ac_config.dataset.acoustic_param.sampling_rate output_rate = sr_config.dataset.param.voice_param.sample_rate realtime_vocoder = RealtimeVocoder( acoustic_param=ac_config.dataset.acoustic_param, out_sampling_rate=output_rate, extract_f0_mode=VocodeMode.WORLD, ) realtime_vocoder.create_synthesizer( buffer_size=1024, number_of_pointers=16, ) f0_converter = F0Converter( input_statistics=input_statistics_path, target_statistics=target_statistics_path, ) ac_config = ac_config sr_config = sr_config acoustic_converter = AcousticConverter( ac_config, stage1_model_path, f0_converter=f0_converter, out_sampling_rate=output_rate, ) super_resolution = SuperResolution( sr_config, stage2_model_path, ) voice_changer = VoiceChanger( acoustic_converter=acoustic_converter, super_resolution=super_resolution, output_sampling_rate=output_rate, ) encode_stream = EncodeStream(vocoder=realtime_vocoder) convert_stream = ConvertStream(voice_changer=voice_changer) decode_stream = DecodeStream(vocoder=realtime_vocoder) num_data = input_time_length time_length = 1 def _load_wave_and_split(time_length: float = 1): length = round(time_length * input_rate) wave, _ = librosa.load(str(input_path), sr=input_rate) return [ wave[i * length:(i + 1) * length] for i in range(len(wave) // length) ] def _add(_stream: BaseStream, _datas): for i, data in zip(range(num_data), _datas): _stream.add(start_time=i * time_length, data=data) def _split_process(_stream: BaseStream, _extra_time: float): return [ _stream.process(start_time=i * time_length, time_length=time_length, extra_time=_extra_time) for i in range(num_data) ] def _join_process(_stream: BaseStream, _extra_time: float): return _stream.process(start_time=0, time_length=time_length * num_data, extra_time=_extra_time) def _process_all_stream( _streams: Tuple[BaseStream, BaseStream, BaseStream], _datas, _split_flags: Tuple[bool, bool, bool], _extra_times: Tuple[float, float, float], ): for stream, split_flag, extra_time in zip(_streams, _split_flags, _extra_times): _add(stream, _datas) if split_flag: _datas = _split_process(stream, _extra_time=extra_time) else: _datas = [_join_process(stream, _extra_time=extra_time)] return _datas def _concat_and_save(_waves, _path: Path): wave = numpy.concatenate(_waves).astype(numpy.float32) librosa.output.write_wav(str(_path), wave, output_rate) def _remove(_streams: Tuple[BaseStream, BaseStream, BaseStream]): for stream in _streams: stream.remove(end_time=num_data) waves = _load_wave_and_split(time_length=time_length)[:num_data] encode_stream = encode_stream convert_stream = convert_stream decode_stream = decode_stream streams = (encode_stream, convert_stream, decode_stream) datas = _process_all_stream(streams, waves, _split_flags=(True, True, True), _extra_times=(0, 1, 0)) _concat_and_save(datas, output_path) _remove(streams)
class AudioConfig(NamedTuple): rate: int chunk: int vocoder_buffer_size: int out_norm: float model_base_path = Path('./trained/').expanduser() test_data_path = Path('tests/test-deep-learning-yuduki-yukari.wav') test_output_path = Path('output.wav') input_statistics_path = model_base_path / 'f0_statistics/hiho_f0stat.npy' target_statistics_path = model_base_path / 'f0_statistics/yukari_f0stat.npy' print('model loading...', flush=True) f0_converter = F0Converter(input_statistics=input_statistics_path, target_statistics=target_statistics_path) model_path = model_base_path / Path('pp-el8-wof0/predictor_2260000.npz') config_path = model_base_path / Path('pp-el8-wof0/config.json') config = create_config(config_path) acoustic_converter = AcousticConverter(config, model_path, f0_converter=f0_converter) print('model 1 loaded!', flush=True) model_path = model_base_path / Path('sr-noise3/predictor_180000.npz') config_path = model_base_path / Path('sr-noise3/config.json') sr_config = create_sr_config(config_path) super_resolution = SuperResolution(sr_config, model_path) print('model 2 loaded!', flush=True) audio_config = AudioConfig( rate=config.dataset.acoustic_param.sampling_rate,
def main(): parser = argparse.ArgumentParser() parser.add_argument('-odn', '--output_device_name') args = parser.parse_args() print('model loading...', flush=True) queue_input_wave = Queue() queue_input_feature = Queue() queue_output_feature = Queue() queue_output_wave = Queue() input_statistics_path = Path('./dat/out_1st_my_npy/') target_statistics_path = Path('./dat/out_1st_yukari_npy/') f0_converter = F0Converter(input_statistics=input_statistics_path, target_statistics=target_statistics_path) # model_path = Path('./trained/f0trans-wmc-multi-ref-el8-woD/predictor_13840000.npz') # config_path = Path('./trained/f0trans-wmc-multi-ref-el8-woD/config.json') # f0_converter = AcousticConverter(create_config(config_path), model_path, gpu=0) model_path = Path( './trained/multi-16k-ref24k-el8-woD-gbc8/predictor_2910000.npz') config_path = Path('./trained/multi-16k-ref24k-el8-woD-gbc8/config.json') # model_path = Path('./trained/akane-multi-ref-el8-woD-gbc8/predictor_5130000.npz') # config_path = Path('./trained/akane-multi-ref-el8-woD-gbc8/config.json') # model_path = Path('./trained/aoi-multi-ref-el8-woD-gbc8/predictor_5720000.npz') # config_path = Path('./trained/aoi-multi-ref-el8-woD-gbc8/config.json') # model_path = Path('./trained/zunko-multi-ref-el8-woD-gbc8/predictor_5710000.npz') # config_path = Path('./trained/zunko-multi-ref-el8-woD-gbc8/config.json') config = create_config(config_path) acoustic_converter = AcousticConverter( config, model_path, gpu=0, f0_converter=f0_converter, out_sampling_rate=24000, ) print('model 1 loaded!', flush=True) model_path = Path('./dat/model/yukari_2nd/predictor_120000.npz') config_path = Path('./dat/model/yukari_2nd/config.json') # model_path = Path('./trained/akane-super-resolution/predictor_240000.npz') # config_path = Path('./trained/akane-super-resolution/config.json') sr_config = create_sr_config(config_path) super_resolution = SuperResolution(sr_config, model_path, gpu=0) print('model 2 loaded!', flush=True) audio_instance = pyaudio.PyAudio() audio_config = AudioConfig( in_rate=config.dataset.acoustic_param.sampling_rate, out_rate=24000, frame_period=config.dataset.acoustic_param.frame_period, in_audio_chunk=config.dataset.acoustic_param.sampling_rate, out_audio_chunk=24000, vocoder_buffer_size=config.dataset.acoustic_param.sampling_rate // 16, in_norm=1 / 8, out_norm=2.0, silent_threshold=-80.0, ) conversion_flag = True voice_changer_stream = VoiceChangerStream( in_sampling_rate=audio_config.in_rate, frame_period=config.dataset.acoustic_param.frame_period, order=config.dataset.acoustic_param.order, in_dtype=numpy.float32, ) wrapper = VoiceChangerStreamWrapper( voice_changer_stream=voice_changer_stream, extra_time_pre=0.2, extra_time=0.5, ) process_encoder = Process(target=encode_worker, kwargs=dict( config=config, wrapper=wrapper, audio_config=audio_config, queue_input=queue_input_wave, queue_output=queue_input_feature, )) process_encoder.start() process_converter = Process(target=convert_worker, kwargs=dict( config=config, wrapper=wrapper, acoustic_converter=acoustic_converter, super_resolution=super_resolution, audio_config=audio_config, queue_input=queue_input_feature, queue_output=queue_output_feature, )) process_converter.start() process_decoder = Process(target=decode_worker, kwargs=dict( config=config, wrapper=wrapper, audio_config=audio_config, queue_input=queue_output_feature, queue_output=queue_output_wave, )) process_decoder.start() # output device name = args.output_device_name if name is None: output_device_index = audio_instance.get_default_output_device_info( )['index'] else: for i in range(audio_instance.get_device_count()): if name in str(audio_instance.get_device_info_by_index(i)['name']): output_device_index = i break else: print('device not found') exit(1) # audio stream print('output_device_index', output_device_index) audio_input_stream = audio_instance.open( format=pyaudio.paFloat32, channels=1, rate=audio_config.in_rate, frames_per_buffer=audio_config.in_audio_chunk, input=True, ) audio_output_stream = audio_instance.open( format=pyaudio.paFloat32, channels=1, rate=audio_config.out_rate, frames_per_buffer=audio_config.out_audio_chunk, output=True, output_device_index=output_device_index, ) # signal def signal_handler(*args, **kwargs): process_encoder.terminate() process_converter.terminate() process_decoder.terminate() sys.exit(0) signal.signal(signal.SIGINT, signal_handler) # key event def key_handler(key): nonlocal conversion_flag if key == pynput.keyboard.Key.space: # switch conversion_flag = not conversion_flag key_listener = pynput.keyboard.Listener(on_press=key_handler) key_listener.start() index_input = 0 index_output = 0 while True: # input audio in_data = audio_input_stream.read(audio_config.in_audio_chunk) wave = numpy.fromstring(in_data, dtype=numpy.float32) * audio_config.in_norm item = Item( original=wave * 5, item=wave, index=index_input, conversion_flag=conversion_flag, ) queue_input_wave.put(item) index_input += 1 print('queue_input_wave', queue_input_wave.qsize(), flush=True) print('queue_input_feature', queue_input_feature.qsize(), flush=True) print('queue_output_feature', queue_output_feature.qsize(), flush=True) print('queue_output_wave', queue_output_wave.qsize(), flush=True) # output wave: numpy.ndarray = None popped_list: List[Item] = [] while True: try: while True: item: Item = queue_output_wave.get_nowait() popped_list.append(item) except queue.Empty: pass print('index_output', index_output) item = next( filter(lambda ii: ii.index == index_output, popped_list), None) if item is None: break popped_list.remove(item) index_output += 1 if item.item is None: continue wave = item.item if item.conversion_flag else item.original break if wave is not None: wave *= audio_config.out_norm b = wave.astype(numpy.float32).tobytes() audio_output_stream.write(b)