def test_extract_input_with_dataset( sampling_length: int, f0_path: Path, phoneme_path: Path, phoneme_list_path: Path, silence_path: Path, spectrogram_path: Path, volume_path: Path, f0_process_mode: F0ProcessMode, time_mask_max_second: float, time_mask_num: int, ): f0 = SamplingData.load(f0_path) phoneme = SamplingData.load(phoneme_path) phoneme_list = JvsPhoneme.load_julius_list(phoneme_list_path) silence = SamplingData.load(silence_path) spectrogram = SamplingData.load(spectrogram_path) volume_data = SamplingData.load(volume_path) FeatureDataset.extract_input( sampling_length=sampling_length, f0_data=f0, phoneme_data=phoneme, spec_data=spectrogram, silence_data=silence, phoneme_list_data=phoneme_list, volume_data=volume_data, f0_process_mode=f0_process_mode, time_mask_max_second=time_mask_max_second, time_mask_num=time_mask_num, )
def generate(self): return Input( wave=Wave.load(self.path_wave), silence=SamplingData.load(self.path_silence), f0=SamplingData.load(self.path_f0), phoneme=SamplingData.load(self.path_phoneme), )
def generate(self): return Input( f0=SamplingData.load(self.f0_path), phoneme=SamplingData.load(self.phoneme_path), spec=SamplingData.load(self.spec_path), silence=SamplingData.load(self.silence_path), phoneme_list=(self.phoneme_class.load_julius_list( self.phoneme_list_path) if self.phoneme_list_path is not None else None), volume=(SamplingData.load(self.volume_path) if self.volume_path is not None else None), )
def convert_f0( model_config: Path, input_glob: str, input_f0_statistics: Path, target_f0_statistics: Path, output_dir: Path, ): output_dir.mkdir(exist_ok=True) save_arguments(output_dir / "arguments.yaml", convert_f0, locals()) config = Config.from_dict(yaml.safe_load(model_config.open())) input_stat = numpy.load(input_f0_statistics, allow_pickle=True).item() target_stat = numpy.load(target_f0_statistics, allow_pickle=True).item() paths = list(map(Path, glob(input_glob))) for p in tqdm(paths, desc="convert_f0"): data = SamplingData.load(p) if data.array.shape[1] == (config.network.voiced_feature_size + 1 + config.network.phoneme_feature_size): f0_index = config.network.voiced_feature_size elif data.array.shape[1] == (1 + 1 + 40): f0_index = 1 else: raise ValueError(data.array.shape[1]) data.array[:, f0_index] += target_stat["mean"] - input_stat["mean"] data.save(output_dir / (p.stem + ".npy"))
def process_wo_context( local_paths: Sequence[Path], speaker_nums: Optional[Sequence[int]], generator: Generator, postfix="_woc", ): try: local_datas = [ SamplingData.load(local_path) for local_path in local_paths ] size = int((time_length + 5) * local_datas[0].rate) local_arrays = [ local_data.array[:size] if len(local_data.array) >= size else np.pad( local_data.array, ((0, size - len(local_data.array)), (0, 0)), mode="edge", ) for local_data in local_datas ] waves = generator.generate( time_length=time_length, sampling_policy=sampling_policy, num_generate=len(local_arrays), local_array=np.stack(local_arrays), speaker_nums=speaker_nums, ) for wave, local_path in zip(waves, local_paths): wave.save(output_dir / (local_path.stem + postfix + ".wav")) except: import traceback traceback.print_exc()
def __getitem__(self, i): data = self.datas[i] input = SamplingData.load(data.input_path).array vowel = numpy.squeeze(SamplingData.load(data.vowel_path).array) speaker_num = data.speaker_num assert len(vowel) <= len( input), f'{data.input_path.stem} cannot be processed.' if abs(len(vowel) - len(input)) >= 10: warn(f'{data.input_path.stem} is not matched.') input_vowel = input[:len(vowel)][vowel] i = numpy.random.randint(len(input_vowel)) return default_convert(dict( input=input_vowel[i], target=speaker_num, ))
def generate(self): wave = Wave.load(self.path_wave) try: local = SamplingData.load(self.path_local) except: local_rate = 80 local_array = to_log_melspectrogram(wave=wave, rate=local_rate) local = SamplingData(array=local_array, rate=local_rate) with NamedTemporaryFile(suffix=".npy", delete=False) as f: self.path_local = Path(f.name) local.save(self.path_local) return Input( wave=wave, silence=SamplingData.load(self.path_silence), local=local, )
def process_local_data(local_paths: Sequence[Path], time_length: float): local_datas = [SamplingData.load(local_path) for local_path in local_paths] size = int((time_length + 1) * local_datas[0].rate) local_arrays = [ local_data.array[:size] if len(local_data.array) >= size else np.pad( local_data.array, ((0, size - len(local_data.array)), (0, 0)), mode="edge", ) for local_data in local_datas ] return local_arrays
def process(args: Tuple[int, Path], sampling_lengths: Sequence[int]): i_data, path = args vector = numpy.empty(len(sampling_lengths), dtype=numpy.int32) data = SamplingData.load(path) array = ~numpy.squeeze(data.array) for i_length, sampling_length in enumerate(sampling_lengths): m = numpy.convolve(numpy.ones(sampling_length, dtype=numpy.int32), array, mode='valid').max() vector[i_length] = m return i_data, vector
def generate(self): return Input( phoneme_list=self.phoneme_class.load_julius_list( self.phoneme_list_path), start_accent_list=numpy.array([ bool(int(s)) for s in self.start_accent_list_path.read_text().split() ]), end_accent_list=numpy.array([ bool(int(s)) for s in self.end_accent_list_path.read_text().split() ]), start_accent_phrase_list=numpy.array([ bool(int(s)) for s in self.start_accent_phrase_list_path.read_text().split() ]), end_accent_phrase_list=numpy.array([ bool(int(s)) for s in self.end_accent_phrase_list_path.read_text().split() ]), f0=SamplingData.load(self.f0_path), volume=(SamplingData.load(self.volume_path) if self.volume_path is not None else None), )
def process( generator: Generator, local_paths: Sequence[Path], local_sampling_rate: Optional[int], time_length: float, speaker_nums: Optional[Sequence[int]], sampling_policy: SamplingPolicy, output_dir: Path, postfix="", ): local_datas = [SamplingData.load(local_path) for local_path in local_paths] if local_sampling_rate is None: rate = local_datas[0].rate local_arrays = [l.array for l in local_datas] else: rate = local_sampling_rate local_arrays = [l.resample(rate) for l in local_datas] size = int((time_length + 5) * local_datas[0].rate) local_arrays = [ l[:size] if len(l) >= size else numpy.pad( l, ((0, size - len(l)), (0, 0)), mode="edge", ) for l in local_arrays ] waves = generator.generate( time_length=time_length, sampling_policy=sampling_policy, num_generate=len(local_arrays), local_array=numpy.stack(local_arrays), speaker_nums=speaker_nums, ) for wave, local_path in zip(waves, local_paths): wave.save(output_dir / (local_path.stem + postfix + ".wav"))
def generate(self): return Input( f0=SamplingData.load(self.f0_path), phoneme=SamplingData.load(self.phoneme_path), silence=SamplingData.load(self.silence_path), )
def generate(self): return Input( wave=Wave.load(self.path_wave), silence=SamplingData.load(self.path_silence), local=SamplingData.load(self.path_local), )
def generate( model_dir: Path, model_iteration: Optional[int], model_config: Optional[Path], output_dir: Path, batch_size: int, num_test: int, from_train_data: bool, time_second: float, val_local_glob: str, val_speaker_id: Optional[int], noise_schedule_start: float, noise_schedule_stop: float, noise_schedule_num: int, use_gpu: bool, ): output_dir.mkdir(exist_ok=True) save_arguments(output_dir / "arguments.yaml", generate, locals()) if model_config is None: model_config = model_dir / "config.yaml" config = Config.from_dict(yaml.safe_load(model_config.open())) model_path = _get_predictor_model_path( model_dir=model_dir, iteration=model_iteration, ) print("model path: ", model_path) generator = Generator( config=config, noise_schedule_config=NoiseScheduleModelConfig( start=noise_schedule_start, stop=noise_schedule_stop, num=noise_schedule_num), predictor=model_path, sampling_rate=config.dataset.sampling_rate, use_gpu=use_gpu, ) local_padding_second = 1 local_padding_length = config.dataset.sampling_rate * local_padding_second config.dataset.sampling_length = int(config.dataset.sampling_rate * time_second) config.dataset.local_padding_length = local_padding_length dataset = create_dataset( config.dataset)["test" if not from_train_data else "train"] if isinstance(dataset, SpeakerWavesDataset): wave_paths = [ input.path_wave for input in dataset.wave_dataset.inputs[:num_test] ] elif isinstance(dataset, WavesDataset): wave_paths = [input.path_wave for input in dataset.inputs[:num_test]] else: raise Exception() for data, wave_path in tqdm( zip(chunked(dataset, batch_size), chunked(wave_paths, batch_size)), desc="generate", ): data = concat_examples(data) output = generator.generate( local=data["local"], local_padding_length=local_padding_length, speaker_id=data["speaker_id"] if "speaker_id" in data else None, ) for wave, p in zip(output, wave_path): wave.save(output_dir / (p.stem + ".wav")) # validation if val_local_glob is not None: local_paths = sorted([Path(p) for p in glob(val_local_glob)]) speaker_ids = [val_speaker_id] * len(local_paths) for local_path, speaker_id in zip(chunked(local_paths, batch_size), chunked(speaker_ids, batch_size)): datas = [SamplingData.load(p) for p in local_path] size = int( (time_second + local_padding_second * 2) * datas[0].rate) local = numpy.stack([ (data.array[:size].T if len(data.array) >= size else numpy.pad( data.array, ((0, size - len(data.array)), (0, 0)), mode="edge", ).T) for data in datas ]) output = generator.generate( local=local, local_padding_length=local_padding_length, speaker_id=(numpy.stack(speaker_id) if speaker_id[0] is not None else None), ) for wave, p in zip(output, local_path): wave.save(output_dir / (p.stem + ".wav"))
def main(): model_dir: Path = arguments.model_dir model_iteration: int = arguments.model_iteration model_config: Path = arguments.model_config time_length: float = arguments.time_length gpu: int = arguments.gpu config = create_config(model_config) model_path = _get_predictor_model_path(model_dir, model_iteration) sr = config.dataset.sampling_rate model = create_predictor(config.model) chainer.serializers.load_npz(str(model_path), model) if gpu is not None: model.to_gpu(gpu) cuda.get_device_from_id(gpu).use() chainer.global_config.train = False chainer.global_config.enable_backprop = False wave_paths = sorted([Path(p) for p in glob.glob(str(config.dataset.input_wave_glob))]) local_paths = sorted([Path(p) for p in glob.glob(str(config.dataset.input_local_glob))]) assert len(wave_paths) == len(local_paths) np.random.RandomState(config.dataset.seed).shuffle(wave_paths) np.random.RandomState(config.dataset.seed).shuffle(local_paths) wave_path = wave_paths[0] local_path = local_paths[0] w_data = Wave.load(wave_path, sampling_rate=sr) l_data = SamplingData.load(local_path) length = int(sr * time_length) l_scale = int(sr // l_data.rate) l_sl = length // l_scale length = l_sl * l_scale w = w_data.wave[:length] l = l_data.array[:l_sl] coarse, fine = encode_16bit(w) c, f, hc, hf = model( c_array=decode_single(model.xp.asarray(coarse)).astype(np.float32)[np.newaxis], f_array=decode_single(model.xp.asarray(fine)).astype(np.float32)[:-1][np.newaxis], l_array=model.xp.asarray(l)[np.newaxis], ) c = chainer.functions.softmax(c) c = chainer.cuda.to_cpu(c[0].data) f = chainer.cuda.to_cpu(f[0].data) fig = plt.figure(figsize=[32 * time_length, 10]) plt.imshow(c, aspect='auto', interpolation='nearest') plt.colorbar() plt.plot((w + 1) * 127.5, 'g', linewidth=0.1, label='true') plt.plot(np.argmax(c, axis=0) + np.argmax(f, axis=0) / 256, 'r', linewidth=0.1, label='predicted') plt.legend() fig.savefig('output.eps')
def generate(self): return InputData( spectrogram=SamplingData.load(str(TempCache( self.spectrogram_path))), silence=SamplingData.load(str(TempCache(self.silence_path))), )
def generate(self): return Input( f0=SamplingData.load(self.f0_path), phoneme=SamplingData.load(self.phoneme_path), phoneme_list=JvsPhoneme.load_julius_list(self.phoneme_list_path), )
def create_data( f0_dir: Path, phoneme_list_dir: Path, loudness_dir: Path, accent_start_dir: Path, accent_end_dir: Path, accent_phrase_start_dir: Path, accent_phrase_end_dir: Path, speaker_valid_filter: Optional[str], utterance_valid_filter: Optional[str], data_num: Optional[int], ): f0_paths = sorted(f0_dir.rglob("*.npy")) if data_num is not None: f0_paths = f0_paths[:data_num] assert len(f0_paths) > 0 phoneme_list_paths = sorted(phoneme_list_dir.rglob("*.lab")) if data_num is not None: phoneme_list_paths = phoneme_list_paths[:data_num] assert len(f0_paths) == len(phoneme_list_paths) loudness_paths = sorted(loudness_dir.rglob("*.npy")) if data_num is not None: loudness_paths = loudness_paths[:data_num] assert len(f0_paths) == len(loudness_paths) accent_start_paths = sorted(accent_start_dir.rglob("*.txt")) if data_num is not None: accent_start_paths = accent_start_paths[:data_num] assert len(f0_paths) == len(accent_start_paths) accent_end_paths = sorted(accent_end_dir.rglob("*.txt")) if data_num is not None: accent_end_paths = accent_end_paths[:data_num] assert len(f0_paths) == len(accent_end_paths) accent_phrase_start_paths = sorted(accent_phrase_start_dir.rglob("*.txt")) if data_num is not None: accent_phrase_start_paths = accent_phrase_start_paths[:data_num] assert len(f0_paths) == len(accent_phrase_start_paths) accent_phrase_end_paths = sorted(accent_phrase_end_dir.rglob("*.txt")) if data_num is not None: accent_phrase_end_paths = accent_phrase_end_paths[:data_num] assert len(f0_paths) == len(accent_phrase_end_paths) datas = [ InputData( name=f0_path.stem, f0=SamplingData.load(f0_path), phoneme_list=JvsPhoneme.load_julius_list(phoneme_list_path), loudness=SamplingData.load(loudness_path), accent_start=[ bool(int(s)) for s in accent_start_path.read_text().split() ], accent_end=[ bool(int(s)) for s in accent_end_path.read_text().split() ], accent_phrase_start=[ bool(int(s)) for s in accent_phrase_start_path.read_text().split() ], accent_phrase_end=[ bool(int(s)) for s in accent_phrase_end_path.read_text().split() ], ) for ( f0_path, phoneme_list_path, loudness_path, accent_start_path, accent_end_path, accent_phrase_start_path, accent_phrase_end_path, ) in zip( f0_paths, phoneme_list_paths, loudness_paths, accent_start_paths, accent_end_paths, accent_phrase_start_paths, accent_phrase_end_paths, ) ] train_datas: List[InputData] = [] valid_datas: List[InputData] = [] for d in datas: if (speaker_valid_filter is not None and speaker_valid_filter in d.name) or (utterance_valid_filter is not None and utterance_valid_filter in d.name): valid_datas.append(d) else: train_datas.append(d) return train_datas, valid_datas