def calc_mcd( path1: Optional[Path] = None, path2: Optional[Path] = None, wave1: Optional[Wave] = None, wave2: Optional[Wave] = None, ): wave1 = Wave.load(path1) if wave1 is None else wave1 wave2 = Wave.load(path2) if wave2 is None else wave2 assert wave1.sampling_rate == wave2.sampling_rate sampling_rate = wave1.sampling_rate min_length = min(len(wave1.wave), len(wave2.wave)) wave1.wave = wave1.wave[:min_length] wave2.wave = wave2.wave[:min_length] mc1 = to_melcepstrum( x=wave1.wave, sampling_rate=sampling_rate, n_fft=2048, win_length=1024, hop_length=256, order=24, ) mc2 = to_melcepstrum( x=wave2.wave, sampling_rate=sampling_rate, n_fft=2048, win_length=1024, hop_length=256, order=24, ) return _mcd(mc1, mc2)
def forward( self, wave: Tensor, local: Tensor, speaker_id: Tensor = None, ): batch_size = len(wave) local_padding_length = int(self.generator.sampling_rate * self.local_padding_time_second) output = self.generator.generate( local=local, local_padding_length=local_padding_length, speaker_id=speaker_id, ) mcd_list = [] for wi, wo in zip(wave.cpu().numpy(), output): wi = Wave(wave=wi, sampling_rate=wo.sampling_rate) try: mcd = calc_mcd(wave1=wi, wave2=wo) except Exception: mcd = numpy.nan mcd_list.append(mcd) scores = { "mcd": (numpy.mean(mcd_list), batch_size), } report(scores, self) return scores
def generate( self, local: Union[numpy.ndarray, torch.Tensor], local_padding_length: int = 0, speaker_id: Union[numpy.ndarray, torch.Tensor] = None, ): if isinstance(local, numpy.ndarray): local = torch.from_numpy(local) local = local.to(self.device) if speaker_id is not None: if isinstance(speaker_id, numpy.ndarray): speaker_id = torch.from_numpy(speaker_id) speaker_id = speaker_id.to(self.device) with torch.no_grad(): output = self.inference_forward( local=local, local_padding_length=local_padding_length, speaker_id=speaker_id, ) waves = output.cpu().numpy() if self.mulaw: waves = decode_mulaw(waves) return [Wave(wave=wave, sampling_rate=self.sampling_rate) for wave in waves]
def generate(self): return Input( wave=Wave.load(self.path_wave), silence=SamplingData.load(self.path_silence), f0=SamplingData.load(self.path_f0), phoneme=SamplingData.load(self.path_phoneme), )
def __call__( self, wave: Tensor, silence: Tensor, local: Tensor, source: Tensor, source2: Tensor, signal: Tensor, speaker_id: Tensor = None, ): batch_size = len(wave) local_padding_length = int(self.generator.sampling_rate * self.local_padding_time_length) output = self.generator.generate( local=local, source=source, speaker_id=speaker_id, local_padding_length=local_padding_length, ) mcd_list = [] for wi, wo in zip(wave.cpu().numpy(), output): wi = Wave(wave=wi, sampling_rate=wo.sampling_rate) mcd = calc_mcd(wave1=wi, wave2=wo) mcd_list.append(mcd) scores = { "mcd": (numpy.mean(mcd_list), batch_size), } report(scores, self) return scores
def __getitem__(self, i: int): sampling_rate = self.sampling_rate length = self.sampling_length frequency = numpy.random.uniform(self.frequency_range[0], self.frequency_range[1]) rand = numpy.random.rand() wave = numpy.sin( (2 * numpy.pi) * (numpy.arange(length, dtype=numpy.float32) * frequency / sampling_rate + rand)) local = numpy.log( numpy.ones(shape=(length // self.local_scale, 1), dtype=numpy.float32) * frequency) silence = numpy.zeros(shape=(length, ), dtype=numpy.bool) return default_convert( self.make_input( wave_data=Wave(wave=wave, sampling_rate=sampling_rate), silence_data=SamplingData(array=silence, rate=sampling_rate), local_data=SamplingData(array=local, rate=sampling_rate // self.local_scale), ))
def __call__( self, wave: np.ndarray, local: Optional[np.ndarray], speaker_num: Optional[np.ndarray] = None, ): batchsize = len(wave) wave = chainer.cuda.to_cpu(wave) wave_output = self.generator.generate( time_length=self.time_length + self.local_padding_time_length * 2, sampling_policy=self.sampling_policy, num_generate=batchsize, local_array=local, speaker_nums=speaker_num, ) mcd_list = [] for wi, wo in zip(wave, wave_output): wi = Wave(wave=wi, sampling_rate=wo.sampling_rate) if self.local_padding_time_length > 0: pad = int(wo.sampling_rate * self.local_padding_time_length) wo.wave = wo.wave[pad:-pad] mcd = calc_mcd(wave1=wi, wave2=wo) mcd_list.append(mcd) scores = {"mcd": (self.generator.xp.asarray(mcd_list).mean(), batchsize)} chainer.report(scores, self) return scores
def generate( self, local: Union[numpy.ndarray, torch.Tensor], source: Union[numpy.ndarray, torch.Tensor], speaker_id: Union[numpy.ndarray, torch.Tensor] = None, local_padding_length: int = 0, ): if isinstance(local, numpy.ndarray): local = torch.from_numpy(local) local = local.to(self.device) if isinstance(source, numpy.ndarray): source = torch.from_numpy(source) source = source.to(self.device) if speaker_id is not None: if isinstance(speaker_id, numpy.ndarray): speaker_id = torch.from_numpy(speaker_id) speaker_id = speaker_id.to(self.device) # generate with torch.no_grad(): output = self.predictor( source=source, local=local, local_padding_length=local_padding_length, speaker_id=speaker_id, ) output = output.cpu().numpy() return [Wave(wave=o, sampling_rate=self.sampling_rate) for o in output]
def test_convert_to_dict(self): sampling_rate = 800 local_sampling_rate = 200 scale = sampling_rate // local_sampling_rate time_length = 10 sampling_length = 16 wave_data = Wave( wave=numpy.linspace( 0, sampling_rate * time_length, sampling_rate * time_length, endpoint=False, ), sampling_rate=sampling_rate, ) silence_data = SamplingData( array=numpy.zeros((sampling_rate * time_length, ), dtype=bool), rate=sampling_rate, ) local_data = SamplingData( array=numpy.linspace( 0, sampling_rate * time_length, local_sampling_rate * time_length, endpoint=False, ), rate=local_sampling_rate, ) wave, silence, local = BaseWaveDataset.extract_input( sampling_length, wave_data=wave_data, silence_data=silence_data, local_data=local_data, local_sampling_rate=local_sampling_rate, local_padding_size=0, local_mask_max_second=0, local_mask_num=0, ) dataset = BaseWaveDataset( sampling_rate=sampling_rate, sampling_length=sampling_length, bit=10, mulaw=False, wave_random_max_second=0, wave_random_num=0, local_sampling_rate=local_sampling_rate, local_padding_size=0, local_mask_max_second=0, local_mask_num=0, ) d = dataset.convert_input(wave, silence, local) self.assertEqual(len(d["coarse"]), sampling_length) self.assertEqual(len(d["encoded_coarse"]), sampling_length) self.assertEqual(len(d["silence"]), sampling_length - 1) self.assertEqual(len(d["local"]), sampling_length // scale)
def test_extract_input(self): for sampling_rate, local_sampling_rate, sampling_length, time_length in [ [800, 200, 16, 10], [24000, 24000 / 256, 1024, 100], ]: with self.subTest( sampling_rate=sampling_rate, local_sampling_rate=local_sampling_rate, sampling_length=sampling_length, time_length=time_length, ): scale = sampling_rate // local_sampling_rate wave_data = Wave( wave=numpy.linspace( 0, int(sampling_rate * time_length), int(sampling_rate * time_length), endpoint=False, ), sampling_rate=sampling_rate, ) silence_data = SamplingData( array=numpy.zeros((sampling_rate * time_length, ), dtype=bool), rate=sampling_rate, ) local_data = SamplingData( array=numpy.linspace( 0, int(sampling_rate * time_length), int(local_sampling_rate * time_length), endpoint=False, ), rate=local_sampling_rate, ) for _ in range(10): wave, silence, local = BaseWaveDataset.extract_input( sampling_length, wave_data=wave_data, silence_data=silence_data, local_data=local_data, local_sampling_rate=local_sampling_rate, local_padding_size=0, local_mask_max_second=0, local_mask_num=0, ) self.assertEqual(len(wave), sampling_length) self.assertEqual(len(silence), sampling_length) self.assertEqual(len(local), sampling_length // scale) wave_as_local = wave.reshape(int(sampling_length // scale), -1).min(axis=1) self.assertTrue(numpy.all(wave_as_local == local))
def _process(path: Path, bit: int, gaussian_noise_sigma: float): wave = Wave.load(path).wave if gaussian_noise_sigma > 0: wave += numpy.random.randn(*wave.shape) * gaussian_noise_sigma encoded = encode_single(encode_mulaw(wave, mu=2 ** bit), bit=bit) return numpy.histogram(encoded, bins=2 ** bit, range=(0, 2 ** bit))[0].astype( numpy.uint64 )
def calc_silence_rate( path1: Optional[Path] = None, path2: Optional[Path] = None, wave1: Optional[Wave] = None, wave2: Optional[Wave] = None, ): wave1 = Wave.load(path1) if wave1 is None else wave1 wave2 = Wave.load(path2) if wave2 is None else wave2 assert wave1.sampling_rate == wave2.sampling_rate silence1 = ~librosa.effects._signal_to_frame_nonsilent(wave1.wave) silence2 = ~librosa.effects._signal_to_frame_nonsilent(wave2.wave) tp = numpy.logical_and(silence1, silence2).sum(dtype=float) tn = numpy.logical_and(~silence1, ~silence2).sum(dtype=float) fn = numpy.logical_and(silence1, ~silence2).sum(dtype=float) fp = numpy.logical_and(~silence1, silence2).sum(dtype=float) accuracy = (tp + tn) / (tp + tn + fn + fp) return accuracy
def setUp(self): waves = [ np.ones(self.num // 2) * -1, np.ones(self.num // 2), ] self.inputs = [ Input( wave=Wave(wave=w, sampling_rate=self.sampling_rate), local=SamplingData(array=np.empty((len(w), 0)), rate=self.sampling_rate), silence=SamplingData(array=np.zeros((len(w), ), dtype=bool), rate=self.sampling_rate), ) for w in waves ]
def generate(self): wave = Wave.load(self.path_wave) try: local = SamplingData.load(self.path_local) except: local_rate = 80 local_array = to_log_melspectrogram(wave=wave, rate=local_rate) local = SamplingData(array=local_array, rate=local_rate) with NamedTemporaryFile(suffix=".npy", delete=False) as f: self.path_local = Path(f.name) local.save(self.path_local) return Input( wave=wave, silence=SamplingData.load(self.path_silence), local=local, )
def __call__( self, wave: Tensor, local: Optional[Tensor], speaker_num: Optional[Tensor] = None, ): batchsize = len(wave) wave_output = self.generator.generate( time_length=self.time_length + self.local_padding_time_length * 2, sampling_policy=self.sampling_policy, num_generate=batchsize, local_array=local, speaker_nums=speaker_num, ) mcd_list = [] sil_acc_list = [] for wi, wo in zip(wave.cpu().numpy(), wave_output): wi = Wave(wave=wi, sampling_rate=wo.sampling_rate) if self.local_padding_time_length > 0: pad = int(wo.sampling_rate * self.local_padding_time_length) wo.wave = wo.wave[pad:-pad] mcd = calc_mcd(wave1=wi, wave2=wo) mcd_list.append(mcd) accuracy = calc_silence_rate(wave1=wi, wave2=wo) sil_acc_list.append(accuracy) scores = { "mcd": (numpy.mean(mcd_list), batchsize), "sil_acc": (numpy.mean(sil_acc_list), batchsize), } report(scores, self) return scores
def process( input_paths: Tuple[Path, Path], output_dir: Path, ): input_wave, input_f0 = input_paths wave_data = Wave.load(input_wave) f0_data = F0.load(input_f0) y = wave_data.wave.astype(np.float64) sr = wave_data.sampling_rate f0 = np.exp(f0_data.array[:, 0].astype(np.float64)) if f0_data.with_vuv: f0[~f0_data.array[:, 1]] = 0 t = np.arange(0, len(f0), dtype=np.float64) / f0_data.rate sp = pyworld.cheaptrick(y, f0, t, sr) ap = pyworld.d4c(y, f0, t, sr) y = pyworld.synthesize(f0, sp, ap, sr) out = output_dir / f"{input_f0.stem}.wav" librosa.output.write_wav(out, y.astype(np.float32), sr)
def test_extract_input_with_local_padding(self): for sampling_rate, local_sampling_rate, sampling_length, time_length, local_padding_size in [ [800, 200, 16, 1, 100], [24000, 24000 / 256, 1024, 4, 1024], ]: with self.subTest( sampling_rate=sampling_rate, local_sampling_rate=local_sampling_rate, sampling_length=sampling_length, time_length=time_length, local_padding_size=local_padding_size, ): scale = sampling_rate // local_sampling_rate wave_data = Wave( wave=np.linspace( 0, int(sampling_rate * time_length), int(sampling_rate * time_length), endpoint=False, ), sampling_rate=sampling_rate, ) silence_data = SamplingData( array=np.zeros((sampling_rate * time_length, ), dtype=bool), rate=sampling_rate, ) local_data = SamplingData( array=np.linspace( 0, int(sampling_rate * time_length), int(local_sampling_rate * time_length), endpoint=False, ), rate=local_sampling_rate, ) for _ in range(10000): wave, silence, local = BaseWaveDataset.extract_input( sampling_length, wave_data=wave_data, silence_data=silence_data, local_data=local_data, local_padding_size=local_padding_size, padding_value=np.nan, ) self.assertEqual(len(wave), sampling_length) self.assertEqual(len(silence), sampling_length) self.assertEqual( len(local), (sampling_length + local_padding_size * 2) // scale) num_pad = np.isnan(local).sum() self.assertLessEqual(num_pad, local_padding_size) self.assertTrue(not np.isnan(local[0]) or not np.isnan(local[-1])) wave_as_local = wave.reshape(int(sampling_length // scale), -1).min(axis=1) pad = int(local_padding_size // scale) local_wo_pad = local[pad:-pad] self.assertTrue(np.all(wave_as_local == local_wo_pad))
def main_forward( self, length: int, sampling_policy: SamplingPolicy, num_generate: int, local_array: np.ndarray = None, s_one: np.ndarray = None, ): if self.model.with_local: with chainer.using_config("train", False), chainer.using_config( "enable_backprop", False): local_array = self.model.forward_encode(l_array=local_array, s_one=s_one).data c = self.xp.zeros([num_generate], dtype=np.float32) c = encode_single(c, bit=self.single_bit) hidden_coarse = self.model.gru.init_hx(local_array)[0].data if self.use_cpp_inference and sampling_policy == SamplingPolicy.random: import yukarin_autoreg_cpp wave = np.zeros((length, num_generate), dtype=np.int32) yukarin_autoreg_cpp.inference( batch_size=num_generate, length=length, output=wave, x=to_numpy(c), l_array=to_numpy(self.xp.transpose(local_array, (1, 0, 2))), hidden=to_numpy(hidden_coarse), ) else: if sampling_policy == SamplingPolicy.random: fast_forward_params = get_fast_forward_params(self.model) w_list = fast_generate( length=length, x=c, l_array=local_array, h=hidden_coarse, **fast_forward_params, ) else: w_list = [] hc = hidden_coarse for i in tqdm(range(length), desc="generate"): with chainer.using_config("train", False), chainer.using_config( "enable_backprop", False): c, hc = self.model.forward_one( prev_x=c, prev_l=local_array[:, i], hidden=hc, ) if sampling_policy == SamplingPolicy.random: is_random = True elif sampling_policy == SamplingPolicy.maximum: is_random = False else: raise ValueError(sampling_policy) c = self.model.sampling(c, maximum=not is_random) w_list.append(c) wave = self.xp.stack(w_list) wave = cuda.to_cpu(wave) wave = wave.T wave = decode_single(wave, bit=self.single_bit) if self.mulaw: wave = decode_mulaw(wave, mu=2**self.single_bit) return [ Wave(wave=w_one, sampling_rate=self.sampling_rate) for w_one in wave ]
def test_convert_to_dict(self): sampling_rate = 800 local_sampling_rate = 200 scale = sampling_rate // local_sampling_rate time_length = 10 sampling_length = 16 wave_data = Wave( wave=np.linspace(0, sampling_rate * time_length, sampling_rate * time_length, endpoint=False), sampling_rate=sampling_rate, ) silence_data = SamplingData( array=np.zeros((sampling_rate * time_length, ), dtype=bool), rate=sampling_rate, ) local_data = SamplingData( array=np.linspace(0, sampling_rate * time_length, local_sampling_rate * time_length, endpoint=False), rate=local_sampling_rate, ) wave, silence, local = BaseWaveDataset.extract_input( sampling_length, wave_data=wave_data, silence_data=silence_data, local_data=local_data, local_padding_size=0, ) dataset = BaseWaveDataset( sampling_length=sampling_length, to_double=True, bit=16, mulaw=False, local_padding_size=0, ) d = dataset.convert_to_dict(wave, silence, local) self.assertEqual(len(d['coarse']), sampling_length) self.assertEqual(len(d['fine']), sampling_length - 1) self.assertEqual(len(d['encoded_coarse']), sampling_length) self.assertEqual(len(d['encoded_fine']), sampling_length) self.assertEqual(len(d['silence']), sampling_length - 1) self.assertEqual(len(d['local']), sampling_length // scale) dataset = BaseWaveDataset( sampling_length=sampling_length, to_double=False, bit=10, mulaw=False, local_padding_size=0, ) d = dataset.convert_to_dict(wave, silence, local) self.assertEqual(len(d['coarse']), sampling_length) self.assertIsNone(d['fine']) self.assertEqual(len(d['encoded_coarse']), sampling_length) self.assertIsNone(d['encoded_fine']) self.assertEqual(len(d['silence']), sampling_length - 1) self.assertEqual(len(d['local']), sampling_length // scale)
def generate_dataset( dataset_directory: Path, data_num: int, sampling_rate: int, local_rate: int, phoneme_size: int, speaker_size: int, ): if dataset_directory.exists(): for p in dataset_directory.rglob("*"): if not p.is_dir(): p.unlink() else: dataset_directory.mkdir() f0_dir = dataset_directory.joinpath("f0") phoneme_dir = dataset_directory.joinpath("phoneme") wave_dir = dataset_directory.joinpath("wave") silence_dir = dataset_directory.joinpath("silence") f0_dir.mkdir(exist_ok=True) phoneme_dir.mkdir(exist_ok=True) wave_dir.mkdir(exist_ok=True) silence_dir.mkdir(exist_ok=True) for i_data in range(data_num): local_length = int(numpy.random.randint(low=100, high=200)) sampling_length = int(local_length / local_rate * sampling_rate) f0 = numpy.random.rand(local_length, 1).astype(numpy.float32) f0[f0 < 0.2] = 0 f0 *= 7 SamplingData(array=f0, rate=local_rate).save(f0_dir.joinpath(f"{i_data}.npy")) phoneme = numpy.random.randint(0, phoneme_size, size=local_length).astype( numpy.int32 ) phoneme = numpy.identity(phoneme_size)[phoneme].astype(numpy.int32) SamplingData(array=phoneme, rate=local_rate).save( phoneme_dir.joinpath(f"{i_data}.npy") ) rand = numpy.random.rand() wave = numpy.concatenate( [ numpy.sin( (2 * numpy.pi) * ( numpy.arange(sampling_length // len(f0), dtype=numpy.float32) * numpy.exp(one_f0) / sampling_rate + rand ) ) for one_f0 in f0.tolist() ] ) Wave(wave=wave, sampling_rate=sampling_rate).save( wave_dir.joinpath(f"{i_data}.wav") ) silence = numpy.zeros_like(wave).astype(bool) SamplingData(array=silence, rate=sampling_rate).save( silence_dir.joinpath(f"{i_data}.npy") ) speaker_dict = defaultdict(list) for i_data in range(data_num): speaker_dict[str(i_data % speaker_size)].append(str(i_data)) json.dump(speaker_dict, dataset_directory.joinpath("speaker_dict.json").open("w"))
async def to_feature(text: str = Form(...), wave: UploadFile = File(...)): with TemporaryDirectory() as d: tmp_dir = Path(d) input_audio_path = tmp_dir.joinpath("input.wav") input_audio_path.write_bytes(await wave.read()) # openjtalk phonemes = [ p.label for p in openjtalk_label_getter( text, openjtalk_command="open_jtalk", dict_path=Path("/var/lib/mecab/dic/open-jtalk/naist-jdic"), htsvoice_path=Path( "/usr/share/hts-voice/nitech-jp-atr503-m001/nitech_jp_atr503_m001.htsvoice" ), output_wave_path=tmp_dir.joinpath("wave.wav"), output_log_path=tmp_dir.joinpath("log.txt"), output_type=OutputType.phoneme, without_span=False, ) ] # julius julius_audio_path = tmp_dir.joinpath("julius.wav") subprocess.check_call( f"sox {input_audio_path} -r 16000 -b 16 {julius_audio_path}".split() ) julius_phonemes = [ p if p not in _jvs_to_julius else _jvs_to_julius[p] for p in phonemes if p != "sil" ] julius_dict_path = tmp_dir.joinpath("2nd.dict") julius_dict = sp_inserter.gen_julius_dict_2nd( " ".join(julius_phonemes), model_type=sp_inserter.ModelType.gmm ) julius_dict_path.write_text(julius_dict) julius_dfa_path = tmp_dir.joinpath("2nd.dfa") julius_dfa = sp_inserter.gen_julius_aliment_dfa(julius_dict.count("\n")) julius_dfa_path.write_text(julius_dfa) julius_output = sp_inserter.julius_phone_alignment( str(julius_audio_path), str(tmp_dir.joinpath("2nd")), _hmm_model, model_type=sp_inserter.ModelType.gmm, options=None, ) time_alignment_list = sp_inserter.frame_to_second( sp_inserter.get_time_alimented_list(julius_output) ) i_phoneme = 0 new_phonemes = [] for p in phonemes: if p == "pau" and time_alignment_list[i_phoneme][2] != "sp": continue i_phoneme += 1 new_phonemes.append(p) aligned = JvsPhoneme.convert( [ JvsPhoneme(start=float(o[0]), end=float(o[1]), phoneme=p) for p, o in zip(new_phonemes, time_alignment_list) ] ) for p in aligned: p.verify() # world f0 = F0.from_wave( Wave.load(input_audio_path, sampling_rate=24000, dtype=numpy.float64), frame_period=5.0, f0_floor=71.0, f0_ceil=800, with_vuv=False, f0_type=F0Type.world, ) converted_f0 = f0.convert( input_mean=f0.valid_f0_log.mean(), input_var=f0.valid_f0_log.var(), target_mean=_voiro_mean, target_var=f0.valid_f0_log.var(), ) converted_f0.array = converted_f0.array.astype(numpy.float32).reshape(-1, 1) # feature phoneme_array = LinguisticFeature( phonemes=aligned, phoneme_class=JvsPhoneme, rate=_feature_rate, feature_types=[LinguisticFeature.FeatureType.PHONEME], ).make_array() phoneme = SamplingData(array=phoneme_array, rate=_feature_rate) feature = SamplingData.collect( [converted_f0, phoneme], rate=_feature_rate, mode="min", error_time_length=0.015, ) return StreamingResponse(BytesIO(feature.astype(numpy.float32).tobytes()))
def generate( self, time_length: float, sampling_policy: SamplingPolicy, num_generate: int, local_array: Union[numpy.ndarray, Tensor] = None, speaker_nums: Union[Sequence[int], Tensor] = None, ): assert num_generate <= self.max_batch_size assert local_array is None or len(local_array) == num_generate assert speaker_nums is None or len(speaker_nums) == num_generate length = int(self.sampling_rate * time_length) if local_array is None: local_array = torch.empty((num_generate, length, 0)).float() local_array = to_tensor(local_array).to(self.device) if speaker_nums is not None: speaker_nums = to_tensor(speaker_nums).reshape( (-1, )).to(self.device) with torch.no_grad(): s_one = self.predictor.forward_speaker(speaker_nums) else: s_one = None if self.predictor.with_local: with torch.no_grad(): local_array = self.predictor.forward_encode( l_array=local_array, s_one=s_one) x = numpy.zeros(num_generate, dtype=numpy.float32) x = encode_single(x, bit=self.bit_size) hidden = numpy.zeros( (num_generate, self.predictor.gru.hidden_size), dtype=numpy.float32, ) if sampling_policy == SamplingPolicy.corrected_random: low_probability_threshold = -18 else: low_probability_threshold = -999 if self.use_fast_inference and self.use_gpu: assert sampling_policy in [ SamplingPolicy.random, ] import yukarin_autoreg_cpp wave = numpy.zeros((length, num_generate), dtype=numpy.int32) yukarin_autoreg_cpp.inference( batch_size=num_generate, length=length, output=wave, x=x.astype(numpy.int32), l_array=to_numpy(local_array.transpose(0, 1)), hidden=to_numpy(hidden), ) elif self.use_fast_inference and not self.use_gpu: assert sampling_policy == SamplingPolicy.random params = get_fast_forward_params(self.predictor) x_list = fast_generate( length=length, x=x, l_array=local_array.numpy(), h=hidden, **params, ) wave = numpy.stack(x_list) else: with torch.no_grad(): x = to_tensor(x).to(self.device) x_max = x hidden = to_tensor(hidden).to(self.device) x_list = [] for i in tqdm(range(length), desc="generate"): d_max, _ = self.predictor.forward_one( prev_x=x_max, prev_l=local_array[:, i], hidden=hidden) d, hidden = self.predictor.forward_one( prev_x=x, prev_l=local_array[:, i], hidden=hidden) if sampling_policy == SamplingPolicy.maximum: is_random = False else: is_random = True d[F.log_softmax(d_max.double(), dim=1) < low_probability_threshold] -= 200 x = self.predictor.sampling(d, maximum=not is_random) x_max = self.predictor.sampling(d, maximum=True) x_list.append(x) wave = torch.stack(x_list).cpu().numpy() wave = wave.T wave = decode_single(wave, bit=self.bit_size) if self.mulaw: wave = decode_mulaw(wave, mu=2**self.bit_size) return [ Wave(wave=w_one, sampling_rate=self.sampling_rate) for w_one in wave ]
def collect_to_tfevents( input_dir: Path, output_dir: Optional[Path], filename_suffix: str, audio_tag_format: str, diff_tag: str, iteration_format: str, remove_exist: bool, expected_wave_dir: Optional[Path], ): if output_dir is None: output_dir = input_dir if remove_exist: for p in output_dir.glob(f"*tfevents*{filename_suffix}"): p.unlink() flag_calc_diff = expected_wave_dir is not None summary_writer = SummaryWriter(logdir=str(output_dir), filename_suffix=filename_suffix) diffs: DefaultDict[int, List[float]] = defaultdict(list) for p in tqdm(sorted(input_dir.rglob("*"), key=_to_nums), desc=input_dir.stem): if p.is_dir(): continue if "tfevents" in p.name: continue rp = p.relative_to(input_dir) iteration = int(iteration_format.format(p=p, rp=rp)) # audio if p.suffix in [".wav"]: wave, sr = librosa.load(str(p), sr=None) summary_writer.add_audio( tag=audio_tag_format.format(p=p, rp=rp), snd_tensor=wave, sample_rate=sr, global_step=iteration, ) # diff if flag_calc_diff and p.name.endswith("_woc.wav"): wave_id = p.name[:-8] expected = expected_wave_dir.joinpath(f"{wave_id}.wav") wo = Wave.load(p) wi = Wave.load(expected, sampling_rate=wo.sampling_rate) diff = calc_mcd(wave1=wi, wave2=wo) diffs[iteration].append(diff) if flag_calc_diff: for iteration, values in sorted(diffs.items()): summary_writer.add_scalar( tag=diff_tag, scalar_value=numpy.mean(values), global_step=iteration, ) summary_writer.close()
def main(): model_dir: Path = arguments.model_dir model_iteration: int = arguments.model_iteration model_config: Path = arguments.model_config time_length: float = arguments.time_length gpu: int = arguments.gpu config = create_config(model_config) model_path = _get_predictor_model_path(model_dir, model_iteration) sr = config.dataset.sampling_rate model = create_predictor(config.model) chainer.serializers.load_npz(str(model_path), model) if gpu is not None: model.to_gpu(gpu) cuda.get_device_from_id(gpu).use() chainer.global_config.train = False chainer.global_config.enable_backprop = False wave_paths = sorted([Path(p) for p in glob.glob(str(config.dataset.input_wave_glob))]) local_paths = sorted([Path(p) for p in glob.glob(str(config.dataset.input_local_glob))]) assert len(wave_paths) == len(local_paths) np.random.RandomState(config.dataset.seed).shuffle(wave_paths) np.random.RandomState(config.dataset.seed).shuffle(local_paths) wave_path = wave_paths[0] local_path = local_paths[0] w_data = Wave.load(wave_path, sampling_rate=sr) l_data = SamplingData.load(local_path) length = int(sr * time_length) l_scale = int(sr // l_data.rate) l_sl = length // l_scale length = l_sl * l_scale w = w_data.wave[:length] l = l_data.array[:l_sl] coarse, fine = encode_16bit(w) c, f, hc, hf = model( c_array=decode_single(model.xp.asarray(coarse)).astype(np.float32)[np.newaxis], f_array=decode_single(model.xp.asarray(fine)).astype(np.float32)[:-1][np.newaxis], l_array=model.xp.asarray(l)[np.newaxis], ) c = chainer.functions.softmax(c) c = chainer.cuda.to_cpu(c[0].data) f = chainer.cuda.to_cpu(f[0].data) fig = plt.figure(figsize=[32 * time_length, 10]) plt.imshow(c, aspect='auto', interpolation='nearest') plt.colorbar() plt.plot((w + 1) * 127.5, 'g', linewidth=0.1, label='true') plt.plot(np.argmax(c, axis=0) + np.argmax(f, axis=0) / 256, 'r', linewidth=0.1, label='predicted') plt.legend() fig.savefig('output.eps')
def generate( self, time_length: Optional[float], sampling_policy: SamplingPolicy, num_generate: int, coarse=None, local_array: np.ndarray = None, speaker_nums: List[int] = None, hidden_coarse=None, ): assert num_generate <= self.max_batch_size assert coarse is None or len(coarse) == num_generate assert local_array is None or len(local_array) == num_generate assert speaker_nums is None or len(speaker_nums) == num_generate assert hidden_coarse is None or len(hidden_coarse) == num_generate assert sampling_policy == SamplingPolicy.random length = int(self.sampling_rate * time_length) if local_array is None: local_array = self.xp.empty((num_generate, length, 0), dtype=np.float32) else: local_array = self.xp.asarray(local_array) if speaker_nums is not None: speaker_nums = self.xp.asarray(speaker_nums).reshape((-1, )) with chainer.using_config("train", False), chainer.using_config( "enable_backprop", False): s_one = self.model.forward_speaker(speaker_nums).data else: s_one = None if self.model.with_local: with chainer.using_config("train", False), chainer.using_config( "enable_backprop", False): local_array = self.model.forward_encode(l_array=local_array, s_one=s_one).data if coarse is None: c = self.xp.zeros([num_generate], dtype=np.float32) c = encode_single(c, bit=self.single_bit) else: c = coarse if hidden_coarse is None: hidden_coarse = self.model.gru.init_hx(local_array)[0].data wave = np.zeros((length, num_generate), dtype=np.int32) yukarin_autoreg_cpp.inference( batch_size=num_generate, length=length, output=wave, x=to_numpy(c), l_array=to_numpy(self.xp.transpose(local_array, (1, 0, 2))), hidden=to_numpy(hidden_coarse), ) wave = wave.T wave = decode_single(wave, bit=self.single_bit) if self.mulaw: wave = decode_mulaw(wave, mu=2**self.single_bit) return [ Wave(wave=w_one, sampling_rate=self.sampling_rate) for w_one in wave ]
def generate(self): return Input( wave=Wave.load(self.path_wave), silence=SamplingData.load(self.path_silence), local=SamplingData.load(self.path_local), )
def generate( model_dir: Path, model_iteration: Optional[int], model_config: Optional[Path], output_dir: Path, to_voiced_scaler: bool, to_f0_scaler: bool, to_phoneme_onehot: bool, batch_size: Optional[int], num_test: int, target_glob: Optional[str], use_gpu: bool, ): if model_config is None: model_config = model_dir / "config.yaml" output_dir.mkdir(exist_ok=True) save_arguments(output_dir / "arguments.yaml", generate, locals()) config = Config.from_dict(yaml.safe_load(model_config.open())) generator = Generator( config=config, predictor=_get_model_path( model_dir=model_dir, iteration=model_iteration, prefix="predictor_", ), voiced_network=( None if not to_voiced_scaler else _get_model_path( model_dir=model_dir, iteration=model_iteration, prefix="voiced_network_", ) ), f0_network=( None if not to_f0_scaler else _get_model_path( model_dir=model_dir, iteration=model_iteration, prefix="f0_network_", ) ), phoneme_network=( None if not to_phoneme_onehot else _get_model_path( model_dir=model_dir, iteration=model_iteration, prefix="phoneme_network_", ) ), use_gpu=use_gpu, ) dataset = create_dataset(config.dataset)["test"] scale = numpy.prod(config.network.scale_list) if batch_size is None: batch_size = config.train.batch_size if isinstance(dataset, SpeakerWavesDataset): wave_paths = [data.path_wave for data in dataset.wave_dataset.inputs[:num_test]] elif isinstance(dataset, WavesDataset): wave_paths = [data.path_wave for data in dataset.inputs[:num_test]] else: raise Exception() if target_glob is not None: wave_paths += list(map(Path, glob(target_glob))) for wps in tqdm(chunked(wave_paths, batch_size), desc="generate"): waves = [Wave.load(p) for p in wps] arrays = [w.wave for w in waves] pad_lengths = [int(numpy.ceil(len(w) / scale) * scale) for w in arrays] arrays = [numpy.r_[w, numpy.zeros(max(pad_lengths) - len(w))] for w in arrays] tensors = [torch.from_numpy(array.astype(numpy.float32)) for array in arrays] output = generator.generate( wave=concat_examples(tensors), to_voiced_scaler=to_voiced_scaler, to_f0_scaler=to_f0_scaler, to_phoneme_onehot=to_phoneme_onehot, ) for feature, p, w, l in zip(output, wps, waves, pad_lengths): feature = feature.T[: l // scale] data = SamplingData(array=feature, rate=w.sampling_rate // scale) data.save(output_dir / (p.stem + ".npy"))