def test_voice_conversion(): print(" > Run voice conversion inference using YourTTS model.") model_name = "tts_models/multilingual/multi-dataset/your_tts" language_id = "en" speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav") reference_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0032.wav") output_path = os.path.join(get_tests_output_path(), "output.wav") run_cli( f"tts --model_name {model_name}" f" --out_path {output_path} --speaker_wav {speaker_wav} --reference_wav {reference_wav} --language_idx {language_id} " )
def test_d_vector_inference(self): args = VitsArgs( spec_segment_size=10, num_chars=32, use_d_vector_file=True, d_vector_dim=256, d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"), ) config = VitsConfig(model_args=args) model = Vits.init_from_config(config, verbose=False).to(device) model.eval() # batch size = 1 input_dummy = torch.randint(0, 24, (1, 128)).long().to(device) d_vectors = torch.randn(1, 256).to(device) outputs = model.inference(input_dummy, aux_input={"d_vectors": d_vectors}) self._check_inference_outputs(config, outputs, input_dummy) # batch size = 2 input_dummy, input_lengths, *_ = self._create_inputs(config) d_vectors = torch.randn(2, 256).to(device) outputs = model.inference(input_dummy, aux_input={ "x_lengths": input_lengths, "d_vectors": d_vectors }) self._check_inference_outputs(config, outputs, input_dummy, batch_size=2)
def _test_forward_with_d_vector(self, batch_size): input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs( batch_size) d_vector = torch.rand(batch_size, 256).to(device) # create model config = GlowTTSConfig( num_chars=32, use_d_vector_file=True, d_vector_dim=256, d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"), ) model = GlowTTS.init_from_config(config, verbose=False).to(device) model.train() print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) # inference encoder and decoder with MAS y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths, {"d_vectors": d_vector}) self.assertEqual(y["z"].shape, mel_spec.shape) self.assertEqual(y["logdet"].shape, torch.Size([batch_size])) self.assertEqual(y["y_mean"].shape, mel_spec.shape) self.assertEqual(y["y_log_scale"].shape, mel_spec.shape) self.assertEqual(y["alignments"].shape, mel_spec.shape[:2] + (input_dummy.shape[1], )) self.assertEqual(y["durations_log"].shape, input_dummy.shape + (1, )) self.assertEqual(y["total_durations_log"].shape, input_dummy.shape + (1, ))
def test_init_multispeaker(self): config = GlowTTSConfig(num_chars=32) model = GlowTTS(config) # speaker embedding with default speaker_embedding_dim config.use_speaker_embedding = True config.num_speakers = 5 config.d_vector_dim = None model.init_multispeaker(config) self.assertEqual(model.c_in_channels, model.hidden_channels_enc) # use external speaker embeddings with speaker_embedding_dim = 301 config = GlowTTSConfig(num_chars=32) config.use_d_vector_file = True config.d_vector_dim = 301 model = GlowTTS(config) model.init_multispeaker(config) self.assertEqual(model.c_in_channels, 301) # use speaker embedddings by the provided speaker_manager config = GlowTTSConfig(num_chars=32) config.use_speaker_embedding = True config.speakers_file = os.path.join(get_tests_data_path(), "ljspeech", "speakers.json") speaker_manager = SpeakerManager.init_from_config(config) model = GlowTTS(config) model.speaker_manager = speaker_manager model.init_multispeaker(config) self.assertEqual(model.c_in_channels, model.hidden_channels_enc) self.assertEqual(model.num_speakers, speaker_manager.num_speakers) # use external speaker embeddings by the provided speaker_manager config = GlowTTSConfig(num_chars=32) config.use_d_vector_file = True config.d_vector_dim = 256 config.d_vector_file = os.path.join(get_tests_data_path(), "dummy_speakers.json") speaker_manager = SpeakerManager.init_from_config(config) model = GlowTTS(config) model.speaker_manager = speaker_manager model.init_multispeaker(config) self.assertEqual(model.c_in_channels, speaker_manager.embedding_dim) self.assertEqual(model.num_speakers, speaker_manager.num_speakers)
def test_init_from_config(self): config = GlowTTSConfig(num_chars=32) model = GlowTTS.init_from_config(config, verbose=False).to(device) config = GlowTTSConfig(num_chars=32, num_speakers=2) model = GlowTTS.init_from_config(config, verbose=False).to(device) self.assertTrue(model.num_speakers == 2) self.assertTrue(not hasattr(model, "emb_g")) config = GlowTTSConfig(num_chars=32, num_speakers=2, use_speaker_embedding=True) model = GlowTTS.init_from_config(config, verbose=False).to(device) self.assertTrue(model.num_speakers == 2) self.assertTrue(hasattr(model, "emb_g")) config = GlowTTSConfig( num_chars=32, num_speakers=2, use_speaker_embedding=True, speakers_file=os.path.join(get_tests_data_path(), "ljspeech", "speakers.json"), ) model = GlowTTS.init_from_config(config, verbose=False).to(device) self.assertTrue(model.num_speakers == 10) self.assertTrue(hasattr(model, "emb_g")) config = GlowTTSConfig( num_chars=32, use_d_vector_file=True, d_vector_dim=256, d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"), ) model = GlowTTS.init_from_config(config, verbose=False).to(device) self.assertTrue(model.num_speakers == 1) self.assertTrue(not hasattr(model, "emb_g")) self.assertTrue(model.c_in_channels == config.d_vector_dim)
def test_init_from_config(self): config = VitsConfig(model_args=VitsArgs(num_chars=32)) model = Vits.init_from_config(config, verbose=False).to(device) config = VitsConfig(model_args=VitsArgs(num_chars=32, num_speakers=2)) model = Vits.init_from_config(config, verbose=False).to(device) self.assertTrue(not hasattr(model, "emb_g")) config = VitsConfig(model_args=VitsArgs( num_chars=32, num_speakers=2, use_speaker_embedding=True)) model = Vits.init_from_config(config, verbose=False).to(device) self.assertEqual(model.num_speakers, 2) self.assertTrue(hasattr(model, "emb_g")) config = VitsConfig(model_args=VitsArgs( num_chars=32, num_speakers=2, use_speaker_embedding=True, speakers_file=os.path.join(get_tests_data_path(), "ljspeech", "speakers.json"), )) model = Vits.init_from_config(config, verbose=False).to(device) self.assertEqual(model.num_speakers, 10) self.assertTrue(hasattr(model, "emb_g")) config = VitsConfig(model_args=VitsArgs( num_chars=32, use_d_vector_file=True, d_vector_dim=256, d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"), )) model = Vits.init_from_config(config, verbose=False).to(device) self.assertTrue(model.num_speakers == 1) self.assertTrue(not hasattr(model, "emb_g")) self.assertTrue(model.embedded_speaker_dim == config.d_vector_dim)
def _test_inference_with_d_vector(self, batch_size): input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs( batch_size) d_vector = torch.rand(batch_size, 256).to(device) config = GlowTTSConfig( num_chars=32, use_d_vector_file=True, d_vector_dim=256, d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"), ) model = GlowTTS.init_from_config(config, verbose=False).to(device) model.eval() outputs = model.inference(input_dummy, { "x_lengths": input_lengths, "d_vectors": d_vector }) self._assert_inference_outputs(outputs, input_dummy, mel_spec)
def test_d_vector_forward(self): batch_size = 2 args = VitsArgs( spec_segment_size=10, num_chars=32, use_d_vector_file=True, d_vector_dim=256, d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"), ) config = VitsConfig(model_args=args) model = Vits.init_from_config(config, verbose=False).to(device) model.train() input_dummy, input_lengths, _, spec, spec_lengths, waveform = self._create_inputs( config, batch_size=batch_size) d_vectors = torch.randn(batch_size, 256).to(device) output_dict = model.forward(input_dummy, input_lengths, spec, spec_lengths, waveform, aux_input={"d_vectors": d_vectors}) self._check_forward_outputs(config, output_dict)
from TTS.tts.datasets import TTSDataset, load_tts_samples from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor # pylint: disable=unused-variable OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/") os.makedirs(OUTPATH, exist_ok=True) # create a dummy config for testing data loaders. c = BaseTTSConfig(text_cleaner="english_cleaners", num_loader_workers=0, batch_size=2, use_noise_augment=False) c.r = 5 c.data_path = os.path.join(get_tests_data_path(), "ljspeech/") ok_ljspeech = os.path.exists(c.data_path) dataset_config = BaseDatasetConfig( name="ljspeech_test", # ljspeech_test to multi-speaker meta_file_train="metadata.csv", meta_file_val=None, path=c.data_path, language="en", ) DATA_EXIST = True if not os.path.exists(c.data_path): DATA_EXIST = False print(" > Dynamic data loader test: {}".format(DATA_EXIST))