def _create_random_model(self): # pylint: disable=global-statement config = load_config( os.path.join(get_tests_output_path(), "dummy_model_config.json")) model = setup_model(config) output_path = os.path.join(get_tests_output_path()) save_checkpoint(config, model, None, None, 10, 1, output_path)
def _create_random_model(self): # pylint: disable=global-statement global symbols, phonemes config = load_config(os.path.join(get_tests_output_path(), 'dummy_model_config.json')) if 'characters' in config.keys(): symbols, phonemes = make_symbols(**config.characters) num_chars = len(phonemes) if config.use_phonemes else len(symbols) model = setup_model(num_chars, 0, config) output_path = os.path.join(get_tests_output_path()) save_checkpoint(model, None, 10, 10, 1, output_path)
def test_run_all_models(): """Check if all the models are downloadable and tts models run correctly.""" print(" > Run synthesizer with all the models.") download_dir = get_user_data_dir("tts") output_path = os.path.join(get_tests_output_path(), "output.wav") manager = ModelManager(output_prefix=get_tests_output_path()) model_names = manager.list_models() for model_name in model_names: print(f"\n > Run - {model_name}") model_path, _, _ = manager.download_model(model_name) if "tts_models" in model_name: local_download_dir = os.path.dirname(model_path) # download and run the model speaker_files = glob.glob(local_download_dir + "/speaker*") language_files = glob.glob(local_download_dir + "/language*") language_id = "" if len(speaker_files) > 0: # multi-speaker model if "speaker_ids" in speaker_files[0]: speaker_manager = SpeakerManager( speaker_id_file_path=speaker_files[0]) elif "speakers" in speaker_files[0]: speaker_manager = SpeakerManager( d_vectors_file_path=speaker_files[0]) # multi-lingual model - Assuming multi-lingual models are also multi-speaker if len(language_files ) > 0 and "language_ids" in language_files[0]: language_manager = LanguageManager( language_ids_file_path=language_files[0]) language_id = language_manager.language_names[0] speaker_id = list(speaker_manager.ids.keys())[0] run_cli( f"tts --model_name {model_name} " f'--text "This is an example." --out_path "{output_path}" --speaker_idx "{speaker_id}" --language_idx "{language_id}" ' ) else: # single-speaker model run_cli( f"tts --model_name {model_name} " f'--text "This is an example." --out_path "{output_path}"') # remove downloaded models shutil.rmtree(download_dir) else: # only download the model manager.download_model(model_name) print(f" | > OK: {model_name}") folders = glob.glob(os.path.join(manager.output_prefix, "*")) assert len(folders) == len(model_names) shutil.rmtree(manager.output_prefix)
def test_in_out(self): self._create_random_model() tts_root_path = get_tests_output_path() tts_checkpoint = os.path.join(tts_root_path, "checkpoint_10.pth.tar") tts_config = os.path.join(tts_root_path, "dummy_model_config.json") synthesizer = Synthesizer(tts_checkpoint, tts_config, None, None) synthesizer.tts("Better this test works!!")
def test_synthesize(): """Test synthesize.py with diffent arguments.""" output_path = os.path.join(get_tests_output_path(), "output.wav") run_cli("tts --list_models") # single speaker model run_cli(f'tts --text "This is an example." --out_path "{output_path}"')
def test_Tacotron(): # set paths config_path = os.path.join(get_tests_input_path(), "test_tacotron_config.json") checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth.tar") output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/") # load config c = load_config(config_path) # create model num_chars = len(phonemes if c.use_phonemes else symbols) model = setup_model(num_chars, 1, c, speaker_embedding_dim=None) # save model torch.save({"model": model.state_dict()}, checkpoint_path) # run test run_cli( f'CUDA_VISIBLE_DEVICES="" python TTS/bin/extract_tts_spectrograms.py --config_path "{config_path}" --checkpoint_path "{checkpoint_path}" --output_path "{output_path}"' ) run_cli(f'rm -rf "{output_path}" "{checkpoint_path}"')
def test_in_out(self): self._create_random_model() config = load_config(os.path.join(get_tests_input_path(), 'server_config.json')) tts_root_path = get_tests_output_path() config['tts_checkpoint'] = os.path.join(tts_root_path, config['tts_checkpoint']) config['tts_config'] = os.path.join(tts_root_path, config['tts_config']) synthesizer = Synthesizer(config['tts_checkpoint'], config['tts_config'], None, None) synthesizer.tts("Better this test works!!")
def test_Tacotron(): # set paths config_path = os.path.join(get_tests_input_path(), "test_tacotron_config.json") checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth") output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/") # load config c = load_config(config_path) # create model model = setup_model(c) # save model torch.save({"model": model.state_dict()}, checkpoint_path) # run test run_cli( f'CUDA_VISIBLE_DEVICES="" python TTS/bin/extract_tts_spectrograms.py --config_path "{config_path}" --checkpoint_path "{checkpoint_path}" --output_path "{output_path}"' ) run_cli(f'rm -rf "{output_path}" "{checkpoint_path}"')
def test(): # set paths wav_path = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs") output_path = os.path.join(get_tests_output_path(), "output_wavs_removed_silence/") output_resample_path = os.path.join(get_tests_output_path(), "output_ljspeech_16khz/") # resample audios run_cli( f'CUDA_VISIBLE_DEVICES="" python TTS/bin/resample.py --input_dir "{wav_path}" --output_dir "{output_resample_path}" --output_sr 16000' ) # run test run_cli( f'CUDA_VISIBLE_DEVICES="" python TTS/bin/remove_silence_using_vad.py --input_dir "{output_resample_path}" --output_dir "{output_path}"' ) run_cli(f'rm -rf "{output_resample_path}"') run_cli(f'rm -rf "{output_path}"')
def test_load_checkpoint(self): chkp_path = os.path.join(get_tests_output_path(), "dummy_glow_tts_checkpoint.pth") config = VitsConfig(VitsArgs(num_chars=32)) model = Vits.init_from_config(config, verbose=False).to(device) chkp = {} chkp["model"] = model.state_dict() torch.save(chkp, chkp_path) model.load_checkpoint(config, chkp_path) self.assertTrue(model.training) model.load_checkpoint(config, chkp_path, eval=True) self.assertFalse(model.training)
def test_synthesize(): """Test synthesize.py with diffent arguments.""" output_path = os.path.join(get_tests_output_path(), "output.wav") run_cli("tts --list_models") # single speaker model run_cli(f'tts --text "This is an example." --out_path "{output_path}"') run_cli("tts --model_name tts_models/en/ljspeech/glow-tts " f'--text "This is an example." --out_path "{output_path}"') run_cli("tts --model_name tts_models/en/ljspeech/glow-tts " "--vocoder_name vocoder_models/en/ljspeech/multiband-melgan " f'--text "This is an example." --out_path "{output_path}"')
def test_voice_conversion(): print(" > Run voice conversion inference using YourTTS model.") model_name = "tts_models/multilingual/multi-dataset/your_tts" language_id = "en" speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav") reference_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0032.wav") output_path = os.path.join(get_tests_output_path(), "output.wav") run_cli( f"tts --model_name {model_name}" f" --out_path {output_path} --speaker_wav {speaker_wav} --reference_wav {reference_wav} --language_idx {language_id} " )
def test_pqmf(): w, sr = load(WAV_FILE) layer = PQMF(N=4, taps=62, cutoff=0.15, beta=9.0) w, sr = load(WAV_FILE) w2 = torch.from_numpy(w[None, None, :]) b2 = layer.analysis(w2) w2_ = layer.synthesis(b2) print(w2_.max()) print(w2_.min()) print(w2_.mean()) sf.write(os.path.join(get_tests_output_path(), "pqmf_output.wav"), w2_.flatten().detach(), sr)
def test_if_all_models_available(): """Check if all the models are downloadable.""" print( " > Checking the availability of all the models under the ModelManager." ) manager = ModelManager(output_prefix=get_tests_output_path()) model_names = manager.list_models() for model_name in model_names: manager.download_model(model_name) print(f" | > OK: {model_name}") folders = glob.glob(os.path.join(manager.output_prefix, "*")) assert len(folders) == len(model_names) shutil.rmtree(manager.output_prefix)
def test_pqmf(): w, sr = load(WAV_FILE) layer = PQMF(N=4, taps=62, cutoff=0.15, beta=9.0) w, sr = load(WAV_FILE) w2 = tf.convert_to_tensor(w[None, None, :]) b2 = layer.analysis(w2) w2_ = layer.synthesis(b2) w2_ = w2.numpy() print(w2_.max()) print(w2_.min()) print(w2_.mean()) sf.write(os.path.join(get_tests_output_path(), 'tf_pqmf_output.wav'), w2_.flatten(), sr)
def test_train_eval_log(self): batch_size = 2 config = VitsConfig( model_args=VitsArgs(num_chars=32, spec_segment_size=10)) model = Vits.init_from_config(config, verbose=False).to(device) model.run_data_dep_init = False model.train() batch = self._create_batch(config, batch_size) logger = TensorboardLogger(log_dir=os.path.join( get_tests_output_path(), "dummy_vits_logs"), model_name="vits_test_train_log") criterion = model.get_criterion() criterion = [criterion[0].to(device), criterion[1].to(device)] outputs = [None] * 2 outputs[0], _ = model.train_step(batch, criterion, 0) outputs[1], _ = model.train_step(batch, criterion, 1) model.train_log(batch, outputs, logger, None, 1) model.eval_log(batch, outputs, logger, None, 1) logger.finish()
def test_train_eval_log(self): batch_size = BATCH_SIZE input_dummy, input_lengths, mel_spec, mel_lengths, _ = self._create_inputs( batch_size) batch = {} batch["text_input"] = input_dummy batch["text_lengths"] = input_lengths batch["mel_lengths"] = mel_lengths batch["mel_input"] = mel_spec batch["d_vectors"] = None batch["speaker_ids"] = None config = GlowTTSConfig(num_chars=32) model = GlowTTS.init_from_config(config, verbose=False).to(device) model.run_data_dep_init = False model.train() logger = TensorboardLogger(log_dir=os.path.join( get_tests_output_path(), "dummy_glow_tts_logs"), model_name="glow_tts_test_train_log") criterion = model.get_criterion() outputs, _ = model.train_step(batch, criterion) model.train_log(batch, outputs, logger, None, 1) model.eval_log(batch, outputs, logger, None, 1) logger.finish()
import glob import os import shutil from tests import get_device_id, get_tests_output_path, run_cli from TTS.vocoder.configs import WavegradConfig config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") config = WavegradConfig( batch_size=8, eval_batch_size=8, num_loader_workers=0, num_val_loader_workers=0, run_eval=True, test_delay_epochs=-1, epochs=1, seq_len=8192, eval_split_size=1, print_step=1, print_eval=True, data_path="tests/data/ljspeech", output_path=output_path, test_noise_schedule={"min_val": 1e-6, "max_val": 1e-2, "num_steps": 2}, ) config.audio.do_trim_silence = True config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch
def _create_random_model(self): config = load_config(os.path.join(get_tests_output_path(), 'dummy_model_config.json')) num_chars = len(phonemes) if config.use_phonemes else len(symbols) model = setup_model(num_chars, 0, config) output_path = os.path.join(get_tests_output_path()) save_checkpoint(model, None, None, None, output_path, 10, 10)
import glob import os import shutil from tests import get_device_id, get_tests_output_path, run_cli from TTS.tts.configs import SpeedySpeechConfig config_path = os.path.join(get_tests_output_path(), "test_speedy_speech_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") config = SpeedySpeechConfig( batch_size=8, eval_batch_size=8, num_loader_workers=0, num_val_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=True, phoneme_language="zh-CN", phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", run_eval=True, test_delay_epochs=-1, epochs=1, print_step=1, print_eval=True, ) config.audio.do_trim_silence = True config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch
from TTS.config.shared_configs import BaseAudioConfig from TTS.speaker_encoder.speaker_encoder_config import SpeakerEncoderConfig def run_test_train(): command = ( f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --config_path {config_path} " f"--coqpit.output_path {output_path} " "--coqpit.datasets.0.name ljspeech " "--coqpit.datasets.0.meta_file_train metadata.csv " "--coqpit.datasets.0.meta_file_val metadata.csv " "--coqpit.datasets.0.path tests/data/ljspeech ") run_cli(command) config_path = os.path.join(get_tests_output_path(), "test_speaker_encoder_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") config = SpeakerEncoderConfig( batch_size=4, num_speakers_in_batch=1, num_utters_per_speaker=10, num_loader_workers=0, max_train_step=2, print_step=1, save_step=1, print_eval=True, audio=BaseAudioConfig(num_mels=80), ) config.audio.do_trim_silence = True
import glob import json import os import shutil from trainer import get_last_checkpoint from tests import get_device_id, get_tests_output_path, run_cli from TTS.config.shared_configs import BaseAudioConfig from TTS.tts.configs.fast_pitch_config import FastPitchConfig config_path = os.path.join(get_tests_output_path(), "fast_pitch_speaker_emb_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") audio_config = BaseAudioConfig( sample_rate=22050, do_trim_silence=True, trim_db=60.0, signal_norm=False, mel_fmin=0.0, mel_fmax=8000, spec_gain=1.0, log_func="np.log", ref_level_db=20, preemphasis=0.0, ) config = FastPitchConfig( audio=audio_config, batch_size=8,
import glob import json import os import shutil from trainer import get_last_checkpoint from tests import get_device_id, get_tests_output_path, run_cli from TTS.tts.configs.tacotron2_config import Tacotron2Config config_path = os.path.join(get_tests_output_path(), "test_model_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") config = Tacotron2Config( r=5, batch_size=8, eval_batch_size=8, num_loader_workers=0, num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=False, phoneme_language="en-us", phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), run_eval=True, test_delay_epochs=-1, epochs=1, print_step=1, print_eval=True, test_sentences=[ "Be a voice, not an echo.", ],
import shutil import unittest import numpy as np import torch from torch.utils.data import DataLoader from tests import get_tests_output_path from TTS.tts.configs import BaseTTSConfig from TTS.tts.datasets import TTSDataset from TTS.tts.datasets.preprocess import ljspeech from TTS.utils.audio import AudioProcessor # pylint: disable=unused-variable OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/") os.makedirs(OUTPATH, exist_ok=True) # create a dummy config for testing data loaders. c = BaseTTSConfig(text_cleaner="english_cleaners", num_loader_workers=0, batch_size=2) c.r = 5 c.data_path = "tests/data/ljspeech/" ok_ljspeech = os.path.exists(c.data_path) DATA_EXIST = True if not os.path.exists(c.data_path): DATA_EXIST = False print(" > Dynamic data loader test: {}".format(DATA_EXIST))
import os import unittest import torch from tests import get_tests_output_path, run_cli from TTS.config.shared_configs import BaseDatasetConfig from TTS.tts.configs.vits_config import VitsConfig torch.manual_seed(1) config_path = os.path.join(get_tests_output_path(), "test_model_config.json") dataset_config_en = BaseDatasetConfig( name="ljspeech", meta_file_train="metadata.csv", meta_file_val="metadata.csv", path="tests/data/ljspeech", language="en", ) dataset_config_pt = BaseDatasetConfig( name="ljspeech", meta_file_train="metadata.csv", meta_file_val="metadata.csv", path="tests/data/ljspeech", language="pt-br", ) # pylint: disable=protected-access
import os import unittest from tests import get_tests_input_path, get_tests_output_path, get_tests_path from TTS.config import BaseAudioConfig from TTS.utils.audio import AudioProcessor TESTS_PATH = get_tests_path() OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests") WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") os.makedirs(OUT_PATH, exist_ok=True) conf = BaseAudioConfig(mel_fmax=8000) # pylint: disable=protected-access class TestAudio(unittest.TestCase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.ap = AudioProcessor(**conf) def test_audio_synthesis(self): """1. load wav 2. set normalization parameters 3. extract mel-spec 4. invert to wav and save the output """ print(" > Sanity check for the process wav -> mel -> wav") def _test(max_norm, signal_norm, symmetric_norm, clip_norm): self.ap.max_norm = max_norm
import glob import json import os import shutil from trainer import get_last_checkpoint from tests import get_device_id, get_tests_output_path, run_cli from TTS.tts.configs.glow_tts_config import GlowTTSConfig config_path = os.path.join(get_tests_output_path(), "test_model_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") config = GlowTTSConfig( batch_size=2, eval_batch_size=8, num_loader_workers=0, num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=True, phoneme_language="en-us", phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", run_eval=True, test_delay_epochs=-1, epochs=1, print_step=1, print_eval=True, test_sentences=[ "Be a voice, not an echo.", ], data_dep_init_steps=1.0,