def __init__(self, torch_device=None): if torch_device is None: if torch.cuda.is_available(): torch_device = 'cuda' else: torch_device = 'cpu' self.tacotron_file_config = path.join(TTS_WORK_DIR, TTS_CONFIG_FILE) self.tacotron_file_checkpoints = path.join(TTS_WORK_DIR, TTS_MODEL_WEIGHTS) self.vocoder_file_config = path.join(TTS_WORK_DIR, TTS_CONFIG_FILE) self.vocoder_file_checkpoints = path.join(VOCODER_WORK_DIR, VOCODER_MODEL_WEIGHTS) # Tacotron2 Loading self.tacotron_instance = Text2Speech( self.tacotron_file_config, self.tacotron_file_checkpoints, device=torch_device, threshold=0.5, minlenratio=0.0, maxlenratio=10.0, use_att_constraint=False, backward_window=1, forward_window=3 ) self.tacotron_instance.spc2wav = None # Vocoder Loading self.vocoder = load_model(self.vocoder_file_checkpoints)\ .to(torch_device)\ .eval() self.vocoder.remove_weight_norm()
def __init__(self, worker_id=1, audio_dest='audio/', audio_format='.wav'): self.id = worker_id #Model selection self.fs = 22050 self.lang = "English" self.tag = "kan-bayashi/ljspeech_tacotron2" self.vocoder_tag = "ljspeech_parallel_wavegan.v1" #Model setup self.d = ModelDownloader() self.text2speech = Text2Speech( **self.d.download_and_unpack(self.tag), device="cpu", # Only for Tacotron 2 threshold=0.5, minlenratio=0.0, maxlenratio=10.0, use_att_constraint=False, backward_window=1, forward_window=3, ) self.vocoder = load_model(download_pretrained_model( self.vocoder_tag)).to("cpu").eval() self.text2speech.spc2wav = None self.vocoder.remove_weight_norm() self.audio_d = audio_dest self.audio_f = audio_format
def get_text2speech( model_name='kan-bayashi/jsut_tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_train.loss.best' ): d = ModelDownloader() text2speech = Text2Speech(**d.download_and_unpack(model_name), device='cuda') return text2speech
def __init__(self, model_id: str): self.model = Text2Speech.from_pretrained(model_id, device="cpu") if hasattr(self.model, "fs"): self.sampling_rate = self.model.fs else: # 16000 by default if not specified self.sampling_rate = 16000
def _tts(model_name): d = ModelDownloader() text2speech = Text2Speech(**d.download_and_unpack(model_name)) speech = np.zeros((10000, ), dtype=np.float32) if text2speech.use_speech: text2speech("foo", speech=speech) else: text2speech("foo")
def _tts(model_name): d = ModelDownloader("downloads") text2speech = Text2Speech(**d.download_and_unpack(model_name, quiet=True)) inputs = {"text": "foo"} if text2speech.use_speech: inputs["speech"] = np.zeros((10000,), dtype=np.float32) if text2speech.tts.spk_embed_dim is not None: inputs["spembs"] = np.zeros((text2speech.tts.spk_embed_dim,), dtype=np.float32) text2speech(**inputs)
def setup_model(self): try: self.model_reload_needed = False self.output_status("Loading nltk...") # setup nltk import nltk nltk.data.path.append(MODEL_DIR + '/nltk_models') try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt', download_dir=MODEL_DIR + "/nltk_models") self.output_status("Loading torch...", end=" ") # setup model import torch from espnet_model_zoo.downloader import ModelDownloader from espnet2.bin.tts_inference import Text2Speech from parallel_wavegan.utils import download_pretrained_model from parallel_wavegan.utils import load_model self.mlDevice = "cuda" if torch.cuda.is_available() else "cpu" self.output_status("Running on " + self.mlDevice) self.output_status("Loading espnet...") d = ModelDownloader(MODEL_DIR + "/espnet_models") self.text2speech = Text2Speech( **d.download_and_unpack(self.tag), device=self.mlDevice, # Only for Tacotron 2 threshold=0.5, minlenratio=0.0, maxlenratio=10.0, use_att_constraint=False, backward_window=1, forward_window=3, # Only for FastSpeech & FastSpeech2 speed_control_alpha=1.0, ) self.text2speech.spc2wav = None # Disable griffin-lim # NOTE: Sometimes download is failed due to "Permission denied". That is # the limitation of google drive. Please retry after serveral hours. self.output_status("Loading vocoder models...") self.vocoder = load_model( download_pretrained_model(self.vocoder_tag, download_dir=MODEL_DIR + "/vocoder_models")).to( self.mlDevice).eval() self.vocoder.remove_weight_norm() self.output_status("Model setup completed.") except Exception as e: self.output_err("Model error", e) raise HandledException()
def get_acoustic_model(self): acoustic_model = Text2Speech( self.acoustic_model_config_path, self.acoustic_model_path, device=self.device, threshold=0.5, minlenratio=0.0, maxlenratio=10.0, use_att_constraint=False, backward_window=1, forward_window=3 ) acoustic_model.spc2wav = None return acoustic_model
tag = "kan-bayashi/ljspeech_conformer_fastspeech2" vocoder_tag = "ljspeech_full_band_melgan.v2" import time import os import torch import soundfile as sf from espnet_model_zoo.downloader import ModelDownloader from espnet2.bin.tts_inference import Text2Speech from parallel_wavegan.utils import download_pretrained_model from parallel_wavegan.utils import load_model d = ModelDownloader() text2speech = Text2Speech( **d.download_and_unpack(tag), device="cuda", speed_control_alpha=1.0, ) text2speech.spc2wav = None # Disable griffin-lim vocoder = load_model(download_pretrained_model(vocoder_tag)).to("cuda").eval() vocoder.remove_weight_norm() while True: conn, addr = s.accept() data = conn.recv(1024) encoding = 'utf-8' data = str(data, encoding) conn.close() # synthesis with torch.no_grad(): start = time.time()
def test_Text2Speech(config_file): text2speech = Text2Speech(train_config=config_file) text = "aiueo" text2speech(text)
from espnet2.bin.tts_inference import Text2Speech from parallel_wavegan.utils import download_pretrained_model from parallel_wavegan.utils import load_model fs, lang = 22050, "English" tag = "kan-bayashi/ljspeech_tacotron2" vocoder_tag = "ljspeech_multi_band_melgan.v2" d = ModelDownloader() text2speech = Text2Speech( **d.download_and_unpack(tag), device="cpu", # Only for Tacotron 2 threshold=0.5, minlenratio=0.0, maxlenratio=10.0, use_att_constraint=False, backward_window=1, forward_window=3, # Only for FastSpeech & FastSpeech2 speed_control_alpha=1.0, ) text2speech.spc2wav = None # Disable griffin-lim # NOTE: Sometimes download is failed due to "Permission denied". That is # the limitation of google drive. Please retry after serveral hours. vocoder = load_model(download_pretrained_model(vocoder_tag)).to("cpu").eval() vocoder.remove_weight_norm() def pronounce(x): with torch.no_grad():
## specify the path to vocoder's checkpoint vocoder_checkpoint = "exp/vocoder/checkpoint-400000steps.pkl" vocoder = load_model(vocoder_checkpoint).to("cuda").eval() vocoder.remove_weight_norm() ## specify path to the main model(transformer/tacotron2/fastspeech) and its config file config_file = "exp/tts_train_raw_char/config.yaml" model_path = "exp/tts_train_raw_char/train.loss.ave_5best.pth" text2speech = Text2Speech( config_file, model_path, device="cuda", # Only for Tacotron 2 threshold=0.5, minlenratio=0.0, maxlenratio=10.0, use_att_constraint=True, backward_window=1, forward_window=3, # Only for FastSpeech & FastSpeech2 speed_control_alpha=1.0, ) text2speech.spc2wav = None # Disable griffin-lim args = get_args() sample_text = args.text with torch.no_grad(): _, c_mel, *_ = text2speech(sample_text.lower()) wav = vocoder.inference(c_mel) ## here all of your synthesized audios will be saved
vocoder_tag = "ljspeech_multi_band_melgan.v2" #@param ["ljspeech_parallel_wavegan.v1", "ljspeech_full_band_melgan.v2", "ljspeech_multi_band_melgan.v2"] {type:"string"} vocoder = load_model( download_pretrained_model(vocoder_tag)).to("cuda").eval() vocoder.remove_weight_norm() models = [ # "transformer", # "tacotron2", # "fastspeech", "fastspeech2", # "conformer_fastspeech2", ] for model in models: text2speech = Text2Speech( "./egs2/ljspeech/tts1/exp/tts_train_{}_raw_phn_tacotron_g2p_en_no_space/config.yaml" .format(model), "./egs2/ljspeech/tts1/exp/tts_train_{}_raw_phn_tacotron_g2p_en_no_space/valid.loss.best.pth" .format(model), device="cuda") fs = text2speech.fs text2speech.spc2wav = None texts = [ # "they state that they are compelled by an imperative sense of duty to advert in terms of decided condemnation to the lamentable condition of the prisons of the city of London,", # "they state that they are compelled by an imperative sense of duty to advert in terms of decided condemnation to the lamentable condition of the prisons of the city of London, The prison officials appear to be on the side of the inspectors, to the great dissatisfaction of the corporation, who claimed the full allegiance and support of its servants. " # "The Court in addition to the proper use of its judicial functions has improperly set itself up as a third house of the Congress. If, for instance, any one of the six justices of the Supreme Court now over the age of seventy should retire as provided under the plan, " # "If such a plan is good for the lower courts it certainly ought to be equally good for the highest court from which there is no appeal. Is it a dangerous precedent for the Congress to change the number of the justices? The Congress has always had, and will have, that power.", # "The prison officials appear to be on the side of the inspectors, to the great dissatisfaction of the corporation, who claimed the full allegiance and support of its servants.", # "in some yards", # "We have, therefore,", "The feature matching loss is a learned similarity metric measured by the difference in features of the discriminator between a ground truth sample and a generated sample", ] with torch.no_grad():