def from_pretrained( model_tag: Optional[str] = None, **kwargs: Optional[Any], ): """Build SeparateSpeech instance from the pretrained model. Args: model_tag (Optional[str]): Model tag of the pretrained models. Currently, the tags of espnet_model_zoo are supported. Returns: SeparateSpeech: SeparateSpeech instance. """ if model_tag is not None: try: from espnet_model_zoo.downloader import ModelDownloader except ImportError: logging.error( "`espnet_model_zoo` is not installed. " "Please install via `pip install -U espnet_model_zoo`.") raise d = ModelDownloader() kwargs.update(**d.download_and_unpack(model_tag)) return SeparateSpeech(**kwargs)
def from_pretrained( model_tag: Optional[str] = None, **kwargs: Optional[Any], ) -> Speech2Text: """Build Speech2Text instance from the pretrained model. Args: model_tag: Model tag of the pretrained models. Return: : Speech2Text instance. """ if model_tag is not None: try: from espnet_model_zoo.downloader import ModelDownloader except ImportError: logging.error( "`espnet_model_zoo` is not installed. " "Please install via `pip install -U espnet_model_zoo`.") raise d = ModelDownloader() kwargs.update(**d.download_and_unpack(model_tag)) return Speech2Text(**kwargs)
def __init__(self, model_name, trans_df): from espnet2.bin.asr_inference import Speech2Text from espnet_model_zoo.downloader import ModelDownloader import jiwer self.model_name = model_name d = ModelDownloader() self.asr_model = Speech2Text(**d.download_and_unpack(model_name)) self.input_txt_list = [] self.clean_txt_list = [] self.output_txt_list = [] self.transcriptions = [] self.true_txt_list = [] self.sample_rate = int( d.data_frame[d.data_frame["name"] == model_name]["fs"]) self.trans_df = trans_df self.trans_dic = self._df_to_dict(trans_df) self.mix_counter = Counter() self.clean_counter = Counter() self.est_counter = Counter() self.transformation = jiwer.Compose([ jiwer.ToLowerCase(), jiwer.RemovePunctuation(), jiwer.RemoveMultipleSpaces(), jiwer.Strip(), jiwer.SentencesToListOfWords(), jiwer.RemoveEmptyStrings(), ])
def _asr(model_name): d = ModelDownloader("downloads") speech2text = Speech2Text(**d.download_and_unpack(model_name, quiet=True)) speech = np.zeros((10000, ), dtype=np.float32) nbests = speech2text(speech) text, *_ = nbests[0] assert isinstance(text, str)
def get_text2speech( model_name='kan-bayashi/jsut_tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_train.loss.best' ): d = ModelDownloader() text2speech = Text2Speech(**d.download_and_unpack(model_name), device='cuda') return text2speech
def _tts(model_name): d = ModelDownloader() text2speech = Text2Speech(**d.download_and_unpack(model_name)) speech = np.zeros((10000, ), dtype=np.float32) if text2speech.use_speech: text2speech("foo", speech=speech) else: text2speech("foo")
def from_pretrained( model_tag: Optional[str] = None, vocoder_tag: Optional[str] = None, **kwargs: Optional[Any], ): """Build Text2Speech instance from the pretrained model. Args: model_tag (Optional[str]): Model tag of the pretrained models. Currently, the tags of espnet_model_zoo are supported. vocoder_tag (Optional[str]): Vocoder tag of the pretrained vocoders. Currently, the tags of parallel_wavegan are supported, which should start with the prefix "parallel_wavegan/". Returns: Text2Speech: Text2Speech instance. """ if model_tag is not None: try: from espnet_model_zoo.downloader import ModelDownloader except ImportError: logging.error( "`espnet_model_zoo` is not installed. " "Please install via `pip install -U espnet_model_zoo`.") raise d = ModelDownloader() kwargs.update(**d.download_and_unpack(model_tag)) if vocoder_tag is not None: if vocoder_tag.startswith("parallel_wavegan/"): try: from parallel_wavegan.utils import download_pretrained_model except ImportError: logging.error( "`parallel_wavegan` is not installed. " "Please install via `pip install -U parallel_wavegan`." ) raise from parallel_wavegan import __version__ # NOTE(kan-bayashi): Filelock download is supported from 0.5.2 assert LooseVersion(__version__) > LooseVersion("0.5.1"), ( "Please install the latest parallel_wavegan " "via `pip install -U parallel_wavegan`.") vocoder_tag = vocoder_tag.replace("parallel_wavegan/", "") vocoder_file = download_pretrained_model(vocoder_tag) vocoder_config = Path(vocoder_file).parent / "config.yml" kwargs.update(vocoder_config=vocoder_config, vocoder_file=vocoder_file) else: raise ValueError(f"{vocoder_tag} is unsupported format.") return Text2Speech(**kwargs)
def setup_model(self): try: self.model_reload_needed = False self.output_status("Loading nltk...") # setup nltk import nltk nltk.data.path.append(MODEL_DIR + '/nltk_models') try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt', download_dir=MODEL_DIR + "/nltk_models") self.output_status("Loading torch...", end=" ") # setup model import torch from espnet_model_zoo.downloader import ModelDownloader from espnet2.bin.tts_inference import Text2Speech from parallel_wavegan.utils import download_pretrained_model from parallel_wavegan.utils import load_model self.mlDevice = "cuda" if torch.cuda.is_available() else "cpu" self.output_status("Running on " + self.mlDevice) self.output_status("Loading espnet...") d = ModelDownloader(MODEL_DIR + "/espnet_models") self.text2speech = Text2Speech( **d.download_and_unpack(self.tag), device=self.mlDevice, # Only for Tacotron 2 threshold=0.5, minlenratio=0.0, maxlenratio=10.0, use_att_constraint=False, backward_window=1, forward_window=3, # Only for FastSpeech & FastSpeech2 speed_control_alpha=1.0, ) self.text2speech.spc2wav = None # Disable griffin-lim # NOTE: Sometimes download is failed due to "Permission denied". That is # the limitation of google drive. Please retry after serveral hours. self.output_status("Loading vocoder models...") self.vocoder = load_model( download_pretrained_model(self.vocoder_tag, download_dir=MODEL_DIR + "/vocoder_models")).to( self.mlDevice).eval() self.vocoder.remove_weight_norm() self.output_status("Model setup completed.") except Exception as e: self.output_err("Model error", e) raise HandledException()
def _tts(model_name): d = ModelDownloader("downloads") text2speech = Text2Speech(**d.download_and_unpack(model_name, quiet=True)) inputs = {"text": "foo"} if text2speech.use_speech: inputs["speech"] = np.zeros((10000,), dtype=np.float32) if text2speech.tts.spk_embed_dim is not None: inputs["spembs"] = np.zeros((text2speech.tts.spk_embed_dim,), dtype=np.float32) text2speech(**inputs)
def get_speech2text(): d = ModelDownloader() speech2text = Speech2Text( # Specify task and corpus # **d.download_and_unpack(task="asr", corpus="librispeech") **d.download_and_unpack( "Shinji Watanabe/laborotv_asr_train_asr_conformer2_latest33_raw_char_sp_valid.acc.ave" ), device='cuda') return speech2text
def main(cmd=None): print(get_commandline_args(), file=sys.stderr) parser = get_parser() args = parser.parse_args(cmd) d = ModelDownloader(".cache/espnet") o = d.download_and_unpack(args.mdl_file) kwargs = vars(args) kwargs.update(o) kwargs.update( {'data_path_and_name_and_type': [(args.wav_scp, 'speech', 'sound')]}) del args.mdl_file del args.wav_scp kwargs.pop("config", None) inference(**kwargs)
def __init__(self, model_name, trans_df): from espnet2.bin.asr_inference import Speech2Text from espnet_model_zoo.downloader import ModelDownloader self.model_name = model_name d = ModelDownloader() self.asr_model = Speech2Text(**d.download_and_unpack(model_name)) self.input_txt_list = [] self.clean_txt_list = [] self.output_txt_list = [] self.sample_rate = int( d.data_frame[d.data_frame["name"] == model_name]["fs"]) self.trans_df = trans_df self.trans_dic = self._df_to_dict(trans_df) self.mix_counter = Counter() self.clean_counter = Counter() self.est_counter = Counter()
class TTS_Worker: def __init__(self, worker_id=1, audio_dest='audio/', audio_format='.wav'): self.id = worker_id #Model selection self.fs = 22050 self.lang = "English" self.tag = "kan-bayashi/ljspeech_tacotron2" self.vocoder_tag = "ljspeech_parallel_wavegan.v1" #Model setup self.d = ModelDownloader() self.text2speech = Text2Speech( **self.d.download_and_unpack(self.tag), device="cpu", # Only for Tacotron 2 threshold=0.5, minlenratio=0.0, maxlenratio=10.0, use_att_constraint=False, backward_window=1, forward_window=3, ) self.vocoder = load_model(download_pretrained_model( self.vocoder_tag)).to("cpu").eval() self.text2speech.spc2wav = None self.vocoder.remove_weight_norm() self.audio_d = audio_dest self.audio_f = audio_format def process_text(self, text, dest): print(f'Worker {self.id} attempting : {text}') with torch.no_grad(): #start = time.time() wav, c, *_ = self.text2speech(text) wav = self.vocoder.inference(c) #rtf = (time.time() - start) / (len(wav) / self.fs) #print(f"RTF = {rtf:5f}") #Output generation wav = wav.view(-1).cpu().numpy() sfwrite(self.audio_d + dest + self.audio_f, wav, self.fs) print(f'Worker {self.id} finished : {text}')
def test_download_and_clean_cache(): d = ModelDownloader() d.download_and_unpack("test") p = d.download("test") d.clean_cache("test") assert not Path(p).exists()
def test_download_and_unpack_local_file(): d = ModelDownloader() path = d.download("test") d.download_and_unpack(path)
def test_download_and_unpack_non_matching(): d = ModelDownloader() with pytest.raises(RuntimeError): d.download_and_unpack(task="dummy")
def test_download_and_unpack_no_inputting(): d = ModelDownloader() with pytest.raises(TypeError): d.download_and_unpack()
def test_download_and_unpack_with_name(): d = ModelDownloader() d.download_and_unpack("test")
def test_download_and_unpack_with_url(): d = ModelDownloader() d.download_and_unpack("https://zenodo.org/record/3951842/files/test.zip?download=1")
fs, lang = 22050, "English" tag = "kan-bayashi/ljspeech_conformer_fastspeech2" vocoder_tag = "ljspeech_full_band_melgan.v2" import time import os import torch import soundfile as sf from espnet_model_zoo.downloader import ModelDownloader from espnet2.bin.tts_inference import Text2Speech from parallel_wavegan.utils import download_pretrained_model from parallel_wavegan.utils import load_model d = ModelDownloader() text2speech = Text2Speech( **d.download_and_unpack(tag), device="cuda", speed_control_alpha=1.0, ) text2speech.spc2wav = None # Disable griffin-lim vocoder = load_model(download_pretrained_model(vocoder_tag)).to("cuda").eval() vocoder.remove_weight_norm() while True: conn, addr = s.accept() data = conn.recv(1024) encoding = 'utf-8' data = str(data, encoding) conn.close() # synthesis with torch.no_grad():
import Levenshtein import json import warnings from pathlib import Path import hydra.utils as utils import matplotlib.pyplot as plt import pickle import seaborn as sns from utils import normalize sns.set() warnings.simplefilter('ignore') d = ModelDownloader() speech2text_en = Speech2Text( **d.download_and_unpack(task="asr", corpus="librispeech") ) def log_spec_dB_dist(x, y): log_spec_dB_const = 10.0 / math.log(10.0) * math.sqrt(2.0) diff = x - y return log_spec_dB_const * math.sqrt(np.inner(diff, diff)) def average_mcd(mc_ref, mc_cv): _, path = fastdtw(mc_cv, mc_ref, dist=euclidean) twf = np.array(path).T cvt_mcc_dtw = mc_cv[twf[0]] trg_mcc_dtw = mc_ref[twf[1]] # MCD diff2sum = np.sum((cvt_mcc_dtw - trg_mcc_dtw)**2, 1) mcd_value = np.mean(10.0 / np.log(10.0) * np.sqrt(2 * diff2sum), 0)