Пример #1
0
    def from_pretrained(
        model_tag: Optional[str] = None,
        **kwargs: Optional[Any],
    ):
        """Build SeparateSpeech instance from the pretrained model.

        Args:
            model_tag (Optional[str]): Model tag of the pretrained models.
                Currently, the tags of espnet_model_zoo are supported.

        Returns:
            SeparateSpeech: SeparateSpeech instance.

        """
        if model_tag is not None:
            try:
                from espnet_model_zoo.downloader import ModelDownloader

            except ImportError:
                logging.error(
                    "`espnet_model_zoo` is not installed. "
                    "Please install via `pip install -U espnet_model_zoo`.")
                raise
            d = ModelDownloader()
            kwargs.update(**d.download_and_unpack(model_tag))

        return SeparateSpeech(**kwargs)
Пример #2
0
    def from_pretrained(
        model_tag: Optional[str] = None,
        **kwargs: Optional[Any],
    ) -> Speech2Text:
        """Build Speech2Text instance from the pretrained model.

        Args:
            model_tag: Model tag of the pretrained models.

        Return:
            : Speech2Text instance.

        """
        if model_tag is not None:
            try:
                from espnet_model_zoo.downloader import ModelDownloader

            except ImportError:
                logging.error(
                    "`espnet_model_zoo` is not installed. "
                    "Please install via `pip install -U espnet_model_zoo`.")
                raise
            d = ModelDownloader()
            kwargs.update(**d.download_and_unpack(model_tag))

        return Speech2Text(**kwargs)
Пример #3
0
def _asr(model_name):
    d = ModelDownloader("downloads")
    speech2text = Speech2Text(**d.download_and_unpack(model_name, quiet=True))
    speech = np.zeros((10000, ), dtype=np.float32)
    nbests = speech2text(speech)
    text, *_ = nbests[0]
    assert isinstance(text, str)
Пример #4
0
    def __init__(self, model_name, trans_df):

        from espnet2.bin.asr_inference import Speech2Text
        from espnet_model_zoo.downloader import ModelDownloader
        import jiwer

        self.model_name = model_name
        d = ModelDownloader()
        self.asr_model = Speech2Text(**d.download_and_unpack(model_name))
        self.input_txt_list = []
        self.clean_txt_list = []
        self.output_txt_list = []
        self.transcriptions = []
        self.true_txt_list = []
        self.sample_rate = int(
            d.data_frame[d.data_frame["name"] == model_name]["fs"])
        self.trans_df = trans_df
        self.trans_dic = self._df_to_dict(trans_df)
        self.mix_counter = Counter()
        self.clean_counter = Counter()
        self.est_counter = Counter()
        self.transformation = jiwer.Compose([
            jiwer.ToLowerCase(),
            jiwer.RemovePunctuation(),
            jiwer.RemoveMultipleSpaces(),
            jiwer.Strip(),
            jiwer.SentencesToListOfWords(),
            jiwer.RemoveEmptyStrings(),
        ])
Пример #5
0
    def __init__(self, worker_id=1, audio_dest='audio/', audio_format='.wav'):
        self.id = worker_id
        #Model selection
        self.fs = 22050
        self.lang = "English"
        self.tag = "kan-bayashi/ljspeech_tacotron2"
        self.vocoder_tag = "ljspeech_parallel_wavegan.v1"

        #Model setup
        self.d = ModelDownloader()
        self.text2speech = Text2Speech(
            **self.d.download_and_unpack(self.tag),
            device="cpu",
            # Only for Tacotron 2
            threshold=0.5,
            minlenratio=0.0,
            maxlenratio=10.0,
            use_att_constraint=False,
            backward_window=1,
            forward_window=3,
        )
        self.vocoder = load_model(download_pretrained_model(
            self.vocoder_tag)).to("cpu").eval()

        self.text2speech.spc2wav = None
        self.vocoder.remove_weight_norm()

        self.audio_d = audio_dest
        self.audio_f = audio_format
def get_text2speech(
    model_name='kan-bayashi/jsut_tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_train.loss.best'
):
    d = ModelDownloader()
    text2speech = Text2Speech(**d.download_and_unpack(model_name),
                              device='cuda')
    return text2speech
Пример #7
0
def _tts(model_name):
    d = ModelDownloader()
    text2speech = Text2Speech(**d.download_and_unpack(model_name))
    speech = np.zeros((10000, ), dtype=np.float32)
    if text2speech.use_speech:
        text2speech("foo", speech=speech)
    else:
        text2speech("foo")
Пример #8
0
    def from_pretrained(
        model_tag: Optional[str] = None,
        vocoder_tag: Optional[str] = None,
        **kwargs: Optional[Any],
    ):
        """Build Text2Speech instance from the pretrained model.

        Args:
            model_tag (Optional[str]): Model tag of the pretrained models.
                Currently, the tags of espnet_model_zoo are supported.
            vocoder_tag (Optional[str]): Vocoder tag of the pretrained vocoders.
                Currently, the tags of parallel_wavegan are supported, which should
                start with the prefix "parallel_wavegan/".

        Returns:
            Text2Speech: Text2Speech instance.

        """
        if model_tag is not None:
            try:
                from espnet_model_zoo.downloader import ModelDownloader

            except ImportError:
                logging.error(
                    "`espnet_model_zoo` is not installed. "
                    "Please install via `pip install -U espnet_model_zoo`.")
                raise
            d = ModelDownloader()
            kwargs.update(**d.download_and_unpack(model_tag))

        if vocoder_tag is not None:
            if vocoder_tag.startswith("parallel_wavegan/"):
                try:
                    from parallel_wavegan.utils import download_pretrained_model

                except ImportError:
                    logging.error(
                        "`parallel_wavegan` is not installed. "
                        "Please install via `pip install -U parallel_wavegan`."
                    )
                    raise

                from parallel_wavegan import __version__

                # NOTE(kan-bayashi): Filelock download is supported from 0.5.2
                assert LooseVersion(__version__) > LooseVersion("0.5.1"), (
                    "Please install the latest parallel_wavegan "
                    "via `pip install -U parallel_wavegan`.")
                vocoder_tag = vocoder_tag.replace("parallel_wavegan/", "")
                vocoder_file = download_pretrained_model(vocoder_tag)
                vocoder_config = Path(vocoder_file).parent / "config.yml"
                kwargs.update(vocoder_config=vocoder_config,
                              vocoder_file=vocoder_file)

            else:
                raise ValueError(f"{vocoder_tag} is unsupported format.")

        return Text2Speech(**kwargs)
Пример #9
0
def _tts(model_name):
    d = ModelDownloader("downloads")
    text2speech = Text2Speech(**d.download_and_unpack(model_name, quiet=True))
    inputs = {"text": "foo"}
    if text2speech.use_speech:
        inputs["speech"] = np.zeros((10000,), dtype=np.float32)
    if text2speech.tts.spk_embed_dim is not None:
        inputs["spembs"] = np.zeros((text2speech.tts.spk_embed_dim,), dtype=np.float32)
    text2speech(**inputs)
Пример #10
0
    def setup_model(self):
        try:
            self.model_reload_needed = False
            self.output_status("Loading nltk...")

            # setup nltk
            import nltk
            nltk.data.path.append(MODEL_DIR + '/nltk_models')
            try:
                nltk.data.find('tokenizers/punkt')
            except LookupError:
                nltk.download('punkt', download_dir=MODEL_DIR + "/nltk_models")

            self.output_status("Loading torch...", end=" ")

            # setup model
            import torch
            from espnet_model_zoo.downloader import ModelDownloader
            from espnet2.bin.tts_inference import Text2Speech
            from parallel_wavegan.utils import download_pretrained_model
            from parallel_wavegan.utils import load_model

            self.mlDevice = "cuda" if torch.cuda.is_available() else "cpu"
            self.output_status("Running on " + self.mlDevice)

            self.output_status("Loading espnet...")

            d = ModelDownloader(MODEL_DIR + "/espnet_models")
            self.text2speech = Text2Speech(
                **d.download_and_unpack(self.tag),
                device=self.mlDevice,
                # Only for Tacotron 2
                threshold=0.5,
                minlenratio=0.0,
                maxlenratio=10.0,
                use_att_constraint=False,
                backward_window=1,
                forward_window=3,
                # Only for FastSpeech & FastSpeech2
                speed_control_alpha=1.0,
            )
            self.text2speech.spc2wav = None  # Disable griffin-lim
            # NOTE: Sometimes download is failed due to "Permission denied". That is
            #   the limitation of google drive. Please retry after serveral hours.

            self.output_status("Loading vocoder models...")

            self.vocoder = load_model(
                download_pretrained_model(self.vocoder_tag,
                                          download_dir=MODEL_DIR +
                                          "/vocoder_models")).to(
                                              self.mlDevice).eval()
            self.vocoder.remove_weight_norm()
            self.output_status("Model setup completed.")
        except Exception as e:
            self.output_err("Model error", e)
            raise HandledException()
def get_speech2text():
    d = ModelDownloader()
    speech2text = Speech2Text(
        # Specify task and corpus
        # **d.download_and_unpack(task="asr", corpus="librispeech")
        **d.download_and_unpack(
            "Shinji Watanabe/laborotv_asr_train_asr_conformer2_latest33_raw_char_sp_valid.acc.ave"
        ),
        device='cuda')
    return speech2text
Пример #12
0
def test_model():
    d = ModelDownloader()
    tasks = ["asr", "tts"]

    for task in tasks:
        for model_name in d.query(task=task):
            if d.query("valid", name=model_name)[0] == "false":
                continue
            print(f"#### Test {model_name} ####")

            if task == "asr":
                _asr(model_name)
            elif task == "tts":
                _tts(model_name)
            else:
                raise NotImplementedError(f"task={task}")
Пример #13
0
def main(cmd=None):
    print(get_commandline_args(), file=sys.stderr)
    parser = get_parser()
    args = parser.parse_args(cmd)

    d = ModelDownloader(".cache/espnet")
    o = d.download_and_unpack(args.mdl_file)
    kwargs = vars(args)
    kwargs.update(o)
    kwargs.update(
        {'data_path_and_name_and_type': [(args.wav_scp, 'speech', 'sound')]})
    del args.mdl_file
    del args.wav_scp

    kwargs.pop("config", None)
    inference(**kwargs)
Пример #14
0
    def __init__(self, model_name, trans_df):

        from espnet2.bin.asr_inference import Speech2Text
        from espnet_model_zoo.downloader import ModelDownloader

        self.model_name = model_name
        d = ModelDownloader()
        self.asr_model = Speech2Text(**d.download_and_unpack(model_name))
        self.input_txt_list = []
        self.clean_txt_list = []
        self.output_txt_list = []
        self.sample_rate = int(
            d.data_frame[d.data_frame["name"] == model_name]["fs"])
        self.trans_df = trans_df
        self.trans_dic = self._df_to_dict(trans_df)
        self.mix_counter = Counter()
        self.clean_counter = Counter()
        self.est_counter = Counter()
Пример #15
0
def test_model():
    d = ModelDownloader()
    tasks = ["asr", "tts"]

    for task in tasks:
        for corpus in list(set(d.query("corpus", task=task))):
            for model_name in d.query(task=task, corpus=corpus):
                if d.query("valid", name=model_name)[0] == "false":
                    continue
                print(f"#### Test {model_name} ####")

                if task == "asr":
                    _asr(model_name)
                elif task == "tts":
                    _tts(model_name)
                else:
                    raise NotImplementedError(f"task={task}")

            # NOTE(kan-bayashi): remove and recreate cache dir to reduce the disk usage.
            shutil.rmtree("downloads")
            os.makedirs("downloads")
Пример #16
0
class TTS_Worker:
    def __init__(self, worker_id=1, audio_dest='audio/', audio_format='.wav'):
        self.id = worker_id
        #Model selection
        self.fs = 22050
        self.lang = "English"
        self.tag = "kan-bayashi/ljspeech_tacotron2"
        self.vocoder_tag = "ljspeech_parallel_wavegan.v1"

        #Model setup
        self.d = ModelDownloader()
        self.text2speech = Text2Speech(
            **self.d.download_and_unpack(self.tag),
            device="cpu",
            # Only for Tacotron 2
            threshold=0.5,
            minlenratio=0.0,
            maxlenratio=10.0,
            use_att_constraint=False,
            backward_window=1,
            forward_window=3,
        )
        self.vocoder = load_model(download_pretrained_model(
            self.vocoder_tag)).to("cpu").eval()

        self.text2speech.spc2wav = None
        self.vocoder.remove_weight_norm()

        self.audio_d = audio_dest
        self.audio_f = audio_format

    def process_text(self, text, dest):
        print(f'Worker {self.id} attempting : {text}')
        with torch.no_grad():
            #start = time.time()
            wav, c, *_ = self.text2speech(text)
            wav = self.vocoder.inference(c)
        #rtf = (time.time() - start) / (len(wav) / self.fs)
        #print(f"RTF = {rtf:5f}")

        #Output generation
        wav = wav.view(-1).cpu().numpy()
        sfwrite(self.audio_d + dest + self.audio_f, wav, self.fs)
        print(f'Worker {self.id} finished : {text}')
Пример #17
0
def create_Readme_file(repo_name, model_name):
    # Fill in the blanks in the template Readme eg. add task tags, model name etc.
    d = ModelDownloader()
    corpus_name = d.query("corpus", name=model_name)[0]
    task_name = d.query("task", name=model_name)[0]
    url_name = d.query("url", name=model_name)[0].split("files/")[0]
    user_name = model_name.split("/")[0]
    lang_name = d.query("lang", name=model_name)[0].replace("jp", "ja")
    template_Readme = open("TEMPLATE_Readme.md")
    new_Readme = open(repo_name + "/README.md", "w")
    lines_arr = [line for line in template_Readme]
    line_final_arr = []
    for line in lines_arr:
        if "<add_more_tags>" in line:
            if task_name == "asr":
                line = line.replace("<add_more_tags>",
                                    "automatic-speech-recognition")
            elif task_name == "tts":
                line = line.replace("<add_more_tags>", "text-to-speech")
            elif task_name == "enh":
                line = line.replace("<add_more_tags>",
                                    "speech-enhancement\n- audio-to-audio")
        if "<add_lang>" in line:
            if lang_name == "multilingual":
                line = line.replace("<add_lang>",
                                    "en\n- zh\n- ja\n- multilingual")
            else:
                line = line.replace("<add_lang>", lang_name)
        line = line.replace("<add_model_name>", model_name)
        line = line.replace("<add_url>", url_name)
        line = line.replace("<add_name>", user_name)
        line = line.replace("<add_corpus>", corpus_name)
        line = line.replace("<add_task_name>", task_name.upper())
        line = line.replace("<add_recipe_task_name>", task_name.lower() + "1")
        if "<add_tts_reference>" in line:
            if task_name == "tts":
                line = line.replace("<add_tts_reference>", tts_reference)
            else:
                line = line.replace("<add_tts_reference>", "")
        new_Readme.write(line)
Пример #18
0
    kwargs = vars(args)
    kwargs.pop("config", None)
    inference(**kwargs)


if __name__ == "__main__":
    from parallel_wavegan.utils import download_pretrained_model
    from parallel_wavegan.utils import load_model
    from espnet_model_zoo.downloader import ModelDownloader
    import soundfile as sf
    import librosa
    import numpy as np
    import os
    import kaldiio

    d = ModelDownloader()
    # tag = 'kan-bayashi/libritts_gst+xvector_transformer'
    text2speech = Text2Speech(
        "/nolan/test/espnet/egs2/vctk/tts1/exp/tts_train_gst_fastspeech2_raw_phn_tacotron_g2p_en_no_space/config.yaml",
        "/nolan/test/espnet/egs2/vctk/tts1/exp/tts_train_gst_fastspeech2_raw_phn_tacotron_g2p_en_no_space/train.loss.best.pth",
        device="cuda")
    # text2speech.spc2wav = None
    vocoder = load_model(
        download_pretrained_model("libritts_parallel_wavegan.v1.long")).to(
            "cuda").eval()

    vocoder.remove_weight_norm()
    spembs = None
    if text2speech.use_speech:
        speech, fs = sf.read("/nolan/VCTK-Corpus/wav48/p226/p226_001.wav")
        # speech, _ = librosa.load("/nolan/VCTK-Corpus/wav48/p225/p225_001.wav", text2speech.fs)
Пример #19
0
def test_download_and_clean_cache():
    d = ModelDownloader()
    d.download_and_unpack("test")
    p = d.download("test")
    d.clean_cache("test")
    assert not Path(p).exists()
Пример #20
0
def test_download_and_unpack_local_file():
    d = ModelDownloader()
    path = d.download("test")
    d.download_and_unpack(path)
Пример #21
0
def test_download_and_unpack_non_matching():
    d = ModelDownloader()
    with pytest.raises(RuntimeError):
        d.download_and_unpack(task="dummy")
Пример #22
0
def test_download_and_unpack_no_inputting():
    d = ModelDownloader()
    with pytest.raises(TypeError):
        d.download_and_unpack()
Пример #23
0
def test_download_and_unpack_with_name():
    d = ModelDownloader()
    d.download_and_unpack("test")
Пример #24
0
def test_download_and_unpack_with_url():
    d = ModelDownloader()
    d.download_and_unpack("https://zenodo.org/record/3951842/files/test.zip?download=1")
Пример #25
0
def test_get_model_names_non_matching():
    d = ModelDownloader()
    assert d.query("name", task="dummy") == []
Пример #26
0
def test_get_model_names_and_urls():
    d = ModelDownloader()
    d.query(["name", "url"], task="asr")
Пример #27
0
def test_download_and_unpack_names_with_condition():
    d = ModelDownloader()
    d.query("name", task="asr")
Пример #28
0
def test_new_cachedir(tmp_path):
    ModelDownloader(tmp_path)
Пример #29
0
def test_update_model_table(tmp_path):
    d = ModelDownloader(tmp_path)
    d.update_model_table()
Пример #30
0
def test_get_data_frame():
    d = ModelDownloader()
    d.get_data_frame()