Пример #1
0
def tts(model_name='tts_models/en/ljspeech/tacotron2-DCA',
        vocoder_name=None,
        use_cuda=False):
    """TTS entry point for PyTorch Hub that provides a Synthesizer object to synthesize speech from a give text.

    Example:
        >>> synthesizer = torch.hub.load('coqui-ai/TTS', 'tts', source='github')
        >>> wavs = synthesizer.tts("This is a test! This is also a test!!")
            wavs - is a list of values of the synthesized speech.

    Args:
        model_name (str, optional): One of the model names from .model.json. Defaults to 'tts_models/en/ljspeech/tacotron2-DCA'.
        vocoder_name (str, optional): One of the model names from .model.json. Defaults to 'vocoder_models/en/ljspeech/multiband-melgan'.
        pretrained (bool, optional): [description]. Defaults to True.

    Returns:
        TTS.utils.synthesizer.Synthesizer: Synthesizer object wrapping both vocoder and tts models.
    """
    manager = ModelManager()

    model_path, config_path, model_item = manager.download_model(model_name)
    vocoder_name = model_item[
        'default_vocoder'] if vocoder_name is None else vocoder_name
    vocoder_path, vocoder_config_path, _ = manager.download_model(vocoder_name)

    # create synthesizer
    synt = Synthesizer(tts_checkpoint=model_path,
                       tts_config_path=config_path,
                       vocoder_checkpoint=vocoder_path,
                       vocoder_config=vocoder_config_path,
                       use_cuda=use_cuda)
    return synt
Пример #2
0
    def __init__(self):

        manager = ModelManager()
        model_path, config_path, model_item = manager.download_model(MODEL)

        vocoder_path, vocoder_config_path, _ = manager.download_model(
            model_item['default_vocoder'])

        # last arg is use kuda,
        self.synth = Synthesizer(model_path, config_path, vocoder_path,
                                 vocoder_config_path, False)
 def __init__(self, lang="en-us", config=None):
     config = config or get_neon_tts_config().get("mozilla_local", {})
     super(MozillaLocalTTS, self).__init__(lang,
                                           config,
                                           MozillaTTSValidator(self),
                                           audio_ext="mp3",
                                           ssml_tags=["speak"])
     self.manager = ModelManager()
     self.models = self.manager.list_models()
     self.preferred_model = config.get("preferred_model", "tacotron2-DDC")
     self._get_synthesizer(
         lang)  # Make sure we have a model available in init
Пример #4
0
def test_if_all_models_available():
    """Check if all the models are downloadable."""
    print(
        " > Checking the availability of all the models under the ModelManager."
    )
    manager = ModelManager(output_prefix=get_tests_output_path())
    model_names = manager.list_models()
    for model_name in model_names:
        manager.download_model(model_name)
        print(f" | > OK: {model_name}")

    folders = glob.glob(os.path.join(manager.output_prefix, "*"))
    assert len(folders) == len(model_names)
    shutil.rmtree(manager.output_prefix)
Пример #5
0
    def __init__(self, auto_start: bool = True):
        super().__init__(auto_start)

        path = Path(
            __file__
        ).parent / "../../.venv/lib/python3.8/site-packages/TTS/.models.json"
        manager = ModelManager(path)

        model_path, config_path = manager.download_model(self.model_name)
        vocoder_path, vocoder_config_path = manager.download_model(
            self.vocoder_name)

        self.synthesizer = Synthesizer(model_path, config_path, vocoder_path,
                                       vocoder_config_path, self.use_cuda)
Пример #6
0
 def __init__(self):
     path = Path(synthesize.__file__).parent / "../.models.json"
     logger.info("path")
     logger.info("Creating ModelManager")
     self.manager = ModelManager(path)
     logger.info("Downloading model")
     model_path, config_path, _ = self.manager.download_model(
         self.MODEL_NAME)
     logger.info("Downloading vcoder")
     vocoder_path, vocoder_config_path, _ = self.manager.download_model(
         self.VOCODER_NAME)
     logger.info("Finished downloading TTS model & vcoder")
     self.synthesizer = Synthesizer(model_path, config_path, vocoder_path,
                                    vocoder_config_path, False)
     self.tts_lock = threading.Lock()
def generate():
    if inputbox.get("1.0", "end-1c") == "":
        messagebox.showerror(
            message=
            "TTS will give a division by zero error if the text field is blank."
        )
    else:
        if not os.path.exists('mozilla-tts-output'):
            try:
                os.makedirs('mozilla-tts-output')
            except OSError as e:
                if e.errno != errno.EEXIST:
                    raise
        generatebutton.config(state="disabled")
        exportbutton.config(state="disabled")
        model_path = None
        config_path = None
        vocoder_path = None
        vocoder_config_path = None
        path = Path(__file__).parent / "TTS/.models.json"
        manager = ModelManager(path)
        model_name = 'tts_models/' + ttsmodelbox.get()
        print(f'model_name is {model_name}')
        # for dev
        #model_path, config_path, model_item = manager.download_model(model_name)
        # for master
        model_path, config_path = manager.download_model(model_name)
        vocoder_name = 'vocoder_models/' + vocodermodelbox.get()
        print(f'vocoder_name is {vocoder_name}')
        # for dev
        #vocoder_path, vocoder_config_path, model_item = manager.download_model(vocoder_name)
        # for master
        vocoder_path, vocoder_config_path = manager.download_model(
            vocoder_name)
        synthesizer = Synthesizer(model_path, config_path, vocoder_path,
                                  vocoder_config_path,
                                  cudacheckbutton.instate(['selected']))
        wav = synthesizer.tts(inputbox.get("1.0", "end-1c"))
        synthesizer.save_wav(wav, "mozilla-tts-output/generated.wav")
        playsound("mozilla-tts-output/generated.wav")
        generatebutton.config(state="enabled")
        exportbutton.config(state="enabled")
        print("All done!")
Пример #8
0
class Tts:

    MODEL_NAME = "tts_models/en/ljspeech/tacotron2-DCA"
    VOCODER_NAME = "vocoder_models/en/ljspeech/multiband-melgan"

    def __init__(self):
        path = Path(synthesize.__file__).parent / "../.models.json"
        logger.info("path")
        logger.info("Creating ModelManager")
        self.manager = ModelManager(path)
        logger.info("Downloading model")
        model_path, config_path, _ = self.manager.download_model(
            self.MODEL_NAME)
        logger.info("Downloading vcoder")
        vocoder_path, vocoder_config_path, _ = self.manager.download_model(
            self.VOCODER_NAME)
        logger.info("Finished downloading TTS model & vcoder")
        self.synthesizer = Synthesizer(model_path, config_path, vocoder_path,
                                       vocoder_config_path, False)
        self.tts_lock = threading.Lock()

    def synthesize_speech(self, tts: str):
        """
        This is largely copy pasted from TTS library (TTS.utils.synthesizer.Synthesizer.save_wav) but slightly modified
        to allow NamedTemporaryFile as output instead of writing it to a file

        :param tts: Text to speech
        :return: Speech in NamedTemporaryFile (wav)
        """
        if not self.tts_lock.acquire(blocking=True, timeout=0.1):
            raise TTSAlreadyProcessingException
        try:
            wav = self.synthesizer.tts(tts)
            wav = np.array(wav)
            wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
            temp_file = NamedTemporaryFile(suffix=".wav")
            scipy.io.wavfile.write(temp_file,
                                   self.synthesizer.output_sample_rate,
                                   wav_norm.astype(np.int16))
            return temp_file
        finally:
            self.tts_lock.release()
def exportaudio():
    if inputbox.get("1.0", "end-1c") == "":
        messagebox.showerror(
            message=
            "TTS will give a division by zero error if the text field is blank."
        )
    else:
        f = filedialog.asksaveasfile(mode='a',
                                     defaultextension=".wav",
                                     filetypes=[("Wave files", ".wav")])
        if f is None:  # asksaveasfile return `None` if dialog closed with "cancel".
            return
    generatebutton.config(state="disabled")
    exportbutton.config(state="disabled")
    model_path = None
    config_path = None
    vocoder_path = None
    vocoder_config_path = None
    path = Path(__file__).parent / "TTS/.models.json"
    manager = ModelManager(path)
    model_name = 'tts_models/' + ttsmodelbox.get()
    print(f'model_name is {model_name}')
    # for dev
    #model_path, config_path, model_item = manager.download_model(model_name)
    # for master
    model_path, config_path = manager.download_model(model_name)
    vocoder_name = 'vocoder_models/' + vocodermodelbox.get()
    print(f'vocoder_name is {vocoder_name}')
    # for dev
    #vocoder_path, vocoder_config_path, model_item = manager.download_model(vocoder_name)
    # for master
    vocoder_path, vocoder_config_path = manager.download_model(vocoder_name)
    synthesizer = Synthesizer(model_path, config_path, vocoder_path,
                              vocoder_config_path,
                              cudacheckbutton.instate(['selected']))
    wav = synthesizer.tts(inputbox.get("1.0", "end-1c"))
    synthesizer.save_wav(wav, str(f.name))
    generatebutton.config(state="enabled")
    exportbutton.config(state="enabled")
    print("All done!")
Пример #10
0
def download_tts_model_and_vocodec():
    path = Path(synthesize.__file__).parent / "../.models.json"
    manager = ModelManager(path)
    logger.info("Downloading model")
    manager.download_model(Tts.MODEL_NAME)
    logger.info("Downloading vcoder")
    manager.download_model(Tts.VOCODER_NAME)
    logger.info("Finished downloading TTS model & vocoder")
Пример #11
0
def make_synthesizer(model_name, use_cuda):
    # load model manager
    path = Path(TTS.__file__).parent / ".models.json"
    manager = ModelManager(path)

    model_path, config_path, model_item = manager.download_model(model_name)
    vocoder_name = model_item["default_vocoder"]
    vocoder_path, vocoder_config_path, _ = manager.download_model(vocoder_name)

    speakers_file_path = None
    encoder_path = None
    encoder_config_path = None

    return Synthesizer(
        model_path,
        config_path,
        speakers_file_path,
        vocoder_path,
        vocoder_config_path,
        encoder_path,
        encoder_config_path,
        use_cuda,
    )
Пример #12
0
def test_run_all_models():
    """Check if all the models are downloadable and tts models run correctly."""
    print(" > Run synthesizer with all the models.")
    download_dir = get_user_data_dir("tts")
    output_path = os.path.join(get_tests_output_path(), "output.wav")
    manager = ModelManager(output_prefix=get_tests_output_path())
    model_names = manager.list_models()
    for model_name in model_names:
        print(f"\n > Run - {model_name}")
        model_path, _, _ = manager.download_model(model_name)
        if "tts_models" in model_name:
            local_download_dir = os.path.dirname(model_path)
            # download and run the model
            speaker_files = glob.glob(local_download_dir + "/speaker*")
            language_files = glob.glob(local_download_dir + "/language*")
            language_id = ""
            if len(speaker_files) > 0:
                # multi-speaker model
                if "speaker_ids" in speaker_files[0]:
                    speaker_manager = SpeakerManager(
                        speaker_id_file_path=speaker_files[0])
                elif "speakers" in speaker_files[0]:
                    speaker_manager = SpeakerManager(
                        d_vectors_file_path=speaker_files[0])

                # multi-lingual model - Assuming multi-lingual models are also multi-speaker
                if len(language_files
                       ) > 0 and "language_ids" in language_files[0]:
                    language_manager = LanguageManager(
                        language_ids_file_path=language_files[0])
                    language_id = language_manager.language_names[0]

                speaker_id = list(speaker_manager.ids.keys())[0]
                run_cli(
                    f"tts --model_name  {model_name} "
                    f'--text "This is an example." --out_path "{output_path}" --speaker_idx "{speaker_id}" --language_idx "{language_id}" '
                )
            else:
                # single-speaker model
                run_cli(
                    f"tts --model_name  {model_name} "
                    f'--text "This is an example." --out_path "{output_path}"')
            # remove downloaded models
            shutil.rmtree(download_dir)
        else:
            # only download the model
            manager.download_model(model_name)
        print(f" | > OK: {model_name}")

    folders = glob.glob(os.path.join(manager.output_prefix, "*"))
    assert len(folders) == len(model_names)
    shutil.rmtree(manager.output_prefix)
Пример #13
0
def main():
    description = """Synthesize speech on command line.

You can either use your trained model or choose a model from the provided list.

If you don't specify any models, then it uses LJSpeech based English model.

## Example Runs

### Single Speaker Models

- List provided models:

    ```
    $ tts --list_models
    ```

- Query info for model info by idx:

    ```
    $ tts --model_info_by_idx "<model_type>/<model_query_idx>"
    ```

- Query info for model info by full name:

    ```
    $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
    ```

- Run TTS with default models:

    ```
    $ tts --text "Text for TTS"
    ```

- Run a TTS model with its default vocoder model:

    ```
    $ tts --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>"
    ```

- Run with specific TTS and vocoder models from the list:

    ```
    $ tts --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path
    ```

- Run your own TTS model (Using Griffin-Lim Vocoder):

    ```
    $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
    ```

- Run your own TTS and Vocoder models:
    ```
    $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav
        --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
    ```

### Multi-speaker Models

- List the available speakers and choose as <speaker_id> among them:

    ```
    $ tts --model_name "<language>/<dataset>/<model_name>"  --list_speaker_idxs
    ```

- Run the multi-speaker TTS model with the target speaker ID:

    ```
    $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>"  --speaker_idx <speaker_id>
    ```

- Run your own multi-speaker TTS model:

    ```
    $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
    ```
    """
    # We remove Markdown code formatting programmatically here to allow us to copy-and-paste from main README to keep
    # documentation in sync more easily.
    parser = argparse.ArgumentParser(
        description=description.replace("    ```\n", ""),
        formatter_class=RawTextHelpFormatter,
    )

    parser.add_argument(
        "--list_models",
        type=str2bool,
        nargs="?",
        const=True,
        default=False,
        help="list available pre-trained TTS and vocoder models.",
    )

    parser.add_argument(
        "--model_info_by_idx",
        type=str,
        default=None,
        help="model info using query format: <model_type>/<model_query_idx>",
    )

    parser.add_argument(
        "--model_info_by_name",
        type=str,
        default=None,
        help="model info using query format: <model_type>/<language>/<dataset>/<model_name>",
    )

    parser.add_argument("--text", type=str, default=None, help="Text to generate speech.")

    # Args for running pre-trained TTS models.
    parser.add_argument(
        "--model_name",
        type=str,
        default="tts_models/en/ljspeech/tacotron2-DDC",
        help="Name of one of the pre-trained TTS models in format <language>/<dataset>/<model_name>",
    )
    parser.add_argument(
        "--vocoder_name",
        type=str,
        default=None,
        help="Name of one of the pre-trained  vocoder models in format <language>/<dataset>/<model_name>",
    )

    # Args for running custom models
    parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.")
    parser.add_argument(
        "--model_path",
        type=str,
        default=None,
        help="Path to model file.",
    )
    parser.add_argument(
        "--out_path",
        type=str,
        default="tts_output.wav",
        help="Output wav file path.",
    )
    parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False)
    parser.add_argument(
        "--vocoder_path",
        type=str,
        help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
        default=None,
    )
    parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
    parser.add_argument(
        "--encoder_path",
        type=str,
        help="Path to speaker encoder model file.",
        default=None,
    )
    parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)

    # args for multi-speaker synthesis
    parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
    parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
    parser.add_argument(
        "--speaker_idx",
        type=str,
        help="Target speaker ID for a multi-speaker TTS model.",
        default=None,
    )
    parser.add_argument(
        "--language_idx",
        type=str,
        help="Target language ID for a multi-lingual TTS model.",
        default=None,
    )
    parser.add_argument(
        "--speaker_wav",
        nargs="+",
        help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.",
        default=None,
    )
    parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
    parser.add_argument(
        "--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None
    )
    parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None)
    parser.add_argument(
        "--list_speaker_idxs",
        help="List available speaker ids for the defined multi-speaker model.",
        type=str2bool,
        nargs="?",
        const=True,
        default=False,
    )
    parser.add_argument(
        "--list_language_idxs",
        help="List available language ids for the defined multi-lingual model.",
        type=str2bool,
        nargs="?",
        const=True,
        default=False,
    )
    # aux args
    parser.add_argument(
        "--save_spectogram",
        type=bool,
        help="If true save raw spectogram for further (vocoder) processing in out_path.",
        default=False,
    )
    parser.add_argument(
        "--reference_wav",
        type=str,
        help="Reference wav file to convert in the voice of the speaker_idx or speaker_wav",
        default=None,
    )
    parser.add_argument(
        "--reference_speaker_idx",
        type=str,
        help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
        default=None,
    )
    args = parser.parse_args()

    # print the description if either text or list_models is not set
    check_args = [
        args.text,
        args.list_models,
        args.list_speaker_idxs,
        args.list_language_idxs,
        args.reference_wav,
        args.model_info_by_idx,
        args.model_info_by_name,
    ]
    if not any(check_args):
        parser.parse_args(["-h"])

    # load model manager
    path = Path(__file__).parent / "../.models.json"
    manager = ModelManager(path)

    model_path = None
    config_path = None
    speakers_file_path = None
    language_ids_file_path = None
    vocoder_path = None
    vocoder_config_path = None
    encoder_path = None
    encoder_config_path = None

    # CASE1 #list : list pre-trained TTS models
    if args.list_models:
        manager.list_models()
        sys.exit()

    # CASE2 #info : model info of pre-trained TTS models
    if args.model_info_by_idx:
        model_query = args.model_info_by_idx
        manager.model_info_by_idx(model_query)
        sys.exit()

    if args.model_info_by_name:
        model_query_full_name = args.model_info_by_name
        manager.model_info_by_full_name(model_query_full_name)
        sys.exit()

    # CASE3: load pre-trained model paths
    if args.model_name is not None and not args.model_path:
        model_path, config_path, model_item = manager.download_model(args.model_name)
        args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name

    if args.vocoder_name is not None and not args.vocoder_path:
        vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)

    # CASE4: set custom model paths
    if args.model_path is not None:
        model_path = args.model_path
        config_path = args.config_path
        speakers_file_path = args.speakers_file_path
        language_ids_file_path = args.language_ids_file_path

    if args.vocoder_path is not None:
        vocoder_path = args.vocoder_path
        vocoder_config_path = args.vocoder_config_path

    if args.encoder_path is not None:
        encoder_path = args.encoder_path
        encoder_config_path = args.encoder_config_path

    # load models
    synthesizer = Synthesizer(
        model_path,
        config_path,
        speakers_file_path,
        language_ids_file_path,
        vocoder_path,
        vocoder_config_path,
        encoder_path,
        encoder_config_path,
        args.use_cuda,
    )

    # query speaker ids of a multi-speaker model.
    if args.list_speaker_idxs:
        print(
            " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
        )
        print(synthesizer.tts_model.speaker_manager.ids)
        return

    # query langauge ids of a multi-lingual model.
    if args.list_language_idxs:
        print(
            " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
        )
        print(synthesizer.tts_model.language_manager.ids)
        return

    # check the arguments against a multi-speaker model.
    if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
        print(
            " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
            "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
        )
        return

    # RUN THE SYNTHESIS
    if args.text:
        print(" > Text: {}".format(args.text))

    # kick it
    wav = synthesizer.tts(
        args.text,
        args.speaker_idx,
        args.language_idx,
        args.speaker_wav,
        reference_wav=args.reference_wav,
        style_wav=args.capacitron_style_wav,
        style_text=args.capacitron_style_text,
        reference_speaker_name=args.reference_speaker_idx,
    )

    # save the results
    print(" > Saving output to {}".format(args.out_path))
    synthesizer.save_wav(wav, args.out_path)
Пример #14
0
from text import TextProcessor
from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer
from pydub import AudioSegment
import os
from pathlib import Path
import json
import justpy as jp

path = Path().cwd() / "TTS/TTS/.models.json"
manager = ModelManager(path)
print("> TTS Module Manager Loaded")
with open(path,"rt") as f:
    modelsdict = json.load(f)
tts_models = [f"tts_models/{lang}/{dataset}" for lang in modelsdict["tts_models"] for dataset in modelsdict["tts_models"][lang]]
vocoder_models = [f"vocoder_models/{lang}/{dataset}" for lang in modelsdict["vocoder_models"] for dataset in modelsdict["vocoder_models"][lang]]

tp = TextProcessor()
print("> Text Processor Loaded")

def initsynthesizer(model_name,vocoder_name,use_cuda):
    model_path, config_path = manager.download_model(model_name)
    vocoder_path, vocoder_config_path = manager.download(vocoder_name)
    return Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, use_cuda)

def tts(synth, text, out_name):
    wav = synth.tts(text)
    syth.save_wav(wav,out_name+'.wav')
    AudioSegment.from_wav(out_name+'.wav').export(out_name+'.mp3',format="mp3")
    os.remove(out_name+'.wav')
Пример #15
0
    epochs=1000,
    text_cleaner="english_cleaners",
    use_phonemes=True,
    use_espeak_phonemes=False,
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    print_step=50,
    print_eval=False,
    mixed_precision=False,
    sort_by_audio_len=True,
    max_seq_len=500000,
    output_path=output_path,
    datasets=[dataset_config],
)

# compute alignments
if not config.model_args.use_aligner:
    manager = ModelManager()
    model_path, config_path, _ = manager.download_model(
        "tts_models/en/ljspeech/tacotron2-DCA")
    # TODO: make compute_attention python callable
    os.system(
        f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/  --use_cuda true"
    )

# train the model
args, config, output_path, _, c_logger, tb_logger = init_training(
    TrainingArgs(), config)
trainer = Trainer(args, config, output_path, c_logger, tb_logger)
trainer.fit()
Пример #16
0
def main():
    # pylint: disable=bad-continuation
    parser = argparse.ArgumentParser(description='''Synthesize speech on command line.\n\n'''

    '''You can either use your trained model or choose a model from the provided list.\n\n'''\

    '''If you don't specify any models, then it uses LJSpeech based English models\n\n'''\

    '''
    Example runs:

    # list provided models
    ./TTS/bin/synthesize.py --list_models

    # run tts with default models.
    ./TTS/bin synthesize.py --text "Text for TTS"

    # run a tts model with its default vocoder model.
     ./TTS/bin synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>"

    # run with specific tts and vocoder models from the list
    ./TTS/bin/synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path

    # run your own TTS model (Using Griffin-Lim Vocoder)
    ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav

    # run your own TTS and Vocoder models
    ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
        --vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json

    ''',
        formatter_class=RawTextHelpFormatter)

    parser.add_argument(
        '--list_models',
        type=str2bool,
        nargs='?',
        const=True,
        default=False,
        help='list available pre-trained tts and vocoder models.')
    parser.add_argument('--text',
                        type=str,
                        default=None,
                        help='Text to generate speech.')

    # Args for running pre-trained TTS models.
    parser.add_argument(
        '--model_name',
        type=str,
        default="tts_models/en/ljspeech/speedy-speech-wn",
        help=
        'Name of one of the pre-trained tts models in format <language>/<dataset>/<model_name>'
    )
    parser.add_argument(
        '--vocoder_name',
        type=str,
        default=None,
        help=
        'Name of one of the pre-trained  vocoder models in format <language>/<dataset>/<model_name>'
    )

    # Args for running custom models
    parser.add_argument('--config_path',
                        default=None,
                        type=str,
                        help='Path to model config file.')
    parser.add_argument(
        '--model_path',
        type=str,
        default=None,
        help='Path to model file.',
    )
    parser.add_argument(
        '--out_path',
        type=str,
        default=Path(__file__).resolve().parent,
        help=
        'Path to save final wav file. Wav file will be named as the given text.',
    )
    parser.add_argument('--use_cuda',
                        type=bool,
                        help='Run model on CUDA.',
                        default=False)
    parser.add_argument(
        '--vocoder_path',
        type=str,
        help=
        'Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).',
        default=None,
    )
    parser.add_argument('--vocoder_config_path',
                        type=str,
                        help='Path to vocoder model config file.',
                        default=None)

    # args for multi-speaker synthesis
    parser.add_argument('--speakers_json',
                        type=str,
                        help="JSON file for multi-speaker model.",
                        default=None)
    parser.add_argument(
        '--speaker_idx',
        type=str,
        help=
        "if the tts model is trained with x-vectors, then speaker_idx is a file present in speakers.json else speaker_idx is the speaker id corresponding to a speaker in the speaker embedding layer.",
        default=None)
    parser.add_argument('--gst_style',
                        help="Wav path file for GST stylereference.",
                        default=None)

    # aux args
    parser.add_argument(
        '--save_spectogram',
        type=bool,
        help=
        "If true save raw spectogram for further (vocoder) processing in out_path.",
        default=False)

    args = parser.parse_args()

    # print the description if either text or list_models is not set
    if args.text is None and not args.list_models:
        parser.parse_args(['-h'])

    # load model manager
    path = Path(__file__).parent / "../.models.json"
    manager = ModelManager(path)

    model_path = None
    config_path = None
    vocoder_path = None
    vocoder_config_path = None

    # CASE1: list pre-trained TTS models
    if args.list_models:
        manager.list_models()
        sys.exit()

    # CASE2: load pre-trained models
    if args.model_name is not None:
        model_path, config_path, model_item = manager.download_model(
            args.model_name)
        args.vocoder_name = model_item[
            'default_vocoder'] if args.vocoder_name is None else args.vocoder_name

    if args.vocoder_name is not None:
        vocoder_path, vocoder_config_path, _ = manager.download_model(
            args.vocoder_name)

    # CASE3: load custome models
    if args.model_path is not None:
        model_path = args.model_path
        config_path = args.config_path

    if args.vocoder_path is not None:
        vocoder_path = args.vocoder_path
        vocoder_config_path = args.vocoder_config_path

    # RUN THE SYNTHESIS
    # load models
    synthesizer = Synthesizer(model_path, config_path, vocoder_path,
                              vocoder_config_path, args.use_cuda)

    print(" > Text: {}".format(args.text))

    # # handle multi-speaker setting
    # if not model_config.use_external_speaker_embedding_file and args.speaker_idx is not None:
    #     if args.speaker_idx.isdigit():
    #         args.speaker_idx = int(args.speaker_idx)
    #     else:
    #         args.speaker_idx = None
    # else:
    #     args.speaker_idx = None

    # if args.gst_style is None:
    #     if 'gst' in model_config.keys() and model_config.gst['gst_style_input'] is not None:
    #         gst_style = model_config.gst['gst_style_input']
    #     else:
    #         gst_style = None
    # else:
    #     # check if gst_style string is a dict, if is dict convert  else use string
    #     try:
    #         gst_style = json.loads(args.gst_style)
    #         if max(map(int, gst_style.keys())) >= model_config.gst['gst_style_tokens']:
    #             raise RuntimeError("The highest value of the gst_style dictionary key must be less than the number of GST Tokens, \n Highest dictionary key value: {} \n Number of GST tokens: {}".format(max(map(int, gst_style.keys())), model_config.gst['gst_style_tokens']))
    #     except ValueError:
    #         gst_style = args.gst_style

    # kick it
    wav = synthesizer.tts(args.text)

    # save the results
    file_name = args.text.replace(" ", "_")[0:20]
    file_name = file_name.translate(
        str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
    out_path = os.path.join(args.out_path, file_name)
    print(" > Saving output to {}".format(out_path))
    synthesizer.save_wav(
        wav,
        out_path,
    )
Пример #17
0
def main():
    # pylint: disable=bad-option-value
    parser = argparse.ArgumentParser(
        description="""Synthesize speech on command line.\n\n"""
        """You can either use your trained model or choose a model from the provided list.\n\n"""
        """If you don't specify any models, then it uses LJSpeech based English model.\n\n"""
        """
    # Example Runs:

    ## Single Speaker Models

    - list provided models

    ```
    $ ./TTS/bin/synthesize.py --list_models
    ```

    - run tts with default models.

    ```
    $ ./TTS/bin synthesize.py --text "Text for TTS"
    ```

    - run a tts model with its default vocoder model.

    ```
    $ ./TTS/bin synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>
    ```

    - run with specific tts and vocoder models from the list

    ```
    $ ./TTS/bin/synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path
    ```

    - run your own TTS model (Using Griffin-Lim Vocoder)

    ```
    $ ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav
    ```

    - run your own TTS and Vocoder models
    ```
    $ ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
        --vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json
    ```

    ## MULTI-SPEAKER MODELS

    - list the available speakers and choose as <speaker_id> among them.

    ```
    $ ./TTS/bin/synthesize.py --model_name "<language>/<dataset>/<model_name>"  --list_speaker_idxs
    ```

    - run the multi-speaker TTS model with the target speaker ID.

    ```
    $ ./TTS/bin/synthesize.py --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>"  --speaker_idx <speaker_id>
    ```

    - run your own multi-speaker TTS model.

    ```
    $ ./TTS/bin/synthesize.py --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth.tar --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
    ```
    """,
        formatter_class=RawTextHelpFormatter,
    )

    parser.add_argument(
        "--list_models",
        type=str2bool,
        nargs="?",
        const=True,
        default=False,
        help="list available pre-trained tts and vocoder models.",
    )
    parser.add_argument("--text",
                        type=str,
                        default=None,
                        help="Text to generate speech.")

    # Args for running pre-trained TTS models.
    parser.add_argument(
        "--model_name",
        type=str,
        default="tts_models/en/ljspeech/tacotron2-DDC",
        help=
        "Name of one of the pre-trained tts models in format <language>/<dataset>/<model_name>",
    )
    parser.add_argument(
        "--vocoder_name",
        type=str,
        default=None,
        help=
        "Name of one of the pre-trained  vocoder models in format <language>/<dataset>/<model_name>",
    )

    # Args for running custom models
    parser.add_argument("--config_path",
                        default=None,
                        type=str,
                        help="Path to model config file.")
    parser.add_argument(
        "--model_path",
        type=str,
        default=None,
        help="Path to model file.",
    )
    parser.add_argument(
        "--out_path",
        type=str,
        default="tts_output.wav",
        help="Output wav file path.",
    )
    parser.add_argument("--use_cuda",
                        type=bool,
                        help="Run model on CUDA.",
                        default=False)
    parser.add_argument(
        "--vocoder_path",
        type=str,
        help=
        "Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
        default=None,
    )
    parser.add_argument("--vocoder_config_path",
                        type=str,
                        help="Path to vocoder model config file.",
                        default=None)
    parser.add_argument(
        "--encoder_path",
        type=str,
        help="Path to speaker encoder model file.",
        default=None,
    )
    parser.add_argument("--encoder_config_path",
                        type=str,
                        help="Path to speaker encoder config file.",
                        default=None)

    # args for multi-speaker synthesis
    parser.add_argument("--speakers_file_path",
                        type=str,
                        help="JSON file for multi-speaker model.",
                        default=None)
    parser.add_argument(
        "--speaker_idx",
        type=str,
        help="Target speaker ID for a multi-speaker TTS model.",
        default=None,
    )
    parser.add_argument(
        "--speaker_wav",
        nargs="+",
        help=
        "wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The x_vectors is computed as their average.",
        default=None,
    )
    parser.add_argument("--gst_style",
                        help="Wav path file for GST stylereference.",
                        default=None)
    parser.add_argument(
        "--list_speaker_idxs",
        help="List available speaker ids for the defined multi-speaker model.",
        type=str2bool,
        nargs="?",
        const=True,
        default=False,
    )
    # aux args
    parser.add_argument(
        "--save_spectogram",
        type=bool,
        help=
        "If true save raw spectogram for further (vocoder) processing in out_path.",
        default=False,
    )

    args = parser.parse_args()

    # print the description if either text or list_models is not set
    if args.text is None and not args.list_models and not args.list_speaker_idxs:
        parser.parse_args(["-h"])

    # load model manager
    path = Path(__file__).parent / "../.models.json"
    manager = ModelManager(path)

    model_path = None
    config_path = None
    speakers_file_path = None
    vocoder_path = None
    vocoder_config_path = None
    encoder_path = None
    encoder_config_path = None

    # CASE1: list pre-trained TTS models
    if args.list_models:
        manager.list_models()
        sys.exit()

    # CASE2: load pre-trained model paths
    if args.model_name is not None and not args.model_path:
        model_path, config_path, model_item = manager.download_model(
            args.model_name)
        args.vocoder_name = model_item[
            "default_vocoder"] if args.vocoder_name is None else args.vocoder_name

    if args.vocoder_name is not None and not args.vocoder_path:
        vocoder_path, vocoder_config_path, _ = manager.download_model(
            args.vocoder_name)

    # CASE3: set custome model paths
    if args.model_path is not None:
        model_path = args.model_path
        config_path = args.config_path
        speakers_file_path = args.speakers_file_path

    if args.vocoder_path is not None:
        vocoder_path = args.vocoder_path
        vocoder_config_path = args.vocoder_config_path

    if args.encoder_path is not None:
        encoder_path = args.encoder_path
        encoder_config_path = args.encoder_config_path

    # load models
    synthesizer = Synthesizer(
        model_path,
        config_path,
        speakers_file_path,
        vocoder_path,
        vocoder_config_path,
        encoder_path,
        encoder_config_path,
        args.use_cuda,
    )

    # query speaker ids of a multi-speaker model.
    if args.list_speaker_idxs:
        print(
            " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
        )
        print(synthesizer.speaker_manager.speaker_ids)
        return

    # check the arguments against a multi-speaker model.
    if synthesizer.tts_speakers_file and (not args.speaker_idx
                                          and not args.speaker_wav):
        print(
            " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
            "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
        )
        return

    # RUN THE SYNTHESIS
    print(" > Text: {}".format(args.text))

    # kick it
    wav = synthesizer.tts(args.text, args.speaker_idx, args.speaker_wav)

    # save the results
    print(" > Saving output to {}".format(args.out_path))
    synthesizer.save_wav(wav, args.out_path)
Пример #18
0
        help="path to JSON file containing speaker ids, if speaker ids are used in the model",
    )
    parser.add_argument("--vocoder_config", type=str, default=None, help="path to vocoder config file.")
    parser.add_argument("--vocoder_checkpoint", type=str, default=None, help="path to vocoder checkpoint file.")
    parser.add_argument("--port", type=int, default=5002, help="port to listen on.")
    parser.add_argument("--use_cuda", type=convert_boolean, default=False, help="true to use CUDA.")
    parser.add_argument("--debug", type=convert_boolean, default=False, help="true to enable Flask debug mode.")
    parser.add_argument("--show_details", type=convert_boolean, default=False, help="Generate model detail page.")
    return parser


# parse the args
args = create_argparser().parse_args()

path = Path(__file__).parent / "../.models.json"
manager = ModelManager(path)

if args.list_models:
    manager.list_models()
    sys.exit()

# update in-use models to the specified released models.
if args.model_name is not None:
    tts_checkpoint_file, tts_config_file, tts_json_dict = manager.download_model(args.model_name)
    args.vocoder_name = tts_json_dict["default_vocoder"] if args.vocoder_name is None else args.vocoder_name

if args.vocoder_name is not None:
    vocoder_checkpoint_file, vocoder_config_file, vocoder_json_dict = manager.download_model(args.vocoder_name)

# If these were not specified in the CLI args, use default values with embedded model files
if not args.tts_checkpoint and os.path.isfile(tts_checkpoint_file):
Пример #19
0
    parser.add_argument("--debug",
                        type=convert_boolean,
                        default=False,
                        help="true to enable Flask debug mode.")
    parser.add_argument("--show_details",
                        type=convert_boolean,
                        default=False,
                        help="Generate model detail page.")
    return parser


# parse the args
args = create_argparser().parse_args()

path = Path(__file__).parent / "../.models.json"
manager = ModelManager(path)

if args.list_models:
    manager.list_models()
    sys.exit()

# update in-use models to the specified released models.
model_path = None
config_path = None
speakers_file_path = None
vocoder_path = None
vocoder_config_path = None

# CASE1: list pre-trained TTS models
if args.list_models:
    manager.list_models()
class MozillaLocalTTS(TTS):
    def __init__(self, lang="en-us", config=None):
        config = config or get_neon_tts_config().get("mozilla_local", {})
        super(MozillaLocalTTS, self).__init__(lang,
                                              config,
                                              MozillaTTSValidator(self),
                                              audio_ext="mp3",
                                              ssml_tags=["speak"])
        self.manager = ModelManager()
        self.models = self.manager.list_models()
        self.preferred_model = config.get("preferred_model", "tacotron2-DDC")
        self._get_synthesizer(
            lang)  # Make sure we have a model available in init

    def get_tts(self, sentence, wav_file, speaker=None):
        stopwatch = Stopwatch()
        speaker = speaker or dict()
        # Read utterance data from passed configuration
        request_lang = speaker.get("language", self.lang)

        to_speak = format_speak_tags(sentence)
        LOG.debug(to_speak)
        if to_speak:
            synthesizer = self._get_synthesizer(request_lang)
            with stopwatch:
                wav_data = synthesizer.tts(sentence)
            LOG.debug(f"Synthesis time={stopwatch.time}")

            with stopwatch:
                synthesizer.save_wav(wav_data, wav_file)
            LOG.debug(f"File access time={stopwatch.time}")
        return wav_file, None

    def _get_synthesizer(self, language) -> Synthesizer:
        if '-' in language:
            language = language.split('-')[0]
        stopwatch = Stopwatch()
        with stopwatch:
            model_name = None

            for model in self.models:
                _, lang, dataset, name = model.split('/')
                print(f"{lang}|{name}")
                if language in lang:
                    model_name = model
                    if name == self.preferred_model:
                        break

            model_path, config_path, model_item = self.manager.download_model(
                model_name)
            vocoder_name = model_item.get(
                "default_vocoder",
                "vocoder_models/universal/libri-tts/fullband-melgan")
            vocoder_path, vocoder_config_path, _ = self.manager.download_model(
                vocoder_name)
            speakers_file_path = ''
            encoder_path = ''
            encoder_config_path = ''
            use_cuda = False

            synthesizer = Synthesizer(
                model_path,
                config_path,
                speakers_file_path,
                vocoder_path,
                vocoder_config_path,
                encoder_path,
                encoder_config_path,
                use_cuda,
            )
        LOG.debug(f"Get synthesizer time={stopwatch.time}")
        return synthesizer