Пример #1
0
 def test_in_out(self):
     self._create_random_model()
     tts_root_path = get_tests_output_path()
     tts_checkpoint = os.path.join(tts_root_path, "checkpoint_10.pth.tar")
     tts_config = os.path.join(tts_root_path, "dummy_model_config.json")
     synthesizer = Synthesizer(tts_checkpoint, tts_config, None, None)
     synthesizer.tts("Better this test works!!")
Пример #2
0
 def test_in_out(self):
     self._create_random_model()
     config = load_config(os.path.join(get_tests_input_path(), 'server_config.json'))
     tts_root_path = get_tests_output_path()
     config['tts_checkpoint'] = os.path.join(tts_root_path, config['tts_checkpoint'])
     config['tts_config'] = os.path.join(tts_root_path, config['tts_config'])
     synthesizer = Synthesizer(config['tts_checkpoint'], config['tts_config'], None, None)
     synthesizer.tts("Better this test works!!")
Пример #3
0
def tts(text):
    synthesizer = Synthesizer(MODEL_PATH, CONFIG_PATH, use_cuda)
    wav = synthesizer.tts(text)
    # save the results
    file_name = text.replace(" ", "_")[0:20]
    file_name = file_name.translate(
        str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
    out_path = OUT_FILE
    print(" > Saving output to {}".format(out_path))
    synthesizer.save_wav(wav, out_path)
    playsound(out_path)
Пример #4
0
class MozillaTTS():
    def __init__(self):

        manager = ModelManager()
        model_path, config_path, model_item = manager.download_model(MODEL)

        vocoder_path, vocoder_config_path, _ = manager.download_model(
            model_item['default_vocoder'])

        # last arg is use kuda,
        self.synth = Synthesizer(model_path, config_path, vocoder_path,
                                 vocoder_config_path, False)

    def say(self, text):

        # generate wav
        wav = self.synth.tts(text)
        # output TODO would be nice to play this without having to do file I/O...
        out_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                'output.wav')
        self.synth.save_wav(
            wav,
            out_file,
        )

        # Open the sound file
        chunk = 1024
        wf = wave.open(out_file, 'rb')

        # play wav
        p = pyaudio.PyAudio()

        # Open a .Stream object to write the WAV file to
        # 'output = True' indicates that the sound will be played rather than recorded
        stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
                        channels=wf.getnchannels(),
                        rate=wf.getframerate(),
                        output=True)

        # Play the sound by writing the audio data to the stream
        data = wf.readframes(chunk)
        while data != '':
            stream.write(data)
            data = wf.readframes(chunk)

        # Close and terminate the stream
        stream.close()
        p.terminate()
def generate():
    if inputbox.get("1.0", "end-1c") == "":
        messagebox.showerror(
            message=
            "TTS will give a division by zero error if the text field is blank."
        )
    else:
        if not os.path.exists('mozilla-tts-output'):
            try:
                os.makedirs('mozilla-tts-output')
            except OSError as e:
                if e.errno != errno.EEXIST:
                    raise
        generatebutton.config(state="disabled")
        exportbutton.config(state="disabled")
        model_path = None
        config_path = None
        vocoder_path = None
        vocoder_config_path = None
        path = Path(__file__).parent / "TTS/.models.json"
        manager = ModelManager(path)
        model_name = 'tts_models/' + ttsmodelbox.get()
        print(f'model_name is {model_name}')
        # for dev
        #model_path, config_path, model_item = manager.download_model(model_name)
        # for master
        model_path, config_path = manager.download_model(model_name)
        vocoder_name = 'vocoder_models/' + vocodermodelbox.get()
        print(f'vocoder_name is {vocoder_name}')
        # for dev
        #vocoder_path, vocoder_config_path, model_item = manager.download_model(vocoder_name)
        # for master
        vocoder_path, vocoder_config_path = manager.download_model(
            vocoder_name)
        synthesizer = Synthesizer(model_path, config_path, vocoder_path,
                                  vocoder_config_path,
                                  cudacheckbutton.instate(['selected']))
        wav = synthesizer.tts(inputbox.get("1.0", "end-1c"))
        synthesizer.save_wav(wav, "mozilla-tts-output/generated.wav")
        playsound("mozilla-tts-output/generated.wav")
        generatebutton.config(state="enabled")
        exportbutton.config(state="enabled")
        print("All done!")
Пример #6
0
class Tts:

    MODEL_NAME = "tts_models/en/ljspeech/tacotron2-DCA"
    VOCODER_NAME = "vocoder_models/en/ljspeech/multiband-melgan"

    def __init__(self):
        path = Path(synthesize.__file__).parent / "../.models.json"
        logger.info("path")
        logger.info("Creating ModelManager")
        self.manager = ModelManager(path)
        logger.info("Downloading model")
        model_path, config_path, _ = self.manager.download_model(
            self.MODEL_NAME)
        logger.info("Downloading vcoder")
        vocoder_path, vocoder_config_path, _ = self.manager.download_model(
            self.VOCODER_NAME)
        logger.info("Finished downloading TTS model & vcoder")
        self.synthesizer = Synthesizer(model_path, config_path, vocoder_path,
                                       vocoder_config_path, False)
        self.tts_lock = threading.Lock()

    def synthesize_speech(self, tts: str):
        """
        This is largely copy pasted from TTS library (TTS.utils.synthesizer.Synthesizer.save_wav) but slightly modified
        to allow NamedTemporaryFile as output instead of writing it to a file

        :param tts: Text to speech
        :return: Speech in NamedTemporaryFile (wav)
        """
        if not self.tts_lock.acquire(blocking=True, timeout=0.1):
            raise TTSAlreadyProcessingException
        try:
            wav = self.synthesizer.tts(tts)
            wav = np.array(wav)
            wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
            temp_file = NamedTemporaryFile(suffix=".wav")
            scipy.io.wavfile.write(temp_file,
                                   self.synthesizer.output_sample_rate,
                                   wav_norm.astype(np.int16))
            return temp_file
        finally:
            self.tts_lock.release()
def exportaudio():
    if inputbox.get("1.0", "end-1c") == "":
        messagebox.showerror(
            message=
            "TTS will give a division by zero error if the text field is blank."
        )
    else:
        f = filedialog.asksaveasfile(mode='a',
                                     defaultextension=".wav",
                                     filetypes=[("Wave files", ".wav")])
        if f is None:  # asksaveasfile return `None` if dialog closed with "cancel".
            return
    generatebutton.config(state="disabled")
    exportbutton.config(state="disabled")
    model_path = None
    config_path = None
    vocoder_path = None
    vocoder_config_path = None
    path = Path(__file__).parent / "TTS/.models.json"
    manager = ModelManager(path)
    model_name = 'tts_models/' + ttsmodelbox.get()
    print(f'model_name is {model_name}')
    # for dev
    #model_path, config_path, model_item = manager.download_model(model_name)
    # for master
    model_path, config_path = manager.download_model(model_name)
    vocoder_name = 'vocoder_models/' + vocodermodelbox.get()
    print(f'vocoder_name is {vocoder_name}')
    # for dev
    #vocoder_path, vocoder_config_path, model_item = manager.download_model(vocoder_name)
    # for master
    vocoder_path, vocoder_config_path = manager.download_model(vocoder_name)
    synthesizer = Synthesizer(model_path, config_path, vocoder_path,
                              vocoder_config_path,
                              cudacheckbutton.instate(['selected']))
    wav = synthesizer.tts(inputbox.get("1.0", "end-1c"))
    synthesizer.save_wav(wav, str(f.name))
    generatebutton.config(state="enabled")
    exportbutton.config(state="enabled")
    print("All done!")
def text_to_wav(text, lang):
    #class Synthesizer(object):
    #def __init__(self, tts_checkpoint, tts_config, vocoder_checkpoint=None, vocoder_config=None, use_cuda=False):
    tts_checkpoint = "/home/hector/.local/share/tts/tts_models--en--ljspeech--speedy-speech-wn/model_file.pth.tar"
    tts_config = "/home/hector/.local/share/tts/tts_models--en--ljspeech--speedy-speech-wn/config.json"
    #tts_checkpoint = "/home/hector/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth.tar"
    #tts_config = "/home/hector/.local/share/tts/tts_models--en--ljspeech--glow-tts/config.json"
    if lang in ["Es"]:
        tts_checkpoint = "/home/hector/.local/share/tts/tts_models--es--mai--tacotron2-DDC/model_file.pth.tar"
        tts_config = "/home/hector/.local/share/tts/tts_models--es--mai--tacotron2-DDC/config.json"

    vocoder_checkpoint = "/home/hector/.local/share/tts/vocoder_models--universal--libri-tts--fullband-melgan/model_file.pth.tar"
    vocoder_config = "/home/hector/.local/share/tts/vocoder_models--universal--libri-tts--fullband-melgan/config.json"

    synthesizer = Synthesizer(tts_checkpoint, tts_config, vocoder_checkpoint,
                              vocoder_config)

    # kick it
    wav = synthesizer.tts(text)

    # save the results
    file_name = 'audio.wav'
    #print(" > Saving output to {}".format(file_name))
    synthesizer.save_wav(wav, file_name)
Пример #9
0
def main():
    description = """Synthesize speech on command line.

You can either use your trained model or choose a model from the provided list.

If you don't specify any models, then it uses LJSpeech based English model.

## Example Runs

### Single Speaker Models

- List provided models:

    ```
    $ tts --list_models
    ```

- Query info for model info by idx:

    ```
    $ tts --model_info_by_idx "<model_type>/<model_query_idx>"
    ```

- Query info for model info by full name:

    ```
    $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
    ```

- Run TTS with default models:

    ```
    $ tts --text "Text for TTS"
    ```

- Run a TTS model with its default vocoder model:

    ```
    $ tts --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>"
    ```

- Run with specific TTS and vocoder models from the list:

    ```
    $ tts --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path
    ```

- Run your own TTS model (Using Griffin-Lim Vocoder):

    ```
    $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
    ```

- Run your own TTS and Vocoder models:
    ```
    $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav
        --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
    ```

### Multi-speaker Models

- List the available speakers and choose as <speaker_id> among them:

    ```
    $ tts --model_name "<language>/<dataset>/<model_name>"  --list_speaker_idxs
    ```

- Run the multi-speaker TTS model with the target speaker ID:

    ```
    $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>"  --speaker_idx <speaker_id>
    ```

- Run your own multi-speaker TTS model:

    ```
    $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
    ```
    """
    # We remove Markdown code formatting programmatically here to allow us to copy-and-paste from main README to keep
    # documentation in sync more easily.
    parser = argparse.ArgumentParser(
        description=description.replace("    ```\n", ""),
        formatter_class=RawTextHelpFormatter,
    )

    parser.add_argument(
        "--list_models",
        type=str2bool,
        nargs="?",
        const=True,
        default=False,
        help="list available pre-trained TTS and vocoder models.",
    )

    parser.add_argument(
        "--model_info_by_idx",
        type=str,
        default=None,
        help="model info using query format: <model_type>/<model_query_idx>",
    )

    parser.add_argument(
        "--model_info_by_name",
        type=str,
        default=None,
        help="model info using query format: <model_type>/<language>/<dataset>/<model_name>",
    )

    parser.add_argument("--text", type=str, default=None, help="Text to generate speech.")

    # Args for running pre-trained TTS models.
    parser.add_argument(
        "--model_name",
        type=str,
        default="tts_models/en/ljspeech/tacotron2-DDC",
        help="Name of one of the pre-trained TTS models in format <language>/<dataset>/<model_name>",
    )
    parser.add_argument(
        "--vocoder_name",
        type=str,
        default=None,
        help="Name of one of the pre-trained  vocoder models in format <language>/<dataset>/<model_name>",
    )

    # Args for running custom models
    parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.")
    parser.add_argument(
        "--model_path",
        type=str,
        default=None,
        help="Path to model file.",
    )
    parser.add_argument(
        "--out_path",
        type=str,
        default="tts_output.wav",
        help="Output wav file path.",
    )
    parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False)
    parser.add_argument(
        "--vocoder_path",
        type=str,
        help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
        default=None,
    )
    parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
    parser.add_argument(
        "--encoder_path",
        type=str,
        help="Path to speaker encoder model file.",
        default=None,
    )
    parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)

    # args for multi-speaker synthesis
    parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
    parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
    parser.add_argument(
        "--speaker_idx",
        type=str,
        help="Target speaker ID for a multi-speaker TTS model.",
        default=None,
    )
    parser.add_argument(
        "--language_idx",
        type=str,
        help="Target language ID for a multi-lingual TTS model.",
        default=None,
    )
    parser.add_argument(
        "--speaker_wav",
        nargs="+",
        help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.",
        default=None,
    )
    parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
    parser.add_argument(
        "--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None
    )
    parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None)
    parser.add_argument(
        "--list_speaker_idxs",
        help="List available speaker ids for the defined multi-speaker model.",
        type=str2bool,
        nargs="?",
        const=True,
        default=False,
    )
    parser.add_argument(
        "--list_language_idxs",
        help="List available language ids for the defined multi-lingual model.",
        type=str2bool,
        nargs="?",
        const=True,
        default=False,
    )
    # aux args
    parser.add_argument(
        "--save_spectogram",
        type=bool,
        help="If true save raw spectogram for further (vocoder) processing in out_path.",
        default=False,
    )
    parser.add_argument(
        "--reference_wav",
        type=str,
        help="Reference wav file to convert in the voice of the speaker_idx or speaker_wav",
        default=None,
    )
    parser.add_argument(
        "--reference_speaker_idx",
        type=str,
        help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
        default=None,
    )
    args = parser.parse_args()

    # print the description if either text or list_models is not set
    check_args = [
        args.text,
        args.list_models,
        args.list_speaker_idxs,
        args.list_language_idxs,
        args.reference_wav,
        args.model_info_by_idx,
        args.model_info_by_name,
    ]
    if not any(check_args):
        parser.parse_args(["-h"])

    # load model manager
    path = Path(__file__).parent / "../.models.json"
    manager = ModelManager(path)

    model_path = None
    config_path = None
    speakers_file_path = None
    language_ids_file_path = None
    vocoder_path = None
    vocoder_config_path = None
    encoder_path = None
    encoder_config_path = None

    # CASE1 #list : list pre-trained TTS models
    if args.list_models:
        manager.list_models()
        sys.exit()

    # CASE2 #info : model info of pre-trained TTS models
    if args.model_info_by_idx:
        model_query = args.model_info_by_idx
        manager.model_info_by_idx(model_query)
        sys.exit()

    if args.model_info_by_name:
        model_query_full_name = args.model_info_by_name
        manager.model_info_by_full_name(model_query_full_name)
        sys.exit()

    # CASE3: load pre-trained model paths
    if args.model_name is not None and not args.model_path:
        model_path, config_path, model_item = manager.download_model(args.model_name)
        args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name

    if args.vocoder_name is not None and not args.vocoder_path:
        vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)

    # CASE4: set custom model paths
    if args.model_path is not None:
        model_path = args.model_path
        config_path = args.config_path
        speakers_file_path = args.speakers_file_path
        language_ids_file_path = args.language_ids_file_path

    if args.vocoder_path is not None:
        vocoder_path = args.vocoder_path
        vocoder_config_path = args.vocoder_config_path

    if args.encoder_path is not None:
        encoder_path = args.encoder_path
        encoder_config_path = args.encoder_config_path

    # load models
    synthesizer = Synthesizer(
        model_path,
        config_path,
        speakers_file_path,
        language_ids_file_path,
        vocoder_path,
        vocoder_config_path,
        encoder_path,
        encoder_config_path,
        args.use_cuda,
    )

    # query speaker ids of a multi-speaker model.
    if args.list_speaker_idxs:
        print(
            " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
        )
        print(synthesizer.tts_model.speaker_manager.ids)
        return

    # query langauge ids of a multi-lingual model.
    if args.list_language_idxs:
        print(
            " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
        )
        print(synthesizer.tts_model.language_manager.ids)
        return

    # check the arguments against a multi-speaker model.
    if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
        print(
            " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
            "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
        )
        return

    # RUN THE SYNTHESIS
    if args.text:
        print(" > Text: {}".format(args.text))

    # kick it
    wav = synthesizer.tts(
        args.text,
        args.speaker_idx,
        args.language_idx,
        args.speaker_wav,
        reference_wav=args.reference_wav,
        style_wav=args.capacitron_style_wav,
        style_text=args.capacitron_style_text,
        reference_speaker_name=args.reference_speaker_idx,
    )

    # save the results
    print(" > Saving output to {}".format(args.out_path))
    synthesizer.save_wav(wav, args.out_path)
Пример #10
0
def main():
    # pylint: disable=bad-continuation
    parser = argparse.ArgumentParser(description='''Synthesize speech on command line.\n\n'''

    '''You can either use your trained model or choose a model from the provided list.\n\n'''\

    '''If you don't specify any models, then it uses LJSpeech based English models\n\n'''\

    '''
    Example runs:

    # list provided models
    ./TTS/bin/synthesize.py --list_models

    # run tts with default models.
    ./TTS/bin synthesize.py --text "Text for TTS"

    # run a tts model with its default vocoder model.
     ./TTS/bin synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>"

    # run with specific tts and vocoder models from the list
    ./TTS/bin/synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path

    # run your own TTS model (Using Griffin-Lim Vocoder)
    ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav

    # run your own TTS and Vocoder models
    ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
        --vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json

    ''',
        formatter_class=RawTextHelpFormatter)

    parser.add_argument(
        '--list_models',
        type=str2bool,
        nargs='?',
        const=True,
        default=False,
        help='list available pre-trained tts and vocoder models.')
    parser.add_argument('--text',
                        type=str,
                        default=None,
                        help='Text to generate speech.')

    # Args for running pre-trained TTS models.
    parser.add_argument(
        '--model_name',
        type=str,
        default="tts_models/en/ljspeech/speedy-speech-wn",
        help=
        'Name of one of the pre-trained tts models in format <language>/<dataset>/<model_name>'
    )
    parser.add_argument(
        '--vocoder_name',
        type=str,
        default=None,
        help=
        'Name of one of the pre-trained  vocoder models in format <language>/<dataset>/<model_name>'
    )

    # Args for running custom models
    parser.add_argument('--config_path',
                        default=None,
                        type=str,
                        help='Path to model config file.')
    parser.add_argument(
        '--model_path',
        type=str,
        default=None,
        help='Path to model file.',
    )
    parser.add_argument(
        '--out_path',
        type=str,
        default=Path(__file__).resolve().parent,
        help=
        'Path to save final wav file. Wav file will be named as the given text.',
    )
    parser.add_argument('--use_cuda',
                        type=bool,
                        help='Run model on CUDA.',
                        default=False)
    parser.add_argument(
        '--vocoder_path',
        type=str,
        help=
        'Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).',
        default=None,
    )
    parser.add_argument('--vocoder_config_path',
                        type=str,
                        help='Path to vocoder model config file.',
                        default=None)

    # args for multi-speaker synthesis
    parser.add_argument('--speakers_json',
                        type=str,
                        help="JSON file for multi-speaker model.",
                        default=None)
    parser.add_argument(
        '--speaker_idx',
        type=str,
        help=
        "if the tts model is trained with x-vectors, then speaker_idx is a file present in speakers.json else speaker_idx is the speaker id corresponding to a speaker in the speaker embedding layer.",
        default=None)
    parser.add_argument('--gst_style',
                        help="Wav path file for GST stylereference.",
                        default=None)

    # aux args
    parser.add_argument(
        '--save_spectogram',
        type=bool,
        help=
        "If true save raw spectogram for further (vocoder) processing in out_path.",
        default=False)

    args = parser.parse_args()

    # print the description if either text or list_models is not set
    if args.text is None and not args.list_models:
        parser.parse_args(['-h'])

    # load model manager
    path = Path(__file__).parent / "../.models.json"
    manager = ModelManager(path)

    model_path = None
    config_path = None
    vocoder_path = None
    vocoder_config_path = None

    # CASE1: list pre-trained TTS models
    if args.list_models:
        manager.list_models()
        sys.exit()

    # CASE2: load pre-trained models
    if args.model_name is not None:
        model_path, config_path, model_item = manager.download_model(
            args.model_name)
        args.vocoder_name = model_item[
            'default_vocoder'] if args.vocoder_name is None else args.vocoder_name

    if args.vocoder_name is not None:
        vocoder_path, vocoder_config_path, _ = manager.download_model(
            args.vocoder_name)

    # CASE3: load custome models
    if args.model_path is not None:
        model_path = args.model_path
        config_path = args.config_path

    if args.vocoder_path is not None:
        vocoder_path = args.vocoder_path
        vocoder_config_path = args.vocoder_config_path

    # RUN THE SYNTHESIS
    # load models
    synthesizer = Synthesizer(model_path, config_path, vocoder_path,
                              vocoder_config_path, args.use_cuda)

    print(" > Text: {}".format(args.text))

    # # handle multi-speaker setting
    # if not model_config.use_external_speaker_embedding_file and args.speaker_idx is not None:
    #     if args.speaker_idx.isdigit():
    #         args.speaker_idx = int(args.speaker_idx)
    #     else:
    #         args.speaker_idx = None
    # else:
    #     args.speaker_idx = None

    # if args.gst_style is None:
    #     if 'gst' in model_config.keys() and model_config.gst['gst_style_input'] is not None:
    #         gst_style = model_config.gst['gst_style_input']
    #     else:
    #         gst_style = None
    # else:
    #     # check if gst_style string is a dict, if is dict convert  else use string
    #     try:
    #         gst_style = json.loads(args.gst_style)
    #         if max(map(int, gst_style.keys())) >= model_config.gst['gst_style_tokens']:
    #             raise RuntimeError("The highest value of the gst_style dictionary key must be less than the number of GST Tokens, \n Highest dictionary key value: {} \n Number of GST tokens: {}".format(max(map(int, gst_style.keys())), model_config.gst['gst_style_tokens']))
    #     except ValueError:
    #         gst_style = args.gst_style

    # kick it
    wav = synthesizer.tts(args.text)

    # save the results
    file_name = args.text.replace(" ", "_")[0:20]
    file_name = file_name.translate(
        str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
    out_path = os.path.join(args.out_path, file_name)
    print(" > Saving output to {}".format(out_path))
    synthesizer.save_wav(
        wav,
        out_path,
    )
class DetectAndDeter:
    CLASSIFICATION_COUNT = 5
    TELEMARKETER_THRESH = 0.3
    VALID_CALLER_THRESH = 0.1
    IN_AUDIO_RATE = 8000
    DS_AUDIO_RATE = 16000
    MOZILLA_TTS_AUDIO_RATE = 22050
    QUIET_THRESH = 150
    QUIET_LENGTH = 3000

    def __init__(self, name):
        self.name = name  # user's name  e.g. "Bob Ross"
        self.valid_caller_event = Event()
        self.caller_audio_chunk = np.array([], dtype='int16')

        self.audio_in_queue = Queue()
        self.stt_to_classification_queue = Queue()
        self.stt_to_chatbot_queue = Queue()
        self.chatbot_to_tts_queue = Queue()
        self.audio_out_queue = Queue()

        self.manager = Manager()
        self.transcript = self.manager.list()
        self.is_telemarketer = self.manager.Value("is_telemarketer", None)

        self.deep_speech = None
        self.mozilla_tts = None

        self.final_transcript = None
        self.final_predictions = None

        self.speech_to_text_thread = Process(target=self.speech_to_text)
        self.classify_text_thread = Process(target=self.classify_text)
        self.generate_response_thread = Process(target=self.generate_responses)
        self.text_to_speech_thread = Process(target=self.text_to_speech)

        self.log = {
            "start": None,
            "end": None,
            "version": CONFIG['version'],
            "transcript": [],
            "is_telemarketer": None,
            "caller": None
        }

    @property
    def queues(self):
        return self.audio_in_queue, self.audio_out_queue

    def start(self):
        self.speech_to_text_thread.start()
        self.classify_text_thread.start()
        self.generate_response_thread.start()
        self.text_to_speech_thread.start()

        self.log["start"] = dt.datetime.now().isoformat()

    def close(self):
        self.log["transcript"] = [value for value in self.transcript]
        self.log["is_telemarketer"] = self.is_telemarketer.value
        self.log["end"] = dt.datetime.now().isoformat()

        self.speech_to_text_thread.terminate()
        self.speech_to_text_thread.join()
        self.speech_to_text_thread.close()

        self.classify_text_thread.terminate()
        self.classify_text_thread.join()
        self.classify_text_thread.close()

        self.generate_response_thread.terminate()
        self.generate_response_thread.join()
        self.generate_response_thread.close()

        self.text_to_speech_thread.terminate()
        self.text_to_speech_thread.join()
        self.text_to_speech_thread.close()

    def fill_log_info(self, caller_number):
        self.log['caller'] = caller_number
        return self.log

    def classify_text(self):
        predictions = []
        while self.is_telemarketer.value is None:
            idx = self.stt_to_classification_queue.get()
            text = self.transcript[idx]['text']

            preds = model.predict(text)
            transcript_line = self.transcript[idx]
            transcript_line['analysis'] = {
                "prediction": str(preds[0]).lower(),
                "confidence": float(max(preds[2]))
            }
            self.transcript[idx] = transcript_line
            predictions.append(str(preds[0]).lower())

            maybe_telemarketer = predictions.count("persuasion") / len(
                predictions)

            if len(predictions) > self.CLASSIFICATION_COUNT:
                print("CLASS")
                print(maybe_telemarketer, self.TELEMARKETER_THRESH,
                      self.VALID_CALLER_THRESH)
                if maybe_telemarketer > self.TELEMARKETER_THRESH:
                    self.is_telemarketer.value = True
                    break
                elif maybe_telemarketer < self.VALID_CALLER_THRESH:
                    self.is_telemarketer.value = False
                    # self.is_telemarketer.set()
                    break

        if not self.is_telemarketer.value:
            self.valid_caller_event.set()

    def generate_responses(self):
        while True:
            text = self.stt_to_chatbot_queue.get()
            print("Generate Response:", text)
            response = str(chatbot.get_response(text))

            self.chatbot_to_tts_queue.put(response)

    def text_to_speech(self):
        tts_config = CONFIG['tts_config']
        models_folder = Path(tts_config['folder'])

        model_path = str(models_folder / tts_config['model'])
        model_config_path = str(models_folder / tts_config['model_config'])
        vocoder_path = str(models_folder / tts_config['vocoder'])
        vocoder_config_path = str(models_folder / tts_config['vocoder_config'])

        self.mozilla_tts = Synthesizer(model_path, model_config_path,
                                       vocoder_path, vocoder_config_path)

        while True:
            response = self.chatbot_to_tts_queue.get()
            print("TTS:", response)

            sound_arr = np.array(self.mozilla_tts.tts(response))

            sound_arr *= 2**15
            sound_arr = sound_arr.astype('int16')

            sound = bytes(sound_arr)
            sound, _ = audioop.ratecv(sound, 2, 1, self.MOZILLA_TTS_AUDIO_RATE,
                                      self.IN_AUDIO_RATE, None)

            ulaw_sound = audioop.lin2ulaw(sound, 2)

            chunk_len = 540
            chunks = len(ulaw_sound) // chunk_len
            extra = len(ulaw_sound) - (chunks * chunk_len)

            for c in range(chunks):
                chunk = ulaw_sound[c * chunk_len:c * chunk_len + chunk_len]
                self.audio_out_queue.put(
                    base64.b64encode(chunk).decode('utf-8'))

            if extra != 0:
                chunk = ulaw_sound[-extra:]
                self.audio_out_queue.put(
                    base64.b64encode(chunk).decode('utf-8'))

            self.transcript.append({
                "speaker": "self",
                "text": response,
                "datetime": dt.datetime.now().isoformat()
            })

    def speech_to_text(self):
        stt_config = CONFIG['stt_config']
        models_folder = Path(stt_config['folder'])
        model_path = str(models_folder / stt_config['model'])
        scorer_path = str(models_folder / stt_config['scorer'])

        self.deep_speech = Model(model_path)
        self.deep_speech.enableExternalScorer(scorer_path)

        stream = self.deep_speech.createStream()

        while True:
            speech = self.audio_in_queue.get()

            while not self.audio_in_queue.empty():
                speech += self.audio_in_queue.get()

            lin_speech = audioop.ulaw2lin(speech, 2)
            ds_speech, _ = audioop.ratecv(lin_speech, 2, 1, self.IN_AUDIO_RATE,
                                          self.DS_AUDIO_RATE, None)

            lin_speech_arr = np.frombuffer(lin_speech, np.int16)
            ds_speech_arr = np.frombuffer(ds_speech, np.int16)

            stream.feedAudioContent(ds_speech_arr)

            self.caller_audio_chunk = np.concatenate(
                (self.caller_audio_chunk, lin_speech_arr))

            chunk_idx = max(0,
                            len(self.caller_audio_chunk) - self.QUIET_LENGTH)
            quiet_chunk = self.caller_audio_chunk[chunk_idx:]
            if (quiet_chunk < self.QUIET_THRESH).all() and (
                    self.caller_audio_chunk > self.QUIET_THRESH).any():
                text = stream.intermediateDecode()

                if text.strip():
                    self.stt_to_chatbot_queue.put(text)

                    idx = len(self.transcript
                              )  # insert to avoid race conditions with indexes
                    self.transcript.insert(
                        idx, {
                            "speaker": "caller",
                            "text": text,
                            "datetime": dt.datetime.now().isoformat()
                        })
                    self.stt_to_classification_queue.put(idx)

                    stream.finishStream()
                    stream = self.deep_speech.createStream()

                self.caller_audio_chunk = np.array([], dtype='int16')

    def make_greeting(self, one_party_consent):
        self.chatbot_to_tts_queue.put(
            f"Hi. This is {self.name} how may I help you?")

        if not one_party_consent:
            self.chatbot_to_tts_queue.put("Keep in mind, I record all calls")
Пример #12
0
def main():
    # pylint: disable=bad-option-value
    parser = argparse.ArgumentParser(
        description="""Synthesize speech on command line.\n\n"""
        """You can either use your trained model or choose a model from the provided list.\n\n"""
        """If you don't specify any models, then it uses LJSpeech based English model.\n\n"""
        """
    # Example Runs:

    ## Single Speaker Models

    - list provided models

    ```
    $ ./TTS/bin/synthesize.py --list_models
    ```

    - run tts with default models.

    ```
    $ ./TTS/bin synthesize.py --text "Text for TTS"
    ```

    - run a tts model with its default vocoder model.

    ```
    $ ./TTS/bin synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>
    ```

    - run with specific tts and vocoder models from the list

    ```
    $ ./TTS/bin/synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path
    ```

    - run your own TTS model (Using Griffin-Lim Vocoder)

    ```
    $ ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav
    ```

    - run your own TTS and Vocoder models
    ```
    $ ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
        --vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json
    ```

    ## MULTI-SPEAKER MODELS

    - list the available speakers and choose as <speaker_id> among them.

    ```
    $ ./TTS/bin/synthesize.py --model_name "<language>/<dataset>/<model_name>"  --list_speaker_idxs
    ```

    - run the multi-speaker TTS model with the target speaker ID.

    ```
    $ ./TTS/bin/synthesize.py --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>"  --speaker_idx <speaker_id>
    ```

    - run your own multi-speaker TTS model.

    ```
    $ ./TTS/bin/synthesize.py --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth.tar --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
    ```
    """,
        formatter_class=RawTextHelpFormatter,
    )

    parser.add_argument(
        "--list_models",
        type=str2bool,
        nargs="?",
        const=True,
        default=False,
        help="list available pre-trained tts and vocoder models.",
    )
    parser.add_argument("--text",
                        type=str,
                        default=None,
                        help="Text to generate speech.")

    # Args for running pre-trained TTS models.
    parser.add_argument(
        "--model_name",
        type=str,
        default="tts_models/en/ljspeech/tacotron2-DDC",
        help=
        "Name of one of the pre-trained tts models in format <language>/<dataset>/<model_name>",
    )
    parser.add_argument(
        "--vocoder_name",
        type=str,
        default=None,
        help=
        "Name of one of the pre-trained  vocoder models in format <language>/<dataset>/<model_name>",
    )

    # Args for running custom models
    parser.add_argument("--config_path",
                        default=None,
                        type=str,
                        help="Path to model config file.")
    parser.add_argument(
        "--model_path",
        type=str,
        default=None,
        help="Path to model file.",
    )
    parser.add_argument(
        "--out_path",
        type=str,
        default="tts_output.wav",
        help="Output wav file path.",
    )
    parser.add_argument("--use_cuda",
                        type=bool,
                        help="Run model on CUDA.",
                        default=False)
    parser.add_argument(
        "--vocoder_path",
        type=str,
        help=
        "Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
        default=None,
    )
    parser.add_argument("--vocoder_config_path",
                        type=str,
                        help="Path to vocoder model config file.",
                        default=None)
    parser.add_argument(
        "--encoder_path",
        type=str,
        help="Path to speaker encoder model file.",
        default=None,
    )
    parser.add_argument("--encoder_config_path",
                        type=str,
                        help="Path to speaker encoder config file.",
                        default=None)

    # args for multi-speaker synthesis
    parser.add_argument("--speakers_file_path",
                        type=str,
                        help="JSON file for multi-speaker model.",
                        default=None)
    parser.add_argument(
        "--speaker_idx",
        type=str,
        help="Target speaker ID for a multi-speaker TTS model.",
        default=None,
    )
    parser.add_argument(
        "--speaker_wav",
        nargs="+",
        help=
        "wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The x_vectors is computed as their average.",
        default=None,
    )
    parser.add_argument("--gst_style",
                        help="Wav path file for GST stylereference.",
                        default=None)
    parser.add_argument(
        "--list_speaker_idxs",
        help="List available speaker ids for the defined multi-speaker model.",
        type=str2bool,
        nargs="?",
        const=True,
        default=False,
    )
    # aux args
    parser.add_argument(
        "--save_spectogram",
        type=bool,
        help=
        "If true save raw spectogram for further (vocoder) processing in out_path.",
        default=False,
    )

    args = parser.parse_args()

    # print the description if either text or list_models is not set
    if args.text is None and not args.list_models and not args.list_speaker_idxs:
        parser.parse_args(["-h"])

    # load model manager
    path = Path(__file__).parent / "../.models.json"
    manager = ModelManager(path)

    model_path = None
    config_path = None
    speakers_file_path = None
    vocoder_path = None
    vocoder_config_path = None
    encoder_path = None
    encoder_config_path = None

    # CASE1: list pre-trained TTS models
    if args.list_models:
        manager.list_models()
        sys.exit()

    # CASE2: load pre-trained model paths
    if args.model_name is not None and not args.model_path:
        model_path, config_path, model_item = manager.download_model(
            args.model_name)
        args.vocoder_name = model_item[
            "default_vocoder"] if args.vocoder_name is None else args.vocoder_name

    if args.vocoder_name is not None and not args.vocoder_path:
        vocoder_path, vocoder_config_path, _ = manager.download_model(
            args.vocoder_name)

    # CASE3: set custome model paths
    if args.model_path is not None:
        model_path = args.model_path
        config_path = args.config_path
        speakers_file_path = args.speakers_file_path

    if args.vocoder_path is not None:
        vocoder_path = args.vocoder_path
        vocoder_config_path = args.vocoder_config_path

    if args.encoder_path is not None:
        encoder_path = args.encoder_path
        encoder_config_path = args.encoder_config_path

    # load models
    synthesizer = Synthesizer(
        model_path,
        config_path,
        speakers_file_path,
        vocoder_path,
        vocoder_config_path,
        encoder_path,
        encoder_config_path,
        args.use_cuda,
    )

    # query speaker ids of a multi-speaker model.
    if args.list_speaker_idxs:
        print(
            " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
        )
        print(synthesizer.speaker_manager.speaker_ids)
        return

    # check the arguments against a multi-speaker model.
    if synthesizer.tts_speakers_file and (not args.speaker_idx
                                          and not args.speaker_wav):
        print(
            " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
            "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
        )
        return

    # RUN THE SYNTHESIS
    print(" > Text: {}".format(args.text))

    # kick it
    wav = synthesizer.tts(args.text, args.speaker_idx, args.speaker_wav)

    # save the results
    print(" > Saving output to {}".format(args.out_path))
    synthesizer.save_wav(wav, args.out_path)
Пример #13
0
class TextToSpeechListenerTask(Listener, Task):

    queue: Queue = Queue(maxsize=20)

    model_name = "tts_models/en/ljspeech/tacotron2-DCA"

    vocoder_name = "vocoder_models/universal/libri-tts/fullband-melgan"

    use_cuda = False

    synthesizer: Synthesizer

    tts_sample_rate = 22050

    output_sample_rate = 16000

    # TODO move to local folder instead of system folder
    def __init__(self, auto_start: bool = True):
        super().__init__(auto_start)

        path = Path(
            __file__
        ).parent / "../../.venv/lib/python3.8/site-packages/TTS/.models.json"
        manager = ModelManager(path)

        model_path, config_path = manager.download_model(self.model_name)
        vocoder_path, vocoder_config_path = manager.download_model(
            self.vocoder_name)

        self.synthesizer = Synthesizer(model_path, config_path, vocoder_path,
                                       vocoder_config_path, self.use_cuda)

    def run(self):
        while self.running:
            try:
                text = self.queue.get(timeout=2)
                if isinstance(text, Text):
                    self.speak(text)
                self.queue.task_done()
            except EmptyQueueError:
                continue

    def speak(self, text: Text) -> None:
        if not text.value:
            logger.warning("Cannot synthesize empty text")
        data = self.synthesize(text.value)
        publish_message(self, AudioFrame(data), text.src)

    def synthesize(self, text: str) -> bytes:
        """ TTS outputs a sample rate of 22050, so we must desample it to be able to consume it again """
        audio_data = np.array(self.synthesizer.tts(text))
        audio_data_normalized = audio_data * (
            32767 / max(0.01, np.max(np.abs(audio_data))))
        audio_data_bytes = audio_data_normalized.astype(np.int16).tobytes()

        audio_data_bytes = self.desample_audio_data(audio_data_bytes)

        self.store_audio_data(audio_data_bytes)

        return audio_data_bytes

    def desample_audio_data(self, data: bytes) -> bytes:
        converted = audioop.ratecv(data, 2, 1, self.tts_sample_rate,
                                   self.output_sample_rate, None)
        return converted[0]

    def store_audio_data(
        self,
        data: bytes,
        path:
        str = "/home/jonas/Projects/robot/mind/assets-old/output/tts/output.raw"
    ) -> None:
        with open(path, "wb") as f:
            f.write(data)