def test_in_out(self): self._create_random_model() tts_root_path = get_tests_output_path() tts_checkpoint = os.path.join(tts_root_path, "checkpoint_10.pth.tar") tts_config = os.path.join(tts_root_path, "dummy_model_config.json") synthesizer = Synthesizer(tts_checkpoint, tts_config, None, None) synthesizer.tts("Better this test works!!")
def test_in_out(self): self._create_random_model() config = load_config(os.path.join(get_tests_input_path(), 'server_config.json')) tts_root_path = get_tests_output_path() config['tts_checkpoint'] = os.path.join(tts_root_path, config['tts_checkpoint']) config['tts_config'] = os.path.join(tts_root_path, config['tts_config']) synthesizer = Synthesizer(config['tts_checkpoint'], config['tts_config'], None, None) synthesizer.tts("Better this test works!!")
def __init__(self): manager = ModelManager() model_path, config_path, model_item = manager.download_model(MODEL) vocoder_path, vocoder_config_path, _ = manager.download_model( model_item['default_vocoder']) # last arg is use kuda, self.synth = Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, False)
def tts(text): synthesizer = Synthesizer(MODEL_PATH, CONFIG_PATH, use_cuda) wav = synthesizer.tts(text) # save the results file_name = text.replace(" ", "_")[0:20] file_name = file_name.translate( str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav' out_path = OUT_FILE print(" > Saving output to {}".format(out_path)) synthesizer.save_wav(wav, out_path) playsound(out_path)
def __init__(self, auto_start: bool = True): super().__init__(auto_start) path = Path( __file__ ).parent / "../../.venv/lib/python3.8/site-packages/TTS/.models.json" manager = ModelManager(path) model_path, config_path = manager.download_model(self.model_name) vocoder_path, vocoder_config_path = manager.download_model( self.vocoder_name) self.synthesizer = Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, self.use_cuda)
def __init__(self): path = Path(synthesize.__file__).parent / "../.models.json" logger.info("path") logger.info("Creating ModelManager") self.manager = ModelManager(path) logger.info("Downloading model") model_path, config_path, _ = self.manager.download_model( self.MODEL_NAME) logger.info("Downloading vcoder") vocoder_path, vocoder_config_path, _ = self.manager.download_model( self.VOCODER_NAME) logger.info("Finished downloading TTS model & vcoder") self.synthesizer = Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, False) self.tts_lock = threading.Lock()
class MozillaTTS(): def __init__(self): manager = ModelManager() model_path, config_path, model_item = manager.download_model(MODEL) vocoder_path, vocoder_config_path, _ = manager.download_model( model_item['default_vocoder']) # last arg is use kuda, self.synth = Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, False) def say(self, text): # generate wav wav = self.synth.tts(text) # output TODO would be nice to play this without having to do file I/O... out_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'output.wav') self.synth.save_wav( wav, out_file, ) # Open the sound file chunk = 1024 wf = wave.open(out_file, 'rb') # play wav p = pyaudio.PyAudio() # Open a .Stream object to write the WAV file to # 'output = True' indicates that the sound will be played rather than recorded stream = p.open(format=p.get_format_from_width(wf.getsampwidth()), channels=wf.getnchannels(), rate=wf.getframerate(), output=True) # Play the sound by writing the audio data to the stream data = wf.readframes(chunk) while data != '': stream.write(data) data = wf.readframes(chunk) # Close and terminate the stream stream.close() p.terminate()
def tts(model_name='tts_models/en/ljspeech/tacotron2-DCA', vocoder_name=None, use_cuda=False): """TTS entry point for PyTorch Hub that provides a Synthesizer object to synthesize speech from a give text. Example: >>> synthesizer = torch.hub.load('coqui-ai/TTS', 'tts', source='github') >>> wavs = synthesizer.tts("This is a test! This is also a test!!") wavs - is a list of values of the synthesized speech. Args: model_name (str, optional): One of the model names from .model.json. Defaults to 'tts_models/en/ljspeech/tacotron2-DCA'. vocoder_name (str, optional): One of the model names from .model.json. Defaults to 'vocoder_models/en/ljspeech/multiband-melgan'. pretrained (bool, optional): [description]. Defaults to True. Returns: TTS.utils.synthesizer.Synthesizer: Synthesizer object wrapping both vocoder and tts models. """ manager = ModelManager() model_path, config_path, model_item = manager.download_model(model_name) vocoder_name = model_item[ 'default_vocoder'] if vocoder_name is None else vocoder_name vocoder_path, vocoder_config_path, _ = manager.download_model(vocoder_name) # create synthesizer synt = Synthesizer(tts_checkpoint=model_path, tts_config_path=config_path, vocoder_checkpoint=vocoder_path, vocoder_config=vocoder_config_path, use_cuda=use_cuda) return synt
def text_to_speech(self): tts_config = CONFIG['tts_config'] models_folder = Path(tts_config['folder']) model_path = str(models_folder / tts_config['model']) model_config_path = str(models_folder / tts_config['model_config']) vocoder_path = str(models_folder / tts_config['vocoder']) vocoder_config_path = str(models_folder / tts_config['vocoder_config']) self.mozilla_tts = Synthesizer(model_path, model_config_path, vocoder_path, vocoder_config_path) while True: response = self.chatbot_to_tts_queue.get() print("TTS:", response) sound_arr = np.array(self.mozilla_tts.tts(response)) sound_arr *= 2**15 sound_arr = sound_arr.astype('int16') sound = bytes(sound_arr) sound, _ = audioop.ratecv(sound, 2, 1, self.MOZILLA_TTS_AUDIO_RATE, self.IN_AUDIO_RATE, None) ulaw_sound = audioop.lin2ulaw(sound, 2) chunk_len = 540 chunks = len(ulaw_sound) // chunk_len extra = len(ulaw_sound) - (chunks * chunk_len) for c in range(chunks): chunk = ulaw_sound[c * chunk_len:c * chunk_len + chunk_len] self.audio_out_queue.put( base64.b64encode(chunk).decode('utf-8')) if extra != 0: chunk = ulaw_sound[-extra:] self.audio_out_queue.put( base64.b64encode(chunk).decode('utf-8')) self.transcript.append({ "speaker": "self", "text": response, "datetime": dt.datetime.now().isoformat() })
def generate(): if inputbox.get("1.0", "end-1c") == "": messagebox.showerror( message= "TTS will give a division by zero error if the text field is blank." ) else: if not os.path.exists('mozilla-tts-output'): try: os.makedirs('mozilla-tts-output') except OSError as e: if e.errno != errno.EEXIST: raise generatebutton.config(state="disabled") exportbutton.config(state="disabled") model_path = None config_path = None vocoder_path = None vocoder_config_path = None path = Path(__file__).parent / "TTS/.models.json" manager = ModelManager(path) model_name = 'tts_models/' + ttsmodelbox.get() print(f'model_name is {model_name}') # for dev #model_path, config_path, model_item = manager.download_model(model_name) # for master model_path, config_path = manager.download_model(model_name) vocoder_name = 'vocoder_models/' + vocodermodelbox.get() print(f'vocoder_name is {vocoder_name}') # for dev #vocoder_path, vocoder_config_path, model_item = manager.download_model(vocoder_name) # for master vocoder_path, vocoder_config_path = manager.download_model( vocoder_name) synthesizer = Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, cudacheckbutton.instate(['selected'])) wav = synthesizer.tts(inputbox.get("1.0", "end-1c")) synthesizer.save_wav(wav, "mozilla-tts-output/generated.wav") playsound("mozilla-tts-output/generated.wav") generatebutton.config(state="enabled") exportbutton.config(state="enabled") print("All done!")
def exportaudio(): if inputbox.get("1.0", "end-1c") == "": messagebox.showerror( message= "TTS will give a division by zero error if the text field is blank." ) else: f = filedialog.asksaveasfile(mode='a', defaultextension=".wav", filetypes=[("Wave files", ".wav")]) if f is None: # asksaveasfile return `None` if dialog closed with "cancel". return generatebutton.config(state="disabled") exportbutton.config(state="disabled") model_path = None config_path = None vocoder_path = None vocoder_config_path = None path = Path(__file__).parent / "TTS/.models.json" manager = ModelManager(path) model_name = 'tts_models/' + ttsmodelbox.get() print(f'model_name is {model_name}') # for dev #model_path, config_path, model_item = manager.download_model(model_name) # for master model_path, config_path = manager.download_model(model_name) vocoder_name = 'vocoder_models/' + vocodermodelbox.get() print(f'vocoder_name is {vocoder_name}') # for dev #vocoder_path, vocoder_config_path, model_item = manager.download_model(vocoder_name) # for master vocoder_path, vocoder_config_path = manager.download_model(vocoder_name) synthesizer = Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, cudacheckbutton.instate(['selected'])) wav = synthesizer.tts(inputbox.get("1.0", "end-1c")) synthesizer.save_wav(wav, str(f.name)) generatebutton.config(state="enabled") exportbutton.config(state="enabled") print("All done!")
def text_to_wav(text, lang): #class Synthesizer(object): #def __init__(self, tts_checkpoint, tts_config, vocoder_checkpoint=None, vocoder_config=None, use_cuda=False): tts_checkpoint = "/home/hector/.local/share/tts/tts_models--en--ljspeech--speedy-speech-wn/model_file.pth.tar" tts_config = "/home/hector/.local/share/tts/tts_models--en--ljspeech--speedy-speech-wn/config.json" #tts_checkpoint = "/home/hector/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth.tar" #tts_config = "/home/hector/.local/share/tts/tts_models--en--ljspeech--glow-tts/config.json" if lang in ["Es"]: tts_checkpoint = "/home/hector/.local/share/tts/tts_models--es--mai--tacotron2-DDC/model_file.pth.tar" tts_config = "/home/hector/.local/share/tts/tts_models--es--mai--tacotron2-DDC/config.json" vocoder_checkpoint = "/home/hector/.local/share/tts/vocoder_models--universal--libri-tts--fullband-melgan/model_file.pth.tar" vocoder_config = "/home/hector/.local/share/tts/vocoder_models--universal--libri-tts--fullband-melgan/config.json" synthesizer = Synthesizer(tts_checkpoint, tts_config, vocoder_checkpoint, vocoder_config) # kick it wav = synthesizer.tts(text) # save the results file_name = 'audio.wav' #print(" > Saving output to {}".format(file_name)) synthesizer.save_wav(wav, file_name)
class Tts: MODEL_NAME = "tts_models/en/ljspeech/tacotron2-DCA" VOCODER_NAME = "vocoder_models/en/ljspeech/multiband-melgan" def __init__(self): path = Path(synthesize.__file__).parent / "../.models.json" logger.info("path") logger.info("Creating ModelManager") self.manager = ModelManager(path) logger.info("Downloading model") model_path, config_path, _ = self.manager.download_model( self.MODEL_NAME) logger.info("Downloading vcoder") vocoder_path, vocoder_config_path, _ = self.manager.download_model( self.VOCODER_NAME) logger.info("Finished downloading TTS model & vcoder") self.synthesizer = Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, False) self.tts_lock = threading.Lock() def synthesize_speech(self, tts: str): """ This is largely copy pasted from TTS library (TTS.utils.synthesizer.Synthesizer.save_wav) but slightly modified to allow NamedTemporaryFile as output instead of writing it to a file :param tts: Text to speech :return: Speech in NamedTemporaryFile (wav) """ if not self.tts_lock.acquire(blocking=True, timeout=0.1): raise TTSAlreadyProcessingException try: wav = self.synthesizer.tts(tts) wav = np.array(wav) wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) temp_file = NamedTemporaryFile(suffix=".wav") scipy.io.wavfile.write(temp_file, self.synthesizer.output_sample_rate, wav_norm.astype(np.int16)) return temp_file finally: self.tts_lock.release()
def _get_synthesizer(self, language) -> Synthesizer: if '-' in language: language = language.split('-')[0] stopwatch = Stopwatch() with stopwatch: model_name = None for model in self.models: _, lang, dataset, name = model.split('/') print(f"{lang}|{name}") if language in lang: model_name = model if name == self.preferred_model: break model_path, config_path, model_item = self.manager.download_model( model_name) vocoder_name = model_item.get( "default_vocoder", "vocoder_models/universal/libri-tts/fullband-melgan") vocoder_path, vocoder_config_path, _ = self.manager.download_model( vocoder_name) speakers_file_path = '' encoder_path = '' encoder_config_path = '' use_cuda = False synthesizer = Synthesizer( model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, encoder_path, encoder_config_path, use_cuda, ) LOG.debug(f"Get synthesizer time={stopwatch.time}") return synthesizer
def test_split_into_sentences(self): """Check demo server sentences split as expected""" print("\n > Testing demo server sentence splitting") # pylint: disable=attribute-defined-outside-init self.seg = Synthesizer.get_segmenter("en") sis = Synthesizer.split_into_sentences assert sis(self, 'Hello. Two sentences') == ['Hello.', 'Two sentences'] assert sis(self, 'He went to meet the adviser from Scott, Waltman & Co. next morning.') == ['He went to meet the adviser from Scott, Waltman & Co. next morning.'] assert sis(self, 'Let\'s run it past Sarah and co. They\'ll want to see this.') == ['Let\'s run it past Sarah and co.', 'They\'ll want to see this.'] assert sis(self, 'Where is Bobby Jr.\'s rabbit?') == ['Where is Bobby Jr.\'s rabbit?'] assert sis(self, 'Please inform the U.K. authorities right away.') == ['Please inform the U.K. authorities right away.'] assert sis(self, 'Were David and co. at the event?') == ['Were David and co. at the event?'] assert sis(self, 'paging dr. green, please come to theatre four immediately.') == ['paging dr. green, please come to theatre four immediately.'] assert sis(self, 'The email format is [email protected]. I think you reversed them.') == ['The email format is [email protected].', 'I think you reversed them.'] assert sis(self, 'The demo site is: https://top100.example.com/subsection/latestnews.html. Please send us your feedback.') == ['The demo site is: https://top100.example.com/subsection/latestnews.html.', 'Please send us your feedback.'] assert sis(self, 'Scowling at him, \'You are not done yet!\' she yelled.') == ['Scowling at him, \'You are not done yet!\' she yelled.'] # with the final lowercase "she" we see it's all one sentence assert sis(self, 'Hey!! So good to see you.') == ['Hey!!', 'So good to see you.'] assert sis(self, 'He went to Yahoo! but I don\'t know the division.') == ['He went to Yahoo! but I don\'t know the division.'] assert sis(self, 'If you can\'t remember a quote, “at least make up a memorable one that\'s plausible..."') == ['If you can\'t remember a quote, “at least make up a memorable one that\'s plausible..."'] assert sis(self, 'The address is not google.com.') == ['The address is not google.com.'] assert sis(self, '1.) The first item 2.) The second item') == ['1.) The first item', '2.) The second item'] assert sis(self, '1) The first item 2) The second item') == ['1) The first item', '2) The second item'] assert sis(self, 'a. The first item b. The second item c. The third list item') == ['a. The first item', 'b. The second item', 'c. The third list item']
def make_synthesizer(model_name, use_cuda): # load model manager path = Path(TTS.__file__).parent / ".models.json" manager = ModelManager(path) model_path, config_path, model_item = manager.download_model(model_name) vocoder_name = model_item["default_vocoder"] vocoder_path, vocoder_config_path, _ = manager.download_model(vocoder_name) speakers_file_path = None encoder_path = None encoder_config_path = None return Synthesizer( model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, encoder_path, encoder_config_path, use_cuda, )
if args.model_path is not None: model_path = args.model_path config_path = args.config_path speakers_file_path = args.speakers_file_path if args.vocoder_path is not None: vocoder_path = args.vocoder_path vocoder_config_path = args.vocoder_config_path # load models synthesizer = Synthesizer( tts_checkpoint=model_path, tts_config_path=config_path, tts_speakers_file=speakers_file_path, tts_languages_file=None, vocoder_checkpoint=vocoder_path, vocoder_config=vocoder_config_path, encoder_checkpoint="", encoder_config="", use_cuda=args.use_cuda, ) use_multi_speaker = hasattr( synthesizer.tts_model, "num_speakers") and (synthesizer.tts_model.num_speakers > 1 or synthesizer.tts_speakers_file is not None) speaker_manager = getattr(synthesizer.tts_model, "speaker_manager", None) # TODO: set this from SpeakerManager use_gst = synthesizer.tts_config.get("use_gst", False) app = Flask(__name__)
def main(): # pylint: disable=bad-continuation parser = argparse.ArgumentParser(description='''Synthesize speech on command line.\n\n''' '''You can either use your trained model or choose a model from the provided list.\n\n'''\ '''If you don't specify any models, then it uses LJSpeech based English models\n\n'''\ ''' Example runs: # list provided models ./TTS/bin/synthesize.py --list_models # run tts with default models. ./TTS/bin synthesize.py --text "Text for TTS" # run a tts model with its default vocoder model. ./TTS/bin synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" # run with specific tts and vocoder models from the list ./TTS/bin/synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path # run your own TTS model (Using Griffin-Lim Vocoder) ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav # run your own TTS and Vocoder models ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav --vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json ''', formatter_class=RawTextHelpFormatter) parser.add_argument( '--list_models', type=str2bool, nargs='?', const=True, default=False, help='list available pre-trained tts and vocoder models.') parser.add_argument('--text', type=str, default=None, help='Text to generate speech.') # Args for running pre-trained TTS models. parser.add_argument( '--model_name', type=str, default="tts_models/en/ljspeech/speedy-speech-wn", help= 'Name of one of the pre-trained tts models in format <language>/<dataset>/<model_name>' ) parser.add_argument( '--vocoder_name', type=str, default=None, help= 'Name of one of the pre-trained vocoder models in format <language>/<dataset>/<model_name>' ) # Args for running custom models parser.add_argument('--config_path', default=None, type=str, help='Path to model config file.') parser.add_argument( '--model_path', type=str, default=None, help='Path to model file.', ) parser.add_argument( '--out_path', type=str, default=Path(__file__).resolve().parent, help= 'Path to save final wav file. Wav file will be named as the given text.', ) parser.add_argument('--use_cuda', type=bool, help='Run model on CUDA.', default=False) parser.add_argument( '--vocoder_path', type=str, help= 'Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).', default=None, ) parser.add_argument('--vocoder_config_path', type=str, help='Path to vocoder model config file.', default=None) # args for multi-speaker synthesis parser.add_argument('--speakers_json', type=str, help="JSON file for multi-speaker model.", default=None) parser.add_argument( '--speaker_idx', type=str, help= "if the tts model is trained with x-vectors, then speaker_idx is a file present in speakers.json else speaker_idx is the speaker id corresponding to a speaker in the speaker embedding layer.", default=None) parser.add_argument('--gst_style', help="Wav path file for GST stylereference.", default=None) # aux args parser.add_argument( '--save_spectogram', type=bool, help= "If true save raw spectogram for further (vocoder) processing in out_path.", default=False) args = parser.parse_args() # print the description if either text or list_models is not set if args.text is None and not args.list_models: parser.parse_args(['-h']) # load model manager path = Path(__file__).parent / "../.models.json" manager = ModelManager(path) model_path = None config_path = None vocoder_path = None vocoder_config_path = None # CASE1: list pre-trained TTS models if args.list_models: manager.list_models() sys.exit() # CASE2: load pre-trained models if args.model_name is not None: model_path, config_path, model_item = manager.download_model( args.model_name) args.vocoder_name = model_item[ 'default_vocoder'] if args.vocoder_name is None else args.vocoder_name if args.vocoder_name is not None: vocoder_path, vocoder_config_path, _ = manager.download_model( args.vocoder_name) # CASE3: load custome models if args.model_path is not None: model_path = args.model_path config_path = args.config_path if args.vocoder_path is not None: vocoder_path = args.vocoder_path vocoder_config_path = args.vocoder_config_path # RUN THE SYNTHESIS # load models synthesizer = Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, args.use_cuda) print(" > Text: {}".format(args.text)) # # handle multi-speaker setting # if not model_config.use_external_speaker_embedding_file and args.speaker_idx is not None: # if args.speaker_idx.isdigit(): # args.speaker_idx = int(args.speaker_idx) # else: # args.speaker_idx = None # else: # args.speaker_idx = None # if args.gst_style is None: # if 'gst' in model_config.keys() and model_config.gst['gst_style_input'] is not None: # gst_style = model_config.gst['gst_style_input'] # else: # gst_style = None # else: # # check if gst_style string is a dict, if is dict convert else use string # try: # gst_style = json.loads(args.gst_style) # if max(map(int, gst_style.keys())) >= model_config.gst['gst_style_tokens']: # raise RuntimeError("The highest value of the gst_style dictionary key must be less than the number of GST Tokens, \n Highest dictionary key value: {} \n Number of GST tokens: {}".format(max(map(int, gst_style.keys())), model_config.gst['gst_style_tokens'])) # except ValueError: # gst_style = args.gst_style # kick it wav = synthesizer.tts(args.text) # save the results file_name = args.text.replace(" ", "_")[0:20] file_name = file_name.translate( str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav' out_path = os.path.join(args.out_path, file_name) print(" > Saving output to {}".format(out_path)) synthesizer.save_wav( wav, out_path, )
def initsynthesizer(model_name,vocoder_name,use_cuda): model_path, config_path = manager.download_model(model_name) vocoder_path, vocoder_config_path = manager.download(vocoder_name) return Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, use_cuda)
def main(): # pylint: disable=bad-option-value parser = argparse.ArgumentParser( description="""Synthesize speech on command line.\n\n""" """You can either use your trained model or choose a model from the provided list.\n\n""" """If you don't specify any models, then it uses LJSpeech based English model.\n\n""" """ # Example Runs: ## Single Speaker Models - list provided models ``` $ ./TTS/bin/synthesize.py --list_models ``` - run tts with default models. ``` $ ./TTS/bin synthesize.py --text "Text for TTS" ``` - run a tts model with its default vocoder model. ``` $ ./TTS/bin synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name> ``` - run with specific tts and vocoder models from the list ``` $ ./TTS/bin/synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path ``` - run your own TTS model (Using Griffin-Lim Vocoder) ``` $ ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav ``` - run your own TTS and Vocoder models ``` $ ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav --vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json ``` ## MULTI-SPEAKER MODELS - list the available speakers and choose as <speaker_id> among them. ``` $ ./TTS/bin/synthesize.py --model_name "<language>/<dataset>/<model_name>" --list_speaker_idxs ``` - run the multi-speaker TTS model with the target speaker ID. ``` $ ./TTS/bin/synthesize.py --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --speaker_idx <speaker_id> ``` - run your own multi-speaker TTS model. ``` $ ./TTS/bin/synthesize.py --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth.tar --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id> ``` """, formatter_class=RawTextHelpFormatter, ) parser.add_argument( "--list_models", type=str2bool, nargs="?", const=True, default=False, help="list available pre-trained tts and vocoder models.", ) parser.add_argument("--text", type=str, default=None, help="Text to generate speech.") # Args for running pre-trained TTS models. parser.add_argument( "--model_name", type=str, default="tts_models/en/ljspeech/tacotron2-DDC", help= "Name of one of the pre-trained tts models in format <language>/<dataset>/<model_name>", ) parser.add_argument( "--vocoder_name", type=str, default=None, help= "Name of one of the pre-trained vocoder models in format <language>/<dataset>/<model_name>", ) # Args for running custom models parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.") parser.add_argument( "--model_path", type=str, default=None, help="Path to model file.", ) parser.add_argument( "--out_path", type=str, default="tts_output.wav", help="Output wav file path.", ) parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False) parser.add_argument( "--vocoder_path", type=str, help= "Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).", default=None, ) parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None) parser.add_argument( "--encoder_path", type=str, help="Path to speaker encoder model file.", default=None, ) parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None) # args for multi-speaker synthesis parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None) parser.add_argument( "--speaker_idx", type=str, help="Target speaker ID for a multi-speaker TTS model.", default=None, ) parser.add_argument( "--speaker_wav", nargs="+", help= "wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The x_vectors is computed as their average.", default=None, ) parser.add_argument("--gst_style", help="Wav path file for GST stylereference.", default=None) parser.add_argument( "--list_speaker_idxs", help="List available speaker ids for the defined multi-speaker model.", type=str2bool, nargs="?", const=True, default=False, ) # aux args parser.add_argument( "--save_spectogram", type=bool, help= "If true save raw spectogram for further (vocoder) processing in out_path.", default=False, ) args = parser.parse_args() # print the description if either text or list_models is not set if args.text is None and not args.list_models and not args.list_speaker_idxs: parser.parse_args(["-h"]) # load model manager path = Path(__file__).parent / "../.models.json" manager = ModelManager(path) model_path = None config_path = None speakers_file_path = None vocoder_path = None vocoder_config_path = None encoder_path = None encoder_config_path = None # CASE1: list pre-trained TTS models if args.list_models: manager.list_models() sys.exit() # CASE2: load pre-trained model paths if args.model_name is not None and not args.model_path: model_path, config_path, model_item = manager.download_model( args.model_name) args.vocoder_name = model_item[ "default_vocoder"] if args.vocoder_name is None else args.vocoder_name if args.vocoder_name is not None and not args.vocoder_path: vocoder_path, vocoder_config_path, _ = manager.download_model( args.vocoder_name) # CASE3: set custome model paths if args.model_path is not None: model_path = args.model_path config_path = args.config_path speakers_file_path = args.speakers_file_path if args.vocoder_path is not None: vocoder_path = args.vocoder_path vocoder_config_path = args.vocoder_config_path if args.encoder_path is not None: encoder_path = args.encoder_path encoder_config_path = args.encoder_config_path # load models synthesizer = Synthesizer( model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, encoder_path, encoder_config_path, args.use_cuda, ) # query speaker ids of a multi-speaker model. if args.list_speaker_idxs: print( " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model." ) print(synthesizer.speaker_manager.speaker_ids) return # check the arguments against a multi-speaker model. if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav): print( " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to " "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`." ) return # RUN THE SYNTHESIS print(" > Text: {}".format(args.text)) # kick it wav = synthesizer.tts(args.text, args.speaker_idx, args.speaker_wav) # save the results print(" > Saving output to {}".format(args.out_path)) synthesizer.save_wav(wav, args.out_path)
class DetectAndDeter: CLASSIFICATION_COUNT = 5 TELEMARKETER_THRESH = 0.3 VALID_CALLER_THRESH = 0.1 IN_AUDIO_RATE = 8000 DS_AUDIO_RATE = 16000 MOZILLA_TTS_AUDIO_RATE = 22050 QUIET_THRESH = 150 QUIET_LENGTH = 3000 def __init__(self, name): self.name = name # user's name e.g. "Bob Ross" self.valid_caller_event = Event() self.caller_audio_chunk = np.array([], dtype='int16') self.audio_in_queue = Queue() self.stt_to_classification_queue = Queue() self.stt_to_chatbot_queue = Queue() self.chatbot_to_tts_queue = Queue() self.audio_out_queue = Queue() self.manager = Manager() self.transcript = self.manager.list() self.is_telemarketer = self.manager.Value("is_telemarketer", None) self.deep_speech = None self.mozilla_tts = None self.final_transcript = None self.final_predictions = None self.speech_to_text_thread = Process(target=self.speech_to_text) self.classify_text_thread = Process(target=self.classify_text) self.generate_response_thread = Process(target=self.generate_responses) self.text_to_speech_thread = Process(target=self.text_to_speech) self.log = { "start": None, "end": None, "version": CONFIG['version'], "transcript": [], "is_telemarketer": None, "caller": None } @property def queues(self): return self.audio_in_queue, self.audio_out_queue def start(self): self.speech_to_text_thread.start() self.classify_text_thread.start() self.generate_response_thread.start() self.text_to_speech_thread.start() self.log["start"] = dt.datetime.now().isoformat() def close(self): self.log["transcript"] = [value for value in self.transcript] self.log["is_telemarketer"] = self.is_telemarketer.value self.log["end"] = dt.datetime.now().isoformat() self.speech_to_text_thread.terminate() self.speech_to_text_thread.join() self.speech_to_text_thread.close() self.classify_text_thread.terminate() self.classify_text_thread.join() self.classify_text_thread.close() self.generate_response_thread.terminate() self.generate_response_thread.join() self.generate_response_thread.close() self.text_to_speech_thread.terminate() self.text_to_speech_thread.join() self.text_to_speech_thread.close() def fill_log_info(self, caller_number): self.log['caller'] = caller_number return self.log def classify_text(self): predictions = [] while self.is_telemarketer.value is None: idx = self.stt_to_classification_queue.get() text = self.transcript[idx]['text'] preds = model.predict(text) transcript_line = self.transcript[idx] transcript_line['analysis'] = { "prediction": str(preds[0]).lower(), "confidence": float(max(preds[2])) } self.transcript[idx] = transcript_line predictions.append(str(preds[0]).lower()) maybe_telemarketer = predictions.count("persuasion") / len( predictions) if len(predictions) > self.CLASSIFICATION_COUNT: print("CLASS") print(maybe_telemarketer, self.TELEMARKETER_THRESH, self.VALID_CALLER_THRESH) if maybe_telemarketer > self.TELEMARKETER_THRESH: self.is_telemarketer.value = True break elif maybe_telemarketer < self.VALID_CALLER_THRESH: self.is_telemarketer.value = False # self.is_telemarketer.set() break if not self.is_telemarketer.value: self.valid_caller_event.set() def generate_responses(self): while True: text = self.stt_to_chatbot_queue.get() print("Generate Response:", text) response = str(chatbot.get_response(text)) self.chatbot_to_tts_queue.put(response) def text_to_speech(self): tts_config = CONFIG['tts_config'] models_folder = Path(tts_config['folder']) model_path = str(models_folder / tts_config['model']) model_config_path = str(models_folder / tts_config['model_config']) vocoder_path = str(models_folder / tts_config['vocoder']) vocoder_config_path = str(models_folder / tts_config['vocoder_config']) self.mozilla_tts = Synthesizer(model_path, model_config_path, vocoder_path, vocoder_config_path) while True: response = self.chatbot_to_tts_queue.get() print("TTS:", response) sound_arr = np.array(self.mozilla_tts.tts(response)) sound_arr *= 2**15 sound_arr = sound_arr.astype('int16') sound = bytes(sound_arr) sound, _ = audioop.ratecv(sound, 2, 1, self.MOZILLA_TTS_AUDIO_RATE, self.IN_AUDIO_RATE, None) ulaw_sound = audioop.lin2ulaw(sound, 2) chunk_len = 540 chunks = len(ulaw_sound) // chunk_len extra = len(ulaw_sound) - (chunks * chunk_len) for c in range(chunks): chunk = ulaw_sound[c * chunk_len:c * chunk_len + chunk_len] self.audio_out_queue.put( base64.b64encode(chunk).decode('utf-8')) if extra != 0: chunk = ulaw_sound[-extra:] self.audio_out_queue.put( base64.b64encode(chunk).decode('utf-8')) self.transcript.append({ "speaker": "self", "text": response, "datetime": dt.datetime.now().isoformat() }) def speech_to_text(self): stt_config = CONFIG['stt_config'] models_folder = Path(stt_config['folder']) model_path = str(models_folder / stt_config['model']) scorer_path = str(models_folder / stt_config['scorer']) self.deep_speech = Model(model_path) self.deep_speech.enableExternalScorer(scorer_path) stream = self.deep_speech.createStream() while True: speech = self.audio_in_queue.get() while not self.audio_in_queue.empty(): speech += self.audio_in_queue.get() lin_speech = audioop.ulaw2lin(speech, 2) ds_speech, _ = audioop.ratecv(lin_speech, 2, 1, self.IN_AUDIO_RATE, self.DS_AUDIO_RATE, None) lin_speech_arr = np.frombuffer(lin_speech, np.int16) ds_speech_arr = np.frombuffer(ds_speech, np.int16) stream.feedAudioContent(ds_speech_arr) self.caller_audio_chunk = np.concatenate( (self.caller_audio_chunk, lin_speech_arr)) chunk_idx = max(0, len(self.caller_audio_chunk) - self.QUIET_LENGTH) quiet_chunk = self.caller_audio_chunk[chunk_idx:] if (quiet_chunk < self.QUIET_THRESH).all() and ( self.caller_audio_chunk > self.QUIET_THRESH).any(): text = stream.intermediateDecode() if text.strip(): self.stt_to_chatbot_queue.put(text) idx = len(self.transcript ) # insert to avoid race conditions with indexes self.transcript.insert( idx, { "speaker": "caller", "text": text, "datetime": dt.datetime.now().isoformat() }) self.stt_to_classification_queue.put(idx) stream.finishStream() stream = self.deep_speech.createStream() self.caller_audio_chunk = np.array([], dtype='int16') def make_greeting(self, one_party_consent): self.chatbot_to_tts_queue.put( f"Hi. This is {self.name} how may I help you?") if not one_party_consent: self.chatbot_to_tts_queue.put("Keep in mind, I record all calls")
def load_synthesizer(): MODEL_PATH = './voice/models/glow-tts/best_model.pth.tar' CONFIG_PATH = './voice/models/glow-tts/config.json' use_cuda = False synthesizer = Synthesizer(MODEL_PATH, CONFIG_PATH, use_cuda) return synthesizer
def main(): description = """Synthesize speech on command line. You can either use your trained model or choose a model from the provided list. If you don't specify any models, then it uses LJSpeech based English model. ## Example Runs ### Single Speaker Models - List provided models: ``` $ tts --list_models ``` - Query info for model info by idx: ``` $ tts --model_info_by_idx "<model_type>/<model_query_idx>" ``` - Query info for model info by full name: ``` $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>" ``` - Run TTS with default models: ``` $ tts --text "Text for TTS" ``` - Run a TTS model with its default vocoder model: ``` $ tts --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" ``` - Run with specific TTS and vocoder models from the list: ``` $ tts --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path ``` - Run your own TTS model (Using Griffin-Lim Vocoder): ``` $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav ``` - Run your own TTS and Vocoder models: ``` $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json ``` ### Multi-speaker Models - List the available speakers and choose as <speaker_id> among them: ``` $ tts --model_name "<language>/<dataset>/<model_name>" --list_speaker_idxs ``` - Run the multi-speaker TTS model with the target speaker ID: ``` $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --speaker_idx <speaker_id> ``` - Run your own multi-speaker TTS model: ``` $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id> ``` """ # We remove Markdown code formatting programmatically here to allow us to copy-and-paste from main README to keep # documentation in sync more easily. parser = argparse.ArgumentParser( description=description.replace(" ```\n", ""), formatter_class=RawTextHelpFormatter, ) parser.add_argument( "--list_models", type=str2bool, nargs="?", const=True, default=False, help="list available pre-trained TTS and vocoder models.", ) parser.add_argument( "--model_info_by_idx", type=str, default=None, help="model info using query format: <model_type>/<model_query_idx>", ) parser.add_argument( "--model_info_by_name", type=str, default=None, help="model info using query format: <model_type>/<language>/<dataset>/<model_name>", ) parser.add_argument("--text", type=str, default=None, help="Text to generate speech.") # Args for running pre-trained TTS models. parser.add_argument( "--model_name", type=str, default="tts_models/en/ljspeech/tacotron2-DDC", help="Name of one of the pre-trained TTS models in format <language>/<dataset>/<model_name>", ) parser.add_argument( "--vocoder_name", type=str, default=None, help="Name of one of the pre-trained vocoder models in format <language>/<dataset>/<model_name>", ) # Args for running custom models parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.") parser.add_argument( "--model_path", type=str, default=None, help="Path to model file.", ) parser.add_argument( "--out_path", type=str, default="tts_output.wav", help="Output wav file path.", ) parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False) parser.add_argument( "--vocoder_path", type=str, help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).", default=None, ) parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None) parser.add_argument( "--encoder_path", type=str, help="Path to speaker encoder model file.", default=None, ) parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None) # args for multi-speaker synthesis parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None) parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None) parser.add_argument( "--speaker_idx", type=str, help="Target speaker ID for a multi-speaker TTS model.", default=None, ) parser.add_argument( "--language_idx", type=str, help="Target language ID for a multi-lingual TTS model.", default=None, ) parser.add_argument( "--speaker_wav", nargs="+", help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.", default=None, ) parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None) parser.add_argument( "--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None ) parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None) parser.add_argument( "--list_speaker_idxs", help="List available speaker ids for the defined multi-speaker model.", type=str2bool, nargs="?", const=True, default=False, ) parser.add_argument( "--list_language_idxs", help="List available language ids for the defined multi-lingual model.", type=str2bool, nargs="?", const=True, default=False, ) # aux args parser.add_argument( "--save_spectogram", type=bool, help="If true save raw spectogram for further (vocoder) processing in out_path.", default=False, ) parser.add_argument( "--reference_wav", type=str, help="Reference wav file to convert in the voice of the speaker_idx or speaker_wav", default=None, ) parser.add_argument( "--reference_speaker_idx", type=str, help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).", default=None, ) args = parser.parse_args() # print the description if either text or list_models is not set check_args = [ args.text, args.list_models, args.list_speaker_idxs, args.list_language_idxs, args.reference_wav, args.model_info_by_idx, args.model_info_by_name, ] if not any(check_args): parser.parse_args(["-h"]) # load model manager path = Path(__file__).parent / "../.models.json" manager = ModelManager(path) model_path = None config_path = None speakers_file_path = None language_ids_file_path = None vocoder_path = None vocoder_config_path = None encoder_path = None encoder_config_path = None # CASE1 #list : list pre-trained TTS models if args.list_models: manager.list_models() sys.exit() # CASE2 #info : model info of pre-trained TTS models if args.model_info_by_idx: model_query = args.model_info_by_idx manager.model_info_by_idx(model_query) sys.exit() if args.model_info_by_name: model_query_full_name = args.model_info_by_name manager.model_info_by_full_name(model_query_full_name) sys.exit() # CASE3: load pre-trained model paths if args.model_name is not None and not args.model_path: model_path, config_path, model_item = manager.download_model(args.model_name) args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name if args.vocoder_name is not None and not args.vocoder_path: vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name) # CASE4: set custom model paths if args.model_path is not None: model_path = args.model_path config_path = args.config_path speakers_file_path = args.speakers_file_path language_ids_file_path = args.language_ids_file_path if args.vocoder_path is not None: vocoder_path = args.vocoder_path vocoder_config_path = args.vocoder_config_path if args.encoder_path is not None: encoder_path = args.encoder_path encoder_config_path = args.encoder_config_path # load models synthesizer = Synthesizer( model_path, config_path, speakers_file_path, language_ids_file_path, vocoder_path, vocoder_config_path, encoder_path, encoder_config_path, args.use_cuda, ) # query speaker ids of a multi-speaker model. if args.list_speaker_idxs: print( " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model." ) print(synthesizer.tts_model.speaker_manager.ids) return # query langauge ids of a multi-lingual model. if args.list_language_idxs: print( " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model." ) print(synthesizer.tts_model.language_manager.ids) return # check the arguments against a multi-speaker model. if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav): print( " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to " "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`." ) return # RUN THE SYNTHESIS if args.text: print(" > Text: {}".format(args.text)) # kick it wav = synthesizer.tts( args.text, args.speaker_idx, args.language_idx, args.speaker_wav, reference_wav=args.reference_wav, style_wav=args.capacitron_style_wav, style_text=args.capacitron_style_text, reference_speaker_name=args.reference_speaker_idx, ) # save the results print(" > Saving output to {}".format(args.out_path)) synthesizer.save_wav(wav, args.out_path)
if args.vocoder_name is not None: vocoder_checkpoint_file, vocoder_config_file, vocoder_json_dict = manager.download_model(args.vocoder_name) # If these were not specified in the CLI args, use default values with embedded model files if not args.tts_checkpoint and os.path.isfile(tts_checkpoint_file): args.tts_checkpoint = tts_checkpoint_file if not args.tts_config and os.path.isfile(tts_config_file): args.tts_config = tts_config_file if not args.vocoder_checkpoint and os.path.isfile(vocoder_checkpoint_file): args.vocoder_checkpoint = vocoder_checkpoint_file if not args.vocoder_config and os.path.isfile(vocoder_config_file): args.vocoder_config = vocoder_config_file synthesizer = Synthesizer( args.tts_checkpoint, args.tts_config, args.vocoder_checkpoint, args.vocoder_config, args.use_cuda ) app = Flask(__name__) @app.route("/") def index(): return render_template("index.html", show_details=args.show_details) @app.route("/details") def details(): model_config = load_config(args.tts_config) if args.vocoder_config is not None and os.path.isfile(args.vocoder_config): vocoder_config = load_config(args.vocoder_config)
def test_split_into_sentences(self): """Check demo server sentences split as expected""" print("\n > Testing demo server sentence splitting") # pylint: disable=attribute-defined-outside-init, protected-access self.seg = Synthesizer._get_segmenter("en") sis = Synthesizer.split_into_sentences assert sis(self, "Hello. Two sentences") == ["Hello.", "Two sentences"] assert sis( self, "He went to meet the adviser from Scott, Waltman & Co. next morning." ) == [ "He went to meet the adviser from Scott, Waltman & Co. next morning." ] assert sis( self, "Let's run it past Sarah and co. They'll want to see this.") == [ "Let's run it past Sarah and co.", "They'll want to see this.", ] assert sis(self, "Where is Bobby Jr.'s rabbit?") == [ "Where is Bobby Jr.'s rabbit?" ] assert sis(self, "Please inform the U.K. authorities right away.") == [ "Please inform the U.K. authorities right away." ] assert sis(self, "Were David and co. at the event?") == [ "Were David and co. at the event?" ] assert sis( self, "paging dr. green, please come to theatre four immediately.") == [ "paging dr. green, please come to theatre four immediately." ] assert sis( self, "The email format is [email protected]. I think you reversed them." ) == [ "The email format is [email protected].", "I think you reversed them.", ] assert sis( self, "The demo site is: https://top100.example.com/subsection/latestnews.html. Please send us your feedback.", ) == [ "The demo site is: https://top100.example.com/subsection/latestnews.html.", "Please send us your feedback.", ] assert sis( self, "Scowling at him, 'You are not done yet!' she yelled.") == [ "Scowling at him, 'You are not done yet!' she yelled." ] # with the final lowercase "she" we see it's all one sentence assert sis(self, "Hey!! So good to see you.") == [ "Hey!!", "So good to see you." ] assert sis(self, "He went to Yahoo! but I don't know the division.") == [ "He went to Yahoo! but I don't know the division." ] assert sis( self, "If you can't remember a quote, “at least make up a memorable one that's plausible...\"" ) == [ "If you can't remember a quote, “at least make up a memorable one that's plausible...\"" ] assert sis(self, "The address is not google.com.") == [ "The address is not google.com." ] assert sis(self, "1.) The first item 2.) The second item") == [ "1.) The first item", "2.) The second item" ] assert sis(self, "1) The first item 2) The second item") == [ "1) The first item", "2) The second item" ] assert sis( self, "a. The first item b. The second item c. The third list item") == [ "a. The first item", "b. The second item", "c. The third list item", ]
print("Link: " + v.text) links.append([v.text.strip(), v.attrs['href']]) return (txt, links) ar = len(sys.argv) if (ar == 1): print("Please use a URL as the first argument") quit() url = sys.argv[1] #txt = cleantext(txt) #txt2 = summ(txt) #print(txt2) hashindex = HashIndex(url) print(hashindex.index) path = '/home/osiris/.local/share/tts/' model_path = path + 'tts_models--en--ljspeech--tacotron2-DCA/model_file.pth.tar' config_path = path + 'tts_models--en--ljspeech--tacotron2-DCA/config.json' vocoder_path = path + 'vocoder_models--en--ljspeech--mulitband-melgan/model_file.pth.tar' vocoder_config_path = path + 'vocoder_models--en--ljspeech--mulitband-melgan/config.json' synthesizer = Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, False) #playall(txt2) (txt, links) = getPageData(url) go(txt)
args.vocoder_name) # CASE3: set custome model paths if args.model_path is not None: model_path = args.model_path config_path = args.config_path speakers_file_path = args.speakers_file_path if args.vocoder_path is not None: vocoder_path = args.vocoder_path vocoder_config_path = args.vocoder_config_path # load models synthesizer = Synthesizer(model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, use_cuda=args.use_cuda) use_multi_speaker = synthesizer.speaker_manager is not None # TODO: set this from SpeakerManager use_gst = synthesizer.tts_config.get("use_gst", False) app = Flask(__name__) def style_wav_uri_to_dict(style_wav: str) -> Union[str, dict]: """Transform an uri style_wav, in either a string (path to wav file to be use for style transfer) or a dict (gst tokens/values to be use for styling) Args: style_wav (str): uri
class TextToSpeechListenerTask(Listener, Task): queue: Queue = Queue(maxsize=20) model_name = "tts_models/en/ljspeech/tacotron2-DCA" vocoder_name = "vocoder_models/universal/libri-tts/fullband-melgan" use_cuda = False synthesizer: Synthesizer tts_sample_rate = 22050 output_sample_rate = 16000 # TODO move to local folder instead of system folder def __init__(self, auto_start: bool = True): super().__init__(auto_start) path = Path( __file__ ).parent / "../../.venv/lib/python3.8/site-packages/TTS/.models.json" manager = ModelManager(path) model_path, config_path = manager.download_model(self.model_name) vocoder_path, vocoder_config_path = manager.download_model( self.vocoder_name) self.synthesizer = Synthesizer(model_path, config_path, vocoder_path, vocoder_config_path, self.use_cuda) def run(self): while self.running: try: text = self.queue.get(timeout=2) if isinstance(text, Text): self.speak(text) self.queue.task_done() except EmptyQueueError: continue def speak(self, text: Text) -> None: if not text.value: logger.warning("Cannot synthesize empty text") data = self.synthesize(text.value) publish_message(self, AudioFrame(data), text.src) def synthesize(self, text: str) -> bytes: """ TTS outputs a sample rate of 22050, so we must desample it to be able to consume it again """ audio_data = np.array(self.synthesizer.tts(text)) audio_data_normalized = audio_data * ( 32767 / max(0.01, np.max(np.abs(audio_data)))) audio_data_bytes = audio_data_normalized.astype(np.int16).tobytes() audio_data_bytes = self.desample_audio_data(audio_data_bytes) self.store_audio_data(audio_data_bytes) return audio_data_bytes def desample_audio_data(self, data: bytes) -> bytes: converted = audioop.ratecv(data, 2, 1, self.tts_sample_rate, self.output_sample_rate, None) return converted[0] def store_audio_data( self, data: bytes, path: str = "/home/jonas/Projects/robot/mind/assets-old/output/tts/output.raw" ) -> None: with open(path, "wb") as f: f.write(data)