def __init__(self): QMainWindow.__init__(self) self.setMinimumSize(QSize(880, 640)) self.setWindowTitle("苏维埃社会主义语言学习机") # Add text field self.txtEdit = QPlainTextEdit(self) self.txtEdit.insertPlainText(sampleText) self.txtEdit.move(40, 10) self.txtEdit.resize(800, 300) # Add button self.btn1 = QPushButton("学习一个 (F1)", self) self.btn1.move(40, 310) self.btn1.clicked.connect(self.button1Clicked) # Add button self.btn2 = QPushButton("念洋文 (F2)", self) self.btn2.move(160, 310) self.btn2.clicked.connect(self.button2Clicked) # Add label self.lbl = QTextEdit(self) self.lbl.setReadOnly(True) self.lbl.move(40, 340) self.lbl.resize(800, 200) self.tts = TTS("jane", "mp3", "ae918646-fa47-4e66-96b6-6ce44d6d3146") self.player = QtMultimedia.QMediaPlayer(self) self.show()
def test_init(self): tts = TTS(SPEAKERS[0], AUDIO_FORMATS[0], KEY) self.assertEqual(tts._TTS__params["speaker"], SPEAKERS[0]) self.assertEqual(tts._TTS__params["format"], AUDIO_FORMATS[0]) self.assertEqual(tts._TTS__params["key"], KEY) self.assertEqual(tts._TTS__params["lang"], "ru-RU")
def __init__(self, **kwargs): self.logger = kwargs.get('logger', logging.getLogger(__name__)) self.yandex_tts = TTS(speaker=kwargs.get('speaker', 'jane'), audio_format='wav', emotion=kwargs.get('emotion'), key=kwargs['key'])
def setUp(self): self.tts = TTS(SPEAKERS[0], AUDIO_FORMATS[0], KEY) self.tts.generate(TEXT) # create temp dir for tests self.tmp_dir = "tmp" os.makedirs(self.tmp_dir) os.chdir(self.tmp_dir)
def text_to_speech(text, file, key=YANDEX_API_KEY, speaker=speaker): tts = TTS(speaker, "wav", '%s' % key, emotion='good', speed='0.9', quality='lo') try: tts.generate(text.encode('utf-8')) except Exception: tts.generate(text) tts.save(file)
def speak(audioString): tts = TTS("ermil", "mp3", "25d87483-720a-46ea-82bd-7f89d4c95bbd", lang='en-US', emotion="good") tts.generate(audioString + " ") tts.save() os.system("mpg321 --stereo speech.mp3 ")
def test_data(self): self.tts.generate(TEXT) # data received self.assertIsNotNone(self.tts._data) # more words than more data self.other_tts = TTS(SPEAKERS[0], AUDIO_FORMATS[0], KEY) self.other_tts.generate(TEXT * 2) self.assertLess(list(self.tts._data), list(self.other_tts._data))
def test_save_without_data(self): tmp_dir = "tmp" os.makedirs(tmp_dir) os.chdir(tmp_dir) tts = TTS(SPEAKERS[0], AUDIO_FORMATS[0], KEY) # save without call generate self.assertRaises(Exception, tts.save, "empty_data") self.assertFalse(os.path.isfile("empty_data")) os.chdir("..") rmtree(tmp_dir, ignore_errors=True)
def test_init_with_kwargs(self): speed = random.choice(SPEEDS) lang = random.choice(LANGUAGES) emotion = random.choice(EMOTIONS) tts = TTS(SPEAKERS[0], AUDIO_FORMATS[0], KEY, speed=speed, lang=lang, emotion=emotion) self.assertEqual(tts._TTS__params["speaker"], SPEAKERS[0]) self.assertEqual(tts._TTS__params["format"], AUDIO_FORMATS[0]) self.assertEqual(tts._TTS__params["key"], KEY) self.assertEqual(tts._TTS__params["lang"], lang) self.assertEqual(tts._TTS__params["speed"], speed) self.assertEqual(tts._TTS__params["emotion"], emotion)
def generate_tts(tts_text): tts_voice = tts_voices[randint(0, len(tts_voices) - 1)] tts_md5 = md5(tts_text.encode('utf-8')).hexdigest() tts_path = os.path.join(os.getcwd(), 'media', 'cache', 'tts.' + tts_md5 + '.' + tts_voice + '.opus') if os.path.exists(tts_path): return tts_path else: try: key = config.get('tts', 'api key') tts = TTS(tts_voice, 'opus', key, lang='ru_RU', emotion='neutral') tts.generate(tts_text) return tts.save(tts_path) except (NoSectionError, NoOptionError): return None
def _generate_audio_file(self): """ Generic method used as a Callback in TTSModule - must provided the audio file and write it on the disk .. raises:: FailToLoadSoundFile """ # Since the gTTS lib disabled the SSL verification we get rid of insecure request warning requests.packages.urllib3.disable_warnings(InsecureRequestWarning) tts = TTS(self.speaker,"mp3", self.key, self.language) tts.generate(text=self.words) # OK we get the audio we can write the sound file tts.save(self.file_path) # Re enable the warnings to avoid affecting the whole kalliope process warnings.resetwarnings()
def just_yandex_tts( text): # синтез речи от Яндекс SpeechCloud (Технологии Яндекса) try: tts = TTS( "oksana", "mp3", "60a2b005-738e-42b6-8b78-9ee9b7d57031", speed=1.2 ) # если не работает, то нужно получить и указать свой ключ от Яндекс SpeechCloud tts.generate(text) #можно менять скорость tts.save('speechY.mp3') except Exception as e: print("[YandexTTS] Не удалось синтезировать речь: {0}".format(e)) return mixer.init() mixer.music.load('speechY.mp3') mixer.music.play() while mixer.music.get_busy(): time.sleep(0.1) mixer.music.stop() mixer.music.load( 'new.waw' ) # нужен второй аудио файл, иначе миксер валится из-за permission error!!!
def say(words): # words= translator.translate(words, dest=language) # words=words.text # words=words.replace("Text, ",'',1) # words=words.strip() print(words) getConfig(path) md5 = hashlib.sha1(words.encode('utf-8')).hexdigest() filemp3 = "" for file in os.listdir("/tmp/"): if file.endswith(md5 + ".wav"): filemp3 = (os.path.join(file)) if filemp3 == md5 + ".wav": print("Файл уже записан") os.system("aplay -q /tmp/" + filemp3) elif filemp3 == md5 + ".mp3": print("Файл уже записан") s.system("mpg123 -q " + filemp3) elif PROVIDERTTS == "Yandex": print("Генерируем файл") #tts = gTTS(text=words, lang=languageG) tts = TTS("alyss", "wav", APIKEYTTS, lang=language, emotion="good") tts.generate(words) words = hashlib.sha1(words.encode('utf-8')).hexdigest() ttsfilename = "/tmp/" + words + ".wav" tts.save(ttsfilename) os.system("aplay -q " + ttsfilename) #os.remove(ttsfilename) elif PROVIDERTTS == "Google": print("Генерируем файл") tts = gTTS(text=words, lang=languageG) words = hashlib.sha1(words.encode('utf-8')).hexdigest() ttsfilename = "/tmp/" + words + ".mp3" tts.save(ttsfilename) os.system("mpg123 -q " + ttsfilename)
import speech_recognition as sr from yandex_speech import TTS from pygame import mixer from pygame.time import delay from os import remove from platform import system r = sr.Recognizer() tts = TTS("oksana", "mp3", "60556d09-0e84-42b7-8974-9d0b01cfee33") mixer.init(frequency=48000) def listen(): ''' Listening audio input from the microphone and return the recognized text using google's speech to text open api and library speechrecognition ''' with sr.Microphone() as source: audio = r.listen(source) try: return r.recognize_google(audio, language="ru-RU") except sr.UnknownValueError: return 1 except sr.RequestError: return 2 def say(text, savepath='phrases/livespeech.mp3'): ''' Instantly to synthesize the text and speak it through the speakers
def t_to_s(text): if (len(text)>500): text = text[0:490] tts = TTS("oksana", "opus", "b04291f2-5e31-4c8e-af57-1695b7bd5f16", lang="ru_RU", emotion="good") tts.generate(text) tts.save("ramazan") return "ramazan"
def t_to_s(text): tts = TTS("oksana", "opus", "b04291f2-5e31-4c8e-af57-1695b7bd5f16") tts.generate(text) tts.save("ramazan") return "ramazan"
def test(): os.environ["CUDA_VISIBLE_DEVICES"] = config.device_ids result_dir = 'temp/' + config.in_file motion_dir = result_dir + '/motion/' os.mkdir(result_dir) os.mkdir(motion_dir) pca = torch.FloatTensor(np.load('basics/pca.npy')[:, :6]) mean = torch.FloatTensor(np.load('basics/mean.npy')) decoder = VG_net() encoder = AT_net() state_dict2 = multi2single(config.vg_model, 1) decoder.load_state_dict(state_dict2) state_dict = multi2single(config.at_model, 1) encoder.load_state_dict(state_dict) encoder.eval() decoder.eval() test_file = result_dir + "/" + config.in_file + ".wav" test_file_old = result_dir + "/old_" + config.in_file + ".wav" if config.text_tts == "" and config.news_url != "": parse_news_content = get_info(config.news_url)['news_content'] else: parse_news_content = config.text_tts tts = TTS(config.name_tts, "wav", "000000-0000-0000-0000-00000000", config.lang_tts, emotion="neutral", speed=1) # test content tts.generate(parse_news_content[:1999]) if config.shift == 1: tts.save(test_file_old) audio_shift(test_file_old, test_file) else: tts.save(test_file) example_image, example_landmark = generator_demo_example_lips( config.person) transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) ]) example_image = cv2.cvtColor(example_image, cv2.COLOR_BGR2RGB) example_image = transform(example_image) example_landmark = example_landmark.reshape( (1, example_landmark.shape[0] * example_landmark.shape[1])) if config.cuda == True: example_image = Variable(example_image.view(1, 3, 128, 128)).cuda() example_landmark = Variable( torch.FloatTensor(example_landmark.astype(float))).cuda() else: example_image = Variable(example_image.view(1, 3, 128, 128)) example_landmark = Variable( torch.FloatTensor(example_landmark.astype(float))) example_landmark = example_landmark * 5.0 example_landmark = example_landmark - mean.expand_as(example_landmark) example_landmark = torch.mm(example_landmark, pca) speech, sr = librosa.load(test_file, sr=16000) mfcc = python_speech_features.mfcc(speech, 16000, winstep=0.01) speech = np.insert(speech, 0, np.zeros(1920)) speech = np.append(speech, np.zeros(1920)) mfcc = python_speech_features.mfcc(speech, 16000, winstep=0.01) sound, _ = librosa.load(test_file, sr=44100) print('=======================================') print('Generate images') t = time.time() ind = 3 with torch.no_grad(): fake_lmark = [] input_mfcc = [] while ind <= int(mfcc.shape[0] / 4) - 4: t_mfcc = mfcc[(ind - 3) * 4:(ind + 4) * 4, 1:] t_mfcc = torch.FloatTensor(t_mfcc) input_mfcc.append(t_mfcc) ind += 1 input_mfcc = torch.stack(input_mfcc, dim=0) input_mfcc = input_mfcc.unsqueeze(0) fake_lmark = encoder(example_landmark, input_mfcc) fake_lmark = fake_lmark.view( fake_lmark.size(0) * fake_lmark.size(1), 6) example_landmark = torch.mm(example_landmark, pca.t()) example_landmark = example_landmark + mean.expand_as(example_landmark) fake_lmark[:, 1:6] *= 2 * torch.FloatTensor( np.array([1.1, 1.2, 1.3, 1.4, 1.5])) fake_lmark = torch.mm(fake_lmark, pca.t()) fake_lmark = fake_lmark + mean.expand_as(fake_lmark) fake_lmark = fake_lmark.unsqueeze(0) fake_lmark = fake_lmark.data.cpu().numpy() file_mark = result_dir + "/" + config.in_file + ".npy" file_mp4 = result_dir + "/" + config.in_file # + ".mp4" np.save(file_mark, fake_lmark) mark_paint.mark_video(fake_lmark, motion_dir) cmd = 'ffmpeg -framerate 25 -i ' + motion_dir + '%d.png -filter:v scale=512:-1 -c:v libx264 -pix_fmt yuv420p ' + file_mp4 + '.mp4' subprocess.call(cmd, shell=True) print('video done') cmd = 'ffmpeg -i ' + file_mp4 + '.mp4 -i ' + test_file + ' -c:v copy -c:a aac -strict experimental ' + file_mp4 + '_result.mp4' subprocess.call(cmd, shell=True) print('video+audio done') return file_mark return False
# Если скрипт запущен в тестовом режиме из консоли с передачей параметра test if len(sys.argv) > 1 and str(sys.argv[1]) == "test": # Выводим информацию на экран print "to_speaker = " + to_speaker print "count_rings = " + str(count_rings) # Если скрипт запущен в боевом режиме else: # Запишем информацию в логи write_log("to_speaker = " + to_speaker) write_log("count_rings = " + str(count_rings)) # Если это первый звонок (первый раз прозвенел аппарат телефона при входящем вызове) if count_rings == 0: # Передадим в Yandex.Speech наш текст для произнесения и запишем полученный файл в формате mp3 file_mp3 = "/home/asterisk/to_speaker/name_or_number" tts = TTS("zahar", "mp3", "*****-****-****-****-***********") tts.generate(str(to_speaker)) tts.save(file_mp3) file_mp3 = file_mp3 + ".mp3" # Если это внутренний звонок, то сделаем паузу, чтобы сначала зазвенел телефонный аппарат, а потом заговорил робот, # иначе, при внутренних звонках, робот говорит раньше, чем звонит телефон, и этим нас пугает немного :) if mc.get("is_internal") is not None: mc.delete("is_internal") # Паузу не делаем, на трубках надо было включить "не пропускать первый ring" # time.sleep(2) # Если количество звонков меньше пяти if count_rings < 5: to_speaker_all = '/usr/bin/mplayer -ao alsa -really-quiet -noconsolecontrols ' + file_mp3 os.system(to_speaker_all)
def setUp(self): self.tts = TTS(SPEAKERS[0], AUDIO_FORMATS[0], KEY)