def PredictForLanguage(executor, language): language_path = os.path.join(path, language) if (not app_utils.CheckIfPathExists(language_path)): return files = app_utils.listdir_fullpath(language_path) with open(os.path.join(out_path, "vortex_" + language + ".csv"), "w") as f: for file in files: # Convert Speech to Text print("Converting Speech to Text for ", file) text = SpeechToText(executor, file, language) # Use this to predict results app_utils.DebugCommand("Text Recognised is ", text) translated = text translated = TranslateToEnglish(text, language, translator="Seq2Seq") app_utils.DebugCommand("Translated to English Is ", translated) if ModelColdStarter.isAlive(): ModelColdStarter.join() prediction = PredictResults(translated) print(prediction) f.write(file + ", " + prediction[0] + "\n")
def __init__(self, detect_model="data/andrew2.net", lyrics_model="data/keras_model_1200.h5", lyrics_chars="data/chars.pkl"): # microphone self.mic = ResumableMicrophoneStream(SAMPLE_RATE, CHUNK_SIZE) # wake word detector self.detector = TriggerDetector(detect_model) # speech and language services self.speech_client = SpeechToText() self.luis = LangUnderstand() self.tts = TextToSpeech() # lyrics generator model self.lyrics_gen = LyricsGenerator(lyrics_model, lyrics_chars) self.pred_queue = DetectQueue(maxlen=5) self.is_wakeup = False # pytft display self.tft = TFTDisplay() self.tft_queue = queue.Queue() self.tft_thread = threading.Thread(target=self.tft_manage, args=()) self.tft_thread.daemon = True self.tft_thread.start() self.notify("hi_there")
def send_voice(self): try: result = SpeechToText().run(self.auth_token) output = result[0] self.auth_token = result[1] print(self.auth_token) self.browser.append("You: " + output) print(output) self.receive_message(output) except Exception as e: print(e)
def get_text_from_audio(): try: file = request.files.get('file') if file and file.filename.rsplit('.', 1)[1] in ('flac', 'wav'): speech_to_text = SpeechToText() filename = secure_filename(file.filename) file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename)) text = speech_to_text.recognize( audio_file=os.path.join(app.config['UPLOAD_FOLDER'], filename)) return jsonify({'text': text}) return jsonify( {'message': 'File format is not supported. Please use flac, wav'}), 400 except Exception as e: return jsonify('Failed to upload file'), 400
def main(): (stt_json, adj_json, output_json) = sys.argv[1:4] # Turn adjustment data into list of kept segments with open(adj_json, 'r') as file: adj_data = json.load(file) # Turn stt json into objects with open(stt_json, 'r') as file: stt = SpeechToText().from_json(json.load(file)) # List of adjustments (start, end, adjustment) offset_adj = [] # Last ending position for iterating through kept segments last_end = 0.00 # Running tally of removed segment lengths current_adj = 0.00 # For each segment that was kept, keep track of the gaps to know how much to adjust for kept_segment in adj_data: print(kept_segment + ":" + str(adj_data[kept_segment])) start = float(kept_segment) end = adj_data[kept_segment] # If the start of this segment is after the last end, we have a gap if (start >= last_end): # Keep track of the gap in segments current_adj = current_adj + (start - last_end) # Add it to a list of adjustments offset_adj.append( Adjustment(start - current_adj, end - current_adj, current_adj)) # Keep track of the last segment end last_end = end # For each word, find the corresponding adjustment for word in stt.results.words: adjust_word(word, offset_adj) # Write the resulting json mgm_utils.write_json_file(stt, output_json)
def convert(media_file, kaldi_file, kaldi_transcript_file, output_json_file): mgm_utils.exception_if_file_not_exist(kaldi_file) if not os.path.exists(kaldi_transcript_file): raise Exception( "Exception: File " + kaldi_transcript_file + " doesn't exist, the previous command generating it must have failed." ) results = SpeechToTextResult() # Open the kaldi json with open(kaldi_file) as json_file: data = json.load(json_file) # Get the kaldi transcript transcript = open(kaldi_transcript_file, "r") results.transcript = transcript.read() # Get a list of words words = data["words"] duration = 0.00 # For each word, add a word to our results for w in words: time = float(w["time"]) end = time + float(w["duration"]) # Keep track of the last time and use it as the duration if end > duration: duration = end results.addWord("", time, end, w["word"], None, None) # Create the media objeect media = SpeechToTextMedia(duration, media_file) # Create the final object outputFile = SpeechToText(media, results) #write the output mgm_utils.write_json_file(outputFile, output_json_file)
def __init__(self, config): """ Initialize :param config: configuration :type config: Config """ self.command_processor = CommandProcessor( self._command_handlers(config.command_handlers)) self.robot = Robot(config.apiai.client_access_token, config.apiai.language, self.command_processor.commands) self.speech_to_text = SpeechToText( config.speechkit.key, "", config.speechkit.recognition.language) self.text_to_speech = TextToSpeech( config.speechkit.synthesis.cache_size, config.speechkit.key, config.speechkit.synthesis.language, config.speechkit.synthesis.speaker, config.speechkit.synthesis.emotion, config.speechkit.synthesis.speed) self.record = SpeechCapture(config.record.silence_calculation_chunks, config.record.speech_level_coefficient, config.record.start_wait_chunks, config.record.finish_wait_chunks)
def predict(speech_recognition=False, speech_synthesis=False): ''' Работа с обученной моделью seq2seq. 1. speech_recognition - включение распознавания речи с микрофона с помощью PocketSphinx 2. speech_synthesis - включение озвучивания ответов с помощью RHVoice ''' name_dataset = configure_file_names() ttt = TextToText(f_name_w2v_model=f_name_w2v_model, f_name_model=f_name_model, f_name_model_weights=f_name_model_weights) if speech_recognition: print('[i] Загрузка языковой модели для распознавания речи...') stt = SpeechToText('from_microphone', name_dataset) if speech_synthesis: print('[i] Загрузка синтезатора речи...') tts = TextToSpeech('anna') print() question = '' while (True): if speech_recognition: print('Слушаю...') question = stt.get() os.write(sys.stdout.fileno(), curses.tigetstr('cuu1')) print('Вы: ' + question) else: question = input('Вы: ') answer, lost_words = ttt.predict(question, True) print('\t=> %s' % answer) if len(lost_words) > 0: print('[w] Потерянные слова: ' + ', '.join(lost_words) + '\n') else: print() if speech_synthesis: tts.get(answer)
def main(): (media_file, transcribe_file, output_stt_json_file, output_seg_json_file) = sys.argv[1:5] mgm_utils.exception_if_file_not_exist(transcribe_file) # Open the transcribe output with open(transcribe_file) as json_file: data = json.load(json_file) amp_results = SpeechToTextResult() # Fail if we don't have results if "results" not in data.keys(): exit(1) aws_results = data["results"] if "transcripts" not in aws_results.keys(): exit(1) # Parse transcript transcripts = aws_results["transcripts"] for t in transcripts: amp_results.transcript = amp_results.transcript + t["transcript"] # Fail if we don't have any keys if "items" not in aws_results.keys(): exit(1) # Parse items (words) items = aws_results["items"] duration = 0.00 # For each item, get the necessary parts and store as a word for i in items: alternatives = i["alternatives"] # Choose an alternative max_confidence = 0.00 text = "" # Each word is stored as an "alternative". Get the one with the maximum confidence for a in alternatives: if float(a["confidence"]) >= max_confidence: max_confidence = float(a["confidence"]) text = a["content"] end_time = -1 start_time = -1 # Two types (punctionation, pronunciation). Only keep times for pronunciation if i["type"] == "pronunciation": end_time = float(i["end_time"]) start_time = float(i["start_time"]) # If this is the greatest end time, store it as duration if end_time > duration: duration = end_time # Add the word to the results amp_results.addWord(i["type"], start_time, end_time, text, "confidence", max_confidence) # Create the media object media = SpeechToTextMedia(duration, media_file) # Create the final object outputFile = SpeechToText(media, amp_results) # Write the output mgm_utils.write_json_file(outputFile, output_stt_json_file) # Start segmentation schema with diarization data # Create a segmentation object to serialize seg_schema = Segmentation() # Create the media object segMedia = SegmentationMedia(duration, media_file) seg_schema.media = segMedia if "speaker_labels" in aws_results.keys(): speakerLabels = aws_results["speaker_labels"] seg_schema.numSpeakers = speakerLabels["speakers"] # For each segment, get the start time, end time and speaker label segments = speakerLabels["segments"] for segment in segments: seg_schema.addDiarizationSegment(float(segment["start_time"]), float(segment["end_time"]), segment["speaker_label"]) # Write the output mgm_utils.write_json_file(seg_schema, output_seg_json_file)
def main(): (input_file, json_file, bucketName, dataAccessRoleArn) = sys.argv[1:5] # Read a list of categories to ignore when outputting entity list ignore_cats_list = list() if len(sys.argv) > 5: print("ignore cats:" + sys.argv[5]) ignore_cats_list = split_ignore_list(sys.argv[5]) # Variable declaration outputS3Uri = 's3://' + bucketName + '/' timestamp = datetime.now().strftime("%Y%m%d%H%M%S") jobName = 'AwsComprehend-' + timestamp + ".json" inputS3Uri = outputS3Uri + jobName # Get the transcript text from the input file with open(input_file, 'r') as file: stt = SpeechToText().from_json(json.load(file)) # Create the ner object ner = EntityExtraction() # Add the media information if stt is None or stt.results is None: mediaLength = 0 else: mediaLength = len(stt.results.transcript) # If we have a blank file, don't error. Create another blank json file to pass to the next process if mediaLength == 0: ner.media = EntityExtractionMedia(mediaLength, input_file) mgm_utils.write_json_file(ner, json_file) exit(0) # Create a temp file to upload to S3 tmpfile = create_temp_transcript_file(jobName, stt.results.transcript) # Copy the temporary text file to S3 copy_to_s3(tmpfile.name, bucketName, jobName) # Make call to aws comprehend output_uri = run_comprehend_job(jobName, inputS3Uri, outputS3Uri, dataAccessRoleArn) uncompressed_file = download_from_s3(output_uri, outputS3Uri, bucketName) if uncompressed_file is None: exit(1) comprehend_data = read_comprehend_response(uncompressed_file) ner.media = EntityExtractionMedia(mediaLength, input_file) # Variables for filling time offsets based on speech to text lastPos = 0 # Iterator to keep track of location in STT word sttWords = len(stt.results.words) # Number of STT words if 'Entities' in comprehend_data.keys(): for entity in comprehend_data["Entities"]: entity_type = entity["Type"] # Start and end time offsets start = None end = None text = entity["Text"] # Split the entity into an array of words based on whitespace entityParts = text.split() # For each word in the entity, find the corresponding word in the STT word list foundWordPos = None for entityPart in entityParts: for wordPos in range(lastPos, sttWords): # If it matches, set the time offset. word = stt.results.words[wordPos] if clean_entity_word( word.text) == clean_entity_word(entityPart): # Keep track of last position to save iterations foundWordPos = wordPos # Set start if we haven't set it yet if start is None: start = word.start end = word.end break else: start = None end = None foundWordPos = None if start is not None: lastPos = foundWordPos else: print("Could not find word") print(text) print(entityParts) print(lastPos) if clean_text( entity_type) not in ignore_cats_list and start is not None: ner.addEntity( entity_type, text, None, None, "relevance", float(entity["Score"]), start, None) #AMP-636 removed startOffset=endOffset=end=None #Write the json file mgm_utils.write_json_file(ner, json_file) #Cleanup temp files safe_delete(uncompressed_file) safe_delete(tmpfile.name)
def main(): (root_dir, from_draftjs, original_transcript, to_transcript) = sys.argv[1:5] # using output instead of input filename as the latter is unique while the former could be used by multiple jobs logger = MgmLogger(root_dir, "hmgm_transcript", to_transcript) sys.stdout = logger sys.stderr = logger try: # if from_draftjs is in error raise exception to notify HMGM job runner to fail the job # otherwise if from_draftjs doesn't exist yet, exit 1 to keep waiting mgm_utils.exit_if_file_not_ready(from_draftjs) print("Converting DraftJs " + from_draftjs + " to Transcript " + to_transcript) with open(from_draftjs) as json_file: d = json.load(json_file) data = eval(json.dumps(d)) #read original file for extracting only the confidence score of each word original_input = open(original_transcript) original_json = json.loads(original_input.read()) original_items = original_json["results"]["words"] #print("the data in editor output is:",data) results = SpeechToTextResult() word_type = text = '' confidence = start_time = end_time = -1 duration = 0.0 # draftJS input file here always came from converted and corrected AMP Transcript, # so it should always contain 'entityMap', otherwise error should occur #Standardising draft js format # if "entityMap" in data.keys(): transcript = '' entityMap = data["entityMap"] for i in range(0, len(entityMap.keys())): punctuation = '' if str(i) not in entityMap.keys(): continue entity = entityMap[str(i)] if "data" in entity: if "text" in entity["data"].keys(): text = entity["data"]["text"] transcript += entity["data"]["text"] + " " if text[-1] in string.punctuation: #[',','.','!','?']: punctuation = text[-1] text = text[0:-1] if "type" in entity: entity_type = entity["type"] if entity_type == "WORD": word_type = "pronunciation" if "start" in entity["data"]: start_time = float(entity["data"]["start"]) if "end" in entity["data"]: end_time = float(entity["data"]["end"]) if end_time > duration: duration = end_time else: word_type = entity_type results.addWord(word_type, start_time, end_time, text, "confidence", confidence) if len(punctuation) > 0: results.addWord('punctuation', None, None, punctuation, "confidence", 0.0) results.transcript = transcript words = results.words #Now retrieving the confidence values from the original input file and assigning them to 'results' list_items = [] list_result = [] for i in range(0, len(original_items)): list_items.append(original_items[i]["text"]) for j in range(0, len(words)): list_result.append(words[j].text) d = difflib.Differ() res = list(d.compare(list_items, list_result)) i = j = 0 word_count = len(words) original_item_count = len(original_items) print("original item count: " + str(original_item_count)) print("word count: " + str(word_count)) for ele in res: if j >= word_count or i >= original_item_count: break elif ele.startswith("- "): i += 1 elif len(ele) > 2 and ele[0:2] == "+ ": words[j].score.scoreValue = 1.0 j += 1 elif ele[0:1] == " " and words[j].text == original_items[i]["text"]: if ("score" in original_items[i]): words[j].score.scoreValue = float( original_items[i]["score"]["scoreValue"]) else: words[ j].score.scoreValue = 1.0 # default score to 1.0 if not existing originally i += 1 j += 1 print("i: " + str(i) + " j:" + str(j)) # Create the media object media = SpeechToTextMedia(duration, original_transcript) # Create the final object stt = SpeechToText(media, results) # Write the output mgm_utils.write_json_file(stt, to_transcript) print("Successfully converted from DraftJs " + from_draftjs + " to Transcript " + to_transcript) # as the last command in HMGM, implicitly exit 0 here to let the whole job complete in success except Exception as e: # as the last command in HMGM, exit -1 to let the whole job fail print( "Failed to convert from DraftJs " + from_draftjs + " to Transcript " + to_transcript, e) traceback.print_exc() sys.stdout.flush() exit(-1)
def reply(text_message, client, thread_id): if client.uid == thread_id: return mess = Message(text=text_message) client.send(mess, thread_id=thread_id, thread_type=ThreadType.USER) client.delay() client = Bot('', '', max_tries=1, user_agent=cfg.user_agent, session_cookies=cfg.session_cookies) client.startListening() acc_demo = '100041985261746' acc_trDuong = '100014187060145' recg = SpeechToText('right shift', reply, ( client, acc_trDuong, )) while 1: client.doOneListen() recg.recognize_once() while 1: pass #speech_to_text.recognize('space', reply, (client, '100014187060145',))
def run(host, port, wsgi=False, https_mode=False): ''' Автовыбор доступного порта (если указан порт 0), загрузка языковой модели и нейронной сети и запуск сервера. 1. wsgi - True: запуск WSGI сервера, False: запуск тестового Flask сервера 2. https - True: запуск в режиме https (сертификат и ключ должны быть в cert.pem и key.pem), False: запуск в режиме http Самоподписанный сертификат можно получить, выполнив: openssl req -x509 -newkey rsa:4096 -nodes -out temp/cert.pem -keyout temp/key.pem -days 365 ''' if port == 0: # Если был введён порт 0, то автовыбор любого доступного порта try: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.bind((host, 0)) port = sock.getsockname()[1] log('выбран порт ' + str(port)) sock.close() except socket.gaierror: log('адрес ' + host + ':' + str(port) + ' некорректен', level='error') sock.close() return except OSError: log('адрес ' + host + ':' + str(port) + ' недоступен', level='error') sock.close() return log('Flask v.' + flask_version + ', WSGIServer v.' + wsgi_version) log('установлен максимальный размер принимаемых данных: {:.2f} Кб'.format( max_content_length / 1024)) name_dataset = f_name_w2v_model_plays[ f_name_w2v_model_plays.rfind('w2v_model_') + len('w2v_model_'):f_name_w2v_model_plays.rfind('.bin')] log('загрузка обученной на наборе данных ' + name_dataset + ' модели seq2seq...') global ttt print() ttt = TextToText(f_name_w2v_model=f_name_w2v_model_plays, f_name_model=f_name_model_plays, f_name_model_weights=f_name_model_weights_plays) print() log('загрузка языковой модели для распознавания речи...') global stt stt = SpeechToText('from_file', name_dataset) log('загрузка синтезатора речи...') global tts tts = TextToSpeech('anna') if wsgi: global http_server if https_mode: log('WSGI сервер запущен на https://' + host + ':' + str(port) + ' (нажмите Ctrl+C или Ctrl+Z для выхода)') else: log('WSGI сервер запущен на http://' + host + ':' + str(port) + ' (нажмите Ctrl+C или Ctrl+Z для выхода)') try: if https_mode: http_server = WSGIServer((host, port), app, log=app.logger, error_log=app.logger, keyfile='temp/key.pem', certfile='temp/cert.pem') else: http_server = WSGIServer((host, port), app, log=app.logger, error_log=app.logger) http_server.serve_forever() except OSError: print() log('адрес ' + host + ':' + str(port) + ' недоступен', level='error') else: log('запуск тестового Flask сервера...') try: if https_mode: app.run(host=host, port=port, ssl_context=('temp/cert.pem', 'temp/key.pem'), threaded=True, debug=False) else: app.run(host=host, port=port, threaded=True, debug=False) except OSError: print() log('адрес ' + host + ':' + str(port) + ' недоступен', level='error')
def main(): fail_safe = FailSafe() coref_solver = CorefSolver() print("Press H for help.") verbose = False if "--verbose" in sys.argv: verbose = True u = UtteranceBranching(coref_solver, verbose=True) else: u = UtteranceBranching(coref_solver) kb_file_name = None if "--kb" in sys.argv: pos = sys.argv.index("--kb") kb_file_name = sys.argv[pos + 1] q_file_name = None if "--q" in sys.argv: pos = sys.argv.index("--q") q_file_name = sys.argv[pos + 1] if kb_file_name: with open(kb_file_name, "r") as f: utterances = list(f) for utterance in utterances: print( "-------------------------------------------------------") print(utterance) response = u.process(utterance[:-1]) print(response) if q_file_name: with open(q_file_name, "r") as f: questions = list(f) for q in questions: print( "-------------------------------------------------------") print(q) response = u.process(q[:-1]) if response: if response[-1] == "+": question, fail_response, similarity = fail_safe.answer_questions( q[:-1]) coref_solver.prev.pop() # print("*********", similarity) if similarity > 0.7: response = fail_response else: response = response[:-1] else: question, response, similarity = fail_safe.answer_questions( q[:-1]) # print(similarity) coref_solver.prev.pop() print("Bot: ", response) return speech_to_text = SpeechToText() while True: # type or say print("You: ", end='', flush=True) utterance = str(sys.stdin.readline()) if utterance[:-1].lower() == "h": print("Press H for help.") print("Press S to view assistant's internal state.") print( "Press V in order to interect with the assistant with your voice." ) continue if utterance[:-1].lower() == "s": u.internal_state() continue if utterance[:-1].lower() == "v": utterance = speech_to_text.process() print(utterance) else: utterance = utterance[:-1] response = u.process(utterance) if response: if response[-1] == "+": question, fail_response, similarity = fail_safe.answer_questions( utterance) coref_solver.prev.pop() if similarity > 0.7: response = fail_response else: response = response[:-1] tts = gTTS(text=response, lang='en') tts.save("response.mp3") os.system("mpg123 response.mp3 2> /dev/null") print("Bot: ", response) if response in ["Glad we talked!", "Happy to help!", "Gooodbye!"]: break else: question, response, similarity = fail_safe.answer_questions( utterance) # print(similarity) coref_solver.prev.pop() print("Bot: ", response) tts = gTTS(text=response, lang='en') tts.save("response.mp3") os.system("mpg123 response.mp3 2> /dev/null") print()