def process(self): """ Audio stream recognition and result parsing """ #You can add speech contexts for better recognition cap_speech_context = types.SpeechContext( phrases=["Add your phrases here"]) client = speech.SpeechClient() config = types.RecognitionConfig(encoding=self.encoding, sample_rate_hertz=self.rate, language_code=self.language, speech_contexts=[ cap_speech_context, ], model='command_and_search') streaming_config = types.StreamingRecognitionConfig( config=config, interim_results=False, single_utterance=False) audio_generator = self.stream_generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) try: self.response_loop(responses) except: self.start()
def run_loop(phrases): # See http://g.co/cloud/speech/docs/languages # for a list of supported languages. language_code = 'en-US' # a BCP-47 language tag client = speech.SpeechClient() speech_context = types.SpeechContext(phrases=phrases) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code, speech_contexts=[speech_context]) streaming_config = types.StreamingRecognitionConfig(config=config, interim_results=True) with MicrophoneStream(RATE, CHUNK) as stream: while True: try: print "running a recognition..." audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest( audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. listen_print_loop(responses) except grpc._channel._Rendezvous as e: print "timeout, restarting" pass
def decode_file(file_path, client, speech_context, sample_rate, max_alternatives, enable_word_time_offsets): def recognize(chunk, file_path): """ Subfunction that loops over audio segments to recognize speech """ # export as flac chunk.export(file_path + ".flac", format="flac", bitrate="44.1k") # open flac file with open(file_path + ".flac", 'rb') as sc: speech_content = sc.read() # initialize speech sample sample = types.RecognitionAudio(content=speech_content) # run speech decoding try: result = client.recognize(opts, sample) except ValueError as e: print(e) result = None return result opts = {} opts['encoding'] = enums.RecognitionConfig.AudioEncoding.FLAC opts['language_code'] = language_code opts['sample_rate_hertz'] = sample_rate opts['max_alternatives'] = max_alternatives opts['enable_word_time_offsets'] = enable_word_time_offsets if speech_context: opts['speech_contexts'] = [ types.SpeechContext(phrases=speech_context) ] # read in wav audio = AudioSegment.from_wav(file_path) # segment into 1 minute chunks if len(audio) > 60000: segments = list(range(0, len(audio), 60000)) if segments[-1] < len(audio): segments.append(len(audio) - 1) print( 'Audio clip is longer than 1 minute. Splitting into %d one minute segments...' % (len(segments) - 1)) audio_chunks = [] for i in range(len(segments) - 1): audio_chunks.append(audio[segments[i]:segments[i + 1]]) else: audio_chunks = [audio] # loop over audio segments results = [] for idx, chunk in enumerate(audio_chunks): results.append(recognize(chunk, file_path + str(idx))) # return list of results return results
def transcribe_streaming(stream_file): client = speech.SpeechClient() short_glossary_file = open('data/short_glossary.txt', 'r') short_glossary = map(lambda x: x.strip(), short_glossary_file.readlines()) short_glossary_file.close() speech_context = types.SpeechContext(phrases=short_glossary) stream = AudioIterable(stream_file, 32 * 1024, 16000) requests = (types.StreamingRecognizeRequest(audio_content=chunk) for chunk in stream) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code='en-GB', speech_contexts=[speech_context] ) streaming_config = types.StreamingRecognitionConfig(config=config) # streaming_recognize returns a generator. # [START migration_streaming_response] responses = client.streaming_recognize(streaming_config, requests) for response in responses: for result in response.results: print('Finished: {}'.format(result.is_final)) print('Stability: {}'.format(result.stability)) alternatives = result.alternatives for alternative in alternatives: print('Confidence: {}'.format(alternative.confidence)) print('Transcript: {}'.format(alternative.transcript))
def transcribe_gcs(gcs_uri): """Asynchronously transcribes the audio file specified by the gcs_uri.""" from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types client = speech.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code='en-US', enable_automatic_punctuation=True, use_enhanced=True, model='phone_call', speech_contexts=[types.SpeechContext(phrases=['we are releasing'])]) operation = client.long_running_recognize(config, audio) print('Waiting for operation to complete...') response = operation.result(900) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. output_file_name = os.path.basename(gcs_uri) + '.txt' with io.open(output_file_name, 'w', encoding='utf8') as output_file: for result in response.results: # The first alternative is the most likely one for this portion. alternative = result.alternatives[0] output_file.write(alternative.transcript.strip()) output_file.write(' Confidence: {}\n'.format( alternative.confidence))
def main(): # See http://g.co/cloud/speech/docs/languages # for a list of supported languages. language_code = 'en-US' # a BCP-47 language tag client = speech.SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, enable_word_time_offsets=True, language_code=language_code, speech_contexts=[types.SpeechContext(phrases=['order probes', 'syntagmatic probes', 'sp probes', "activations during", "jamie don't show", 'jamie show', 'order echoes', 'syntagmatic echoes'])]) streaming_config = types.StreamingRecognitionConfig( config=config, interim_results=True) with MicrophoneStream(RATE, CHUNK) as stream: audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. listen_print_loop(responses, stream)
def main(): # See http://g.co/cloud/speech/docs/languages # for a list of supported languages. language_code = 'en-US' # a BCP-47 language tag recog = enums.RecognitionConfig enums.RecognitionConfig.AudioEncoding.LINEAR16 client = speech.SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code, speech_contexts=[types.SpeechContext(phrases=ava, )]) streaming_config = types.StreamingRecognitionConfig(config=config, interim_results=True) with MicrophoneStream(RATE, CHUNK) as stream: audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) while 1: try: # Now, put the transcription responses to use. listen_print_loop(responses) except Exception as e: print "Restarting" print e main()
def transcribe_gcs(gcs_uri): """Asynchronously transcribes the audio file specified by the gcs_uri.""" from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types client = speech.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=44100, language_code='en-US', # enable_automatic_punctuation=True, # enable_speaker_diarization=True, # diarization_speaker_count=2, speech_contexts=[types.SpeechContext(phrases=phrase_list)]) operation = client.long_running_recognize(config, audio) print('Waiting for operation to complete...') response = operation.result(timeout=10000) # Basic genevieve for result in response.results: print(f"{result.alternatives[0].transcript}")
def main(): # See http://g.co/cloud/speech/docs/languages # for a list of supported languages. language_code = 'en-GB' # a BCP-47 language tag key = os.path.join(__file__, 'creds.json') credentials = service_account.Credentials.from_service_account_file( 'creds.json') client = speech.SpeechClient(credentials=credentials) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code, #maxAlternatives=5, speech_contexts=[ types.SpeechContext(phrases=[ "poo", "f**k", "f*****g", "arse", "bollocks", "s***e", "innovation" ], ) ]) streaming_config = types.StreamingRecognitionConfig(config=config, interim_results=True) with MicrophoneStream(RATE, CHUNK) as stream: audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. try: listen_print_loop(responses) except exceptions.OutOfRange: main()
def voice_stream_to_text(): language_code = 'en-US' # a BCP-47 language tag contexts = types.SpeechContext(phrases=terminologies) client = speech.SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code, speech_contexts=[contexts]) streaming_config = types.StreamingRecognitionConfig(config=config, interim_results=True) with MicrophoneStream(RATE, CHUNK) as stream: audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. # listen_print_loop(responses) with TextStream() as ts: text_generator = ts.generator(responses) for text in text_generator: yield text
def _start_google_stream(self): self._logger.info("[gstar] Start streaming to Google") # Configure Google speech recognition self._google_client = speech.SpeechClient() self._logger.info("[gstar] Got Google client") contexts = [types.SpeechContext(phrases=[])] config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=self._google_rate, language_code="en_US", max_alternatives=1, profanity_filter=False, speech_contexts=contexts, enable_word_time_offsets=False) self._google_recognition_config = types.StreamingRecognitionConfig( config=config, single_utterance=False, interim_results=False) self._logger.info("[gstar] Google configuration ready") source_audio = (types.StreamingRecognizeRequest(audio_content=content) for content in self._generate_next_buffer()) self._logger.info("[gstar] source list ready") self._google_response_iterator = self._google_client.streaming_recognize( self._google_recognition_config, self._generate_next_buffer()) # source_audio) self._logger.info("[gstar] Streaming started!") async (self._process_next_response)
def run(self): #main 'listen and recognition' function try: config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=self.RATE, language_code=self.lang.lang, speech_contexts=[ types.SpeechContext(phrases=self.lang.phrases) ]) streaming_config = types.StreamingRecognitionConfig( config=config, interim_results=True) with MicrophoneStream(self.RATE, self.CHUNK, self.device) as self.stream: audio_generator = self.stream.generator() requests = (types.StreamingRecognizeRequest( audio_content=content) for content in audio_generator) responses = self.client.streaming_recognize( streaming_config, requests) # Now, put the transcription responses to use. self.tts.speak(self.lang.start_phrase) self.listen_loop(responses) except OutOfRange: print("Stream restart") self.stream.stop()
def transcribe_file_ret(speech_file): """Transcribe the given audio file.""" from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types client = speech.SpeechClient() # [START migration_sync_request] # [START migration_audio_config_file] with io.open(speech_file, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=8000, language_code='en-US', speech_contexts=[types.SpeechContext( phrases=phrases, )]) #use_enhanced=True, # model='phone_call',) # [END migration_audio_config_file] # [START migration_sync_response] response = client.recognize(config, audio) # [END migration_sync_request] # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. return(response)
def callback_recognize(self, req): # init google speech client self.client = speech.SpeechClient() self.stream.start_stream() print("options:", len(req.options), req.options) print("language:", req.language) print("timeout:", str(req.timeout)) speech_context = None answer_context = [] for option in req.options: if option.strip(): answer_context.append(option.lower().strip() if '$' not in option else option.strip()) if answer_context: speech_context = types.SpeechContext(phrases=answer_context) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code=str(req.language.strip()) if req.language.strip() else "en-US", speech_contexts=[speech_context]) else: config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code=str(req.language.strip()) if req.language.strip() else "en-US") streaming_config = types.StreamingRecognitionConfig( config=config, interim_results=True, single_utterance=True) with MicrophoneStream(self.stream_buff) as mic: audio_generator = mic.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) try: #print(requests) responses = self.client.streaming_recognize( streaming_config, requests, timeout=(req.timeout if (req.timeout != 0) else 30)) #print('responses', responses) output = self.validate_response(responses, answer_context) except gexcp.DeadlineExceeded as e: output = "#TIMEOUT#" print("#TIMEOUT#") self.stream.stop_stream() print("Detected [%s]" % (output)) return QTrobotGspeechResponse(output)
def __init__(self, loop, credential='google.json', rate=RATE, language_code='en-US', name='Zhaoyuan'): self.name = name self.rate = rate self.credential = credential self.loop = loop self.keywords = [] for p in self.patterns: self.keywords.extend(p["input"]) print("keywords are {}".format(self.keywords)) self.audio_queue = asyncio.Queue(maxsize=10) audio = pyaudio.PyAudio() self.record_stream = audio.open(format=pyaudio.paInt16, channels=1, rate=rate, input=True, start=False, stream_callback=self.record_callback) self.play_stream = audio.open(format=pyaudio.paInt16, channels=1, rate=rate, start=False, output=True) os.environ.setdefault('GOOGLE_APPLICATION_CREDENTIALS', credential) self.speech_client = speech.SpeechClient() self.speech_config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=rate, language_code=language_code, speech_contexts=[types.SpeechContext(phrases=self.keywords, )]) self.tts_client = texttospeech.TextToSpeechClient() self.voice = texttospeech.types.VoiceSelectionParams( language_code='en-US', ssml_gender=texttospeech.enums.SsmlVoiceGender.FEMALE) self.audio_config = texttospeech.types.AudioConfig( audio_encoding=texttospeech.enums.AudioEncoding.LINEAR16, sample_rate_hertz=rate) print('assistant ok')
def get_transcripts(): # Imports the Google Cloud client library from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types # Instantiates a client client = speech.SpeechClient() # Transcribe audio files responses = dict() for root, subFolders, files in os.walk("../audio_input/"): for file_name in files: file_path = root + "/" + file_name print(file_path) # Loads the audio into memory with io.open(file_path, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=16000, language_code='en-US', enable_word_time_offsets=True, speech_contexts=[ types.SpeechContext(phrases=[ "paul", "estuardo", "piyush", "madison", "mostafa", "momotaz", "katie", "goodbye", "hello", "say", "great", "job", "good", "bye", "say hello", "great job", "great job good bye", "great job goodbye" ]) ]) # Detects speech in the audio file responses[file_path] = client.long_running_recognize(config, audio) for file_name, future in responses.items(): out_name = file_name.replace("input", "output") response = future.result(timeout=300) with open(out_name, "w") as out_file: out_file.write(file_name + "\n" + str(response)) print(out_name)
def transcribe_file(speech_file): """Transcribe the given audio file.""" client = speech.SpeechClient() with io.open(speech_file, 'rb') as audio_file: content = audio_file.read() datalist = [] with io.open('./invoice/data.csv') as data_file: reader = csv.reader(data_file) for row in reader: datalist.append(row[0]) audio = types.RecognitionAudio(content=content) print('Recognizing...') config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code='ja-JP', speech_contexts=[types.SpeechContext( phrases=datalist )]) response = client.recognize(config, audio) print('Finished Recognizing') # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. results = {} n = 0 for result in response.results: # The first alternative is the most likely one for this portion. #------result.alternatives[0].transcript------ print(u'Transcript: {}'.format(result.alternatives[0].transcript)) number = 'k'+str(n) results[number]=(u'認識結果: {}'.format(result.alternatives[0].transcript)) n += 1 return(results)
import logging import time import rospy from std_msgs.msg import Bool, Empty, Float32 # [END import_libraries] # Audio recording parameters RATE = 16000 CHUNK = int(RATE / 10) # 100ms DEADLINE_SECS = 60 WRAP_IT_UP_SECS = 15 SECS_OVERLAP = 1 SPEECH_CONTEXT = types.SpeechContext( phrases=["home", "teleop", "scale", "enable", "set"]) # list of commands for dVRK class VoiceRecognizer(QObject): def __init__(self, parent=None): super(VoiceRecognizer, self).__init__(parent) language_code = 'en-US' # a BCP-47 language tag self.confirm_signal = pyqtSignal() self.client = speech.SpeechClient() self.config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code, speech_contexts=[SPEECH_CONTEXT]) self.streaming_config = types.StreamingRecognitionConfig(
def main(): mqtt_client.on_connect = on_connect mqtt_client.on_message = on_message mqtt_client.connect(MQTT_BROKER_IP, MQTT_BROKER_PORT) mqtt_client.loop_start() logger.info("Starting Speech-to-Text") language_code = 'en-GB' # a BCP-47 language tag client = speech.SpeechClient() logger.info("Google Speech-to-Text client setup") mic_manager = ResumableMicrophoneStream(RATE, int(RATE / 10)) logger.info("Mic manager setup") with mic_manager as stream: resume = False global PHRASES global RECOGNIZE while True: if not RECOGNIZE: time.sleep(1) else: audio_generator = stream.generator(resume=resume) requests = (types.StreamingRecognizeRequest( audio_content=content) for content in audio_generator) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code, max_alternatives=1, enable_word_time_offsets=True, speech_contexts=[types.SpeechContext(phrases=PHRASES)]) streaming_config = types.StreamingRecognitionConfig( config=config, interim_results=True) responses = client.streaming_recognize(streaming_config, requests) try: listen_print_loop(responses, stream) break except grpc.RpcError, e: if e.code() not in (grpc.StatusCode.INVALID_ARGUMENT, grpc.StatusCode.OUT_OF_RANGE): raise details = e.details() if e.code() == grpc.StatusCode.INVALID_ARGUMENT: if 'deadline too short' not in details: logger.error(details) raise else: if 'maximum allowed stream duration' not in details: logger.error(details) raise logger.info('Resuming...') resume = True except Exception, e: logger.info(e) mqtt_client.publish('system/error')
def _get_speech_context(self): """Return a SpeechContext instance to bias recognition towards certain phrases. """ return types.SpeechContext(phrases=self._phrases, )
def transcribe_file(filepathURI, frame_id, user_id): speech_file = filepathURI client = speech.SpeechClient() # [START migration_async_request] resultsJSONList = [] (soundFiles, needsSlice, sample_rate) = convertToL16(speech_file, frame_id) mark_time_offset_counter = 0 for convertedFilePath in soundFiles: try: with io.open(convertedFilePath, 'rb') as audio_file: content = audio_file.read() print ":::::::::::::: AUDIO SLICE " + str( mark_time_offset_counter / (len(content) / sample_rate)) + ": " + str( len(content)) + " @ " + str(sample_rate) + " sp/s" audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=sample_rate, language_code='en-US', enable_word_time_offsets=True, speech_contexts=[ types.SpeechContext(phrases=[ "token", "hype", "coin", "hype coin", "wallet", "crypto" ]) ]) # [START migration_async_response] operation = client.long_running_recognize(config, audio) # [END migration_async_request] print('Waiting for operation to complete...') result = operation.result(timeout=1000) transcript_cat = "" transcript_arr = [] word_mark_cat = [] for res in result.results: phrasonJSON = [] for alternative in res.alternatives: for word_info in alternative.words: if word_info.word.lower() in kSTOPWORDS_LIST: continue start_time_DB_format = "{}".format( mark_time_offset_counter + word_info.start_time.seconds) word_mark = { "word": word_info.word, "start": start_time_DB_format, "frameid": frame_id, "userid": user_id } word_mark_cat.append(word_mark) storeToTermMapSQL(word_info.word, start_time_DB_format, frame_id, user_id) transcript_cat = transcript_cat + '{}'.format( alternative.transcript) + "\n" transcript_arr.append(alternative.transcript) scopeJSON = { "transcript": transcript_cat, "transcript_arr": transcript_arr, "confidence": "100", "words": word_mark_cat } resultsJSONList.append(scopeJSON) mark_time_offset_counter = mark_time_offset_counter + kAUDIO_TIME_SLICE_WINDOW except: print "GOOGLE TOOK A SHIT" audioJSON = {"audio": resultsJSONList} spitJSONAPIResulttoMDB(audioJSON, "audio_speech_google", frame_id, user_id)