def run(self): """Called from [start]. Connects to service and begins streaming.""" # Exit if stop event occurred. if self._stop_event.is_set(): return # Create SSL channel. channel = self._create_channel() self.is_started = True # Open stream service = cloud_speech.SpeechClient(channel) streaming_config = types.StreamingRecognitionConfig( config=types.RecognitionConfig( enable_automatic_punctuation=self.punctuation, encoding=self.encoding, sample_rate_hertz=self.rate, language_code=self.language,), interim_results=self.interim_results) try: request_stream = self._request_stream() resp_stream = service.streaming_recognize( streaming_config, request_stream) self._handle_results(resp_stream) finally: self.stop()
def get_client(lang='en-US', sample_rate=16000, interim_results=False, single_utterance=True, phrase_key=""): """ Helper to return client and config """ client = SpeechClient() config = types.StreamingRecognitionConfig( config=types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=sample_rate, language_code=lang, # Enhanced models are only available to projects that # opt in for audio data collection. use_enhanced=True, # A model must be specified to use enhanced model. model="command_and_search", speech_contexts=[ types.SpeechContext(phrases=PhraseGenerator.get_phrases( "app/config.json", phrase_key), ) ]), interim_results=interim_results, single_utterance=single_utterance) print(str(config)) return client, config
def main(): # See http://g.co/cloud/speech/docs/languages # for a list of supported languages. language_code = 'en-US' # a BCP-47 language tag # If enabled, each word in the first alternative of each result will be # tagged with a speaker tag to identify the speaker. enable_speaker_diarization = True # Optional. Specifies the estimated number of speakers in the conversation. # diarization_speaker_count = 2 client = speech_v1p1beta1.SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code, enable_speaker_diarization=enable_speaker_diarization) streaming_config = types.StreamingRecognitionConfig(config=config, interim_results=True) with MicrophoneStream(RATE, CHUNK) as stream: audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. listen_print_loop(responses)
def _STT_stream(audio_file, **kwargs): print("_STT_stream: Exeucting streaming_recognize API on audio_file {}". format(audio_file)) client = speech_v1p1beta1.SpeechClient() # with io.open(audio_file, 'rb') as f: # content = f.read() config = kwargs streaming_config = types.StreamingRecognitionConfig(config=config) transcript = '' # In practice, stream should be a generator yielding chunks of audio data. stream = stream_feed(audio_file) requests = (types.StreamingRecognizeRequest(audio_content=chunk) for chunk in stream) # streaming_recognize returns a generator. # [START speech_python_migration_streaming_response] responses = client.streaming_recognize(streaming_config, requests) # [END speech_python_migration_streaming_request] for response in responses: # Once the transcription has settled, the first result will contain the # is_final result. The other results will be for subsequent portions of # the audio. for result in response.results: alternatives = result.alternatives for alternative in alternatives: transcript += alternative.transcript # [END speech_python_migration_streaming_response] # [END speech_transcribe_streaming] return transcript
def microphone_streaming_start(wf, output_stream): # See http://g.co/cloud/speech/docs/languages # for a list of supported languages. language_code = 'ko-KR' client = speech.SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code, enable_automatic_punctuation=True, enable_word_time_offsets=True) # enable_speaker_diarization=True, # diarization_speaker_count=3) streaming_config = types.StreamingRecognitionConfig(config=config, interim_results=True) with MicrophoneStream(RATE, CHUNK, wf, output_stream) as stream: audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. listen_print_loop(responses)
def audio_main(): f = open(u"Nao_log.txt", u"a") f.write( u'##**************************** Audio Log File (Group 1) *********************************##' ) f.close() # See http://g.co/cloud/speech/docs/languages # for a list of supported languages. language_code = u'en-US' # a BCP-47 language tag # If enabled, each word in the first alternative of each result will be # tagged with a speaker tag to identify the speaker. enable_speaker_diarization = True # Optional. Specifies the estimated number of speakers in the conversation. #diarization_speaker_count = 2 client = speech_v1p1beta1.SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code, enable_speaker_diarization=enable_speaker_diarization) streaming_config = types.StreamingRecognitionConfig(config=config, interim_results=True) with MicrophoneStream(RATE, CHUNK) as stream: while not stream.closed: sys.stdout.write(YELLOW) sys.stdout.write(u'\n' + unicode(STREAMING_LIMIT * stream.restart_counter) + u': NEW REQUEST\n') stream.audio_input = [] audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. listen_print_loop(responses, stream) if stream.result_end_time > 0: stream.final_request_end_time = stream.is_final_end_time stream.result_end_time = 0 stream.last_audio_input = [] stream.last_audio_input = stream.audio_input stream.audio_input = [] stream.restart_counter = stream.restart_counter + 1 if not stream.last_transcript_was_final: sys.stdout.write(u'\n') stream.new_stream = True
def sub_main(profanityFilterBool): """ *** Code taken from Google Cloud Speech to text documentation *** Turns on the profanity filter so bad words are censored and not printed """ # See http://g.co/cloud/speech/docs/languages # for a list of supported languages. language_code = 'en-US' # a BCP-47 language tag sp_c_cico = { "phrases": ["Hey cico", "Hey Kiko"], "boost": 30.0 } # speech_contexts_cico sp_c_kiko = { "phrases": ["cico", "Cico", "kiko", "Kiko", "kygo", "Kitty, girl"], "boost": 0 } movement_words = { "phrases" : ["move", "feet", "forward", "right", "left", "backward", "degrees", "radians", "to the left", "to the right"], "boost": 20.0 } numbers = { "phrases": ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"], "boost": 5.0 } relevant_words = { "phrases": ["cornell cup robotics", "and", "pick up", "grab"], "boost": 10.0 } speech_contexts = [sp_c_cico, sp_c_kiko, movement_words, relevant_words] client = speech_v1p1beta1.SpeechClient() # print(help(types.RecognitionConfig)) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code, enable_automatic_punctuation=True, speech_contexts=speech_contexts) streaming_config = types.StreamingRecognitionConfig( config=config, interim_results=True) with MicrophoneStream(RATE, CHUNK) as stream: audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. solution = returnResponseString(responses) # solution is the result append_to_file("log.txt", str(solution)) return solution
def transcribe_streaming(stream_file, encoding="LINEAR16", sample_rate=16000): client = speech.SpeechClient() with io.open(stream_file, 'rb') as audio_file: content = audio_file.read() # In practice, stream should be a generator yielding chunks of audio data. stream = [content] requests = (types.StreamingRecognizeRequest(audio_content=chunk) for chunk in stream) config = types.RecognitionConfig( encoding=ENCODINGS[encoding], sample_rate_hertz=sample_rate, language_code='ko-KR', enable_automatic_punctuation=True, enable_word_time_offsets=True, enable_speaker_diarization=True, # 한국어 지원 안됨 (speaker_tag가 모두 동일인으로 분류됨) diarization_speaker_count=3) streaming_config = types.StreamingRecognitionConfig(config=config) # streaming_recognize returns a generator. responses = client.streaming_recognize(streaming_config, requests) words_with_tags = [] transcripts = [] print("Waiting for transcribe...") for response in responses: for result in response.results: alternatives = result.alternatives for alternative in alternatives: print(u'Transcript: {}'.format(alternative.transcript)) transcripts.append( alternative.transcript) # punctuation 포함된 문장을 사용하기 위해 저장 for words in alternative.words: word = words.word start_time = round( words.start_time.seconds + words.start_time.nanos * 1e-9, 3) end_time = round( words.end_time.seconds + words.end_time.nanos * 1e-9, 3) speaker_tag = words.speaker_tag words_with_tags.append([ word, start_time, end_time, speaker_tag ]) # [word, start_time, end_time, speaker_tag] print() # newline return words_with_tags, transcripts
def process(self, loop): """ Audio stream recognition and result parsing """ #You can add speech contexts for better recognition cap_speech_context = types.SpeechContext(**self.context) metadata = types.RecognitionMetadata(**self.metadata) client = speech.SpeechClient() config = types.RecognitionConfig(encoding=self.encoding, sample_rate_hertz=self.rate, language_code=self.language, speech_contexts=[ cap_speech_context, ], enable_automatic_punctuation=True, model=self.model, metadata=metadata) streaming_config = types.StreamingRecognitionConfig( config=config, interim_results=self.interim_results, single_utterance=self.single_utterance) audio_generator = self.stream_generator() requests = iter( types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) #print('process',type(responses)) try: #print('process') for response in responses: #print('process received') if self.terminated: break if not response.results: continue result = response.results[0] if not result.alternatives: continue speechData = MessageToDict(response) global_async_worker.add_task(self.async_callback(speechData)) # debug transcript = result.alternatives[0].transcript print('>>', transcript, "(OK)" if result.is_final else "") except Exception as e: print('process excepted', e) self.start()
def gspeech_client(self): """Creates the Google Speech API client, configures it, and sends/gets audio/text data for parsing. """ language_code = 'en-US' # Hints for the API context = types.SpeechContext(phrases=self.context) client = speech.SpeechClient() # Create metadata object, helps processing metadata = types.RecognitionMetadata() # Interaction Type: # VOICE_SEARCH: Transcribe spoken questions and queries into text. # VOICE_COMMAND: Transcribe voice commands, such as for controlling a device. metadata.interaction_type = ( enums.RecognitionMetadata.InteractionType.VOICE_COMMAND) # Microphone Distance: # NEARFIELD: The audio was captured from a closely placed microphone. # MIDFIELD: The speaker is within 3 meters of the microphone. # FARFIELD: The speaker is more than 3 meters away from the microphone. metadata.microphone_distance = ( enums.RecognitionMetadata.MicrophoneDistance.MIDFIELD) # Device Type: # PC: Speech was recorded using a personal computer or tablet. # VEHICLE: Speech was recorded in a vehicle. # OTHER_OUTDOOR_DEVICE: Speech was recorded outdoors. # OTHER_INDOOR_DEVICE: Speech was recorded indoors. metadata.recording_device_type = ( enums.RecognitionMetadata.RecordingDeviceType.PC) # Media Type: # AUDIO: The speech data is an audio recording. # VIDEO: The speech data originally recorded on a video. metadata.original_media_type = ( enums.RecognitionMetadata.OriginalMediaType.AUDIO) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code=language_code, speech_contexts=[context], use_enhanced=True, model='command_and_search', metadata=metadata) streaming_config = types.StreamingRecognitionConfig( config=config, single_utterance=False, interim_results=False) # Hack from Google Speech Python docs, very pythonic c: requests = (types.StreamingRecognizeRequest(audio_content=content) for content in self.generator()) responses = client.streaming_recognize(streaming_config, requests) self._listen_print_loop(responses)
def __init__(self): self.client = speech.SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code='en-US', enable_word_time_offsets=True, model='video', diarization_speaker_count=2, enable_automatic_punctuation=True, use_enhanced=True, enable_speaker_diarization=True, speech_contexts=[speech.types.SpeechContext(phrases=[])] ) self.streaming_config = types.StreamingRecognitionConfig(config=config)
def listen(self, language_code='ja-JP'): """Listen.""" # See http://g.co/cloud/speech/docs/languages # for a list of supported languages. client = speech.SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=self.rate, model=None, speech_contexts=[types.SpeechContext( )], language_code=language_code) streaming_config = types.StreamingRecognitionConfig( config=config, single_utterance=True, interim_results=True ) self.callbacks.get("ready", lambda: True)() with MicrophoneStream(self.rate, int(self.rate/10)) as stream: self.callbacks.get("start", lambda: True)() while True: try: audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) self.listen_print_loop(responses) except exceptions.OutOfRange: print("Time exceeded.(OutOfRange)") except exceptions.ServiceUnavailable: print("Connection closed.(ServiceUnavailable)") except KeyboardInterrupt: print("KeyboardInterrupt.") break except: print("Unexpected error:", sys.exc_info()[0]) raise self.callbacks.get("end", lambda: True)()
def __init__(self, speakers, speaker_count, sample_rate, chunk, language_code, exit_command): self.speakers = speakers self.speaker_count = speaker_count self.sample_rate = sample_rate self.chunk = chunk self.language_code = language_code self.exit_command = exit_command self.client = speech.SpeechClient() self.recognition_config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=self.sample_rate, language_code=self.language_code, enable_speaker_diarization=True, diarization_speaker_count=self.speaker_count) self.streaming_config = types.StreamingRecognitionConfig( config=self.recognition_config, interim_results=True)
def request_command(): language_code = 'en-US' client = speech.SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, speech_contexts=[{ "phrases": recommeneded_pharses, "boost": boost, }], language_code=language_code) streaming_config = types.StreamingRecognitionConfig(config=config, interim_results=True) with MicrophoneStream(RATE, CHUNK) as stream: audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) print("here") for response in responses: if not response.results: continue result = response.results[0] if not result.alternatives: continue # Display the transcription of the top alternative. transcript = result.alternatives[0].transcript if result.is_final: print(transcript) match = re.search(command_regex, transcript) if match: player_command = PlayerCommand(match.group(1), match.group(3), match.group(4)) return player_command return None
def transcribe_streaming(stream_file): """Streams transcription of the given audio file.""" import io from google.cloud import speech_v1p1beta1 from google.cloud.speech_v1p1beta1 import enums from google.cloud.speech_v1p1beta1 import types client = speech_v1p1beta1.SpeechClient() # [START speech_python_migration_streaming_request] with io.open(stream_file, 'rb') as audio_file: content = audio_file.read() # In practice, stream should be a generator yielding chunks of audio data. stream = [content] requests = (types.StreamingRecognizeRequest(audio_content=chunk) for chunk in stream) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.MP3, sample_rate_hertz=16000, language_code='en-US') streaming_config = types.StreamingRecognitionConfig(config=config) # streaming_recognize returns a generator. # [START speech_python_migration_streaming_response] responses = client.streaming_recognize(streaming_config, requests) # [END speech_python_migration_streaming_request] for response in responses: # Once the transcription has settled, the first result will contain the # is_final result. The other results will be for subsequent portions of # the audio. for result in response.results: print('Finished: {}'.format(result.is_final)) print('Stability: {}'.format(result.stability)) alternatives = result.alternatives # The alternatives are ordered from most likely to least. for alternative in alternatives: print('Confidence: {}'.format(alternative.confidence)) print(u'Transcript: {}'.format(alternative.transcript))
def main(): client = speech.SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=LANGUAGE_CODE, enable_speaker_diarization=True, diarization_speaker_count=SPEAKER_COUNT) streaming_config = types.StreamingRecognitionConfig(config=config, interim_results=True) # indicates that this stream request should return temporary results # that may be refined at a later time (after processing more audio). # Interim results will be noted within responses through the setting of # is_final to false with MicrophoneStream(RATE, CHUNK) as stream: audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. listen_print_loop(responses)