def recognize_stream(bytestream: Generator[ByteString, None, None], client: speech.SpeechClient, recognition_config: types.RecognitionConfig, q: Queue): """Streams transcription of the given audio file.""" requests = (types.StreamingRecognizeRequest(audio_content=chunk) for chunk in bytestream) responses = client.streaming_recognize( get_streaming_recognition_config(recognition_config), requests) while True: try: resp = next(responses) logger.debug("reading next response; resp.results is {}".format( resp.results)) except StopIteration: logger.info("no more responses!") break if resp.results: final = [x for x in resp.results if x.is_final] if final: q.put(final[0].alternatives[0]) logger.info("exit from recognize_stream!") return
class GoogleAsr(Asr): NAME = 'Google Asr' DEPENDENCIES = {'system': [], 'pip': {'google-cloud-speech==1.3.1'}} def __init__(self): super().__init__() self._capableOfArbitraryCapture = True self._isOnlineASR = True self._client: Optional[SpeechClient] = None self._streamingConfig: Optional[ types.StreamingRecognitionConfig] = None self._previousCapture = '' def onStart(self): super().onStart() os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = str( Path(self.Commons.rootDir(), 'credentials/googlecredentials.json')) self._client = SpeechClient() # noinspection PyUnresolvedReferences config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=self.AudioServer.SAMPLERATE, language_code=self.LanguageManager.getLanguageAndCountryCode()) self._streamingConfig = types.StreamingRecognitionConfig( config=config, interim_results=True) def decodeStream(self, session: DialogSession) -> Optional[ASRResult]: super().decodeStream(session) recorder = Recorder(self._timeout, session.user, session.siteId) self.ASRManager.addRecorder(session.siteId, recorder) self._recorder = recorder with Stopwatch() as processingTime: with recorder as stream: audioStream = stream.audioStream() # noinspection PyUnresolvedReferences try: requests = (types.StreamingRecognizeRequest( audio_content=content) for content in audioStream) responses = self._client.streaming_recognize( self._streamingConfig, requests) result = self._checkResponses(session, responses) except: self.logWarning('Failed ASR request') self.end() return ASRResult( text=result[0], session=session, likelihood=result[1], processingTime=processingTime.time) if result else None def _checkResponses(self, session: DialogSession, responses: Generator) -> Optional[tuple]: if responses is None: return None for response in responses: if not response.results: continue result = response.results[0] if not result.alternatives: continue if result.is_final: return result.alternatives[0].transcript, result.alternatives[ 0].confidence elif result.alternatives[0].transcript != self._previousCapture: self.partialTextCaptured( session=session, text=result.alternatives[0].transcript, likelihood=result.alternatives[0].confidence, seconds=0) self._previousCapture = result.alternatives[0].transcript return None
sock.connect(address) else: sock = None client = SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=args.lang_code) streaming_config = types.StreamingRecognitionConfig(config=config, interim_results=True) print("%s recognition started!" % args.lang_code) while True: with MicrophoneStream(RATE, CHUNK) as stream: audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) try: responses = client.streaming_recognize(streaming_config, requests) listen_print_loop(responses, print_locally=args.debug, sock=sock) except KeyboardInterrupt: break except: # ignore "400 Exceeded maximum allowed stream duration of 305 seconds." continue if sock is not None: sock.close()
class GoogleAsr(Asr): NAME = 'Google Asr' DEPENDENCIES = { 'system': [], 'pip' : { 'google-cloud-speech==1.3.1' } } def __init__(self): super().__init__() self._credentialsFile = Path(self.Commons.rootDir(), 'credentials/googlecredentials.json') self._capableOfArbitraryCapture = True self._isOnlineASR = True self._client: Optional[SpeechClient] = None self._streamingConfig: Optional[types.StreamingRecognitionConfig] = None if self._credentialsFile.exists() and not self.ConfigManager.getAliceConfigByName('googleASRCredentials'): self.ConfigManager.updateAliceConfiguration(key='googleASRCredentials', value=self._credentialsFile.read_text(), doPreAndPostProcessing=False) self._internetLostFlag = Event() # Set if internet goes down, cut the decoding self._lastResultCheck = 0 # The time the intermediate results were last checked. If actual time is greater than this value + 3, stop processing, internet issues self._previousCapture = '' # The text that was last captured in the iteration def onStart(self): super().onStart() os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = str(self._credentialsFile) self._client = SpeechClient() # noinspection PyUnresolvedReferences config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=self.AudioServer.SAMPLERATE, language_code=self.LanguageManager.getLanguageAndCountryCode() ) self._streamingConfig = types.StreamingRecognitionConfig(config=config, interim_results=True) def decodeStream(self, session: DialogSession) -> Optional[ASRResult]: super().decodeStream(session) recorder = Recorder(self._timeout, session.user, session.deviceUid) self.ASRManager.addRecorder(session.deviceUid, recorder) self._recorder = recorder result = None with Stopwatch() as processingTime: with recorder as stream: audioStream = stream.audioStream() # noinspection PyUnresolvedReferences try: requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audioStream) responses = self._client.streaming_recognize(self._streamingConfig, requests) result = self._checkResponses(session, responses) except Exception as e: self._internetLostFlag.clear() self.logWarning(f'Failed ASR request: {e}') self.end() return ASRResult( text=result[0], session=session, likelihood=result[1], processingTime=processingTime.time ) if result else None def onInternetLost(self): self._internetLostFlag.set() def _checkResponses(self, session: DialogSession, responses: Generator) -> Optional[tuple]: if responses is None: return None for response in responses: if self._internetLostFlag.is_set(): self.logDebug('Internet connectivity lost during ASR decoding') if not response.results: raise Exception('Internet connectivity lost during decoding') result = response.results[0] return result.alternatives[0].transcript, result.alternatives[0].confidence if not response.results: continue result = response.results[0] if not result.alternatives: continue if result.is_final: return result.alternatives[0].transcript, result.alternatives[0].confidence elif result.alternatives[0].transcript != self._previousCapture: self.partialTextCaptured(session=session, text=result.alternatives[0].transcript, likelihood=result.alternatives[0].confidence, seconds=0) self._previousCapture = result.alternatives[0].transcript elif result.alternatives[0].transcript == self._previousCapture: now = int(time()) if self._lastResultCheck == 0: self._lastResultCheck = 0 continue if now > self._lastResultCheck + 3: self.logDebug(f'Stopping process as there seems to be connectivity issues') return result.alternatives[0].transcript, result.alternatives[0].confidence self._lastResultCheck = now return None
class GoogleAsr(Asr): NAME = 'Google Asr' DEPENDENCIES = {'system': [], 'pip': {'google-cloud-speech==1.3.1'}} def __init__(self): super().__init__() self._capableOfArbitraryCapture = True self._isOnlineASR = True self._client: Optional[SpeechClient] = None self._streamingConfig: Optional[ types.StreamingRecognitionConfig] = None self._internetLostFlag = Event( ) # Set if internet goes down, cut the decoding self._lastResultCheck = 0 # The time the intermediate results were last checked. If actual time is greater than this value + 3, stop processing, internet issues self._previousCapture = '' # The text that was last captured in the iteration self._delayedGoogleConfirmation = False # set whether slow internet is detected or not def onStart(self): super().onStart() os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = str( Path(self.Commons.rootDir(), 'credentials/googlecredentials.json')) self._client = SpeechClient() # noinspection PyUnresolvedReferences config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=self.AudioServer.SAMPLERATE, language_code=self.LanguageManager.getLanguageAndCountryCode()) self._streamingConfig = types.StreamingRecognitionConfig( config=config, interim_results=True) def decodeStream(self, session: DialogSession) -> Optional[ASRResult]: super().decodeStream(session) recorder = Recorder(self._timeout, session.user, session.siteId) self.ASRManager.addRecorder(session.siteId, recorder) self._recorder = recorder result = None with Stopwatch() as processingTime: with recorder as stream: audioStream = stream.audioStream() # noinspection PyUnresolvedReferences try: requests = (types.StreamingRecognizeRequest( audio_content=content) for content in audioStream) responses = self._client.streaming_recognize( self._streamingConfig, requests) result = self._checkResponses(session, responses) except: self._internetLostFlag.clear() self.logWarning('Failed ASR request') self.end() return ASRResult( text=result[0], session=session, likelihood=result[1], processingTime=processingTime.time) if result else None def onInternetLost(self): self._internetLostFlag.set() def _checkResponses(self, session: DialogSession, responses: Generator) -> Optional[tuple]: if responses is None: return None for response in responses: if self._internetLostFlag.is_set(): self.logDebug('Internet connectivity lost during ASR decoding') if not response.results: raise Exception( 'Internet connectivity lost during decoding') result = response.results[0] return result.alternatives[0].transcript, result.alternatives[ 0].confidence if not response.results: continue result = response.results[0] if not result.alternatives: continue if result.is_final: self._lastResultCheck = 0 self._delayedGoogleConfirmation = False # print(f'Text confirmed by Google') return result.alternatives[0].transcript, result.alternatives[ 0].confidence elif result.alternatives[0].transcript != self._previousCapture: self.partialTextCaptured( session=session, text=result.alternatives[0].transcript, likelihood=result.alternatives[0].confidence, seconds=0) # below function captures the "potential" full utterance not just one word from it if len(self._previousCapture) <= len( result.alternatives[0].transcript): self._previousCapture = result.alternatives[0].transcript elif result.alternatives[0].transcript == self._previousCapture: # If we are here it's cause google hasn't responded yet with confirmation on captured text # Store the time in seconds since epoch now = int(time()) # Set a reference to nows time plus 3 seconds self._lastResultCheck = now + 3 # wait 3 seconds and see if google responds if not self._delayedGoogleConfirmation: # print(f'Text of "{self._previousCapture}" captured but not confirmed by GoogleASR yet') while now <= self._lastResultCheck: now = int(time()) self._delayedGoogleConfirmation = True # Give google the option to still process the utterance continue # During next iteration, If google hasn't responded in 3 seconds assume intent is correct if self._delayedGoogleConfirmation: self.logDebug( f'Stopping process as there seems to be connectivity issues' ) self._lastResultCheck = 0 self._delayedGoogleConfirmation = False return result.alternatives[ 0].transcript, result.alternatives[0].confidence return None