def transcribe_with_word_time_offsets( speech_content: bytes, ) -> Iterable[Tuple[str, float, float]]: """Recognize words with time offsets from a speech. Args: speech_content: Binary data of the speech. Yields: The word with start time and end time that api recognized. [ ('여기요', 0.0, 2.0), ('저기요', 3.6, 5.4), ('저', 5.4, 9.2), ('밖에서', 9.2, 9.6), ('장애인', 9.6, 10.0), ('주차', 10.0, 10.3), ('가능', 10.3, 10.5), ('까만색', 10.5, 11.3), ('소나타', 11.3, 11.7), ('글', 11.7, 11.8), ('찾아요', 11.8, 12.2), ('근데요', 12.2, 13.2) ] See: https://cloud.google.com/speech-to-text/docs/sync-recognize """ client = SpeechClient() audio = types.RecognitionAudio(content=speech_content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=44100, language_code="ko-KR", enable_word_time_offsets=True, ) response = client.recognize(config, audio) for result in response.results: alternative = result.alternatives[0] for word_info in alternative.words: word = word_info.word start_time = word_info.start_time end_time = word_info.end_time yield ( word, start_time.seconds + start_time.nanos * 1e-9, end_time.seconds + end_time.nanos * 1e-9, )
def __init__(self, credentials_path, language_code, phrase_hints=[]): """ Args: credentials_path (str): The path to the service account private key json file. language_code (str): The language of the supplied audio as a BCP-47 language tag. Example: “en-US”. phrase_hints (str[]): https://cloud.google.com/speech-to-text/docs/basics#phrase-hints """ self.language_code = language_code self.client = SpeechClient().from_service_account_json( credentials_path) self.speech_context = [types.SpeechContext(phrases=phrase_hints)] self._mic: MicrophoneInput = None
def get_conn(self): """ Retrieves connection to Cloud Speech. :return: Google Cloud Speech client object. :rtype: google.cloud.speech_v1.SpeechClient """ if not self._client: self._client = SpeechClient(credentials=self._get_credentials()) return self._client
def _make_one(): import google.auth.credentials from google.cloud.speech_v1 import SpeechClient credentials = mock.Mock(spec=google.auth.credentials.Credentials) return SpeechClient(credentials=credentials)
def make_speech_client(): credentials = mock.Mock(spec=google.auth.credentials.Credentials) return SpeechClient(credentials=credentials)
class SpeechToTextClient: """ A speech to text client that parses microphone input into text. """ def __init__(self, credentials_path, language_code, phrase_hints=[]): """ Args: credentials_path (str): The path to the service account private key json file. language_code (str): The language of the supplied audio as a BCP-47 language tag. Example: “en-US”. phrase_hints (str[]): https://cloud.google.com/speech-to-text/docs/basics#phrase-hints """ self.language_code = language_code self.client = SpeechClient().from_service_account_json( credentials_path) self.speech_context = [types.SpeechContext(phrases=phrase_hints)] self._mic: MicrophoneInput = None def start(self, callback): """ Args: callback (function): Function that is called when text is transcribed from speech """ try: with MicrophoneInput() as mic: print("Starting SpeechToTextClient") self._mic = mic audio_generator = self._mic.generator() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=self._mic.RATE, language_code=self.language_code, use_enhanced=True, speech_contexts=self.speech_context) streaming_config = types.StreamingRecognitionConfig( config=config, interim_results=True) requests = (types.StreamingRecognizeRequest( audio_content=content) for content in audio_generator) responses = self.client.streaming_recognize( streaming_config, requests) for response in responses: if not response.results: # no results continue # first result is best result result = response.results[0] if not result.alternatives: continue transcript = result.alternatives[0].transcript.strip( ).lower() callback((transcript, result.is_final)) except OutOfRange: self.restart(callback) def stop(self): print("Stopping SpeechToTextClient") if self._mic is None or self._mic.closed: return self._mic.close() def restart(self, callback): self.stop() self.start(callback) def update_phrase_hints(self, phrase_hints): self.phrase_hints = [types.SpeechContext(phrases=phrase_hints)]