def transcribe_with_word_time_offsets(
    speech_content: bytes, ) -> Iterable[Tuple[str, float, float]]:
    """Recognize words with time offsets from a speech.

    Args:
        speech_content: Binary data of the speech.

    Yields:
        The word with start time and end time that api recognized.

            [
                ('여기요', 0.0, 2.0),
                ('저기요', 3.6, 5.4),
                ('저', 5.4, 9.2),
                ('밖에서', 9.2, 9.6),
                ('장애인', 9.6, 10.0),
                ('주차', 10.0, 10.3),
                ('가능', 10.3, 10.5),
                ('까만색', 10.5, 11.3),
                ('소나타', 11.3, 11.7),
                ('글', 11.7, 11.8),
                ('찾아요', 11.8, 12.2),
                ('근데요', 12.2, 13.2)
            ]

    See:
        https://cloud.google.com/speech-to-text/docs/sync-recognize

    """
    client = SpeechClient()

    audio = types.RecognitionAudio(content=speech_content)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=44100,
        language_code="ko-KR",
        enable_word_time_offsets=True,
    )

    response = client.recognize(config, audio)

    for result in response.results:
        alternative = result.alternatives[0]

        for word_info in alternative.words:
            word = word_info.word
            start_time = word_info.start_time
            end_time = word_info.end_time
            yield (
                word,
                start_time.seconds + start_time.nanos * 1e-9,
                end_time.seconds + end_time.nanos * 1e-9,
            )
예제 #2
0
 def __init__(self, credentials_path, language_code, phrase_hints=[]):
     """
     Args:
         credentials_path (str): The path to the service account private key json file.
         language_code (str): The language of the supplied audio as a BCP-47 language tag. Example: “en-US”.
         phrase_hints (str[]): https://cloud.google.com/speech-to-text/docs/basics#phrase-hints
     """
     self.language_code = language_code
     self.client = SpeechClient().from_service_account_json(
         credentials_path)
     self.speech_context = [types.SpeechContext(phrases=phrase_hints)]
     self._mic: MicrophoneInput = None
예제 #3
0
    def get_conn(self):
        """
        Retrieves connection to Cloud Speech.

        :return: Google Cloud Speech client object.
        :rtype: google.cloud.speech_v1.SpeechClient
        """
        if not self._client:
            self._client = SpeechClient(credentials=self._get_credentials())
        return self._client
예제 #4
0
    def _make_one():
        import google.auth.credentials
        from google.cloud.speech_v1 import SpeechClient

        credentials = mock.Mock(spec=google.auth.credentials.Credentials)
        return SpeechClient(credentials=credentials)
예제 #5
0
def make_speech_client():
    credentials = mock.Mock(spec=google.auth.credentials.Credentials)
    return SpeechClient(credentials=credentials)
예제 #6
0
class SpeechToTextClient:
    """
    A speech to text client that parses microphone input into text.
    """
    def __init__(self, credentials_path, language_code, phrase_hints=[]):
        """
        Args:
            credentials_path (str): The path to the service account private key json file.
            language_code (str): The language of the supplied audio as a BCP-47 language tag. Example: “en-US”.
            phrase_hints (str[]): https://cloud.google.com/speech-to-text/docs/basics#phrase-hints
        """
        self.language_code = language_code
        self.client = SpeechClient().from_service_account_json(
            credentials_path)
        self.speech_context = [types.SpeechContext(phrases=phrase_hints)]
        self._mic: MicrophoneInput = None

    def start(self, callback):
        """
        Args:
            callback (function): Function that is called when text is transcribed from speech
        """
        try:
            with MicrophoneInput() as mic:
                print("Starting SpeechToTextClient")
                self._mic = mic
                audio_generator = self._mic.generator()
                config = types.RecognitionConfig(
                    encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
                    sample_rate_hertz=self._mic.RATE,
                    language_code=self.language_code,
                    use_enhanced=True,
                    speech_contexts=self.speech_context)
                streaming_config = types.StreamingRecognitionConfig(
                    config=config, interim_results=True)
                requests = (types.StreamingRecognizeRequest(
                    audio_content=content) for content in audio_generator)
                responses = self.client.streaming_recognize(
                    streaming_config, requests)
                for response in responses:
                    if not response.results:  # no results
                        continue
                    # first result is best result
                    result = response.results[0]
                    if not result.alternatives:
                        continue
                    transcript = result.alternatives[0].transcript.strip(
                    ).lower()
                    callback((transcript, result.is_final))
        except OutOfRange:
            self.restart(callback)

    def stop(self):
        print("Stopping SpeechToTextClient")
        if self._mic is None or self._mic.closed:
            return
        self._mic.close()

    def restart(self, callback):
        self.stop()
        self.start(callback)

    def update_phrase_hints(self, phrase_hints):
        self.phrase_hints = [types.SpeechContext(phrases=phrase_hints)]