예제 #1
0
  def run(self):
    """Called from [start]. Connects to service and begins streaming."""

    # Exit if stop event occurred.
    if self._stop_event.is_set():
      return

    # Create SSL channel.
    channel = self._create_channel()
    self.is_started = True

    # Open stream
    service = cloud_speech.SpeechClient(channel)
    streaming_config = types.StreamingRecognitionConfig(
        config=types.RecognitionConfig(
            enable_automatic_punctuation=self.punctuation,
            encoding=self.encoding,
            sample_rate_hertz=self.rate,
            language_code=self.language,),
        interim_results=self.interim_results)

    try:
      request_stream = self._request_stream()
      resp_stream = service.streaming_recognize(
          streaming_config, request_stream)
      self._handle_results(resp_stream)
    finally:
      self.stop()
예제 #2
0
def get_client(lang='en-US',
               sample_rate=16000,
               interim_results=False,
               single_utterance=True,
               phrase_key=""):
    """
    Helper to return client and config
    """
    client = SpeechClient()
    config = types.StreamingRecognitionConfig(
        config=types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=sample_rate,
            language_code=lang,
            # Enhanced models are only available to projects that
            # opt in for audio data collection.
            use_enhanced=True,
            # A model must be specified to use enhanced model.
            model="command_and_search",
            speech_contexts=[
                types.SpeechContext(phrases=PhraseGenerator.get_phrases(
                    "app/config.json", phrase_key), )
            ]),
        interim_results=interim_results,
        single_utterance=single_utterance)
    print(str(config))
    return client, config
예제 #3
0
def main():
    # See http://g.co/cloud/speech/docs/languages
    # for a list of supported languages.
    language_code = 'en-US'  # a BCP-47 language tag

    # If enabled, each word in the first alternative of each result will be
    # tagged with a speaker tag to identify the speaker.
    enable_speaker_diarization = True

    # Optional. Specifies the estimated number of speakers in the conversation.
    # diarization_speaker_count = 2

    client = speech_v1p1beta1.SpeechClient()
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code,
        enable_speaker_diarization=enable_speaker_diarization)

    streaming_config = types.StreamingRecognitionConfig(config=config,
                                                        interim_results=True)

    with MicrophoneStream(RATE, CHUNK) as stream:
        audio_generator = stream.generator()
        requests = (types.StreamingRecognizeRequest(audio_content=content)
                    for content in audio_generator)

        responses = client.streaming_recognize(streaming_config, requests)

        # Now, put the transcription responses to use.
        listen_print_loop(responses)
def _STT_stream(audio_file, **kwargs):

    print("_STT_stream: Exeucting streaming_recognize API on audio_file {}".
          format(audio_file))

    client = speech_v1p1beta1.SpeechClient()
    # with io.open(audio_file, 'rb') as f:
    # content = f.read()

    config = kwargs
    streaming_config = types.StreamingRecognitionConfig(config=config)

    transcript = ''

    # In practice, stream should be a generator yielding chunks of audio data.
    stream = stream_feed(audio_file)
    requests = (types.StreamingRecognizeRequest(audio_content=chunk)
                for chunk in stream)

    # streaming_recognize returns a generator.
    # [START speech_python_migration_streaming_response]
    responses = client.streaming_recognize(streaming_config, requests)
    # [END speech_python_migration_streaming_request]

    for response in responses:
        # Once the transcription has settled, the first result will contain the
        # is_final result. The other results will be for subsequent portions of
        # the audio.
        for result in response.results:
            alternatives = result.alternatives
            for alternative in alternatives:
                transcript += alternative.transcript
    # [END speech_python_migration_streaming_response]
    # [END speech_transcribe_streaming]
    return transcript
예제 #5
0
def microphone_streaming_start(wf, output_stream):
    # See http://g.co/cloud/speech/docs/languages
    # for a list of supported languages.
    language_code = 'ko-KR'

    client = speech.SpeechClient()
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code,
        enable_automatic_punctuation=True,
        enable_word_time_offsets=True)
    #     enable_speaker_diarization=True,
    #     diarization_speaker_count=3)

    streaming_config = types.StreamingRecognitionConfig(config=config,
                                                        interim_results=True)

    with MicrophoneStream(RATE, CHUNK, wf, output_stream) as stream:
        audio_generator = stream.generator()
        requests = (types.StreamingRecognizeRequest(audio_content=content)
                    for content in audio_generator)

        responses = client.streaming_recognize(streaming_config, requests)

        # Now, put the transcription responses to use.
        listen_print_loop(responses)
예제 #6
0
def audio_main():
    f = open(u"Nao_log.txt", u"a")
    f.write(
        u'##**************************** Audio Log File (Group 1) *********************************##'
    )
    f.close()

    # See http://g.co/cloud/speech/docs/languages
    # for a list of supported languages.
    language_code = u'en-US'  # a BCP-47 language tag

    # If enabled, each word in the first alternative of each result will be
    # tagged with a speaker tag to identify the speaker.
    enable_speaker_diarization = True

    # Optional. Specifies the estimated number of speakers in the conversation.
    #diarization_speaker_count = 2

    client = speech_v1p1beta1.SpeechClient()
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code,
        enable_speaker_diarization=enable_speaker_diarization)

    streaming_config = types.StreamingRecognitionConfig(config=config,
                                                        interim_results=True)

    with MicrophoneStream(RATE, CHUNK) as stream:

        while not stream.closed:
            sys.stdout.write(YELLOW)
            sys.stdout.write(u'\n' + unicode(STREAMING_LIMIT *
                                             stream.restart_counter) +
                             u': NEW REQUEST\n')

            stream.audio_input = []
            audio_generator = stream.generator()

            requests = (types.StreamingRecognizeRequest(audio_content=content)
                        for content in audio_generator)

            responses = client.streaming_recognize(streaming_config, requests)

            # Now, put the transcription responses to use.

            listen_print_loop(responses, stream)

            if stream.result_end_time > 0:
                stream.final_request_end_time = stream.is_final_end_time
            stream.result_end_time = 0
            stream.last_audio_input = []
            stream.last_audio_input = stream.audio_input
            stream.audio_input = []
            stream.restart_counter = stream.restart_counter + 1

            if not stream.last_transcript_was_final:
                sys.stdout.write(u'\n')
            stream.new_stream = True
예제 #7
0
def sub_main(profanityFilterBool):
    """
    *** Code taken from Google Cloud Speech to text documentation ***
    Turns on the profanity filter so bad words are censored and not printed
    """
    # See http://g.co/cloud/speech/docs/languages
    # for a list of supported languages.
    language_code = 'en-US'  # a BCP-47 language tag
    sp_c_cico = {
        "phrases": ["Hey cico", "Hey Kiko"],
        "boost": 30.0
    }  # speech_contexts_cico
    sp_c_kiko = {
        "phrases": ["cico", "Cico", "kiko", "Kiko", "kygo", "Kitty, girl"],
        "boost": 0
    }
    movement_words = {
        "phrases" : ["move", "feet", "forward", "right", "left", "backward", "degrees", "radians", "to the left", "to the right"],
        "boost": 20.0
    }
    numbers = {
        "phrases": ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"],
        "boost": 5.0
    }
    relevant_words = {
        "phrases": ["cornell cup robotics", "and", "pick up", "grab"],
        "boost": 10.0
    }
    speech_contexts = [sp_c_cico, sp_c_kiko, movement_words, relevant_words]
    client = speech_v1p1beta1.SpeechClient()
    # print(help(types.RecognitionConfig))
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code,
        enable_automatic_punctuation=True,
        speech_contexts=speech_contexts)

    streaming_config = types.StreamingRecognitionConfig(
        config=config,
        interim_results=True)

    with MicrophoneStream(RATE, CHUNK) as stream:

        audio_generator = stream.generator()
        requests = (types.StreamingRecognizeRequest(audio_content=content)
                    for content in audio_generator)

        responses = client.streaming_recognize(streaming_config, requests)
        # Now, put the transcription responses to use.
        solution = returnResponseString(responses)  # solution is the result

        append_to_file("log.txt", str(solution))

    return solution
예제 #8
0
def transcribe_streaming(stream_file, encoding="LINEAR16", sample_rate=16000):
    client = speech.SpeechClient()

    with io.open(stream_file, 'rb') as audio_file:
        content = audio_file.read()

    # In practice, stream should be a generator yielding chunks of audio data.
    stream = [content]

    requests = (types.StreamingRecognizeRequest(audio_content=chunk)
                for chunk in stream)

    config = types.RecognitionConfig(
        encoding=ENCODINGS[encoding],
        sample_rate_hertz=sample_rate,
        language_code='ko-KR',
        enable_automatic_punctuation=True,
        enable_word_time_offsets=True,
        enable_speaker_diarization=True,  # 한국어 지원 안됨 (speaker_tag가 모두 동일인으로 분류됨)
        diarization_speaker_count=3)
    streaming_config = types.StreamingRecognitionConfig(config=config)

    # streaming_recognize returns a generator.
    responses = client.streaming_recognize(streaming_config, requests)

    words_with_tags = []
    transcripts = []

    print("Waiting for transcribe...")
    for response in responses:
        for result in response.results:
            alternatives = result.alternatives
            for alternative in alternatives:
                print(u'Transcript: {}'.format(alternative.transcript))
                transcripts.append(
                    alternative.transcript)  # punctuation 포함된 문장을 사용하기 위해 저장
                for words in alternative.words:
                    word = words.word
                    start_time = round(
                        words.start_time.seconds +
                        words.start_time.nanos * 1e-9, 3)
                    end_time = round(
                        words.end_time.seconds + words.end_time.nanos * 1e-9,
                        3)
                    speaker_tag = words.speaker_tag
                    words_with_tags.append([
                        word, start_time, end_time, speaker_tag
                    ])  # [word, start_time, end_time, speaker_tag]
            print()  # newline

    return words_with_tags, transcripts
예제 #9
0
    def process(self, loop):
        """
        Audio stream recognition and result parsing
        """
        #You can add speech contexts for better recognition
        cap_speech_context = types.SpeechContext(**self.context)
        metadata = types.RecognitionMetadata(**self.metadata)
        client = speech.SpeechClient()
        config = types.RecognitionConfig(encoding=self.encoding,
                                         sample_rate_hertz=self.rate,
                                         language_code=self.language,
                                         speech_contexts=[
                                             cap_speech_context,
                                         ],
                                         enable_automatic_punctuation=True,
                                         model=self.model,
                                         metadata=metadata)

        streaming_config = types.StreamingRecognitionConfig(
            config=config,
            interim_results=self.interim_results,
            single_utterance=self.single_utterance)
        audio_generator = self.stream_generator()
        requests = iter(
            types.StreamingRecognizeRequest(audio_content=content)
            for content in audio_generator)

        responses = client.streaming_recognize(streaming_config, requests)
        #print('process',type(responses))
        try:
            #print('process')
            for response in responses:
                #print('process received')
                if self.terminated:
                    break
                if not response.results:
                    continue
                result = response.results[0]
                if not result.alternatives:
                    continue
                speechData = MessageToDict(response)
                global_async_worker.add_task(self.async_callback(speechData))

                # debug
                transcript = result.alternatives[0].transcript

                print('>>', transcript, "(OK)" if result.is_final else "")
        except Exception as e:
            print('process excepted', e)
            self.start()
예제 #10
0
 def gspeech_client(self):
     """Creates the Google Speech API client, configures it, and sends/gets
     audio/text data for parsing.
     """
     language_code = 'en-US'
     # Hints for the API
     context = types.SpeechContext(phrases=self.context)
     client = speech.SpeechClient()
     # Create metadata object, helps processing
     metadata = types.RecognitionMetadata()
     # Interaction Type:
     # VOICE_SEARCH: Transcribe spoken questions and queries into text.
     # VOICE_COMMAND: Transcribe voice commands, such as for controlling a device.
     metadata.interaction_type = (
         enums.RecognitionMetadata.InteractionType.VOICE_COMMAND)
     # Microphone Distance:
     # NEARFIELD: The audio was captured from a closely placed microphone.
     # MIDFIELD: The speaker is within 3 meters of the microphone.
     # FARFIELD: The speaker is more than 3 meters away from the microphone.
     metadata.microphone_distance = (
         enums.RecognitionMetadata.MicrophoneDistance.MIDFIELD)
     # Device Type:
     # PC: Speech was recorded using a personal computer or tablet.
     # VEHICLE: Speech was recorded in a vehicle.
     # OTHER_OUTDOOR_DEVICE: Speech was recorded outdoors.
     # OTHER_INDOOR_DEVICE: Speech was recorded indoors.
     metadata.recording_device_type = (
         enums.RecognitionMetadata.RecordingDeviceType.PC)
     # Media Type:
     # AUDIO: The speech data is an audio recording.
     # VIDEO: The speech data originally recorded on a video.
     metadata.original_media_type = (
         enums.RecognitionMetadata.OriginalMediaType.AUDIO)
     config = types.RecognitionConfig(
         encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
         sample_rate_hertz=16000,
         language_code=language_code,
         speech_contexts=[context],
         use_enhanced=True,
         model='command_and_search',
         metadata=metadata)
     streaming_config = types.StreamingRecognitionConfig(
         config=config, single_utterance=False, interim_results=False)
     # Hack from Google Speech Python docs, very pythonic c:
     requests = (types.StreamingRecognizeRequest(audio_content=content)
                 for content in self.generator())
     responses = client.streaming_recognize(streaming_config, requests)
     self._listen_print_loop(responses)
    def __init__(self):
        self.client = speech.SpeechClient()
        config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=16000,
            language_code='en-US',
            enable_word_time_offsets=True,
            model='video',
            diarization_speaker_count=2,
            enable_automatic_punctuation=True,
            use_enhanced=True,
            enable_speaker_diarization=True,
            speech_contexts=[speech.types.SpeechContext(phrases=[])]
        )

        self.streaming_config = types.StreamingRecognitionConfig(config=config)
예제 #12
0
    def listen(self, language_code='ja-JP'):
        """Listen."""
        # See http://g.co/cloud/speech/docs/languages
        # for a list of supported languages.

        client = speech.SpeechClient()
        config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=self.rate,
            model=None,
            speech_contexts=[types.SpeechContext(
            )],
            language_code=language_code)
        streaming_config = types.StreamingRecognitionConfig(
            config=config,
            single_utterance=True,
            interim_results=True
        )

        self.callbacks.get("ready", lambda: True)()

        with MicrophoneStream(self.rate, int(self.rate/10)) as stream:

            self.callbacks.get("start", lambda: True)()

            while True:
                try:
                    audio_generator = stream.generator()
                    requests = (types.StreamingRecognizeRequest(audio_content=content)
                                for content in audio_generator)
                    responses = client.streaming_recognize(streaming_config, requests)

                    self.listen_print_loop(responses)

                except exceptions.OutOfRange:
                    print("Time exceeded.(OutOfRange)")
                except exceptions.ServiceUnavailable:
                    print("Connection closed.(ServiceUnavailable)")
                except KeyboardInterrupt:
                    print("KeyboardInterrupt.")
                    break
                except:
                    print("Unexpected error:", sys.exc_info()[0])
                    raise

            self.callbacks.get("end", lambda: True)()
예제 #13
0
    def __init__(self, speakers, speaker_count, sample_rate, chunk, language_code, exit_command):
        self.speakers = speakers
        self.speaker_count = speaker_count
        self.sample_rate = sample_rate
        self.chunk = chunk
        self.language_code = language_code
        self.exit_command = exit_command

        self.client = speech.SpeechClient()
        self.recognition_config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=self.sample_rate,
            language_code=self.language_code,
            enable_speaker_diarization=True,
            diarization_speaker_count=self.speaker_count)
        self.streaming_config = types.StreamingRecognitionConfig(
            config=self.recognition_config,
            interim_results=True)
예제 #14
0
def request_command():
    language_code = 'en-US'

    client = speech.SpeechClient()
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        speech_contexts=[{
            "phrases": recommeneded_pharses,
            "boost": boost,
        }],
        language_code=language_code)
    streaming_config = types.StreamingRecognitionConfig(config=config,
                                                        interim_results=True)

    with MicrophoneStream(RATE, CHUNK) as stream:
        audio_generator = stream.generator()
        requests = (types.StreamingRecognizeRequest(audio_content=content)
                    for content in audio_generator)
        responses = client.streaming_recognize(streaming_config, requests)
        print("here")

        for response in responses:
            if not response.results:
                continue

            result = response.results[0]
            if not result.alternatives:
                continue

            # Display the transcription of the top alternative.
            transcript = result.alternatives[0].transcript

            if result.is_final:
                print(transcript)

                match = re.search(command_regex, transcript)
                if match:
                    player_command = PlayerCommand(match.group(1),
                                                   match.group(3),
                                                   match.group(4))
                    return player_command
                return None
예제 #15
0
def transcribe_streaming(stream_file):
    """Streams transcription of the given audio file."""
    import io
    from google.cloud import speech_v1p1beta1
    from google.cloud.speech_v1p1beta1 import enums
    from google.cloud.speech_v1p1beta1 import types
    client = speech_v1p1beta1.SpeechClient()

    # [START speech_python_migration_streaming_request]
    with io.open(stream_file, 'rb') as audio_file:
        content = audio_file.read()

    # In practice, stream should be a generator yielding chunks of audio data.
    stream = [content]
    requests = (types.StreamingRecognizeRequest(audio_content=chunk)
                for chunk in stream)

    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.MP3,
        sample_rate_hertz=16000,
        language_code='en-US')
    streaming_config = types.StreamingRecognitionConfig(config=config)

    # streaming_recognize returns a generator.
    # [START speech_python_migration_streaming_response]
    responses = client.streaming_recognize(streaming_config, requests)
    # [END speech_python_migration_streaming_request]

    for response in responses:
        # Once the transcription has settled, the first result will contain the
        # is_final result. The other results will be for subsequent portions of
        # the audio.
        for result in response.results:
            print('Finished: {}'.format(result.is_final))
            print('Stability: {}'.format(result.stability))
            alternatives = result.alternatives
            # The alternatives are ordered from most likely to least.
            for alternative in alternatives:
                print('Confidence: {}'.format(alternative.confidence))
                print(u'Transcript: {}'.format(alternative.transcript))
예제 #16
0
파일: main.py 프로젝트: eureyuri/dolly
def main():
    client = speech.SpeechClient()
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=LANGUAGE_CODE,
        enable_speaker_diarization=True,
        diarization_speaker_count=SPEAKER_COUNT)
    streaming_config = types.StreamingRecognitionConfig(config=config,
                                                        interim_results=True)
    # indicates that this stream request should return temporary results
    # that may be refined at a later time (after processing more audio).
    # Interim results will be noted within responses through the setting of
    # is_final to false

    with MicrophoneStream(RATE, CHUNK) as stream:
        audio_generator = stream.generator()
        requests = (types.StreamingRecognizeRequest(audio_content=content)
                    for content in audio_generator)

        responses = client.streaming_recognize(streaming_config, requests)

        # Now, put the transcription responses to use.
        listen_print_loop(responses)