def transcribe_file(speech_file): """Transcribe the given audio file.""" from google.cloud import speech_v1 from google.cloud.speech_v1 import enums from google.cloud.speech_v1 import types import io client = speech_v1.SpeechClient() # [START speech_python_migration_sync_request] # [START speech_python_migration_config] with io.open(speech_file, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code='ja-JP') # [END speech_python_migration_config] # [START speech_python_migration_sync_response] response = client.recognize(config, audio) # [END speech_python_migration_sync_request] # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: # The first alternative is the most likely one for this portion. print(u'Transcript: {}'.format(result.alternatives[0].transcript))
async def speech_to_text(e): opts = e.pattern_match.group(1) or "" args, _ = parse_arguments(opts, ['lang']) lang = args.get('lang', DEFAULT_LANG) await e.edit("**Transcribing...**") message = await e.get_reply_message() file = message.audio or message.voice if not file: await e.edit("**No audio file specified**", delete_in=3) return file = await bot.download_file(file) content = io.BytesIO(file) audio = types.RecognitionAudio(content=file) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.OGG_OPUS, sample_rate_hertz=16000, language_code=lang) response = STTClient.long_running_recognize(config, audio) op_result = response.result() result = op_result.results[0].alternatives[0] output = f"**Transcript:** {result.transcript}\n\n**Confidence:** __{round(result.confidence, 5)}__" await e.edit(output)
def main(): # See http://g.co/cloud/speech/docs/languages # for a list of supported languages. language_code = 'en-US' # a BCP-47 language tag client = speech.SpeechClient.from_service_account_json( "./MyProject-90749589d270.json") config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code) streaming_config = types.StreamingRecognitionConfig(config=config, interim_results=True) with MicrophoneStream(RATE, CHUNK) as stream: audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. listen_print_loop(responses) return user_phrase_result
def speech2text(self, file_path): # Instantiates a client client = speech.SpeechClient() # The name of the audio file to transcribe file_name = file_path # Loads the audio into memory with io.open(file_name, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=self.sample_rate, language_code=self.lang_code) # Detects speech in the audio file response = client.recognize(config, audio) result_li = [] for result in response.results: result_li.append(format(result.alternatives[0].transcript)) return result_li
def start(self, callback): """ Args: callback (function): Function that is called when text is transcribed from speech """ try: with MicrophoneInput() as mic: print("Starting SpeechToTextClient") self._mic = mic audio_generator = self._mic.generator() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=self._mic.RATE, language_code=self.language_code, use_enhanced=True, speech_contexts=self.speech_context) streaming_config = types.StreamingRecognitionConfig( config=config, interim_results=True) requests = (types.StreamingRecognizeRequest( audio_content=content) for content in audio_generator) responses = self.client.streaming_recognize( streaming_config, requests) for response in responses: if not response.results: # no results continue # first result is best result result = response.results[0] if not result.alternatives: continue transcript = result.alternatives[0].transcript.strip( ).lower() callback((transcript, result.is_final)) except OutOfRange: self.restart(callback)
def get_config(self): config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code='en-US') return config
def main(): # See http://g.co/cloud/speech/docs/languages # for a list of supported languages. # The language code you speak. language_code = 'th-TH' # a BCP-47 language tag client = speech.SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code) streaming_config = types.StreamingRecognitionConfig( config=config, interim_results=True) # Initial loop value rounds = 1 while True: try: print('streaming loop :' + str(rounds)) with MicrophoneStream(RATE, CHUNK) as stream: audio_generator = stream.generator() # Create request data requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) # POST data to google cloud speech responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. listen_print_loop(responses) except Exception as err: print(err) rounds += 1
def return_recognized(PATH, words): SCOPES = ['https://www.googleapis.com/auth/cloud-platform'] SERVICE_ACCOUNT_FILE = 'C:/Users/Janek/PycharmProjects/test/klucz.json' credentials = service_account.Credentials.from_service_account_file( SERVICE_ACCOUNT_FILE, scopes=SCOPES) client = speech.SpeechClient(credentials=credentials) file_name = os.path.join(os.path.dirname(__file__), 'resources', PATH) # Loads the audio into memory with io.open(file_name, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code='pl-PL', speech_contexts=[speech.types.SpeechContext(phrases=words)]) # Detects speech in the audio file response = client.recognize(config, audio) transcribed = {} for result in response.results: # print('Transcript: {}, {}'.format(result.alternatives[0].transcript, result.alternatives[0].confidence)) transcribed[result.alternatives[0]. transcript] = result.alternatives[0].confidence return transcribed
def transcribe_file(speech_file): path_ = pathlib.Path.cwd() path_ = path_ / 'teste de fluencia-2b49c4cc975c.json' """Transcribe the given audio file.""" # client = speech_v1.SpeechClient() # cria o cliente da API do google client = speech_v1.SpeechClient.from_service_account_json(path_) with io.open(speech_file, 'rb') as audio_file: # abre o arquivo de audio content = audio_file.read() # ler o conteudo audio = types.RecognitionAudio(content=content) # define o tipo config = types.RecognitionConfig( # configuração a ser usada no reconhecimento encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=48000, language_code='pt-BR', speech_contexts=[{ "phrases": utils.list_animals }] # dica para reconhecimento ) response = client.recognize(config, audio) # reconhecimento do audio # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. # for result in response.results: # # The first alternative is the most likely one for this portion. # print(u'Transcript: {}'.format(result.alternatives[0].transcript)) return response
def transcribe_with_word_time_offsets( speech_content: bytes, ) -> Iterable[Tuple[str, float, float]]: """Recognize words with time offsets from a speech. Args: speech_content: Binary data of the speech. Yields: The word with start time and end time that api recognized. [ ('여기요', 0.0, 2.0), ('저기요', 3.6, 5.4), ('저', 5.4, 9.2), ('밖에서', 9.2, 9.6), ('장애인', 9.6, 10.0), ('주차', 10.0, 10.3), ('가능', 10.3, 10.5), ('까만색', 10.5, 11.3), ('소나타', 11.3, 11.7), ('글', 11.7, 11.8), ('찾아요', 11.8, 12.2), ('근데요', 12.2, 13.2) ] See: https://cloud.google.com/speech-to-text/docs/sync-recognize """ client = SpeechClient() audio = types.RecognitionAudio(content=speech_content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=44100, language_code="ko-KR", enable_word_time_offsets=True, ) response = client.recognize(config, audio) for result in response.results: alternative = result.alternatives[0] for word_info in alternative.words: word = word_info.word start_time = word_info.start_time end_time = word_info.end_time yield ( word, start_time.seconds + start_time.nanos * 1e-9, end_time.seconds + end_time.nanos * 1e-9, )
def work1(): global var, tflag var.set("ここに音声認識結果が表示されます") language_code = 'ja-JP' # a BCP-47 language tag client = speech.SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code) streaming_config = types.StreamingRecognitionConfig(config=config, interim_results=True) with MicrophoneStream(RATE, CHUNK) as stream: audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. for response in responses: if not response.results: continue # The `results` list is consecutive. For streaming, we only care about # the first result being considered, since once it's `is_final`, it # moves on to considering the next utterance. result = response.results[0] if not result.alternatives: continue # Display the transcription of the top alternative. transcript = result.alternatives[0].transcript if not result.is_final: txtlist = textwrap.wrap(transcript, int(ww / w)) print(txtlist) setxt = "" if (len(txtlist) <= num_comment): for i in range(len(txtlist)): setxt += txtlist[i] var.set(setxt) else: for i in range(num_comment): setxt += txtlist[len(txtlist) - num_comment + i] var.set(setxt) else: # Exit recognition if any of the transcribed phrases could be # one of our keywords. if re.search(r'\b(exit|quit)\b', transcript, re.I): on_closing()
def test_inherited_method(self): from google.cloud.speech_v1 import types client = self._make_one() config = types.RecognitionConfig(encoding='FLAC') audio = types.RecognitionAudio(uri='http://foo.com/bar.wav') with mock.patch.object(client, '_recognize') as recognize: client.recognize(config, audio) # Assert that the underlying GAPIC method was called as expected. recognize.assert_called_once_with( types.RecognizeRequest( config=config, audio=audio, ), None)
def transcribe_streaming(stream_file): """Streams transcription of the given audio file.""" client = speech.SpeechClient() with io.open(stream_file, 'rb') as audio_file: content = audio_file.read() # In practice, stream should be a generator yielding chunks of audio data. stream = [content] requests = (types.StreamingRecognizeRequest(audio_content=chunk) for chunk in stream) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code='en-US', model="command_and_search") streaming_config = types.StreamingRecognitionConfig(config=config) # streaming_recognize returns a generator. responses = client.streaming_recognize(streaming_config, requests) return_text = [] confidence = [] for response in responses: # Once the transcription has settled, the first result will contain the # is_final result. The other results will be for subsequent portions of # the audio. for result in response.results: # print('Finished: {}'.format(result.is_final)) # print('Stability: {}'.format(result.stability)) alternatives = result.alternatives # The alternatives are ordered from most likely to least. for alternative in alternatives: # print('Confidence: {}'.format(alternative.confidence)) # print(u'Transcript: {}'.format(alternative.transcript)) return_text.append(alternative.transcript) confidence.append(alternative.confidence) confidence = np.mean(confidence) return return_text, confidence return return_text, confidence
def test_inherited_method(self): from google.cloud.speech_v1 import types client = self._make_one() config = types.RecognitionConfig(encoding='FLAC') audio = types.RecognitionAudio(uri='http://foo.com/bar.wav') patch = mock.patch.object(client, '_recognize', autospec=True) with patch as recognize: client.recognize(config, audio) # Assert that the underlying GAPIC method was called as expected. assert recognize.call_count == 1 _, args, _ = recognize.mock_calls[0] assert args[0] == types.RecognizeRequest( config=config, audio=audio, )
def transcribe_gcs(gcs_uri): """Transcribes the audio file specified by the gcs_uri.""" from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types client = speech.SpeechClient() # [START speech_python_migration_config_gcs] audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=16000, language_code='ja-JP') # [END speech_python_migration_config_gcs] response = client.recognize(config, audio) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: # The first alternative is the most likely one for this portion. print(u'Transcript: {}'.format(result.alternatives[0].transcript))
def upload(): return render_template('post.html') uri = request.files['audio'].stream.read() # #uri = open(stream, "rb", buffering=0) # # print("Debug") # print(request) # print(request.form) # print(request.files) # print("Debug") # # text = convert_speech_to_text(audio) # client = speech_v1.SpeechClient() # encoding = enums.RecognitionConfig.AudioEncoding.FLAC sample_rate_hertz = 48000 language_code = 'en-US' #config = {'encoding': encoding, 'sample_rate_hertz': sample_rate_hertz, 'language_code': language_code} config = types.RecognitionConfig(encoding=encoding, sample_rate_hertz=sample_rate_hertz, language_code=language_code) #uri = 'gs://bucket_name/file_name.flac' # print(str(uri)) audio = types.RecognitionAudio(content=uri) # print("AUDIO: " + str(audio)) # print("CONFIG: " + str(config)) response = client.recognize(config, audio) # print("AAAAAA"+str(response)) # print("BBBBBBB"+str(response.results)) # print("CCCCCCC"+str(response.results[0].alternatives)) # print(response.results[0].alternatives[0].transcript) # # return response.results[0].alternatives[0].transcript sample_txt = "" x = io.open('sample.txt', mode='r', encoding='utf-8', errors='ignore') for line in x: sample_txt += line print(sample_txt) return sample_txt
def run_quickstart(): client = speech_v1.SpeechClient() # The name of the audio file to transcribe file_name = '../../sound/sample.wav' # Loads the audio into memory with io.open(file_name, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=speech_v1.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=44100, language_code='ko-KR', audio_channel_count=2) # Detects speech in the audio file operation = client.long_running_recognize(config=config, audio=audio) response = operation.result(timeout=90) def closecallback(): window.destroy() window = tkinter.Tk() window.title("AI Speaker Test") window.geometry("640x400+100+100") window.resizable(False, False) text = tkinter.Text(window) for result in response.results: text.insert(tkinter.CURRENT, '음성출력\n') if '메시지' in result.alternatives[0].transcript: text.insert(tkinter.CURRENT, result.alternatives[0].transcript) text.pack() button = tkinter.Button(window, text='Close', command=closecallback) button.place(x=0, y=350, relx=0.5) window.mainloop() playsound(file_name)
def listen(self, single_utterance=True): speech_contexts = types.SpeechContext(phrases=['male']) language_code = 'en-US' client = speech.SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, speech_contexts=[speech_contexts], language_code=language_code) # If single_utterance=True, the go command uttered by the subject after a long pause for self-admin is not # registered streaming_config = types.StreamingRecognitionConfig( config=config, single_utterance=single_utterance) with MicrophoneStream(RATE, CHUNK) as stream: audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. return self.listen_print_loop(responses)
def main(): # See http://g.co/cloud/speech/docs/languages # for a list of supported languages. language_code = 'en-GB' # a BCP-47 language tag client = speech.SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code) streaming_config = types.StreamingRecognitionConfig(config=config, interim_results=True) with MicrophoneStream(RATE, CHUNK) as stream: audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) try: responses = client.streaming_recognize(streaming_config, requests, timeout=21) except: noresult = ("no result") return noresult num_chars_printed = 0 for response in responses: try: if not response.results: print("no result ") continue # The `results` list is consecutive. For streaming, we only care about # the first result being considered, since once it's `is_final`, it # moves on to considering the next utterance. result = response.results[0] if not result.alternatives: print("no alternatives") continue # Display the transcription of the top alternative. transcript = result.alternatives[0].transcript # Display interim results, but with a carriage return at the end of the # line, so subsequent lines will overwrite them. # # If the previous result was longer than this one, we need to print # some extra spaces to overwrite the previous result overwrite_chars = ' ' * (num_chars_printed - len(transcript)) if not result.is_final: sys.stdout.write(transcript + overwrite_chars + '\r') sys.stdout.flush() print("loop no result") return transcript + overwrite_chars num_chars_printed = 0 except: noresult = '' return noresult
from stream_recognition_class import StreamRecognition import time start = time.time() lan_code = sys.argv[1] translator_code = sys.argv[2] RATE = 16000 CHUNK = int(RATE / 10) credentials = service_account.Credentials.from_service_account_file( 'Location of API Key File') data = [] client = speech.SpeechClient(credentials=credentials) config = types.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=lan_code) streaming_config = types.StreamingRecognitionConfig(config=config, interim_results=True) with StreamRecognition(RATE, CHUNK) as stream: audio_generator = stream.speech_generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) end = (time.time() - start) - 10 formatted_time = "{:.2f}".format(end) while True: fetched_text = strem_recognition_module.print_speech_loop(
import io from google.cloud import speech_v1 from google.cloud.speech_v1 import enums, types client = speech_v1.SpeechClient() audio_file = io.open("output.mp3", 'rb') content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code='en_US') response = client.recognize(config, audio) print(response)