def main(): # See http://g.co/cloud/speech/docs/languages # for a list of supported languages. language_code = 'en-US' # a BCP-47 language tag # If enabled, each word in the first alternative of each result will be # tagged with a speaker tag to identify the speaker. enable_speaker_diarization = True # Optional. Specifies the estimated number of speakers in the conversation. # diarization_speaker_count = 2 client = speech_v1p1beta1.SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code, enable_speaker_diarization=enable_speaker_diarization) streaming_config = types.StreamingRecognitionConfig(config=config, interim_results=True) with MicrophoneStream(RATE, CHUNK) as stream: audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. listen_print_loop(responses)
def google_transcribe_file(fp, bucket_name='prof-resp-trans'): storage_client = google.cloud.storage.Client() bucket = storage_client.get_bucket(bucket_name) client = google.cloud.speech_v1p1beta1.SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED, language_code='en-US', enable_word_time_offsets=True, enable_automatic_punctuation=True) blob = bucket.blob(fp.name) if not blob.exists(): new_print('Uploading File: {}'.format(fp.name)) blob.upload_from_filename(str(fp.resolve())) new_print('Finished Uploading: {}'.format(fp.name)) else: new_print('File already uploaded: {}'.format(fp.name)) new_print('Starting transcription...') audio = types.RecognitionAudio( uri='gs://{}/{}'.format(bucket_name, fp.name)) response = client.long_running_recognize(config, audio) results = response.result() new_print('Transciption finished') return results
def read(file_name, result_file, time=50): # Instantiates a client client = speech.SpeechClient() # The name of the audio file to transcribe file_name = os.path.join( os.path.dirname(__file__), file_name) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, audio_channel_count=2, sample_rate_hertz=44100, enable_speaker_diarization=True, diarization_speaker_count=2, language_code='ko-KR') # Loads the audio into memory with io.open(file_name, 'rb') as audio_file: with io.open(result_file, 'w') as f: FILE_BYTE = 44100 * 2 read_byte = FILE_BYTE * time while True: content = audio_file.read(read_byte) if not content: break audio = types.RecognitionAudio(content=content) response = client.recognize(config, audio) for result in response.results: f.write(result.alternatives[0].transcript) print(result.alternatives[0]) print('Transcript: {}'.format(result.alternatives[0].transcript))
def configureAPI(self): if self.title.find('.flac') != -1: with open(os.path.join(self.path, self.title), 'rb') as audio_file: content = audio_file.read() self.audio = types.RecognitionAudio(content=content) else: self.audio = types.RecognitionAudio(uri="gs://twiml-mp3/" + self.title + ".flac") self.config = types.RecognitionConfig( encoding=self.encoding, sample_rate_hertz=self.sample_rate, language_code=self.language_code, enable_automatic_punctuation=self.punctuation, enable_speaker_diarization=self.diarization, diarization_speaker_count=self.num_speakers, audio_channel_count=1, use_enhanced=self.enhanced, model=self.model, enable_word_time_offsets=self.time_offsets, enable_word_confidence=self.word_confidence, max_alternatives=self.max_alternatives, metadata=self.metadata, speech_contexts=[types.SpeechContext(phrases=self.phrases)])
def transcribe_gcs(gcs_uri, hint_phrases, set_config): """Asynchronously transcribes the audio file specified by the gcs_uri.""" client = speech_v1p1beta1.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) # hint_phrase = [] # set_config['enable_speaker_diarization'] = 'False' print(set_config.get('enable_automatic_punctuation')) # Set default values, check dict having each key and cast from str to each type. config = types.RecognitionConfig( encoding=eval(set_config.get('encoding', 'enums.RecognitionConfig.AudioEncoding.FLAC')), sample_rate_hertz=int(set_config.get('sample_rate_hertz', 16000)), language_code=set_config.get('language_code', 'en-US'), enable_automatic_punctuation=eval(set_config.get('enable_automatic_punctuation', True)), enable_speaker_diarization=eval(set_config.get('enable_speaker_diarization', False)), diarization_speaker_count=int(set_config.get('diarization_speaker_count', 1)), speech_contexts=[speech_v1p1beta1.types.SpeechContext(phrases=hint_phrases)]) operation = client.long_running_recognize(config, audio) print('Waiting for operation to complete...') response = operation.result(timeout=900) return response
def transcribe_audio(self, audio_blob): phrases = [c.command_variant for c in self.commands] config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, language_code='en-US', audio_channel_count=audio_blob.pop('n_channels'), enable_word_time_offsets=True, model='video', speech_contexts=[ dict(phrases=phrases, boost=self.commandword_bias) ]) try: operation = self.client.long_running_recognize(config, audio_blob) except ResourceExhausted: err_msg = f"The project has run out of it's quota for today. Try again tomorrow or set up your own Google Cloud project, see '{meta_utils.install_url()}'" print(err_msg) sys.exit(1) print(u"Analyzing speech...") response = operation.result() words = [] for result in response.results: for word in result.alternatives[0].words: words.append(word) return words
def voice_recognize(storage_uri): """ Performs synchronous speech recognition on an audio file Args: storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE] """ client = speech_v1p1beta1.SpeechClient() # storage_uri = 'gs://cloud-samples-data/speech/brooklyn_bridge.mp3' # Encoding of audio data sent. This sample sets this explicitly. # This field is optional for FLAC and WAV audio formats. config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.MP3, sample_rate_hertz=44100, language_code='en-US', # Enable automatic punctuation enable_automatic_punctuation=True) audio = {"uri": storage_uri} response = client.recognize(config, audio) result = [r.alternatives[0].transcript for r in response.results] return ' '.join(result)
def microphone_streaming_start(wf, output_stream): # See http://g.co/cloud/speech/docs/languages # for a list of supported languages. language_code = 'ko-KR' client = speech.SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code, enable_automatic_punctuation=True, enable_word_time_offsets=True) # enable_speaker_diarization=True, # diarization_speaker_count=3) streaming_config = types.StreamingRecognitionConfig(config=config, interim_results=True) with MicrophoneStream(RATE, CHUNK, wf, output_stream) as stream: audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. listen_print_loop(responses)
def transcribe_file_with_diarization(): audio = types.RecognitionAudio(uri=args['input']) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, # encoding=enums.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=22050, language_code='en-US', enable_speaker_diarization=True, diarization_speaker_count=2, model='phone_call') operation = client.long_running_recognize(config, audio) print("Waiting on response from google cloud...") response = operation.result(timeout=720) ## 360 call 01 for result in response.results: print("\n\n::BEGIN TRANSCRIPT::\n") print("{}".format(result.alternatives[0].transcript)) print("\n::END TRANSCRIPT::\n\n") print("\t\tCONFIDENCE: {} \n\n".format( result.alternatives[0].confidence)) print("::BEGIN SPEAKER DIARIZATION::\n") words_info = result.alternatives[0].words for word_info in words_info: print("{}: '{}'".format(word_info.speaker_tag, word_info.word)) print("\n::END SPEAKER DIARIZATION")
def convertWAVToTranscript(fileFullPathname, split_length_inSeconds): pool = Pool(8) # Number of concurrent threads #spk = new SpeechContext { phrases : [ "લગભગ", "માત્ર","શ્રી ત્રંબકભાઈ","સોભાગભાઈ","દેહ વિલય","જ્ઞાની પુરુષ","દશા","રુવાડા","ઐશ્વર્ય","એ","જ્ઞાન","વિકલ્પ","ત્યારે","હમણાં","મુમુક્ષુ","દશા","માર્ગ","અદ્દભુત","નિશ્ચય","સ્મૃતિ" ] }; config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, #sample_rate_hertz=512000, language_code='gu-IN', speechContexts = {"phrases" : ['લગભગ', 'માત્ર','શ્રી ત્રંબકભાઈ','સોભાગભાઈ','દેહ વિલય','જ્ઞાની પુરુષ','દશા','રુવાડા','ઐશ્વર્ય','એ','જ્ઞાન','વિકલ્પ','ત્યારે','હમણાં']} #,'મુમુક્ષુ','દશા','માર્ગ','અદ્દભુત','નિશ્ચય','સ્મૃતિ'])] #,"મુમુક્ષુ","દશા","માર્ગ","અદ્દભુત","નિશ્ચય","સ્મૃતિ" ])] ) files = [f for f in listdir(fileFullPathname) if isfile(join(fileFullPathname, f))] #file_Direcotry = os.path.dirname(fileFullPathname) def transcribe(data): idx, file = data num, _ = file.split('.') text_script = "" name = file #print(file + " - started") # Loads the audio into memory with io.open(fileFullPathname+'\\'+file, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) # Transcribe audio file # Detects speech in the audio file client = speech.SpeechClient() response = client.recognize(config, audio) for result in response.results: text_script += result.alternatives[0].transcript #print(name + " - done") return { "idx": num, "text": text_script } all_text = pool.map(transcribe, enumerate(files)) pool.close() pool.join() transcript = "" total_seconds = 0 for t in sorted_nicely(all_text): #sorted(all_text, key=lambda x: sorted_nicely(x['idx'])): #total_seconds += split_length_inSeconds print("Duration of file {} is {}".format(fileFullPathname+t['idx']+'.wav', math.ceil(get_duration(fileFullPathname+'\\'+t['idx']+'.wav')))) total_seconds += math.ceil(get_duration(fileFullPathname+t['idx']+'.wav')) # Cool shortcut from: # https://stackoverflow.com/questions/775049/python-time-seconds-to-hms # to get hours, minutes and seconds m, s = divmod(total_seconds, 60) h, m = divmod(m, 60) # Format time as h:m:s - 30 seconds of text transcript = transcript + "{:0>2d}:{:0>2d}:{:0>2d} {}\n".format(h, m, s, t['text']) #print(transcript) with open("transcript.txt", "w", encoding='utf-8') as f: f.write(transcript)
def transcribe_gcs(gcs_uri, hertz, channel): client = speech.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.FLAC, # flacの設定 sample_rate_hertz=int(hertz), # ヘルツは音声ファイルに合わせる audio_channel_count=int(channel), language_code='ja-JP', # 日本語音声の場合 enable_speaker_diarization=True, # 異なる話者の分離 enable_automatic_punctuation=True, # 句読点 speech_contexts=SELECTED_PHRASES # 音声適応ブースト ) operation = client.long_running_recognize(config, audio) print('Waiting for operation to complete...') operationResult = operation.result() filename = gcs_uri.rsplit('/', 1)[1].split('.')[0] + ".txt" outputfilepath = os.path.join(OUTPUT_FOLDER, filename) fout = codecs.open(outputfilepath, 'a', 'utf-8') for result in operationResult.results: for alternative in result.alternatives: fout.write(u'{}\n'.format(alternative.transcript)) fout.close()
def diarized_transcribe(gcred, gcs_uri, speakercount): os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = gcred client = speech_v1p1beta1.SpeechClient() audio = beta_types.RecognitionAudio(uri=gcs_uri) config = beta_types.RecognitionConfig( encoding=beta_enums.RecognitionConfig.AudioEncoding.FLAC, language_code='en-US', enable_speaker_diarization=True, diarization_speaker_count=speakercount, enable_word_time_offsets=True, model='video', enable_automatic_punctuation=True) operation = client.long_running_recognize(config, audio) response = operation.result(timeout=3600) transcript = MessageToDict(response) transcript = transcript.get('results') transcript = transcript.pop() transcript = transcript.get('alternatives') transcript = transcript.pop() transcript = transcript.get('words') return transcript
def run(self): """Called from [start]. Connects to service and begins streaming.""" # Exit if stop event occurred. if self._stop_event.is_set(): return # Create SSL channel. channel = self._create_channel() self.is_started = True # Open stream service = cloud_speech.SpeechClient(channel) streaming_config = types.StreamingRecognitionConfig( config=types.RecognitionConfig( enable_automatic_punctuation=self.punctuation, encoding=self.encoding, sample_rate_hertz=self.rate, language_code=self.language,), interim_results=self.interim_results) try: request_stream = self._request_stream() resp_stream = service.streaming_recognize( streaming_config, request_stream) self._handle_results(resp_stream) finally: self.stop()
def get_client(lang='en-US', sample_rate=16000, interim_results=False, single_utterance=True, phrase_key=""): """ Helper to return client and config """ client = SpeechClient() config = types.StreamingRecognitionConfig( config=types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=sample_rate, language_code=lang, # Enhanced models are only available to projects that # opt in for audio data collection. use_enhanced=True, # A model must be specified to use enhanced model. model="command_and_search", speech_contexts=[ types.SpeechContext(phrases=PhraseGenerator.get_phrases( "app/config.json", phrase_key), ) ]), interim_results=interim_results, single_utterance=single_utterance) print(str(config)) return client, config
def transcribe_gcs(gcs_uri): """Asynchronously transcribes the audio file specified by the gcs_uri.""" from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types client = speech.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=8000, language_code='en-US', profanity_filter=True, speech_contexts=[ speech.types.SpeechContext( phrases=['Andy', 'Wisy', 'EEP', 'Project', 'Tom', 'Jeff'], ) ], ) operation = client.long_running_recognize(config, audio) print('GCS -- Waiting for operation to complete...') response = operation.result(timeout=90) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: # The first alternative is the most likely one for this portion. print(('Transcript: {}'.format(result.alternatives[0].transcript))) print(('Confidence: {}'.format(result.alternatives[0].confidence)))
def audio_main(): f = open(u"Nao_log.txt", u"a") f.write( u'##**************************** Audio Log File (Group 1) *********************************##' ) f.close() # See http://g.co/cloud/speech/docs/languages # for a list of supported languages. language_code = u'en-US' # a BCP-47 language tag # If enabled, each word in the first alternative of each result will be # tagged with a speaker tag to identify the speaker. enable_speaker_diarization = True # Optional. Specifies the estimated number of speakers in the conversation. #diarization_speaker_count = 2 client = speech_v1p1beta1.SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code, enable_speaker_diarization=enable_speaker_diarization) streaming_config = types.StreamingRecognitionConfig(config=config, interim_results=True) with MicrophoneStream(RATE, CHUNK) as stream: while not stream.closed: sys.stdout.write(YELLOW) sys.stdout.write(u'\n' + unicode(STREAMING_LIMIT * stream.restart_counter) + u': NEW REQUEST\n') stream.audio_input = [] audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. listen_print_loop(responses, stream) if stream.result_end_time > 0: stream.final_request_end_time = stream.is_final_end_time stream.result_end_time = 0 stream.last_audio_input = [] stream.last_audio_input = stream.audio_input stream.audio_input = [] stream.restart_counter = stream.restart_counter + 1 if not stream.last_transcript_was_final: sys.stdout.write(u'\n') stream.new_stream = True
def google_transcribe(audio_file_name): file_name = filepath + audio_file_name frame_rate, channels = frame_rate_channel(file_name) if channels > 1: stereo_to_mono(file_name) bucket_name = bucketname source_file_name = filepath + audio_file_name destination_blob_name = audio_file_name upload_blob(bucket_name, source_file_name, destination_blob_name) gcs_uri = 'gs://' + bucketname + '/' + audio_file_name transcript = '' client = speech.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) print("Setting up configurations") speech_context = speech.types.SpeechContext(phrases=[ "$OOV_CLASS_DIGIT_SEQUENCE", "$YEAR", "$PERCENT", "$MONEY", "$MONTH" ]) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=frame_rate, language_code='en-US', enable_speaker_diarization=True, diarization_speaker_count=2, speech_contexts=[speech_context], use_enhanced=True, model="phone_call") # Detects speech in the audio file print("detecting speech") operation = client.long_running_recognize(config, audio) response = operation.result(timeout=10000) result = response.results[-1] words_info = result.alternatives[0].words tag = 1 speaker = "" print("Assembling words") for word_info in words_info: if word_info.speaker_tag == tag: speaker = speaker + " " + word_info.word else: #Changed transcript += "speaker {}: {}".format(tag, speaker) + '\n' tag = word_info.speaker_tag speaker = "" + word_info.word transcript += "speaker {}: {}".format(tag, speaker) delete_blob(bucket_name, destination_blob_name) return transcript
def sub_main(profanityFilterBool): """ *** Code taken from Google Cloud Speech to text documentation *** Turns on the profanity filter so bad words are censored and not printed """ # See http://g.co/cloud/speech/docs/languages # for a list of supported languages. language_code = 'en-US' # a BCP-47 language tag sp_c_cico = { "phrases": ["Hey cico", "Hey Kiko"], "boost": 30.0 } # speech_contexts_cico sp_c_kiko = { "phrases": ["cico", "Cico", "kiko", "Kiko", "kygo", "Kitty, girl"], "boost": 0 } movement_words = { "phrases" : ["move", "feet", "forward", "right", "left", "backward", "degrees", "radians", "to the left", "to the right"], "boost": 20.0 } numbers = { "phrases": ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"], "boost": 5.0 } relevant_words = { "phrases": ["cornell cup robotics", "and", "pick up", "grab"], "boost": 10.0 } speech_contexts = [sp_c_cico, sp_c_kiko, movement_words, relevant_words] client = speech_v1p1beta1.SpeechClient() # print(help(types.RecognitionConfig)) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code, enable_automatic_punctuation=True, speech_contexts=speech_contexts) streaming_config = types.StreamingRecognitionConfig( config=config, interim_results=True) with MicrophoneStream(RATE, CHUNK) as stream: audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. solution = returnResponseString(responses) # solution is the result append_to_file("log.txt", str(solution)) return solution
def convertFLACToTranscript(fileFullPathname, split_length_inSeconds): pool = Pool(16) # Number of concurrent threads config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.FLAC, #sample_rate_hertz=512000, language_code='gu-IN') files = [f for f in listdir(fileFullPathname) if isfile(join(fileFullPathname, f))] #file_Direcotry = os.path.dirname(fileFullPathname) def transcribe(data): idx, file = data num, _ = file.split('.') text_script = "" name = file print(file + " - started") # Loads the audio into memory with io.open(fileFullPathname+'\\'+file, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) # Transcribe audio file # Detects speech in the audio file client = speech.SpeechClient() response = client.recognize(config, audio) for result in response.results: text_script += result.alternatives[0].transcript print(name + " - done") return { "idx": num, "text": text_script } all_text = pool.map(transcribe, enumerate(files)) pool.close() pool.join() transcript = "" total_seconds = 0 for t in sorted_nicely(all_text): #sorted(all_text, key=lambda x: sorted_nicely(x['idx'])): #print("Duration of file {} is {}".format(fileFullPathname+t['idx']+'.flac', math.ceil(get_duration(fileFullPathname+'\\'+t['idx']+'.flac')))) total_seconds += math.ceil(get_duration(fileFullPathname+t['idx']+'.flac')) # Cool shortcut from: # https://stackoverflow.com/questions/775049/python-time-seconds-to-hms # to get hours, minutes and seconds m, s = divmod(total_seconds, 60) h, m = divmod(m, 60) # Format time as h:m:s - 30 seconds of text transcript = transcript + "{:0>2d}:{:0>2d}:{:0>2d} {}\n".format(h, m, s, t['text']) #print(transcript) with open("transcript.txt", "w", encoding='utf-8') as f: f.write(transcript)
def transcribe_gcs(gcs_uri): from google.cloud import speech_v1p1beta1 as speech from google.cloud.speech_v1p1beta1 import enums from google.cloud.speech_v1p1beta1 import types client = speech.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( sample_rate_hertz=32000, encoding=enums.RecognitionConfig.AudioEncoding.FLAC, enable_speaker_diarization=True, diarization_speaker_count=2, language_code='ja-JP') operation = client.long_running_recognize(config, audio) print('Waiting for operation to complete...') operationResult = operation.result() d = datetime.datetime.today() today = d.strftime("%Y%m%d-%H%M%S") fout = codecs.open('output{}.txt'.format(today), 'a', 'shift_jis') speaker_1_words = "" speaker_1_s = float(0) speaker_2_words = "" speaker_2_s = float(0) for word in operationResult.results[-1].alternatives[0].words: tmp_word = u'{}'.format(word.word.split("|")[0]) start_time = float(word.start_time.seconds) + float( word.start_time.nanos) / 1000 / 1000 / 1000 end_time = float(word.end_time.seconds) + float( word.end_time.nanos) / 1000 / 1000 / 1000 s = end_time - start_time if word.speaker_tag == 1: speaker_1_s += s speaker_1_words += tmp_word else: speaker_2_s += s speaker_2_words += tmp_word fout.write('speaker_1: \n{}\n'.format(speaker_1_words)) fout.write('s: {}\n'.format(speaker_1_s)) fout.write('speaker_2: \n{}\n'.format(speaker_2_words)) fout.write('s: {}\n'.format(speaker_2_s)) #for result in operationResult.results: # for alternative in result.alternatives: # fout.write(u'{}\n'.format(alternative.transcript)) fout.close()
def google_transcribe(audio_file_name): file_name = filepath + audio_file_name m4a_to_wav(file_name) # The name of the audio file to transcribe frame_rate, channels = frame_rate_channel(file_name) if channels > 1: stereo_to_mono(file_name) bucket_name = bucketname source_file_name = filepath + audio_file_name destination_blob_name = audio_file_name upload_blob(bucket_name, source_file_name, destination_blob_name) gcs_uri = 'gs://' + bucketname + '/' + audio_file_name transcript = '' client = speech.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=frame_rate, language_code='en-IN', enable_speaker_diarization=True, diarization_speaker_count=2) # Detects speech in the audio file operation = client.long_running_recognize(config, audio) response = operation.result(timeout=10000) result = response.results[-1] words_info = result.alternatives[0].words tag = 1 speaker = "" for word_info in words_info: if word_info.speaker_tag == tag: speaker = speaker + " " + word_info.word else: transcript += f"speaker {tag}: {speaker}\n" tag = word_info.speaker_tag speaker = "" + word_info.word transcript += f"speaker {tag}: {speaker}" delete_blob(bucket_name, destination_blob_name) return transcript
def transcribe_file(speech_file): """Transcribe the given audio file asynchronously.""" from google.cloud import speech_v1p1beta1 from google.cloud.speech_v1p1beta1 import enums from google.cloud.speech_v1p1beta1 import types client = speech_v1p1beta1.SpeechClient() # [START migration_async_request] with io.open(speech_file, 'rb') as audio_file: content = audio_file.read() print "Using ", speech_file, ", with the below config:" print "" print "importing speech_v1p1beta1" print "language_code='en-US'" print "use_enhanced=True" print "enable_automatic_punctuation=False" print "enable_word_time_offsets=False" print "profanity_filter=True" print "sample_rate=44100hz" print "" print "Transcript is as follows" audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=44100, language_code='en-US', use_enhanced=True, # A model must be specified to use enhanced model. model='phone_call', enable_automatic_punctuation=False, enable_word_time_offsets=False, profanity_filter=True, # speech_contexts=[speech.types.SpeechContext( # phrases=['Andy', 'Wisy', 'EEP', 'Project', 'Tom', 'Jeff'], # )], ) # [START migration_async_response] operation = client.long_running_recognize(config, audio) # [END migration_async_request] print('Waiting for operation to complete...') response = operation.result(timeout=90) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: # The first alternative is the most likely one for this portion. print(u'Transcript: {}'.format(result.alternatives[0].transcript)) print('Confidence: {}'.format(result.alternatives[0].confidence))
def transcribe_streaming(stream_file, encoding="LINEAR16", sample_rate=16000): client = speech.SpeechClient() with io.open(stream_file, 'rb') as audio_file: content = audio_file.read() # In practice, stream should be a generator yielding chunks of audio data. stream = [content] requests = (types.StreamingRecognizeRequest(audio_content=chunk) for chunk in stream) config = types.RecognitionConfig( encoding=ENCODINGS[encoding], sample_rate_hertz=sample_rate, language_code='ko-KR', enable_automatic_punctuation=True, enable_word_time_offsets=True, enable_speaker_diarization=True, # 한국어 지원 안됨 (speaker_tag가 모두 동일인으로 분류됨) diarization_speaker_count=3) streaming_config = types.StreamingRecognitionConfig(config=config) # streaming_recognize returns a generator. responses = client.streaming_recognize(streaming_config, requests) words_with_tags = [] transcripts = [] print("Waiting for transcribe...") for response in responses: for result in response.results: alternatives = result.alternatives for alternative in alternatives: print(u'Transcript: {}'.format(alternative.transcript)) transcripts.append( alternative.transcript) # punctuation 포함된 문장을 사용하기 위해 저장 for words in alternative.words: word = words.word start_time = round( words.start_time.seconds + words.start_time.nanos * 1e-9, 3) end_time = round( words.end_time.seconds + words.end_time.nanos * 1e-9, 3) speaker_tag = words.speaker_tag words_with_tags.append([ word, start_time, end_time, speaker_tag ]) # [word, start_time, end_time, speaker_tag] print() # newline return words_with_tags, transcripts
def process(self, loop): """ Audio stream recognition and result parsing """ #You can add speech contexts for better recognition cap_speech_context = types.SpeechContext(**self.context) metadata = types.RecognitionMetadata(**self.metadata) client = speech.SpeechClient() config = types.RecognitionConfig(encoding=self.encoding, sample_rate_hertz=self.rate, language_code=self.language, speech_contexts=[ cap_speech_context, ], enable_automatic_punctuation=True, model=self.model, metadata=metadata) streaming_config = types.StreamingRecognitionConfig( config=config, interim_results=self.interim_results, single_utterance=self.single_utterance) audio_generator = self.stream_generator() requests = iter( types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) #print('process',type(responses)) try: #print('process') for response in responses: #print('process received') if self.terminated: break if not response.results: continue result = response.results[0] if not result.alternatives: continue speechData = MessageToDict(response) global_async_worker.add_task(self.async_callback(speechData)) # debug transcript = result.alternatives[0].transcript print('>>', transcript, "(OK)" if result.is_final else "") except Exception as e: print('process excepted', e) self.start()
def sample_long_running_recognize(local_file_path): """ Print confidence level for individual words in a transcription of a short audio file Separating different speakers in an audio file recording Args: local_file_path Path to local audio file, e.g. /path/audio.wav """ client = speech_v1p1beta1.SpeechClient() # local_file_path = 'audio_files/2speaker.m4a' # If enabled, each word in the first alternative of each result will be # tagged with a speaker tag to identify the speaker. enable_speaker_diarization = True # Optional. Specifies the estimated number of speakers in the conversation. diarization_speaker_count = 2 # The language of the supplied audio language_code = "en-US" config = types.RecognitionConfig( enable_speaker_diarization=enable_speaker_diarization, enable_automatic_punctuation=True, #diarization_speaker_count=diarization_speaker_count, # model='phone_call', language_code='en-US') with io.open(local_file_path, "rb") as f: content = f.read() audio = types.RecognitionAudio(content=content) operation = client.long_running_recognize(config, audio) print(u"Waiting for operation to complete...") response = operation.result() print(response.results) for result in response.results: # First alternative has words tagged with speakers alternative = result.alternatives[0] print(u"Transcript: {}".format(alternative.transcript)) # Print the speaker_tag of each word for word in alternative.words: print(u"Word: {}".format(word.word)) print(u"Speaker tag: {}".format(word.speaker_tag))
def gspeech_client(self): """Creates the Google Speech API client, configures it, and sends/gets audio/text data for parsing. """ language_code = 'en-US' # Hints for the API context = types.SpeechContext(phrases=self.context) client = speech.SpeechClient() # Create metadata object, helps processing metadata = types.RecognitionMetadata() # Interaction Type: # VOICE_SEARCH: Transcribe spoken questions and queries into text. # VOICE_COMMAND: Transcribe voice commands, such as for controlling a device. metadata.interaction_type = ( enums.RecognitionMetadata.InteractionType.VOICE_COMMAND) # Microphone Distance: # NEARFIELD: The audio was captured from a closely placed microphone. # MIDFIELD: The speaker is within 3 meters of the microphone. # FARFIELD: The speaker is more than 3 meters away from the microphone. metadata.microphone_distance = ( enums.RecognitionMetadata.MicrophoneDistance.MIDFIELD) # Device Type: # PC: Speech was recorded using a personal computer or tablet. # VEHICLE: Speech was recorded in a vehicle. # OTHER_OUTDOOR_DEVICE: Speech was recorded outdoors. # OTHER_INDOOR_DEVICE: Speech was recorded indoors. metadata.recording_device_type = ( enums.RecognitionMetadata.RecordingDeviceType.PC) # Media Type: # AUDIO: The speech data is an audio recording. # VIDEO: The speech data originally recorded on a video. metadata.original_media_type = ( enums.RecognitionMetadata.OriginalMediaType.AUDIO) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code=language_code, speech_contexts=[context], use_enhanced=True, model='command_and_search', metadata=metadata) streaming_config = types.StreamingRecognitionConfig( config=config, single_utterance=False, interim_results=False) # Hack from Google Speech Python docs, very pythonic c: requests = (types.StreamingRecognizeRequest(audio_content=content) for content in self.generator()) responses = client.streaming_recognize(streaming_config, requests) self._listen_print_loop(responses)
def __init__(self): self.client = speech.SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code='en-US', enable_word_time_offsets=True, model='video', diarization_speaker_count=2, enable_automatic_punctuation=True, use_enhanced=True, enable_speaker_diarization=True, speech_contexts=[speech.types.SpeechContext(phrases=[])] ) self.streaming_config = types.StreamingRecognitionConfig(config=config)
def google_transcribe(uploaded_file_path): print("Converting: \t" + uploaded_file_path.split("/")[-1]) wav_file_path = mp3_to_wav(uploaded_file_path) print("Converted: \t" + wav_file_path.split("/")[-1]) print("Checking frame rate: \t", wav_file_path.split("/")[-1]) frame_rate, channels = frame_rate_channel(wav_file_path) wav_name = wav_file_path.split("/")[-1] print("Uploading blob: \t", wav_name) upload_blob(bucket_name, wav_file_path, wav_name) print("Starting Transcripting: \t", wav_name) gcs_uri = 'gs://' + bucket_name + '/' + wav_name transcript = '' client = speech.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=frame_rate, language_code=Language_code, enable_speaker_diarization=True, diarization_speaker_count=2) # Detects speech in the audio file operation = client.long_running_recognize(config, audio) response = operation.result(timeout=10000) result = response.results[-1] words_info = result.alternatives[0].words tag = 1 speaker = "" for word_info in words_info: if word_info.speaker_tag == tag: speaker = speaker + " " + word_info.word else: transcript += "speaker {}: {}".format(tag, speaker) + '\n' tag = word_info.speaker_tag speaker = "" + word_info.word transcript += "speaker {}: {}".format(tag, speaker) print("Deleting blob: \t", wav_name) delete_blob(bucket_name, wav_name) return transcript
def listen(self, language_code='ja-JP'): """Listen.""" # See http://g.co/cloud/speech/docs/languages # for a list of supported languages. client = speech.SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=self.rate, model=None, speech_contexts=[types.SpeechContext( )], language_code=language_code) streaming_config = types.StreamingRecognitionConfig( config=config, single_utterance=True, interim_results=True ) self.callbacks.get("ready", lambda: True)() with MicrophoneStream(self.rate, int(self.rate/10)) as stream: self.callbacks.get("start", lambda: True)() while True: try: audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) self.listen_print_loop(responses) except exceptions.OutOfRange: print("Time exceeded.(OutOfRange)") except exceptions.ServiceUnavailable: print("Connection closed.(ServiceUnavailable)") except KeyboardInterrupt: print("KeyboardInterrupt.") break except: print("Unexpected error:", sys.exc_info()[0]) raise self.callbacks.get("end", lambda: True)()
def __init__(self, speakers, speaker_count, sample_rate, chunk, language_code, exit_command): self.speakers = speakers self.speaker_count = speaker_count self.sample_rate = sample_rate self.chunk = chunk self.language_code = language_code self.exit_command = exit_command self.client = speech.SpeechClient() self.recognition_config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=self.sample_rate, language_code=self.language_code, enable_speaker_diarization=True, diarization_speaker_count=self.speaker_count) self.streaming_config = types.StreamingRecognitionConfig( config=self.recognition_config, interim_results=True)