def configureAPI(self): if self.title.find('.flac') != -1: with open(os.path.join(self.path, self.title), 'rb') as audio_file: content = audio_file.read() self.audio = types.RecognitionAudio(content=content) else: self.audio = types.RecognitionAudio(uri="gs://twiml-mp3/" + self.title + ".flac") self.config = types.RecognitionConfig( encoding=self.encoding, sample_rate_hertz=self.sample_rate, language_code=self.language_code, enable_automatic_punctuation=self.punctuation, enable_speaker_diarization=self.diarization, diarization_speaker_count=self.num_speakers, audio_channel_count=1, use_enhanced=self.enhanced, model=self.model, enable_word_time_offsets=self.time_offsets, enable_word_confidence=self.word_confidence, max_alternatives=self.max_alternatives, metadata=self.metadata, speech_contexts=[types.SpeechContext(phrases=self.phrases)])
def transcribe_gcs(gcs_uri, hertz, channel): client = speech.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.FLAC, # flacの設定 sample_rate_hertz=int(hertz), # ヘルツは音声ファイルに合わせる audio_channel_count=int(channel), language_code='ja-JP', # 日本語音声の場合 enable_speaker_diarization=True, # 異なる話者の分離 enable_automatic_punctuation=True, # 句読点 speech_contexts=SELECTED_PHRASES # 音声適応ブースト ) operation = client.long_running_recognize(config, audio) print('Waiting for operation to complete...') operationResult = operation.result() filename = gcs_uri.rsplit('/', 1)[1].split('.')[0] + ".txt" outputfilepath = os.path.join(OUTPUT_FOLDER, filename) fout = codecs.open(outputfilepath, 'a', 'utf-8') for result in operationResult.results: for alternative in result.alternatives: fout.write(u'{}\n'.format(alternative.transcript)) fout.close()
def google_transcribe_file(fp, bucket_name='prof-resp-trans'): storage_client = google.cloud.storage.Client() bucket = storage_client.get_bucket(bucket_name) client = google.cloud.speech_v1p1beta1.SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED, language_code='en-US', enable_word_time_offsets=True, enable_automatic_punctuation=True) blob = bucket.blob(fp.name) if not blob.exists(): new_print('Uploading File: {}'.format(fp.name)) blob.upload_from_filename(str(fp.resolve())) new_print('Finished Uploading: {}'.format(fp.name)) else: new_print('File already uploaded: {}'.format(fp.name)) new_print('Starting transcription...') audio = types.RecognitionAudio( uri='gs://{}/{}'.format(bucket_name, fp.name)) response = client.long_running_recognize(config, audio) results = response.result() new_print('Transciption finished') return results
def transcribe_file_with_diarization(): audio = types.RecognitionAudio(uri=args['input']) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, # encoding=enums.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=22050, language_code='en-US', enable_speaker_diarization=True, diarization_speaker_count=2, model='phone_call') operation = client.long_running_recognize(config, audio) print("Waiting on response from google cloud...") response = operation.result(timeout=720) ## 360 call 01 for result in response.results: print("\n\n::BEGIN TRANSCRIPT::\n") print("{}".format(result.alternatives[0].transcript)) print("\n::END TRANSCRIPT::\n\n") print("\t\tCONFIDENCE: {} \n\n".format( result.alternatives[0].confidence)) print("::BEGIN SPEAKER DIARIZATION::\n") words_info = result.alternatives[0].words for word_info in words_info: print("{}: '{}'".format(word_info.speaker_tag, word_info.word)) print("\n::END SPEAKER DIARIZATION")
def read_audio(self, filepath): with io.open(filepath, "rb") as f: content = f.read() audio = types.RecognitionAudio(content=content) #audio = {"content": content} return audio
def diarized_transcribe(gcred, gcs_uri, speakercount): os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = gcred client = speech_v1p1beta1.SpeechClient() audio = beta_types.RecognitionAudio(uri=gcs_uri) config = beta_types.RecognitionConfig( encoding=beta_enums.RecognitionConfig.AudioEncoding.FLAC, language_code='en-US', enable_speaker_diarization=True, diarization_speaker_count=speakercount, enable_word_time_offsets=True, model='video', enable_automatic_punctuation=True) operation = client.long_running_recognize(config, audio) response = operation.result(timeout=3600) transcript = MessageToDict(response) transcript = transcript.get('results') transcript = transcript.pop() transcript = transcript.get('alternatives') transcript = transcript.pop() transcript = transcript.get('words') return transcript
def read(file_name, result_file, time=50): # Instantiates a client client = speech.SpeechClient() # The name of the audio file to transcribe file_name = os.path.join( os.path.dirname(__file__), file_name) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, audio_channel_count=2, sample_rate_hertz=44100, enable_speaker_diarization=True, diarization_speaker_count=2, language_code='ko-KR') # Loads the audio into memory with io.open(file_name, 'rb') as audio_file: with io.open(result_file, 'w') as f: FILE_BYTE = 44100 * 2 read_byte = FILE_BYTE * time while True: content = audio_file.read(read_byte) if not content: break audio = types.RecognitionAudio(content=content) response = client.recognize(config, audio) for result in response.results: f.write(result.alternatives[0].transcript) print(result.alternatives[0]) print('Transcript: {}'.format(result.alternatives[0].transcript))
def transcribe_gcs(gcs_uri): """Asynchronously transcribes the audio file specified by the gcs_uri.""" from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types client = speech.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=8000, language_code='en-US', profanity_filter=True, speech_contexts=[ speech.types.SpeechContext( phrases=['Andy', 'Wisy', 'EEP', 'Project', 'Tom', 'Jeff'], ) ], ) operation = client.long_running_recognize(config, audio) print('GCS -- Waiting for operation to complete...') response = operation.result(timeout=90) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: # The first alternative is the most likely one for this portion. print(('Transcript: {}'.format(result.alternatives[0].transcript))) print(('Confidence: {}'.format(result.alternatives[0].confidence)))
def transcribe_gcs(gcs_uri, hint_phrases, set_config): """Asynchronously transcribes the audio file specified by the gcs_uri.""" client = speech_v1p1beta1.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) # hint_phrase = [] # set_config['enable_speaker_diarization'] = 'False' print(set_config.get('enable_automatic_punctuation')) # Set default values, check dict having each key and cast from str to each type. config = types.RecognitionConfig( encoding=eval(set_config.get('encoding', 'enums.RecognitionConfig.AudioEncoding.FLAC')), sample_rate_hertz=int(set_config.get('sample_rate_hertz', 16000)), language_code=set_config.get('language_code', 'en-US'), enable_automatic_punctuation=eval(set_config.get('enable_automatic_punctuation', True)), enable_speaker_diarization=eval(set_config.get('enable_speaker_diarization', False)), diarization_speaker_count=int(set_config.get('diarization_speaker_count', 1)), speech_contexts=[speech_v1p1beta1.types.SpeechContext(phrases=hint_phrases)]) operation = client.long_running_recognize(config, audio) print('Waiting for operation to complete...') response = operation.result(timeout=900) return response
def google_transcribe(audio_file_name): file_name = filepath + audio_file_name frame_rate, channels = frame_rate_channel(file_name) if channels > 1: stereo_to_mono(file_name) bucket_name = bucketname source_file_name = filepath + audio_file_name destination_blob_name = audio_file_name upload_blob(bucket_name, source_file_name, destination_blob_name) gcs_uri = 'gs://' + bucketname + '/' + audio_file_name transcript = '' client = speech.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) print("Setting up configurations") speech_context = speech.types.SpeechContext(phrases=[ "$OOV_CLASS_DIGIT_SEQUENCE", "$YEAR", "$PERCENT", "$MONEY", "$MONTH" ]) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=frame_rate, language_code='en-US', enable_speaker_diarization=True, diarization_speaker_count=2, speech_contexts=[speech_context], use_enhanced=True, model="phone_call") # Detects speech in the audio file print("detecting speech") operation = client.long_running_recognize(config, audio) response = operation.result(timeout=10000) result = response.results[-1] words_info = result.alternatives[0].words tag = 1 speaker = "" print("Assembling words") for word_info in words_info: if word_info.speaker_tag == tag: speaker = speaker + " " + word_info.word else: #Changed transcript += "speaker {}: {}".format(tag, speaker) + '\n' tag = word_info.speaker_tag speaker = "" + word_info.word transcript += "speaker {}: {}".format(tag, speaker) delete_blob(bucket_name, destination_blob_name) return transcript
def transcribe_file(content): audio = types.RecognitionAudio(content=content) response = client.recognize(config, audio) try: text = response.results[0].alternatives[0].transcript return text[0].upper() + text[1:] + "?" except IndexError: return ""
def transcribe_gcs(gcs_uri): from google.cloud import speech_v1p1beta1 as speech from google.cloud.speech_v1p1beta1 import enums from google.cloud.speech_v1p1beta1 import types client = speech.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( sample_rate_hertz=32000, encoding=enums.RecognitionConfig.AudioEncoding.FLAC, enable_speaker_diarization=True, diarization_speaker_count=2, language_code='ja-JP') operation = client.long_running_recognize(config, audio) print('Waiting for operation to complete...') operationResult = operation.result() d = datetime.datetime.today() today = d.strftime("%Y%m%d-%H%M%S") fout = codecs.open('output{}.txt'.format(today), 'a', 'shift_jis') speaker_1_words = "" speaker_1_s = float(0) speaker_2_words = "" speaker_2_s = float(0) for word in operationResult.results[-1].alternatives[0].words: tmp_word = u'{}'.format(word.word.split("|")[0]) start_time = float(word.start_time.seconds) + float( word.start_time.nanos) / 1000 / 1000 / 1000 end_time = float(word.end_time.seconds) + float( word.end_time.nanos) / 1000 / 1000 / 1000 s = end_time - start_time if word.speaker_tag == 1: speaker_1_s += s speaker_1_words += tmp_word else: speaker_2_s += s speaker_2_words += tmp_word fout.write('speaker_1: \n{}\n'.format(speaker_1_words)) fout.write('s: {}\n'.format(speaker_1_s)) fout.write('speaker_2: \n{}\n'.format(speaker_2_words)) fout.write('s: {}\n'.format(speaker_2_s)) #for result in operationResult.results: # for alternative in result.alternatives: # fout.write(u'{}\n'.format(alternative.transcript)) fout.close()
def transcribe_file(speech_file): """Transcribe the given audio file asynchronously.""" from google.cloud import speech_v1p1beta1 from google.cloud.speech_v1p1beta1 import enums from google.cloud.speech_v1p1beta1 import types client = speech_v1p1beta1.SpeechClient() # [START migration_async_request] with io.open(speech_file, 'rb') as audio_file: content = audio_file.read() print "Using ", speech_file, ", with the below config:" print "" print "importing speech_v1p1beta1" print "language_code='en-US'" print "use_enhanced=True" print "enable_automatic_punctuation=False" print "enable_word_time_offsets=False" print "profanity_filter=True" print "sample_rate=44100hz" print "" print "Transcript is as follows" audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=44100, language_code='en-US', use_enhanced=True, # A model must be specified to use enhanced model. model='phone_call', enable_automatic_punctuation=False, enable_word_time_offsets=False, profanity_filter=True, # speech_contexts=[speech.types.SpeechContext( # phrases=['Andy', 'Wisy', 'EEP', 'Project', 'Tom', 'Jeff'], # )], ) # [START migration_async_response] operation = client.long_running_recognize(config, audio) # [END migration_async_request] print('Waiting for operation to complete...') response = operation.result(timeout=90) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: # The first alternative is the most likely one for this portion. print(u'Transcript: {}'.format(result.alternatives[0].transcript)) print('Confidence: {}'.format(result.alternatives[0].confidence))
def google_transcribe(audio_file_name): file_name = filepath + audio_file_name m4a_to_wav(file_name) # The name of the audio file to transcribe frame_rate, channels = frame_rate_channel(file_name) if channels > 1: stereo_to_mono(file_name) bucket_name = bucketname source_file_name = filepath + audio_file_name destination_blob_name = audio_file_name upload_blob(bucket_name, source_file_name, destination_blob_name) gcs_uri = 'gs://' + bucketname + '/' + audio_file_name transcript = '' client = speech.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=frame_rate, language_code='en-IN', enable_speaker_diarization=True, diarization_speaker_count=2) # Detects speech in the audio file operation = client.long_running_recognize(config, audio) response = operation.result(timeout=10000) result = response.results[-1] words_info = result.alternatives[0].words tag = 1 speaker = "" for word_info in words_info: if word_info.speaker_tag == tag: speaker = speaker + " " + word_info.word else: transcript += f"speaker {tag}: {speaker}\n" tag = word_info.speaker_tag speaker = "" + word_info.word transcript += f"speaker {tag}: {speaker}" delete_blob(bucket_name, destination_blob_name) return transcript
def sample_long_running_recognize(local_file_path): """ Print confidence level for individual words in a transcription of a short audio file Separating different speakers in an audio file recording Args: local_file_path Path to local audio file, e.g. /path/audio.wav """ client = speech_v1p1beta1.SpeechClient() # local_file_path = 'audio_files/2speaker.m4a' # If enabled, each word in the first alternative of each result will be # tagged with a speaker tag to identify the speaker. enable_speaker_diarization = True # Optional. Specifies the estimated number of speakers in the conversation. diarization_speaker_count = 2 # The language of the supplied audio language_code = "en-US" config = types.RecognitionConfig( enable_speaker_diarization=enable_speaker_diarization, enable_automatic_punctuation=True, #diarization_speaker_count=diarization_speaker_count, # model='phone_call', language_code='en-US') with io.open(local_file_path, "rb") as f: content = f.read() audio = types.RecognitionAudio(content=content) operation = client.long_running_recognize(config, audio) print(u"Waiting for operation to complete...") response = operation.result() print(response.results) for result in response.results: # First alternative has words tagged with speakers alternative = result.alternatives[0] print(u"Transcript: {}".format(alternative.transcript)) # Print the speaker_tag of each word for word in alternative.words: print(u"Word: {}".format(word.word)) print(u"Speaker tag: {}".format(word.speaker_tag))
def transcribe_gcs(gcs_uri: str) -> Dict[str, List[Any]]: """Asynchronously transcribes the audio file specified by the gcs_uri.""" client = speech_v1p1beta1.SpeechClient() # The language of the supplied audio audio = types.RecognitionAudio(uri=gcs_uri) operation = client.long_running_recognize(RECOGNITION_CONFIG, audio) print(dt.now(), "Waiting for operation to complete...") print(dt.now(), "Operation", operation.operation) start = time() response = operation.result(timeout=90000) print(dt.now(), "Got response in ", time() - start) return MessageToDict(response)
def google_transcribe(uploaded_file_path): print("Converting: \t" + uploaded_file_path.split("/")[-1]) wav_file_path = mp3_to_wav(uploaded_file_path) print("Converted: \t" + wav_file_path.split("/")[-1]) print("Checking frame rate: \t", wav_file_path.split("/")[-1]) frame_rate, channels = frame_rate_channel(wav_file_path) wav_name = wav_file_path.split("/")[-1] print("Uploading blob: \t", wav_name) upload_blob(bucket_name, wav_file_path, wav_name) print("Starting Transcripting: \t", wav_name) gcs_uri = 'gs://' + bucket_name + '/' + wav_name transcript = '' client = speech.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=frame_rate, language_code=Language_code, enable_speaker_diarization=True, diarization_speaker_count=2) # Detects speech in the audio file operation = client.long_running_recognize(config, audio) response = operation.result(timeout=10000) result = response.results[-1] words_info = result.alternatives[0].words tag = 1 speaker = "" for word_info in words_info: if word_info.speaker_tag == tag: speaker = speaker + " " + word_info.word else: transcript += "speaker {}: {}".format(tag, speaker) + '\n' tag = word_info.speaker_tag speaker = "" + word_info.word transcript += "speaker {}: {}".format(tag, speaker) print("Deleting blob: \t", wav_name) delete_blob(bucket_name, wav_name) return transcript
def transcribe_interviews(gcs_uri): """Asynchronously transcribes the audio file specified by the gcs_uri.""" from google.cloud import speech_v1p1beta1 as speech from google.cloud.speech_v1p1beta1 import enums from google.cloud.speech_v1p1beta1 import types client = speech.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding. LINEAR16, #this is for WAV files, you can use multiple types sample_rate_hertz=44100, language_code='en-US', #use_enhanced=True, #can only use if your ethics allows sending data offsite enable_word_time_offsets=True, enable_automatic_punctuation=True, model='Video') # change this if you're doing focus groups operation = client.long_running_recognize(config, audio) print('A little man is now listening and transcribing...') response = operation.result(timeout=90000000) f = open("Interview 1.txt", "w") #can change the file name that the text gets written to for i, result in enumerate(response.results): alternative = result.alternatives[0] f.write(str(i) + '\n') f.write('{}'.format( str( time.strftime( '%H:%M:%S', time.gmtime(int( alternative.words[0].start_time.seconds)))))) f.write(' --> ') f.write('{}'.format( str( time.strftime( '%H:%M:%S', time.gmtime(int(alternative.words[-1].end_time.seconds))))) + '\n') #f.write('speaker {} :'.format(alternative.words[0].speaker_tag)) f.write(u'{}'.format(alternative.transcript) + '\n\n')
def transcribe_gcs(gcs_uri, encoding="LINEAR16", sample_rate=16000): """Asynchronously transcribes the audio file specified by the gcs_uri.""" client = speech.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( encoding=ENCODINGS[encoding], sample_rate_hertz=sample_rate, language_code='ko-KR', enable_automatic_punctuation=True, enable_word_time_offsets=True, enable_speaker_diarization=True, # 한국어 지원 안됨 (speaker_tag가 모두 동일인으로 분류됨) diarization_speaker_count=3) operation = client.long_running_recognize(config, audio) print('Waiting for operation to complete...') response = operation.result(timeout=300) words_with_tags = [] transcripts = [] # Each result is for a consecutive portion of the audio. # Iterate through them to get the transcripts for the entire audio file. for result in response.results: # The first alternative is the most likely one for this portion. print(u'Transcript: {}'.format(result.alternatives[0].transcript)) transcripts.append(result.alternatives[0].transcript ) # punctuation 포함된 문장을 사용하기 위해 저장 for words in result.alternatives[0].words: word = words.word start_time = round( words.start_time.seconds + words.start_time.nanos * 1e-9, 3) end_time = round( words.end_time.seconds + words.end_time.nanos * 1e-9, 3) speaker_tag = words.speaker_tag words_with_tags.append( [word, start_time, end_time, speaker_tag]) # [word, start_time, end_time, speaker_tag] print() return words_with_tags, transcripts
def run_speaker_diarization(self, audio_uri, audio_ch, audio_sr, max_speakers): logger.info('Performing Speaker Diarization for {}'.format(audio_uri)) drzr_config = types.SpeakerDiarizationConfig( enable_speaker_diarization=True, max_speaker_count=max_speakers) config = speech.types.RecognitionConfig( language_code="en-US", sample_rate_hertz=int(audio_sr), encoding=enums.RecognitionConfig.AudioEncoding.MP3, audio_channel_count=int(audio_ch), enable_word_time_offsets=True, model="video", enable_automatic_punctuation=False, diarization_config=drzr_config) audio_file = types.RecognitionAudio(uri=audio_uri) operation = self.client.long_running_recognize(config=config, audio=audio_file) res = operation.result() return res
def long_transcribe_gcs(self, gcs_uri): print("Transcribing... (This may take a while)") audio = types.RecognitionAudio(uri=gcs_uri) operation = self.config.client.long_running_recognize(self.config.recognition_config, audio) response = operation.result(timeout=90) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. text = "" for result in response.results: speaker_num = result.alternatives[0].words[0].speaker_tag speaker = self.config.speakers[speaker_num - 1] text += speaker + result.alternatives[0].transcript + '\n' # The first alternative is the most likely one for this portion. # print(u'Transcript: {}'.format(result.alternatives[0].transcript)) # print('Confidence: {}'.format(result.alternatives[0].confidence)) return text
def speech_to_text(gcs_uri): #ltt_context = open('context.txt', 'r').read().split('\n') client = speech_v1p1beta1.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) #speech_contexts_element = {"phrases": ltt_context, "boost": 11} #speech_contexts = [speech_contexts_element] config = { "encoding": enums.RecognitionConfig.AudioEncoding.MP3, "sample_rate_hertz": 48000, "language_code": 'en-US', #"speech_contexts": speech_contexts, "max_alternatives": 11, "model": "video", "enable_word_confidence": True, "enable_word_time_offsets": True, "enable_automatic_punctuation": True } operation = client.long_running_recognize(config, audio) print('Speech-to-Text running.') response = operation.result() return (response)
def transcribe_audio(): I = 4 try: alter_db(I) except Exception as e: print e conn, c = get_db(databases[I]) conn.execute("SELECT %s FROM %s LIMIT 1 " % (database_keys[I], database_names[I])) rows = conn.fetchall() for row in rows: #print row audio_file = "%s%s/%s.wav" % (DIR, FILE_NAMES[I], row[0]) audio_file = copy_files(audio_file, row[0]) print audio_file if os.path.isfile(audio_file): with io.open(audio_file, 'rb') as audio_file2: content = audio_file2.read() audio_data = types.RecognitionAudio(content=content) # try: response = client.recognize(config, audio_data) transcript = "-1" confidence = 0 if len(response.results): result = response.results[0] if result.alternatives: transcript = result.alternatives[0].transcript confidence = result.alternatives[0].confidence print response #score = get_jaccard_sim() sql = "UPDATE " + database_names[ I] + " SET transcript=?,confidence=? WHERE Filename=? " print sql conn.execute(sql, (transcript, confidence, row[0])) c.commit()
def wordTimeOffsets(filename, phrases, flag): client = speech.SpeechClient() if flag == 0: speech_file = "/home/malkaiv/project/final/recordings/wav/" + str(filename) + ".wav" else: print("in flag = 1") speech_file = "/home/malkaiv/project/final/recordings/wordsWav/" + str(filename) +".wav" with io.open(speech_file, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) with wave.open(speech_file, 'rb') as wave_file: frame_rate = wave_file.getframerate() # speechText = "please call Stella ask her to bring these things with her from the store \ # six spoons of fresh snow peas five thick slabs of blue cheese and maybe a snack for her \ # brother Bob we also need a small plastic snake and a big toy frog for the kids she can \ # scoop these things into three red bags and we will go meet her Wednesday at the train station" # phrases = speechText.lower().split() boost = 20.0 speech_contexts_element = {"phrases": phrases, "boost": boost} speech_contexts = [speech_contexts_element] config = types.RecognitionConfig( speech_contexts=speech_contexts, encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=frame_rate, language_code="en-US", enable_word_time_offsets=True, enable_word_confidence=True, ) # print(client.recognize(config=config, audio=audio)) # first = response.results[0].alternatives[0] # print("{}\n{}".format(first.transcript, first.confidence)) return client.recognize(config=config, audio=audio)
def transcribe(data): idx, file = data num, _ = file.split('.') text_script = "" name = file print(file + " - started") # Loads the audio into memory with io.open(fileFullPathname+'\\'+file, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) # Transcribe audio file # Detects speech in the audio file client = speech.SpeechClient() response = client.recognize(config, audio) for result in response.results: text_script += result.alternatives[0].transcript print(name + " - done") return { "idx": num, "text": text_script }
def transcribe_gcs(gcs_uri, phrase_hints=[], language_code="en-US"): """Asynchronously transcribes the audio file specified by the gcs_uri.""" client = speech.SpeechClient() phrases = phrase_hints audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code=language_code, enable_word_time_offsets=True, model = 'video', diarization_speaker_count=2, enable_automatic_punctuation=True, use_enhanced=True, enable_speaker_diarization=True, speech_contexts=[speech.types.SpeechContext(phrases=phrases)] ) operation = client.long_running_recognize(config, audio) transcription_response = operation.result(timeout=90000) return transcription_response.results
def transcribe_streaming_from_file(speech_file): """Transcribe the given audio file.""" from google.cloud import speech_v1p1beta1 as speech from google.cloud.speech_v1p1beta1 import enums from google.cloud.speech_v1p1beta1 import types client = speech.SpeechClient() with io.open(speech_file, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, language_code='en-US') response = client.recognize(config, audio) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: # The first alternative is the most likely one for this portion. # The first alternative is the most likely one for this portion. return result.alternatives[0].transcript
def translate_with_timestamps(self, gs_uri, encoding, mode, hint): audio = types.RecognitionAudio(uri=gs_uri) config = types.RecognitionConfig( encoding=encoding, language_code=mode, enable_word_time_offsets=True, speech_contexts=[types.SpeechContext(phrases=hint)], enable_word_confidence=True) operation = self.client.long_running_recognize(config=config, audio=audio) results = [] for result in operation.result().results: alternatives = result.alternatives if not alternatives: continue alternative = alternatives[0] results.append([alternative.transcript, alternative.confidence]) for word_info in alternative.words: word = word_info.word start_time = word_info.start_time.seconds + word_info.start_time.nanos * 1e-9 end_time = word_info.end_time.seconds + word_info.end_time.nanos * 1e-9 confidence = word_info.confidence results.append([word, start_time, end_time, confidence]) return results
def transcribe_file(speech_file): """Transcribe the given audio file asynchronously.""" from google.cloud import speech_v1p1beta1 from google.cloud.speech_v1p1beta1 import enums from google.cloud.speech_v1p1beta1 import types client = speech_v1p1beta1.SpeechClient() # [START migration_async_request] with io.open(speech_file, 'rb') as audio_file: content = audio_file.read() print('Using ', speech_file, ', with the below config:') print("") print("importing speech_v1p1beta1") print("language_code='en-US'") print("use_enhanced=True") print("enable_automatic_punctuation=False") print("enable_word_time_offsets=False") print("profanity_filter=True") print("sample_rate=44100hz") print("") print("Transcript is as follows") print("Trans_Output_" + foldernametime) audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=44100, language_code='en-US', use_enhanced=True, # A model must be specified to use enhanced model. model='phone_call', enable_automatic_punctuation=False, enable_word_time_offsets=False, profanity_filter=True, # speech_contexts=[speech.types.SpeechContext( # phrases=['Andy', 'Wisy', 'EEP', 'Project', 'Tom', 'Jeff'], # )], ) # [START migration_async_response] operation = client.long_running_recognize(config, audio) # [END migration_async_request] os.chdir("Trans_Output_" + foldernametime) with open("output_transcription.txt", "a") as myfile: #print('File -- before write file') #myfile.write(datetime.now().strftime("%Y-%m-%d %H:%M:%S")+ "\n") print('File -- Waiting for operation to complete...') response = operation.result(timeout=90) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: # The first alternative is the most likely one for this portion. print(('Transcript: {}'.format(result.alternatives[0].transcript))) print(('Confidence: {}'.format(result.alternatives[0].confidence))) with open("output_transcription.txt", "a") as myfile: myfile.write(datetime.now().strftime("%Y-%m-%d %H:%M:%S -")) myfile.write( (' {}'.format(result.alternatives[0].transcript)) + "\n") #myfile.write(('Confidence: {}'.format(result.alternatives[0].confidence))+ "\n") with open("output_transcription.txt", "a") as myfile: myfile.write('') myfile.close() # [END migration_async_response] exit()
async def speech_to_text(queue): """ This function implements the translation from speech to text with online and offline services, and compute the emotion related to the speech :param queue: process shared queue """ kb_client = kb.KnowledgeBaseClient(False) kb_ID = (kb_client.register())['details'] kb_client.registerTags(kb_ID, { 'AV_IN_TRANSC_EMOTION' : {'desc' : 'text from audio', 'doc' : """```json\n{\n\t"tag": 'AV_IN_TRANSC_EMOTION',\n\t"timestamp": int,\n\t"ID": int,\n\t"text": string,\n\t"language": string,\n\t"valence": float,\n\t"arousal": float\n}```"""} }) # Create new recogniers for all the services used r = sr.Recognizer() google_client = None try: google_client = speech.SpeechClient() except exceptions.DefaultCredentialsError as e: log.error("Failed to authenticate with Google Cloud Speech recognition" + str(e)) except : log.error("Unexpected error. Failed to authenticate with Google Cloud Speech recognition:"+ str(sys.exc_info()[0])) with ThreadPoolExecutor() as executor: while True: # Data stored in the queue contain all the information needed to create AudioData object timestamp, channels, sampleRate, bitPerSample, data = await queue.get() audio = sr.AudioData(data, sampleRate, bitPerSample/8) audio_gc = types.RecognitionAudio(content=data) # Compute the transcription of the audio google_cloud = executor.submit(recognize, "google-cloud", audio_gc, google_client) google = executor.submit(recognize, "google", audio, r) sphinx = executor.submit(recognize, "sphinx", audio, r) # Compute the emotion related to the audio #emotion = executor.submit(sentimental_analizer.emotion_from_speech, sampleRate, audio, log) res = google_cloud.result() if res["error"] is None: # Add to KB Google cloud speech recognition result with timestamp and ID log.info("Insert into KB --> Google cloud speech recognition result: " + str(res["text"])) else: log.error("Google cloud speech recognition retrieved an error: " + str(res["error"])) res = google.result() if res["error"] is None: # Add to KB Google result with timestamp and ID log.info("Insert into KB --> Google result: " + str(res["text"])) else: log.error("Google retrieved an error: " + str(res["error"])) res = sphinx.result() if res["error"] is None: # Add to KB Sphinx result with timestamp and ID log.info("Insert into KB --> Sphinx result: " + str(res["text"])) else: log.error("Sphinx retrieved an error: " + str(res["error"])) emotion = {"valence": 1, "arousal":1} #emotion.result() myID = 'stt' if res["error"] is None: # Add to KB that the transcription of the audio kb_client.addFact(kb_ID, 'AV_IN_TRANSC_EMOTION', 1, 100, {"tag": 'AV_IN_TRANSC_EMOTION', "timestamp": timestamp, "ID": timestamp, "text": res["text"], "language": res["lang"], "valence": emotion["valence"], "arousal": emotion["arousal"] }) # TODO adjust "text_f_audio", 2, 50, 'false' else: # Add to KB that none of google and sphinx retrieved a result log.critical("Insert into KB that no Google or Sphinx result") kb_client.addFact(kb_ID, 'AV_IN_TRANSC_EMOTION', 1, 100, {"tag": 'AV_IN_TRANSC_EMOTION', "timestamp": timestamp, "ID": timestamp, "text": "", "language": res["lang"], "valence": emotion["valence"], "arousal": emotion["arousal"] })