def transcribe_file_with_auto_punctuation(): """Transcribe the given audio file with auto punctuation enabled.""" # [START speech_transcribe_auto_punctuation_beta] from google.cloud import speech_v1p1beta1 as speech client = speech.SpeechClient() speech_file = "resources/commercial_mono.wav" with io.open(speech_file, "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=8000, language_code="en-US", # Enable automatic punctuation enable_automatic_punctuation=True, ) response = client.recognize(config=config, audio=audio) for i, result in enumerate(response.results): alternative = result.alternatives[0] print("-" * 20) print(u"First alternative of result {}".format(i)) print(u"Transcript: {}".format(alternative.transcript))
def transcribe_file_with_multichannel(): """Transcribe the given audio file synchronously with multi channel.""" # [START speech_transcribe_multichannel_beta] from google.cloud import speech_v1p1beta1 as speech client = speech.SpeechClient() speech_file = "resources/Google_Gnome.wav" with open(speech_file, "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code="en-US", audio_channel_count=1, enable_separate_recognition_per_channel=True, ) response = client.recognize(config=config, audio=audio) for i, result in enumerate(response.results): alternative = result.alternatives[0] print("-" * 20) print("First alternative of result {}".format(i)) print(u"Transcript: {}".format(alternative.transcript)) print(u"Channel Tag: {}".format(result.channel_tag))
def google_transcribe(audio_file_name): file_name = filepath + audio_file_name # mp3_to_wav(file_name) # The name of the audio file to transcribe frame_rate, channels = frame_rate_channel(file_name) if channels > 1: stereo_to_mono(file_name) bucket_name = bucketname source_file_name = filepath + audio_file_name destination_blob_name = audio_file_name upload_blob(bucket_name, source_file_name, destination_blob_name) gcs_uri = 'gs://' + bucketname + '/' + audio_file_name transcript = '' credential_path = "/home/asheeshg01/Speech-f22e193c0063.json" os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path client = speech.SpeechClient() audio = speech.RecognitionAudio(uri=gcs_uri) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=frame_rate, language_code='en-US', enable_speaker_diarization=True, diarization_speaker_count=2) # Detects speech in the audio file #operation = client.long_running_recognize(config, audio) operation = client.long_running_recognize(request={"config":config, "audio":audio}) response = operation.result(timeout=10000) result = response.results[-1] #Changed words_info = result.alternatives[0].words #Changed tag=1 #Changed speaker="" #Changed for word_info in words_info: #Changed if word_info.speaker_tag==tag: #Changed speaker=speaker+" "+word_info.word #Changed else: #Changed transcript += "speaker {}: {}".format(tag,speaker) + '\n' #Changed tag=word_info.speaker_tag #Changed speaker=""+word_info.word #Changed transcript += "speaker {}: {}".format(tag,speaker) #Changed #for result in response.results: #transcript += result.alternatives[0].transcript delete_blob(bucket_name, destination_blob_name) return transcript
def google_word_details(audio_file_name): file_name = filepath + audio_file_name second_lang = "hi-IN" frame_rate, channels = frame_rate_channel(file_name) bucket_name = bucketname source_file_name = filepath + audio_file_name destination_blob_name = audio_file_name upload_blob(bucket_name, source_file_name, destination_blob_name) gcs_uri = 'gs://' + bucketname + '/' + audio_file_name transcript = '' word_details = '' credential_path = s.get("credential_path") os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path client = speech.SpeechClient() audio = speech.RecognitionAudio(uri=gcs_uri) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=frame_rate, language_code='en-US', alternative_language_codes=[second_lang], enable_speaker_diarization=True, diarization_speaker_count=2, enable_word_time_offsets=True) # Detects speech in the audio file #operation = client.long_running_recognize(config, audio) operation = client.long_running_recognize(request={ "config": config, "audio": audio }) response = operation.result(timeout=10000) result = response.results[-1] words_info = result.alternatives[0].words tag = 1 speaker = "" for word_info in words_info: word = word_info.word start_time = word_info.start_time end_time = word_info.end_time speaker1 = word_info.speaker_tag word_details += " Word: {} : start_time: {}: end_time: {}: speaker {}".format( word, start_time.total_seconds(), end_time.total_seconds(), speaker1) storage_client = storage.Client() bucket_name = storage_client.get_bucket(bucket_name) word_details_filename = audio_file_name.split( '.')[0] + '_word_details' + '.txt' blob_word_details_file = bucket_name.blob(word_details_filename) blob_word_details_file.upload_from_string(word_details) #delete_blob(bucket_name, destination_blob_name) return word_details
def get_transcripts_json(gcstorage_path, lang, phrase_hints=[], speaker_count=1, enhanced_model=None): # transcribes audio files def _jsonify(res): # helper func for simplifying gcp speech client response json = [] for section in res.results: data = { 'transcript': section.alternatives[0].transcript, 'words': [] } for word in section.alternative[0].words: data['words'].append({ 'word': word.word, 'start_time': word.start_time.total_seconds(), 'end_time': word.end_time.total_seconds(), 'speaker_tag': word.speaker_tag }) json.append(data) return json client = speech.SpeechClient() audio = speech.RecognitionAudio(uri=gcstorage_path) diarize = speaker_count if speaker_count > 1 else False print(f"Diarizing: {diarize}") diarizationConfig = speech.SpeakerDiarizationConfig( enable_speaker_diarization=speaker_count if speaker_count > 1 else False, ) # if eng only, can use the optimized video model if lang == 'en': enhanced_model = 'video' config = speech.RecognitionConfig( lang_code='en-US' if lang == 'en' else lang, enable_automatic_punctuation=True, enable_word_time_offsets=True, speech_contexts=[{ 'phrases': phrase_hints, 'boost': 15 }], diarization_config=diarizationConfig, profanity_filter=True, use_enhanced=True if enhanced_model else False, model='video' if enhanced_model else None) res = client.long_running_recognize(config=config, audio=audio).result() return _jsonify(res)
def transcribe_file_with_multilanguage(files_path=r'D:/dirname'): client = speech.SpeechClient() first_lang = "fr-FR" #second_lang = "cmn-Hans-CN" config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.MP3, #language_code='fr-FR', model='command_and_search', enable_automatic_punctuation=True, sample_rate_hertz=16000, #audio_channel_count=2, #enable_speaker_diarization=True, language_code=first_lang, #alternative_language_codes=[second_lang], #model="video", ) for f in os.listdir(files_path): speech_file = os.path.join(files_path, f) outputfile = os.path.splitext(f)[0] + '.txt' outputfile = os.path.join(files_path, outputfile) if os.path.splitext(speech_file)[-1] != '.mp3': continue if os.path.exists(outputfile): print(speech_file + ' already transcribed in ' + outputfile) continue print(speech_file) with open(speech_file, "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) #gcs_uri = "gs://pathname.mp3" #audio = speech.RecognitionAudio(uri=gcs_uri) #operation = client.long_running_recognize(config=config, audio=audio) operation = client.recognize(config=config, audio=audio) print("Waiting for operation to complete...") #response = operation.result(timeout=30) response = operation #print(response.results) print('saving to ' + outputfile) with open(outputfile, 'w', encoding='utf-8') as f: for i, result in enumerate(response.results): alternative = result.alternatives[0] print("-" * 20) print(u"First alternative of result {}: {}".format( i, alternative)) print(u"Transcript: {}".format(alternative.transcript)) f.write(alternative.transcript) f.write('\n')
def transcribe_file(speech_file, num_speakers): """Transcribe the given audio file asynchronously.""" # Imports the Google Cloud client library #from google.cloud import speech from google.cloud import speech_v1p1beta1 as speech # Instantiates a client client = speech.SpeechClient() # The name of the audio file to transcribe file_name = os.path.join(os.path.dirname(__file__), "resources", "audio.raw") # Loads the audio into memory with io.open(speech_file, "rb") as audio_file: content = audio_file.read() # Construct a recognition metadata object metadata = speech.RecognitionMetadata() metadata.interaction_type = speech.RecognitionMetadata.InteractionType.DISCUSSION metadata.recording_device_type = ( speech.RecognitionMetadata.RecordingDeviceType.OTHER_INDOOR_DEVICE ) metadata.audio_topic = "court trial hearing" metadata.original_mime_type = "audio/mp3" audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=16000, language_code="en-US", enable_automatic_punctuation=True, enable_speaker_diarization=True, diarization_speaker_count=num_speakers, # Enhanced models cost more than standard models. use_enhanced=True, model="video", enable_word_time_offsets=True, ) # Detects speech in the audio file -- short audio file print("Waiting for operation to complete...") response = client.recognize(config=config, audio=audio) result = response.results[-1] words_info = result.alternatives[0].words # Printing out the output: for word_info in words_info: print( u"word: '{}', speaker_tag: {}, start_time:{}, end_time:{}".format(word_info.word, word_info.speaker_tag, word_info.start_time.total_seconds(), word_info.end_time.total_seconds()) )
def speech_to_text(gcs_URI, keypath): # Reference: https://cloud.google.com/speech-to-text/docs/async-recognize # Set up credentials from local keypath G = 'https://www.listennotes.com/e/p/ea09b575d07341599d8d5b71f205517b/' credentials = service_account.Credentials.from_service_account_file( keypath) audio = speech.RecognitionAudio(uri=gcs_URI) config = speech.RecognitionConfig( language_code="en-US", enable_automatic_punctuation=True, enable_word_time_offsets=True, encoding=speech.RecognitionConfig.AudioEncoding.MP3, sample_rate_hertz=16000, ) client = speech.SpeechClient(credentials=credentials) operation = client.long_running_recognize(config=config, audio=audio) print("Waiting for operation to complete...") response = operation.result() i = 1 sentence = '' transcript_all = '' start_time_offset = [] # Building a python dict (contains start time and words) from the response: for result in response.results: best_alternative = result.alternatives[0] transcript = best_alternative.transcript if i == 1: transcript_all = transcript else: transcript_all += " " + transcript i += 1 # Getting timestamps for word in best_alternative.words: start_s = word.start_time.total_seconds() word = word.word if sentence == '': sentence = word sentence_start_time = start_s else: sentence += ' ' + word if '.' in word: start_time_offset.append({ 'time': sentence_start_time, 'sentence': sentence }) sentence = '' speech_to_text_data = { 'transcript': transcript_all, 'timestamps': start_time_offset } print('Finish transcription.') return speech_to_text_data
def transcribe_gcs(gcs_uri): print('Process', gcs_uri) """Asynchronously transcribes the audio file specified by the gcs_uri.""" from google.cloud import speech_v1p1beta1 as speech os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = osp.abspath( configs['google_ca_dir']) client = speech.SpeechClient() audio = speech.RecognitionAudio(uri=gcs_uri) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.MP3, sample_rate_hertz=44100, language_code="ja-jp", enable_word_time_offsets=True, ) operation = client.long_running_recognize(config=config, audio=audio) print("Waiting for operation to complete...") response = operation.result(timeout=90) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. res = [] for result in response.results: alternative = result.alternatives[0] # The first alternative is the most likely one for this portion. print(u"Transcript: {}".format(alternative.transcript)) print("Confidence: {}".format(alternative.confidence)) words = [] for word_info in alternative.words: word = word_info.word start_time = word_info.start_time end_time = word_info.end_time words.append({ 'word': word, 'start_time': start_time, 'end_time': end_time }) print( f"Word: {word}, start_time: {start_time.total_seconds()}, end_time: {end_time.total_seconds()}" ) res.append({ "Transcript": alternative.transcript, "Confidence": alternative.confidence, 'word': words }) with open(osp.join('res', gcs_uri[-7:-4]), 'wb') as f: pickle.dump(res, f)
def my_transcribe(): from google.cloud import speech_v1p1beta1 as speech client = speech.SpeechClient() speech_file = 'resources/voice_tom2.wav' # speech_file = 'resources/voice_tom_southern.wav' with open(speech_file, "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, #sample_rate_hertz=44100, language_code="th-TH", audio_channel_count=2, # 2 (stereo), 1 (mono) enable_word_confidence=True, enable_word_time_offsets=True, enable_speaker_diarization=True, diarization_speaker_count=2, model="default", ) print("Waiting for operation to complete...") response = client.recognize(config=config, audio=audio) for i, result in enumerate(response.results): alternative = result.alternatives[0] print("-" * 30) #print(u"Transcript: {}".format(alternative.transcript)) print("Confidence: {}".format(alternative.confidence)) print(u"Channel Tag: {}".format(result.channel_tag)) ground_truth = get_ground_truth_text() hypothesis = str(alternative.transcript) print("Ground Truth: ", get_ground_truth_text()) print("Hypothesis: ", hypothesis) atta = Tokenizer(model="attacut-sc") gt_word_tokenize = atta.tokenize(ground_truth) hp_word_tokenize = atta.tokenize(hypothesis) # gt_word_tokenize = word_tokenize(ground_truth, engine="newmm") # default=newmm, longest # hp_word_tokenize = word_tokenize(hypothesis, engine="newmm") print("Ground Truth Word Tokenize:", gt_word_tokenize) print("Hypothesis Word Tokenize:", hp_word_tokenize) error = evaluation.util.word_error_rate(hp_word_tokenize, gt_word_tokenize) print("WER: ", error)
def get_speaker_diarization_results(source_file_name, speaker_count): client = speech.SpeechClient() gcs_uri = "gs://ami_corpus/meeting_files/" + source_file_name audio = speech.RecognitionAudio(uri=gcs_uri) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=48000, language_code="en-US", enable_speaker_diarization=True, diarization_speaker_count=speaker_count, ) response = client.long_running_recognize(config=config, audio=audio) result = response.result().results[-1] return result.alternatives[0].words
def google(): if request.method == 'POST': if os.path.exists("speechtotext.wav"): os.remove("speechtotext.wav") if os.path.exists("monosound.wav"): os.remove("monosound.wav") f = request.files['file'] content = f.read() with open('speechtotext.wav', mode='bx') as file: file.write(content) client = speech.SpeechClient() speech_file = "speechtotext.wav" rate, data = wf.read(speech_file) data0 = data[:, 0] wf.write("monosound.wav", 48000, data0) with io.open("monosound.wav", "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) ob = sf.SoundFile(speech_file) first_lang = "en-US" second_lang = "es-US" third_lang = "zh-cmn-Hans-CN" fourth_lang = "hi-IN" config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=ob.samplerate, language_code="en-US", alternative_language_codes=[second_lang, third_lang, fourth_lang]) response = client.recognize(config=config, audio=audio) text = "" for i, result in enumerate(response.results): alternative = result.alternatives[0] text = text + alternative.transcript + "\n" return jsonify({'text': text})
def transcribe_file_with_metadata(): """Send a request that includes recognition metadata.""" # [START speech_transcribe_recognition_metadata_beta] from google.cloud import speech_v1p1beta1 as speech client = speech.SpeechClient() speech_file = "resources/commercial_mono.wav" with io.open(speech_file, "rb") as audio_file: content = audio_file.read() # Here we construct a recognition metadata object. # Most metadata fields are specified as enums that can be found # in speech.enums.RecognitionMetadata metadata = speech.RecognitionMetadata() metadata.interaction_type = speech.RecognitionMetadata.InteractionType.DISCUSSION metadata.microphone_distance = ( speech.RecognitionMetadata.MicrophoneDistance.NEARFIELD ) metadata.recording_device_type = ( speech.RecognitionMetadata.RecordingDeviceType.SMARTPHONE ) # Some metadata fields are free form strings metadata.recording_device_name = "Pixel 2 XL" # And some are integers, for instance the 6 digit NAICS code # https://www.naics.com/search/ metadata.industry_naics_code_of_audio = 519190 audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=8000, language_code="en-US", # Add this in the request to send metadata. metadata=metadata, ) response = client.recognize(config=config, audio=audio) for i, result in enumerate(response.results): alternative = result.alternatives[0] print("-" * 20) print(u"First alternative of result {}".format(i)) print(u"Transcript: {}".format(alternative.transcript))
def google_word_details(audio_file_name): file_name = filepath + audio_file_name frame_rate, channels = frame_rate_channel(file_name) bucket_name = bucketname source_file_name = filepath + audio_file_name destination_blob_name = audio_file_name upload_blob(bucket_name, source_file_name, destination_blob_name) gcs_uri = 'gs://' + bucketname + '/' + audio_file_name transcript = '' word_details = '' credential_path = "/home/asheeshg01/Speech-f22e193c0063.json" os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path client = speech.SpeechClient() audio = speech.RecognitionAudio(uri=gcs_uri) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=frame_rate, language_code='en-US', enable_speaker_diarization=True, diarization_speaker_count=2, enable_word_time_offsets=True) # Detects speech in the audio file #operation = client.long_running_recognize(config, audio) operation = client.long_running_recognize(request={"config":config, "audio":audio}) response = operation.result(timeout=10000) result = response.results[-1] #Changed words_info = result.alternatives[0].words #Changed tag=1 #Changed speaker="" #Changed for word_info in words_info: #Changed word = word_info.word start_time = word_info.start_time end_time = word_info.end_time speaker1 = word_info.speaker_tag word_details += " Word: {} : start_time: {}: end_time: {}: speaker {}".format(word,start_time.total_seconds(),end_time.total_seconds(),speaker1) delete_blob(bucket_name, destination_blob_name) return word_details
def run_stt(lang): try: client = speech.SpeechClient() audio = speech.RecognitionAudio(uri=f'gs://kuza_audio/audio_file') config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.MP3, sample_rate_hertz=16000, language_code=lang, ) operation = client.long_running_recognize(config=config, audio=audio) except Exception: return 'Fatal:STT 서비스 연결에 실패하였습니다.' try: response = operation.result(timeout=6000) text = '' for result in response.results: text += result.alternatives[0].transcript + ' ' except Exception: return 'Fatal:자막 생성에 실패하였습니다.(시간 초과)' return text
def transcribe_gcs(gcs_uri, speakers_num, encoding): """Asynchronously transcribes the audio file specified by the gcs_uri.""" from google.cloud import speech_v1p1beta1 as speech import spacy # import paralleldots import operator output = [] client = speech.SpeechClient() audio = speech.RecognitionAudio(uri=gcs_uri) config = speech.RecognitionConfig( encoding=encoding, #speech.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=48000, language_code="en-US", audio_channel_count=1, enable_automatic_punctuation=True, enable_speaker_diarization=True, diarization_speaker_count=speakers_num, ) operation = client.long_running_recognize(config=config, audio=audio) # print("Waiting for operation to complete...") response = operation.result(timeout=6000) speaker_tagged_result_1 = response.results[len(response.results) - 1] print(speaker_tagged_result_1.alternatives[0].words) for wordObj in speaker_tagged_result_1.alternatives[0].words: speaker_tag = str(wordObj.speaker_tag) output_item = { "word": wordObj.word, "start_time": wordObj.start_time.total_seconds(), "end_time": wordObj.end_time.total_seconds(), "speaker_tag": speaker_tag } output.append(output_item) output = sorted(output, key=lambda x: x['start_time'], reverse=False) return output
def export_transcript_to_storage_beta(input_storage_uri, output_storage_uri, encoding, sample_rate_hertz, language_code): # input_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE] audio = speech.RecognitionAudio(uri=input_storage_uri) # Pass in the URI of the Cloud Storage bucket to hold the transcription output_config = speech.TranscriptOutputConfig(gcs_uri=output_storage_uri) # Speech configuration object config = speech.RecognitionConfig( encoding=encoding, sample_rate_hertz=sample_rate_hertz, language_code=language_code, ) # Compose the long-running request request = speech.LongRunningRecognizeRequest(audio=audio, config=config, output_config=output_config) # Create the speech client speech_client = speech.SpeechClient() operation = speech_client.long_running_recognize(request=request) print("Waiting for operation to complete...") response = operation.result(timeout=90) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: # The first alternative is the most likely one for this portion. print("Transcript: {}".format(result.alternatives[0].transcript)) print("Confidence: {}".format(result.alternatives[0].confidence)) # [END speech_transcribe_with_speech_to_storage_beta] return response.results[0].alternatives[0].transcript
def transcribe_file_with_auto_punctuation(): from google.cloud import speech_v1p1beta1 as speech client = speech.SpeechClient() speech_file = 'resources/Google_Gnome.wav' with io.open(speech_file, 'rb') as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, language_code='en-US', enable_automatic_punctuation=True) response = client.recognize(config=config, audio=audio) for i, result in enumerate(response.results): alternative = result.alternatives[0] print('-' * 20) print(u'First alternative of result {}'.format(i)) print(u'Transcript: {}'.format(alternative.transcript))
def transcribe_file_with_diarization(): """Transcribe the given audio file synchronously with diarization.""" # [START speech_transcribe_diarization_beta] from google.cloud import speech_v1p1beta1 as speech client = speech.SpeechClient() speech_file = "resources/commercial_mono.wav" with open(speech_file, "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=8000, language_code="en-US", enable_speaker_diarization=True, diarization_speaker_count=2, ) print("Waiting for operation to complete...") response = client.recognize(config=config, audio=audio) # The transcript within each result is separate and sequential per result. # However, the words list within an alternative includes all the words # from all the results thus far. Thus, to get all the words with speaker # tags, you only have to take the words list from the last result: result = response.results[-1] words_info = result.alternatives[0].words # Printing out the output: for word_info in words_info: print( u"word: '{}', speaker_tag: {}".format(word_info.word, word_info.speaker_tag) )
def main(): client = speech_v1p1beta1.SpeechClient.from_service_account_json('key.json') #gcs_uri = "gs://edward-raw/audio/teste_speech_to_texto.mp3" gcs_uri = "gs://edward-raw/audio/test.mp3" audio = speech_v1p1beta1.RecognitionAudio(uri=gcs_uri) config = speech_v1p1beta1.RecognitionConfig( encoding=speech_v1p1beta1.RecognitionConfig.AudioEncoding.MP3, sample_rate_hertz=16000, language_code="pt-BR", ) print("Waiting for operation to complete...") # Audio Longo #operation = client.long_running_recognize(config=config, audio=audio) #response = operation.result(timeout=100000) # Audio Curto response = client.recognize(config=config, audio=audio) for result in response.results: print(u"Transcript: {}".format(result.alternatives[0].transcript))
def toText(file): speech_file = file first_lang = "he" # Hebrew second_lang = "en-US" # English US third_lang = "ru_RU" # Russian fourth_lang = "ar" # Arabic with open(speech_file, "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.MP3, sample_rate_hertz=48000, language_code=first_lang, alternative_language_codes=[second_lang, third_lang, fourth_lang], ) response = client.recognize(config=config, audio=audio) for i, result in enumerate(response.results): # alternative = result.alternatives[0] return result
def transcribe( self, file_uri: Union[str, Path], phrases: Optional[List[str]] = None, **kwargs: Any, ) -> transcript_model.Transcript: """ Transcribe audio from GCS file and return a Transcript model. Parameters ---------- file_uri: Union[str, Path] The GCS file uri to the audio file or caption file to transcribe. It should be in format 'gs://...'. phrases: Optional[List[str]] = None A list of strings to feed as targets to the model. Returns ------- outputs: transcript_model.Transcript The transcript model for the supplied media file. """ # Create client client = speech.SpeechClient.from_service_account_file( filename=str(self.credentials_file)) # Create basic metadata metadata = speech.RecognitionMetadata() metadata.interaction_type = ( speech.RecognitionMetadata.InteractionType.PHONE_CALL) metadata.original_media_type = ( speech.RecognitionMetadata.OriginalMediaType.VIDEO) # Add phrases event_metadata_speech_context = speech.SpeechContext( phrases=self._clean_phrases(phrases)) # Prepare for transcription config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code="en-US", enable_automatic_punctuation=True, enable_word_time_offsets=True, enable_spoken_punctuation=True, speech_contexts=[ GOOGLE_SPEECH_ADAPTION_CLASSES, event_metadata_speech_context, ], metadata=metadata, model="video", use_enhanced=True, ) audio = speech.RecognitionAudio(uri=file_uri) # Begin transcription log.debug(f"Beginning transcription for: {file_uri}") operation = client.long_running_recognize(request={ "config": config, "audio": audio }) # Wait for complete response = operation.result(timeout=10800) # Select highest confidence transcripts confidence_sum = 0 segments = 0 # Create timestamped sentences timestamped_sentences: List[transcript_model.Sentence] = [] transcript_sentence_index = 0 # Create sentence boundary pipeline nlp = English() nlp.add_pipe("sentencizer") for result in response.results: # Some portions of audio may not have text if len(result.alternatives) > 0: # Split transcript into sentences doc = nlp(result.alternatives[0].transcript) # Convert generator to list sentences = [str(sent) for sent in doc.sents] # Index holder for word results of response w_marker = 0 for s_ind, _ in enumerate(sentences): # Sentence text s_text = sentences[s_ind] num_words = len(s_text.split()) # Initialize sentence model timestamped_sentence = transcript_model.Sentence( index=transcript_sentence_index, confidence=result.alternatives[0].confidence, # Start and end time are placeholder values start_time=0.0, end_time=0.0, words=[], text=s_text, ) for w_ind in range(w_marker, w_marker + num_words): # Extract word from response word = result.alternatives[0].words[w_ind] # Nanos no longer supported, use microseconds instead # https://github.com/googleapis/python-speech/issues/71 start_time = (word.start_time.seconds + word.start_time.microseconds * 1e-6) end_time = (word.end_time.seconds + word.end_time.microseconds * 1e-6) # Add start_time to Sentence if first word if w_ind - w_marker == 0: timestamped_sentence.start_time = start_time # Add end_time to Sentence if last word if (w_ind - w_marker) == (num_words - 1): timestamped_sentence.end_time = end_time # Create Word model timestamped_word = transcript_model.Word( index=w_ind - w_marker, start_time=start_time, end_time=end_time, text=self._clean_word(word.word), ) timestamped_sentence.words.append(timestamped_word) # Increment word marker w_marker += num_words # Add Sentence to sentence list timestamped_sentences.append(timestamped_sentence) # Increment transcript sentence index transcript_sentence_index += 1 # Update confidence stats confidence_sum += result.alternatives[0].confidence segments += 1 # Compute mean confidence if segments > 0: confidence = confidence_sum / segments else: confidence = 0.0 log.info( f"Completed transcription for: {file_uri}. Confidence: {confidence}" ) # Create transcript model transcript = transcript_model.Transcript( generator=f"Google Speech-to-Text -- CDP v{__version__}", confidence=confidence, session_datetime=None, created_datetime=datetime.utcnow().isoformat(), sentences=timestamped_sentences, ) return transcript
def get_transcripts_json(gcsPath, langCode, phraseHints=[], speakerCount=1, enhancedModel=None): """Transcribes audio files. Args: gcsPath (String): path to file in cloud storage (i.e. "gs://audio/clip.mp4") langCode (String): language code (i.e. "en-US", see https://cloud.google.com/speech-to-text/docs/languages) phraseHints (String[]): list of words that are unusual but likely to appear in the audio file. speakerCount (int, optional): Number of speakers in the audio. Only works on English. Defaults to None. enhancedModel (String, optional): Option to use an enhanced speech model, i.e. "video" Returns: list | Operation.error """ # Helper function for simplifying Google speech client response def _jsonify(result): json = [] for section in result.results: data = { "transcript": section.alternatives[0].transcript, "words": [] } for word in section.alternatives[0].words: data["words"].append({ "word": word.word, "start_time": word.start_time.total_seconds(), "end_time": word.end_time.total_seconds(), "speaker_tag": word.speaker_tag }) json.append(data) return json client = speech.SpeechClient() audio = speech.RecognitionAudio(uri=gcsPath) diarize = speakerCount if speakerCount > 1 else False print(f"Diarizing: {diarize}") diarizationConfig = speech.SpeakerDiarizationConfig( enable_speaker_diarization=speakerCount if speakerCount > 1 else False, ) # In English only, we can use the optimized video model if langCode == "en": enhancedModel = "video" config = speech.RecognitionConfig( language_code="en-US" if langCode == "en" else langCode, enable_automatic_punctuation=True, enable_word_time_offsets=True, speech_contexts=[{ "phrases": phraseHints, "boost": 15 }], diarization_config=diarizationConfig, profanity_filter=True, use_enhanced=True if enhancedModel else False, model="video" if enhancedModel else None) res = client.long_running_recognize(config=config, audio=audio).result() return _jsonify(res)
def transcribe_gcs(gcs_uri, num_speakers): """Asynchronously transcribes the audio file specified by the gcs_uri.""" # Imports the Google Cloud client library #from google.cloud import speech from google.cloud import speech_v1p1beta1 as speech # Instantiates a client client = speech.SpeechClient() # Construct a recognition metadata object metadata = speech.RecognitionMetadata() metadata.interaction_type = speech.RecognitionMetadata.InteractionType.DISCUSSION metadata.recording_device_type = ( speech.RecognitionMetadata.RecordingDeviceType.OTHER_INDOOR_DEVICE ) metadata.audio_topic = "court trial hearing" metadata.original_mime_type = "audio/mp3" audio = speech.RecognitionAudio(uri=gcs_uri) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=16000, language_code="en-US", enable_automatic_punctuation=True, enable_speaker_diarization=True, diarization_speaker_count=num_speakers, # Enhanced models cost more than standard models. use_enhanced=True, model="video", enable_word_time_offsets=True, ) # Detects speech in the audio file -- long audio file operation = client.long_running_recognize(config=config, audio=audio) print("Waiting for operation to complete...") response = operation.result(timeout=300) # Writing results to json result_counter = 0 word_counter = 0 output_json = {} for result in response.results: alternative = result.alternatives[0] output_json[f"{result_counter}_Transcript"] = alternative.transcript output_json[f"{result_counter}_Confidence"] = alternative.confidence result_counter += 1 for word_info in alternative.words: word = word_info.word start_time = word_info.start_time end_time = word_info.end_time speaker_tag = word_info.speaker_tag output_json[f"{word_counter}_Word"] = word output_json[f"{word_counter}_start_time"] = start_time.total_seconds() output_json[f"{word_counter}_end_time"] = end_time.total_seconds() output_json[f"{word_counter}_speaker_tag"] = speaker_tag word_counter += 1 with open("{}.json".format(gcs_uri.split('/')[-1][:-5]) , "w+") as file: json.dump(output_json, file) print("Dirized and transcribed {}".format(gcs_uri.split('/')[-1]))
def transcribe_with_model_adaptation( project_id, location, storage_uri, custom_class_id, phrase_set_id ): """ Create`PhraseSet` and `CustomClasses` to create custom lists of similar items that are likely to occur in your input data. """ # Create the adaptation client adaptation_client = speech.AdaptationClient() # The parent resource where the custom class and phrase set will be created. parent = f"projects/{project_id}/locations/{location}" # Create the custom class custom_class_response = adaptation_client.create_custom_class( { "parent": parent, "custom_class_id": custom_class_id, "custom_class": { "items": [ {"value": "sushido"}, {"value": "altura"}, {"value": "taneda"}, ] }, } ) # Create the phrase set phrase_set_response = adaptation_client.create_phrase_set( { "parent": parent, "phrase_set_id": phrase_set_id, "phrase_set": { "boost": 10, "phrases": [{"value": f"Visit restaurants like ${custom_class_id}"}], }, } ) # The next section shows how to use the newly created custom # class and phrase set to send a transcription request with speech adaptation # Speech adaptation configuration speech_adaptation = speech.SpeechAdaptation( phrase_sets=[phrase_set_response], custom_classes=[custom_class_response] ) # speech configuration object config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=24000, language_code="en-US", adaptation=speech_adaptation, ) # The name of the audio file to transcribe # storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE] audio = speech.RecognitionAudio(uri=storage_uri) # Create the speech client speech_client = speech.SpeechClient() response = speech_client.recognize(config=config, audio=audio) for result in response.results: print("Transcript: {}".format(result.alternatives[0].transcript)) # [END speech_transcribe_with_model_adaptation] return response.results[0].alternatives[0].transcript
# Auth credentials = service_account.Credentials.from_service_account_file( PurePath(Path(__file__).resolve().parent).joinpath( Path(str(script_config["OPTS"]["Credentials"])))) # Instantiate GC Speech client client = speech.SpeechClient(credentials=credentials) if str(script_config["OPTS"]["Mode"]) == "local": # Read-in audio from local file (60s limit, gs is recommended Mode) with io.open( PurePath(Path(__file__).resolve().parent).joinpath( str(script_config["OPTS"]["Path"])), "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) else: # Read-in audio from GS print(str(script_config["OPTS"]["Path"])) audio = speech.RecognitionAudio(uri=str(script_config["OPTS"]["Path"])) # Config request req_config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.FLAC, language_code=str(script_config["OPTS"]["Language"]), enable_speaker_diarization=True, diarization_speaker_count=int(script_config["OPTS"]["Speakers"]), enable_automatic_punctuation=True, ) # Set GC Operation
def async_transcribe(audio_file_paths, bucket_name, output_tsv_path, sample_rate, language_code, speaker_count=0, begin_sec=0.0): """Transcribe a given audio file using the async GCloud Speech-to-Text API. The async API has the advantage of being able to handler longer audio without state reset. Empirically, we've observed that the async calls lead to slightly better accuracy than streaming calls. Args: audio_file_paths: Paths to the audio files as a list of strings in the correct order. bucket_name: Name of GCS bucket used for holding objects temporarily. output_tsv_path: Path to the output TSV file. sample_rate: Audio sample rate. language_code: Language code for recognition. speaker_count: Number of speakers. If 0, speaker diarization will be disabled. begin_sec: Transcript begin timestamp in seconds. """ tmp_audio_file = tempfile.mktemp(suffix=".flac") print("Temporary audio file: %s" % tmp_audio_file) audio_duration_s = concatenate_audio_files(audio_file_paths, tmp_audio_file) storage_client = storage.Client() bucket = storage_client.bucket(bucket_name) destination_blob_name = os.path.basename(tmp_audio_file) blob = bucket.blob(destination_blob_name) print("Uploading %s to GCS bucket %s" % (tmp_audio_file, bucket_name)) blob.upload_from_filename(tmp_audio_file) gcs_uri = "gs://%s/%s" % (bucket_name, destination_blob_name) print("Uploaded to GCS URI: %s" % gcs_uri) client = speech.SpeechClient() audio = speech.RecognitionAudio(uri=gcs_uri) enable_speaker_diarization = speaker_count > 0 config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=sample_rate, language_code=language_code, enable_speaker_diarization=enable_speaker_diarization, diarization_speaker_count=speaker_count) operation = client.long_running_recognize(config=config, audio=audio) timeout_s = int(audio_duration_s * 0.25) print("Waiting for async ASR operation to complete " "(audio duration: %.3f s; ASR timeout: %d s)..." % (audio_duration_s, timeout_s)) response = operation.result(timeout=timeout_s) blob.delete() os.remove(tmp_audio_file) utterances = [] for result in response.results: # The first alternative is the most likely one for this portion. alt = result.alternatives[0] utterances.append(alt.transcript) print(u"Transcript: {}".format(alt.transcript)) diarized_words = [(word.word, word.speaker_tag, word.start_time.total_seconds(), word.end_time.total_seconds()) for word in alt.words] # print("Confidence: {}".format(result.alternatives[0].confidence)) regrouped_utterances = regroup_utterances(utterances, diarized_words) with open(output_tsv_path, "w" if not begin_sec else "a") as f: if not begin_sec: # Write the TSV header. f.write(tsv_data.HEADER + "\n") utterance_counter = 0 for (regrouped_utterance, speaker_index, start_time_sec, end_time_sec) in regrouped_utterances: utterance_counter += 1 line = "%.3f\t%.3f\t%s\t%s [U%d] [Speaker #%d]" % ( start_time_sec + begin_sec, end_time_sec + begin_sec, tsv_data.SPEECH_TRANSCRIPT_TIER, regrouped_utterance, utterance_counter, speaker_index) print(line) f.write(line + "\n")
from google.cloud import speech_v1p1beta1 as speech client = speech.SpeechClient() speech_file = "resources/commercial_mono.wav" with open(speech_file, "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=8000, language_code="en-US", enable_speaker_diarization=True, diarization_speaker_count=2, ) print("Waiting for operation to complete...") response = client.recognize(config=config, audio=audio) # The transcript within each result is separate and sequential per result. # However, the words list within an alternative includes all the words # from all the results thus far. Thus, to get all the words with speaker # tags, you only have to take the words list from the last result: result = response.results[-1] words_info = result.alternatives[0].words # Printing out the output:
def get_transcript(speech_file,content_type): # google authentication os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/andrewfung/Programming/Multiple Speaker Detection/multiple-speaker-detection-3ed65d50eff1.json' # wget -nc https://realenglishconversations.com/... # instantiate a speech client and declare an audio file client = speech.SpeechClient() with io.open(speech_file, "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) if 'wav' in content_type: config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code="en-US", enable_speaker_diarization=True, diarization_speaker_count=2, ) elif 'mpeg' in content_type: config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.MP3, sample_rate_hertz=16000, language_code="en-US", enable_speaker_diarization=True, diarization_speaker_count=2, ) elif 'flac' in content_type: config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=16000, language_code="en-US", enable_speaker_diarization=True, diarization_speaker_count=2, ) print("Waiting for operation to complete...") response = client.recognize(config=config, audio=audio) result = response.results[-1] words_info = result.alternatives[0].words words_list = [] # Printing out the output: for word_info in words_info: words_list.append( { 'word': word_info.word, 'speaker_tag': word_info.speaker_tag, 'start_time': word_info.start_time, 'end_time': word_info.end_time, } ) # print(words_list) # create a script based on the words_list current_speaker = words_list[0]['speaker_tag'] current_line = [] script = [] for item in words_list: if item['speaker_tag'] != current_speaker: # speaker changed, end of line script.append( { 'speaker': current_speaker, 'line': current_line } ) current_line = [] current_speaker = item['speaker_tag'] else: # same speaker, add to the current line current_line.append(item['word']) script.append( { 'speaker': current_speaker, 'line': current_line } ) script = [(f"Speaker {line['speaker']}: " + " ".join(line['line']) + "\n") for line in script] return script
def google_api(id): os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.path.join(app.config['API_KEYS'],'Google_Api_Key.json') id=str(id) audio_file_name = r"interaction" + id + ".wav" audio_file_path = os.path.join(app.config['AUDIO_FILES'], audio_file_name) true_label_file_name = r'speaker_id_' + id + '.txt' true_label_path = os.path.join(app.config['TRUE_LABEL'], true_label_file_name) # Instantiates a client client = speech.SpeechClient() # Loads the audio into memory with io.open(audio_file_path, "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code="en-US", enable_speaker_diarization=True, #diarization_speaker_count=3 ) # print("Waiting for operation to complete...\n") response = client.recognize(request={"config": config, "audio": audio}) # The transcript within each result is separate and sequential per result. # However, the words list within an alternative includes all the words # from all the results thus far. Thus, to get all the words with speaker # tags, you only have to take the words list from the last result: result = response.results[-1] words_info = result.alternatives[0].words # Filling list of transcribed words list_of_words = [] for word_info in words_info: list_of_words.append(word_info.word) # initialize an empty string for text output string = " " text = string.join(list_of_words) # Creating list of labels speaker_tags = [] for word_info in words_info: speaker_tags.append(word_info.speaker_tag) # Create new-labels dictionary for speaker tags speaker_tags_dict = {} counter = 0 for tag in speaker_tags: if tag in speaker_tags_dict: continue else: speaker_tags_dict[tag] = counter counter += 1 # Normalize speaker tags speaker_tags_normalized = [speaker_tags_dict[tag] for tag in speaker_tags] # True Labels speaker_id_file = open(true_label_path, 'r') true_label_id = speaker_id_file.read() true_label_speaker_id = [] for c in true_label_id.split(','): n = int(c) true_label_speaker_id.append(n) # Setting length of label lists equal if len(speaker_tags_normalized) < len(true_label_speaker_id): length = len(speaker_tags_normalized) true_label_speaker_id = true_label_speaker_id[:length] else: length = len(true_label_speaker_id) speaker_tags_normalized = speaker_tags_normalized[:length] return text, speaker_tags_normalized, adjusted_rand_score(speaker_tags_normalized, true_label_speaker_id)