def diarization(self, wavfile, bucket_name, project_name): if not os.path.exists(project_name): os.mkdir(project_name) print("Uploading {} to google cloud storage bucket".format(wavfile)) set_value("label_wav_file_transcribe", "Uploading file to cloud storage bucket...") self.upload_blob(bucket_name, wavfile, "temp_audio.wav") gcs_uri = "gs://{}/temp_audio.wav".format(bucket_name) set_value("label_wav_file_transcribe", "Finished uploading.") client = speech.SpeechClient() audio = speech.RecognitionAudio(uri=gcs_uri) info = mediainfo(wavfile) sample_rate = info['sample_rate'] print("Transcribing {} with audio rate {}".format( wavfile, sample_rate)) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=int(sample_rate), language_code="en-US", enable_automatic_punctuation=True, enable_word_time_offsets=True, enable_speaker_diarization=True, diarization_speaker_count=int(get_value("input_diarization_num")), ) operation = client.long_running_recognize(config=config, audio=audio) print( "Waiting for operation to complete, this may take several minutes..." ) set_value( "label_wav_file_transcribe", "Waiting for operation to complete, this may take several minutes..." ) response = operation.result(timeout=28800) result = response.results[-1] words = result.alternatives[0].words active_speaker = 1 transcript = [] current_cut = 0 previous_cut = 0 speaker_wavs = [] for x in range(int(get_value("input_diarization_num"))): speaker_wavs.append(AudioSegment.empty()) transcript.append("") w = AudioSegment.from_wav(wavfile) for word in words: if word.speaker_tag == active_speaker: end_time = word.end_time current_cut = end_time.total_seconds() * 1e3 #print(current_cut) transcript[active_speaker - 1] += word.word + ' ' else: #speaker has changed transcript[active_speaker - 1] += word.word + ' ' w_cut = w[(previous_cut):current_cut] previous_cut = current_cut speaker_wavs[active_speaker - 1] = speaker_wavs[active_speaker - 1] + w_cut active_speaker = word.speaker_tag #finish last wav cut w_cut = w[previous_cut:current_cut] speaker_wavs[active_speaker - 1] = speaker_wavs[active_speaker - 1] + w_cut for i, wave in enumerate(speaker_wavs): speaker_wavs[i].export("{}/speaker_{}.wav".format( project_name, i + 1), format="wav") for i, text in enumerate(transcript): f = open("{}/speaker_{}.txt".format(project_name, i + 1), 'w') f.write(transcript[i]) f.close() set_value("label_wav_file_transcribe", "Done!") print("Done with diarization!") print('\a') #system beep
def transcribe_audio_to_tsv_with_diarization(input_audio_paths, output_tsv_path, sample_rate, language_code, speaker_count, begin_sec=0.0): """Transcribe speech in input audio files and write results to .tsv file. This method differs from transcribe_audio_to_tsv() in that it performs speaker diarization and uses the word-level speaker indices to regroup the transcripts. """ client = speech.SpeechClient() enable_speaker_diarization = speaker_count > 0 config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=sample_rate, audio_channel_count=1, enable_separate_recognition_per_channel=False, language_code=language_code, enable_speaker_diarization=enable_speaker_diarization, diarization_speaker_count=speaker_count) streaming_config = speech.StreamingRecognitionConfig(config=config, interim_results=False) requests = audio_data_generator(input_audio_paths, config) responses = client.streaming_recognize(streaming_config, requests) with open(output_tsv_path, "w" if not begin_sec else "a") as f: if not begin_sec: # Write the TSV header. f.write(tsv_data.HEADER + "\n") utterances = [] for response in responses: if not response.results: continue results = [ result for result in response.results if result.is_final ] max_confidence = -1 best_transcript = None result_end_time = None for result in results: for alt in result.alternatives: if alt.confidence > max_confidence: max_confidence = alt.confidence best_transcript = alt.transcript.strip() diarized_words = [(word.word, word.speaker_tag, word.start_time.total_seconds(), word.end_time.total_seconds()) for word in alt.words] result_end_time = result.result_end_time if not best_transcript: continue end_time_sec = result_end_time.total_seconds() utterances.append(best_transcript) regrouped_utterances = regroup_utterances(utterances, diarized_words) utterance_counter = 0 for (regrouped_utterance, speaker_index, start_time_sec, end_time_sec) in regrouped_utterances: utterance_counter += 1 line = "%.3f\t%.3f\t%s\t%s [U%d] [Speaker #%d]" % ( start_time_sec + begin_sec, end_time_sec + begin_sec, tsv_data.SPEECH_TRANSCRIPT_TIER, regrouped_utterance, utterance_counter, speaker_index) print(line) f.write(line + "\n")
def get_transcript(speech_file,content_type): # google authentication os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/andrewfung/Programming/Multiple Speaker Detection/multiple-speaker-detection-3ed65d50eff1.json' # wget -nc https://realenglishconversations.com/... # instantiate a speech client and declare an audio file client = speech.SpeechClient() with io.open(speech_file, "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) if 'wav' in content_type: config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code="en-US", enable_speaker_diarization=True, diarization_speaker_count=2, ) elif 'mpeg' in content_type: config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.MP3, sample_rate_hertz=16000, language_code="en-US", enable_speaker_diarization=True, diarization_speaker_count=2, ) elif 'flac' in content_type: config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=16000, language_code="en-US", enable_speaker_diarization=True, diarization_speaker_count=2, ) print("Waiting for operation to complete...") response = client.recognize(config=config, audio=audio) result = response.results[-1] words_info = result.alternatives[0].words words_list = [] # Printing out the output: for word_info in words_info: words_list.append( { 'word': word_info.word, 'speaker_tag': word_info.speaker_tag, 'start_time': word_info.start_time, 'end_time': word_info.end_time, } ) # print(words_list) # create a script based on the words_list current_speaker = words_list[0]['speaker_tag'] current_line = [] script = [] for item in words_list: if item['speaker_tag'] != current_speaker: # speaker changed, end of line script.append( { 'speaker': current_speaker, 'line': current_line } ) current_line = [] current_speaker = item['speaker_tag'] else: # same speaker, add to the current line current_line.append(item['word']) script.append( { 'speaker': current_speaker, 'line': current_line } ) script = [(f"Speaker {line['speaker']}: " + " ".join(line['line']) + "\n") for line in script] return script
def get_transcripts_json(gcsPath, langCode, phraseHints=[], speakerCount=1, enhancedModel=None): """Transcribes audio files. Args: gcsPath (String): path to file in cloud storage (i.e. "gs://audio/clip.mp4") langCode (String): language code (i.e. "en-US", see https://cloud.google.com/speech-to-text/docs/languages) phraseHints (String[]): list of words that are unusual but likely to appear in the audio file. speakerCount (int, optional): Number of speakers in the audio. Only works on English. Defaults to None. enhancedModel (String, optional): Option to use an enhanced speech model, i.e. "video" Returns: list | Operation.error """ # Helper function for simplifying Google speech client response def _jsonify(result): json = [] for section in result.results: data = { "transcript": section.alternatives[0].transcript, "words": [] } for word in section.alternatives[0].words: data["words"].append({ "word": word.word, "start_time": word.start_time.total_seconds(), "end_time": word.end_time.total_seconds(), "speaker_tag": word.speaker_tag }) json.append(data) return json client = speech.SpeechClient() audio = speech.RecognitionAudio(uri=gcsPath) diarize = speakerCount if speakerCount > 1 else False print(f"Diarizing: {diarize}") diarizationConfig = speech.SpeakerDiarizationConfig( enable_speaker_diarization=speakerCount if speakerCount > 1 else False, ) # In English only, we can use the optimized video model if langCode == "en": enhancedModel = "video" config = speech.RecognitionConfig( language_code="en-US" if langCode == "en" else langCode, enable_automatic_punctuation=True, enable_word_time_offsets=True, speech_contexts=[{ "phrases": phraseHints, "boost": 15 }], diarization_config=diarizationConfig, profanity_filter=True, use_enhanced=True if enhancedModel else False, model="video" if enhancedModel else None) res = client.long_running_recognize(config=config, audio=audio).result() return _jsonify(res)
def transcribe_file(speech_file): """Transcribe the given audio file asynchronously.""" from google.cloud import speech_v1p1beta1 from google.cloud.speech_v1p1beta1 import enums from google.cloud.speech_v1p1beta1 import types client = speech_v1p1beta1.SpeechClient() # [START migration_async_request] with io.open(speech_file, 'rb') as audio_file: content = audio_file.read() #print('Using ', speech_file, ', with the below config:') #print("") #print("importing speech_v1p1beta1") #print("language_code='en-US'") #print("use_enhanced=True") #print("enable_automatic_punctuation=False") #print("enable_word_time_offsets=False") #print("profanity_filter=True") #print("sample_rate=48000hz") #print("") #print("Transcript is as follows") audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=int(args.samplerate), language_code='en-US', # alternative_language_codes='yue-Hant-HK', use_enhanced=True, # A model must be specified to use enhanced model. model='phone_call', enable_automatic_punctuation=False, enable_word_time_offsets=False, profanity_filter=True, enable_speaker_diarization=True, speech_contexts=[ speech_v1p1beta1.types.SpeechContext(phrases=[ keyword1, keyword2, keyword3, keyword4, keyword5, keyword6, keyword7, keyword8, keyword9, keyword10 ], ) ], ) # [START migration_async_response] operation = client.long_running_recognize(config, audio) # [END migration_async_request] os.chdir("..") os.chdir("Trans_Output_" + foldernametime) with open("output_transcription.txt", "a") as myfile: myfile.write(datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "\n") #myfile.write(' - Starting a new transcription.......\n') #print('Waiting for operation to complete...') response = operation.result(timeout=90) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: # The first alternative is the most likely one for this portion. print(('Transcript: {}'.format(result.alternatives[0].transcript))) print(('Confidence: {}'.format(result.alternatives[0].confidence))) with open("output_transcription.txt", "a") as myfile: myfile.write(('Transcript: {}'.format( result.alternatives[0].transcript)) + "\n") myfile.write(('Confidence: {}'.format( result.alternatives[0].confidence)) + "\n") with open("output_transcription.txt", "a") as myfile: myfile.write('')
script_config = configparser.ConfigParser() try: script_config.read("config.ini") except Exception as e: print("Error reading config file. Exiting.") print(e) exit() # Auth credentials = service_account.Credentials.from_service_account_file( PurePath(Path(__file__).resolve().parent).joinpath( Path(str(script_config["OPTS"]["Credentials"])))) # Instantiate GC Speech client client = speech.SpeechClient(credentials=credentials) if str(script_config["OPTS"]["Mode"]) == "local": # Read-in audio from local file (60s limit, gs is recommended Mode) with io.open( PurePath(Path(__file__).resolve().parent).joinpath( str(script_config["OPTS"]["Path"])), "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) else: # Read-in audio from GS print(str(script_config["OPTS"]["Path"])) audio = speech.RecognitionAudio(uri=str(script_config["OPTS"]["Path"])) # Config request req_config = speech.RecognitionConfig(
def transcribe_streaming(self, stream_file, configuration): """Streams transcription of the given audio file.""" import io client = speech.SpeechClient() output = '' with io.open(stream_file, 'rb') as audio_file: audio_content = audio_file.read() config = { "model": configuration.get_model(), "use_enhanced": configuration.get_use_enhanced(), "encoding": configuration.get_encoding(), "sample_rate_hertz": configuration.get_sample_rate_hertz(), "language_code": configuration.get_language_code(), "alternative_language_codes": configuration.get_alternative_language_codes(), "audio_channel_count": configuration.get_audio_channel_count(), "enable_separate_recognition_per_channel": configuration.get_enable_separate_recognition_per_channel(), "enable_speaker_diarization": configuration.get_enableSpeakerDiarization(), "diarization_speaker_count": configuration.get_diarizationSpeakerCount(), "enable_automatic_punctuation": configuration.get_enableAutomaticPunctuation(), "speech_contexts": configuration.get_speech_context() } streaming_config = speech.types.StreamingRecognitionConfig( config=config, interim_results=True) # BUG IS HERE #requests = speech.types.StreamingRecognizeRequest( # audio_content=audio_content) stream = [audio_content] requests = (speech.types.StreamingRecognizeRequest(audio_content=chunk) for chunk in stream) responses = client.streaming_recognize(streaming_config, requests) #import pdb; pdb.set_trace() for response in responses: # Once the transcription has settled, the first result will contain the # is_final result. The other results will be for subsequent portions of # the audio. for result in response.results: alternatives = result.alternatives # The alternatives are ordered from most likely to least. for alternative in alternatives: output = ''.join(alternative.transcript) return output
def analyze_audio(ogg_file_path, speaker_count=3): """Takes an audio file and outputs meeting statistics as a dictionary. Args: ogg_file_path (str): Path to the audio file in ogg-format. speaker_count (int): Number of people participating in the meeting. Returns: Dict[str, Any] """ # Convert audio files to flac if ogg_file_path.split(".")[-1] != "flac": ogg_file_path = convert_ogg2flac(ogg_file_path) speech_client = speech_v1p1beta1.SpeechClient(credentials=CREDENTIALS) config = { "enable_speaker_diarization": True, "diarization_speaker_count": speaker_count, "language_code": "en-US", "encoding": speech_v1p1beta1.enums.RecognitionConfig.AudioEncoding.FLAC, "max_alternatives": 1, "use_enhanced": True, "sample_rate_hertz": 48000, } # Upload file to GCS Storage bucket client = storage.Client(credentials=CREDENTIALS) bucket = client.get_bucket(GCS_BUCKET_NAME) blob = bucket.blob(ogg_file_path) blob.upload_from_filename(ogg_file_path) audio = {"uri": f"gs://{GCS_BUCKET_NAME}/{ogg_file_path}"} operation = speech_client.long_running_recognize(config, audio) response = operation.result() # Empty response when speech to text failed if not response.results: json_out = { "google_transcript": "", "raw_transcript": "", "transcript": [], "speakers": [], "topics": [], "sentiment": { "score": 0, "magnitude": 0 }, } return json_out result = response.results[-1] alternative = result.alternatives[0] json_out = { "google_transcript": alternative.transcript, "raw_transcript": ' '.join([word.word for word in alternative.words]) } # Get transcript distributed by speakers transcript = [] sentence = [] last_speaker = alternative.words[0].speaker_tag for word in alternative.words: current_speaker = word.speaker_tag if current_speaker == last_speaker: sentence.append(word.word) else: transcript.append({ "speaker_id": last_speaker, "line": ' '.join(sentence) }) sentence = [word.word] last_speaker = current_speaker transcript.append({"speaker_id": last_speaker, "line": ' '.join(sentence)}) json_out["transcript"] = transcript # Analyze speakers speaker_tags = [word.speaker_tag for word in alternative.words] unique_speakers = set(speaker_tags) speaker_ratios = [] for speaker in unique_speakers: speaker_ratios.append({ "speaker_id": speaker, "ratio": round(speaker_tags.count(speaker) / len(speaker_tags), 2) }) json_out["speakers"] = speaker_ratios # Analyze sentiment and topics sentiment, topics = analyze_text(json_out["raw_transcript"]) json_out["sentiment"] = sentiment json_out["topics"] = topics # Include speaker sentiment speaker_sentiment = analyze_speaker_sentiment(json_out['transcript']) for line in json_out['speakers']: line.update({'sentiment_score': speaker_sentiment[line['speaker_id']]}) return json_out
def get_transcript_audio_file(audio_path, langs): if 'en-US' in langs: main_lang = 'en-US' else: main_lang = langs[0] parent_dir = '/'.join(audio_path.split('/')[:-1]) temp_dir = parent_dir + '/temp' fname = audio_path.split('/')[-1] try: os.mkdir(temp_dir) except FileExistsError: pass full_audio = pydub.AudioSegment.from_wav(audio_path) transcript = {} for t in range(0, len(full_audio), SIZE): try: chunk = full_audio[t:t + SIZE] except IndexError: chunk = full_audio[t:] chunk = chunk.set_sample_width(2) chunk.export(temp_dir + '/' + str(int(t / SIZE)) + '_' + fname, format='wav', bitrate='16k') chunk_info = mediainfo(temp_dir + '/' + str(int(t / SIZE)) + '_' + fname) config = { "language_code": main_lang, "sample_rate_hertz": int(chunk_info['sample_rate']), "encoding": enums.RecognitionConfig.AudioEncoding.LINEAR16, "profanity_filter": False, "audio_channel_count": int(chunk_info['channels']) } if len(langs) > 1: config["alternative_language_codes"] = langs[1:] with io.open(temp_dir + '/' + str(int(t / SIZE)) + '_' + fname, 'rb') as f: content = f.read() audio = {"content": content} try: client = speech_v1p1beta1.SpeechClient.from_service_account_json( 'api_keys/google_cloud.json') except: client = speech_v1p1beta1.SpeechClient() response = client.recognize(config, audio) for result in response.results: if int(t / 1000) not in transcript.keys(): transcript[int(t / 1000)] = [] alternative = result.alternatives[0] transcript[int(t / 1000)].append(alternative.transcript) os.remove(temp_dir + '/' + str(int(t / SIZE)) + '_' + fname) os.rmdir(temp_dir) return transcript
def main(new_connection_index): """start bidirectional streaming from microphone input to speech API""" # diarization_config = { # "enable_speaker_diarization": True, # "min_speaker_count": 1, # "max_speaker_count": 6, # } # alternative_language_codes = ['zh'], client = speech_v1p1beta1.SpeechClient() config = speech_v1p1beta1.RecognitionConfig( encoding=speech_v1p1beta1.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=SAMPLE_RATE, language_code="en-US", max_alternatives=1, enable_word_time_offsets=True, use_enhanced=True, model="default", ) # model="video", # diarization_config=diarization_config, streaming_config = speech_v1p1beta1.StreamingRecognitionConfig( config=config, interim_results=True, single_utterance=False) # send data of the start of the speech recognition import requests url = "http://localhost:3000/api/zoom/recog_start" now = datetime.datetime.now() recog_start = now.strftime('%Y-%m-%d %H:%M:%S') offset_time_start = time.time() data = {"recog_start": recog_start} res = requests.post(url, json=data) mic_manager = ResumableMicrophoneStream(SAMPLE_RATE, CHUNK_SIZE) print(mic_manager.chunk_size) sys.stdout.write(YELLOW) sys.stdout.write('\nListening, say "Quit" or "Exit" to stop.\n\n') sys.stdout.write("End (ms) Transcript Results/Status\n") sys.stdout.write("=====================================================\n") with mic_manager as stream: while not stream.closed: new_connection_index += 1 sys.stdout.write(YELLOW) sys.stdout.write("\n" + str(STREAMING_LIMIT * stream.restart_counter) + ": NEW REQUEST\n") print("#### new_connection_index: {}".format(new_connection_index)) stream.audio_input = [] audio_generator = stream.generator() requests = (speech_v1p1beta1.StreamingRecognizeRequest( audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. listen_print_loop(responses, stream, new_connection_index) if stream.result_end_time > 0: stream.final_request_end_time = stream.is_final_end_time stream.result_end_time = 0 stream.last_audio_input = [] stream.last_audio_input = stream.audio_input stream.audio_input = [] stream.restart_counter = stream.restart_counter + 1 if not stream.last_transcript_was_final: sys.stdout.write("\n") stream.new_stream = True
def __init__(self, config, link_db): self.client = speech.SpeechClient()
def speech_to_text(audio_path, SPEECHTOTEXT_SPEAKER_COUNT): # Instantiates a client client = speech.SpeechClient() # Loads the audio into memory with io.open(audio_path, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, # encoding=enums.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED, sample_rate_hertz=16000, language_code='en-US', enable_word_time_offsets=True, enable_speaker_diarization=True, diarization_speaker_count=SPEECHTOTEXT_SPEAKER_COUNT, audio_channel_count=2, # model='video', ) # async longrunnning audio file to text operation = client.long_running_recognize(config, audio) # Detects speech in the audio file print("Waiting for operation to complete...") # response = client.recognize(config, audio) response = operation.result(timeout=90) ''' Each result is for a consecutive portion of the audio. Iterate through them to get the transcripts for the entire audio file. ''' words_list = [] # for result in response.results: result = response.results[-1] alternative = result.alternatives[0] print(u'Transcript: {}'.format(alternative.transcript)) print('Confidence: {}'.format(alternative.confidence)) for word_info in alternative.words: word = word_info.word start_time = word_info.start_time start_secs = start_time.seconds + start_time.nanos * 1e-9 end_time = word_info.end_time end_secs = end_time.seconds + end_time.nanos * 1e-9 print('Word: {}, start_time: {}, end_time: {}, speaker_tag: {}'.format( word, start_time.seconds + start_time.nanos * 1e-9, end_time.seconds + end_time.nanos * 1e-9, word_info.speaker_tag, )) words_list.append({ 'value': word_info.word, 'start_secs': start_secs, 'end_secs': end_secs, 'speaker_tag': word_info.speaker_tag, }) return words_list
class Transcriber(TranscriberBaseClass): name = NAME SUPPORTED_FORMATS = ["flac"] cost_per_15_seconds = [0.004, 0.006, 0.009] no_config_error_message = ( "Please sign up for the Google Speech-to-Text API " "and put the path to your credentials in an " 'environment variable "GOOGLE_APPLICATION_CREDENTIALS"') transcript_type = TRANSCRIPT_TYPE # https://cloud.google.com/speech-to-text/docs/languages # Array.from(document.querySelector('.devsite-table-wrapper').querySelectorAll('table tr')).slice(1).map(row => row.children[1].innerText) _language_list = [ "af-ZA", "am-ET", "hy-AM", "az-AZ", "id-ID", "ms-MY", "bn-BD", "bn-IN", "ca-ES", "cs-CZ", "da-DK", "de-DE", "en-AU", "en-CA", "en-GH", "en-GB", "en-IN", "en-IE", "en-KE", "en-NZ", "en-NG", "en-PH", "en-SG", "en-ZA", "en-TZ", "en-US", "es-AR", "es-BO", "es-CL", "es-CO", "es-CR", "es-EC", "es-SV", "es-ES", "es-US", "es-GT", "es-HN", "es-MX", "es-NI", "es-PA", "es-PY", "es-PE", "es-PR", "es-DO", "es-UY", "es-VE", "eu-ES", "fil-PH", "fr-CA", "fr-FR", "gl-ES", "ka-GE", "gu-IN", "hr-HR", "zu-ZA", "is-IS", "it-IT", "jv-ID", "kn-IN", "km-KH", "lo-LA", "lv-LV", "lt-LT", "hu-HU", "ml-IN", "mr-IN", "nl-NL", "ne-NP", "nb-NO", "pl-PL", "pt-BR", "pt-PT", "ro-RO", "si-LK", "sk-SK", "sl-SI", "su-ID", "sw-TZ", "sw-KE", "fi-FI", "sv-SE", "ta-IN", "ta-SG", "ta-LK", "ta-MY", "te-IN", "vi-VN", "tr-TR", "ur-PK", "ur-IN", "el-GR", "bg-BG", "ru-RU", "sr-RS", "uk-UA", "he-IL", "ar-IL", "ar-JO", "ar-AE", "ar-BH", "ar-DZ", "ar-SA", "ar-IQ", "ar-KW", "ar-MA", "ar-TN", "ar-OM", "ar-PS", "ar-QA", "ar-LB", "ar-EG", "fa-IR", "hi-IN", "th-TH", "ko-KR", "zh-TW", "yue-Hant-HK", "ja-JP", "zh-HK", "zh", ] if _check_for_config(): speech_client = speech.SpeechClient() storage_client = storage.Client() transcript_bucket = storage_client.get_bucket(BUCKET_NAME_TRANSCRIPT) def __init__(self, filepath): super().__init__(filepath) @classmethod def _setup(cls): super()._setup() if not shutil.which("gsutil"): raise exceptions.DependencyRequired( "Please install gcloud using the steps here:" "https://cloud.google.com/storage/docs/gsutil_install") cls._make_bucket_if_doesnt_exist(BUCKET_NAME_TRANSCRIPT) @classmethod def _make_bucket_if_doesnt_exist(cls, bucket_name): try: cls.storage_client.create_bucket(bucket_name) except gc_exceptions.Conflict: # this might fail if a bucket by the name exists *anywhere* on GCS? return else: print("made Google Cloud Storage Bucket for transcripts") def convert_file_format_if_needed(self): if self.file_format not in self.SUPPORTED_FORMATS: if not shutil.which("ffmpeg"): raise exceptions.DependencyRequired("please install ffmpeg") self.filepath = helpers.convert_file(self.filepath, "flac") @property def file_format(self): return pathlib.Path(self.filepath).suffix[1:].lower() @staticmethod def check_for_config() -> bool: return _check_for_config() def upload_file_if_too_big(self): """10MB limit as of Mar 7, 2019""" pass def transcribe(self, **kwargs) -> str: self.convert_file_format_if_needed() self.upload_file_if_too_big() self._request_transcription(**kwargs) def _check_if_transcript_exists(self, transcript_name=None): return storage.Blob(bucket=self.transcript_bucket, name=transcript_name or self.basename).exists(self.storage_client) def _request_transcription( self, language_code="en-US", enable_automatic_punctuation=True, enable_speaker_diarization=True, num_speakers=2, model="phone_call", use_enhanced=True, ) -> str: """Returns the job_name""" if self._check_if_transcript_exists(): raise exceptions.AlreadyExistsError( f"{self.basename} already exists on {NAME}") num_audio_channels = helpers.get_num_audio_channels(self.filepath) sample_rate = helpers.get_sample_rate(self.filepath) with io.open(self.filepath, "rb") as audio_file: content = audio_file.read() audio = speech.types.RecognitionAudio(content=content) if language_code != "en-US": model = None config = speech.types.RecognitionConfig( encoding=speech.enums.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=sample_rate, audio_channel_count=num_audio_channels, enable_separate_recognition_per_channel=True, enable_word_confidence=True, enable_word_time_offsets=True, language_code=language_code, enable_automatic_punctuation=enable_automatic_punctuation, enable_speaker_diarization=enable_speaker_diarization, diarization_speaker_count=num_speakers, model=model, use_enhanced=use_enhanced, ) self.operation = self.speech_client.long_running_recognize( config, audio) print("transcribing...") while not self.operation.done(): sleep(1) print(".") result_list = [] for result in self.operation.result().results: result_list.append(str(result)) print("saving transcript") transcript_path = "/tmp/transcript.txt" with open(transcript_path, "w") as fout: fout.write("\n".join(result_list)) print("uploading transcript") self.upload_file(BUCKET_NAME_TRANSCRIPT, transcript_path) os.remove(transcript_path) return self.basename @classmethod def retrieve_transcript(cls, transcription_job_name: str) -> TRANSCRIPT_TYPE: """Get transcript from BUCKET_NAME_TRANSCRIPT""" if not cls._check_if_transcript_exists( cls, transcript_name=transcription_job_name): raise exceptions.DoesntExistError("no such transcript!") blob = cls.transcript_bucket.blob(transcription_job_name) f = tempfile.NamedTemporaryFile(delete=False) f.close() blob.download_to_filename(f.name) with open(f.name) as fin: transcript_text = fin.read() os.remove(f.name) return transcript_text def upload_file(self, bucket_name, path): blob = self.transcript_bucket.blob(self.basename) blob.upload_from_filename(path) @classmethod def get_transcription_jobs(cls, job_name_query=None, status=None) -> List[dict]: if status and status.lower() != "completed": return [] jobs = [] for t in cls.transcript_bucket.list_blobs(): if job_name_query is not None and t.name != job_name_query: continue jobs.append({"name": t.name, "status": "COMPLETED"}) return jobs
def build_dataset(self): print("running") output_wav_path = "{}/wavs/".format(self.project_name) if not os.path.exists(self.project_name): os.mkdir(self.project_name) if not os.path.exists(output_wav_path): os.mkdir(output_wav_path) if self.split_method == 0: #Google API mode if not get_value("input_project_name") or not get_value( "label_wav_file_path"): print("Error, please choose text and/or audio files.") return set_value("label_build_status", "Detecting silences. This may take several minutes...") audio_name = self.wav_file_path w = AudioSegment.from_wav(audio_name) s_len = 1000 silence_cuts = silence.split_on_silence(w, min_silence_len=s_len, silence_thresh=-45, keep_silence=True) cuts = [] final_cuts = [] def split_wav(wav, l): if (wav.duration_seconds * 1000) < (self.cut_length * 1000): output = [] output.append(wav) return output too_long = False while True: l -= 50 if l == 0: print( "Error, could not find small enough silence period for split, giving up" ) output = [] output.append(wav) return output splits = silence.split_on_silence(wav, min_silence_len=l, silence_thresh=-45, keep_silence=True) print("Trying resplit...") for s in splits: if (s.duration_seconds * 1000) > (self.cut_length * 1000): too_long = True if too_long == True: too_long = False else: return splits # Keep splitting until all cuts are under max len for i, c in enumerate(silence_cuts): print(f"Checking phrase {i}...") c_splits = split_wav(c, 1000) for s in c_splits: cuts.append(s) # c_split_len = 1 # s_len_temp = s_len - 100 # for c in silence_cuts: # if (c.duration_seconds * 1000) > (self.cut_length * 1000): # # cut again, too long # #print("cutting again...") # while c_split_len == 1: # #print(s_len_temp) # c_split = split_wav(c, s_len_temp) # c_split_len = len(c_split) # s_len_temp -= 100 #reduce split time for hopefully more cuts # c_split_len = 1 # s_len_temp = s_len - 100 # for i in c_split: # cuts.append(i) # else: # cuts.append(c) # rebuild small cuts into larger, but below split len temp_cuts = AudioSegment.empty() prev_cuts = AudioSegment.empty() for i, c in enumerate(cuts): prev_cuts = temp_cuts temp_cuts = temp_cuts + c if i == (len(cuts) - 1): #on final entry if (temp_cuts.duration_seconds * 1000) > (self.cut_length * 1000): final_cuts.append(prev_cuts) final_cuts.append(c) else: final_cuts.append(temp_cuts) else: if ((temp_cuts.duration_seconds * 1000) + (cuts[i + 1].duration_seconds * 1000)) > ( self.cut_length * 1000): # combine failed, too long, add what has already been concatenated final_cuts.append(temp_cuts) temp_cuts = AudioSegment.empty() if not os.path.exists("{}/wavs".format(self.project_name)): os.mkdir("{}/wavs".format(self.project_name)) for i, w in enumerate(final_cuts): w.export("{}/wavs/{}.wav".format( self.project_name, i + int(get_value("input_starting_index"))), format="wav") # Process each cut into google API and add result to csv with open("{}/output.csv".format(self.project_name), 'w') as f: bucket_name = get_value("input_storage_bucket") newline = '' for i, c in enumerate(final_cuts): x = i + int(get_value("input_starting_index")) print(f"Transcribing entry {x}") self.upload_blob( bucket_name, "{}/wavs/{}.wav".format(self.project_name, x), "temp_audio.wav") gcs_uri = "gs://{}/temp_audio.wav".format(bucket_name) client = speech.SpeechClient() audio = speech.RecognitionAudio(uri=gcs_uri) info = mediainfo("{}/wavs/{}.wav".format( self.project_name, x)) sample_rate = info['sample_rate'] if get_value("input_use_videomodel") == 1: print("Using enchanced google model...") config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding. LINEAR16, sample_rate_hertz=int(sample_rate), language_code="en-US", enable_automatic_punctuation=True, enable_word_time_offsets=False, enable_speaker_diarization=False, # enhanced model for better performance? use_enhanced=True, model="video", #"phone_call or video" ) else: config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding. LINEAR16, sample_rate_hertz=int(sample_rate), language_code="en-US", enable_automatic_punctuation=True, enable_word_time_offsets=False, enable_speaker_diarization=False, ) operation = client.long_running_recognize(config=config, audio=audio) response = operation.result(timeout=28800) for result in response.results: text = result.alternatives[0].transcript # replace some symbols and google API word choice text = text.replace("%", " percent") text = text.replace("cuz", "cause") text = text.replace("-", " ") text = text.replace("&", "and") print(text) set_value("label_build_status", text) f.write("{}wavs/{}.wav|{}".format(newline, x, text)) newline = '\n' print('\a') #system beep set_value("label_build_status", "Done!") print("Done running builder!") else: # Aeneas mode if not get_value("input_project_name") or not get_value( "label_speaker_text_path") or not get_value( "label_wav_file_path"): print("Error, please choose text and/or audio files.") return if not os.path.exists("aeneas_out"): os.mkdir("aeneas_out") else: shutil.rmtree("aeneas_out") os.mkdir("aeneas_out") if not os.path.exists("aeneas_prepped"): os.mkdir("aeneas_prepped") else: shutil.rmtree("aeneas_prepped") os.mkdir("aeneas_prepped") audio_name = self.wav_file_path with open(self.speaker_text_path, 'r', encoding="utf8") as f: text = f.read() text = text.replace(';', '.') text = text.replace(':', '.') text = text.replace('-', ' ') text = text.replace('”', '') text = text.replace('“', '') text = text.replace('"', '.') text = text.replace('—', ' ') text = text.replace('’', '\'') text = text.replace(' –', '.') text = text.strip('\n') if self.contains_punc: #remove any duplicate whitespace between words text = " ".join(text.split()) phrase_splits = re.split( r'(?<=[\.\!\?])\s*', text) #split on white space between sentences phrase_splits = list(filter( None, phrase_splits)) #remove empty splits else: #no punctuation from speech to text, so we must divid text by word count phrase_splits = [] temp_line = [] text_split = text.split() word_count_limit = 16 while len(text_split) > 0: while len(temp_line) < word_count_limit and len( text_split) > 0: temp_line.append(text_split.pop(0)) phrase_splits.append(" ".join(temp_line)) temp_line = [] with open('aeneas_prepped/split_text', 'w') as f: newline = '' for s in phrase_splits: if s: stripped = s.strip() #remove whitespace f.write(newline + stripped) newline = '\n' #os.system('python -m aeneas.tools.execute_task ' + audio_name + ' aeneas_prepped/split_text "task_adjust_boundary_percent_value=50|task_adjust_boundary_algorithm=percent|task_language=en|is_text_type=plain|os_task_file_format=csv" ' + 'aeneas_out/' + audio_name_no_ext + '.csv') os.system( 'python -m aeneas.tools.execute_task ' + audio_name + ' aeneas_prepped/split_text "task_adjust_boundary_percent_value=50|task_adjust_boundary_algorithm=percent|task_language=en|is_text_type=plain|os_task_file_format=csv" ' + 'aeneas_out/' + self.project_name + '.csv') output_exists = False if os.path.exists("{}/output.csv".format(self.project_name)): #if file exists then prepare for append output_exists = True new_csv_file = open("{}/output.csv".format(self.project_name), 'a') if output_exists: new_csv_file.write("\n") with open('aeneas_out/' + self.project_name + '.csv', 'r') as csv_file: index_count = int(self.index_start) csv_reader = csv.reader(csv_file, delimiter=',') csv_reader = list(csv_reader) #convert to list row_count = len(csv_reader) newline = "" for row in csv_reader: beginning_cut = float(row[1]) end_cut = float(row[2]) text_out = row[3] text_out = text_out.strip() print("{} {} {} ".format(beginning_cut, end_cut, text_out)) c_length = end_cut - beginning_cut #if cut is longer than cut length then split it even more cut_length = float(self.cut_length) if c_length > cut_length: more_cuts = open("aeneas_prepped/temp.csv", 'w') #save the current cut wav file to run on aeneas again w = AudioSegment.from_wav(audio_name) wav_cut = w[(beginning_cut * 1000):(end_cut * 1000)] wav_cut.export("aeneas_prepped/tempcut.wav", format="wav") split_list = [] num_cuts = math.ceil(c_length / cut_length) text_list = text_out.split() text_list_len = len(text_list) split_len = math.ceil(text_list_len / num_cuts) print( "too long, making extra {} cuts. with length {}" .format(num_cuts, split_len)) for i in range(1, num_cuts + 1): words = [] for j in range(0, split_len): if not text_list: break words.append(text_list.pop(0)) split_list.append(" ".join(words)) print(split_list) print() newline_splits = '' for phrase in split_list: more_cuts.write(newline_splits + phrase) newline_splits = '\n' more_cuts.close() os.system( 'python -m aeneas.tools.execute_task ' + "aeneas_prepped/tempcut.wav" + ' aeneas_prepped/temp.csv "task_adjust_boundary_percent_value=50|task_adjust_boundary_algorithm=percent|task_language=en|is_text_type=plain|os_task_file_format=csv" ' + 'aeneas_out/temp_out.csv') csv_file_temp = open('aeneas_out/temp_out.csv', 'r') csv_reader_temp = csv.reader(csv_file_temp, delimiter=',') csv_reader_temp = list( csv_reader_temp) #convert to list row_count = len(csv_reader_temp) w = AudioSegment.from_wav( "aeneas_prepped/tempcut.wav") for row in csv_reader_temp: beginning_cut = float(row[1]) end_cut = float(row[2]) text_out = row[3] text_out = text_out.strip() wav_cut = w[(beginning_cut * 1000):(end_cut * 1000)] new_wav_filename = "wavs/" + str( index_count) + ".wav" new_csv_file.write("{}{}|{}".format( newline, new_wav_filename, text_out)) wav_cut.export("{}/{}".format( self.project_name, new_wav_filename), format="wav") index_count += 1 newline = '\n' csv_file_temp.close() else: w = AudioSegment.from_wav(audio_name) wav_cut = w[(beginning_cut * 1000):(end_cut * 1000)] new_wav_filename = "wavs/" + str( index_count) + ".wav" new_csv_file.write("{}{}|{}".format( newline, new_wav_filename, text_out)) wav_cut.export("{}/{}".format( self.project_name, new_wav_filename), format="wav") index_count += 1 newline = '\n' new_csv_file.close() set_value("label_build_status", "Building dataset done!") #Remove temporary directories shutil.rmtree("aeneas_prepped") shutil.rmtree("aeneas_out") print('\a') #system beep print("Done with Aeneas!")
def gen_transcript(filename: str, script_path: str, to_lang: str): """generates a transcript""" client = speech.SpeechClient() #upload to gcp uri_ = upload_to_gcp(filename) audio = speech.types.RecognitionAudio(uri=uri_) characters, sentences = script_sanitzer.santize(script_path, ['*,*', '[,]', '(,)']) phrases_ = [x[0] if len(x[0]) < 100 else x[0][:100] for x in sentences] config = speech.types.RecognitionConfig(encoding='FLAC', language_code='en-US', model='video', sample_rate_hertz=16000, enable_word_time_offsets=True) operation = client.long_running_recognize(config, audio) print('Waiting for operation to complete...') result_data = operation.result(timeout=1000) merged_transcript = "" merged_words = [] for result in result_data.results: alternative = result.alternatives[0] merged_transcript += alternative.transcript for word_info in alternative.words: word = word_info.word start_time = word_info.start_time end_time = word_info.end_time word_tup = (word, start_time.seconds + start_time.nanos * 1e-9, end_time.seconds + end_time.nanos * 1e-9) # print('Word: {}, start_time: {}, end_time: {}'.format( # word, # start_time.seconds + start_time.nanos * 1e-9, # end_time.seconds + end_time.nanos * 1e-9)) merged_words.append(word_tup) #santize the script print(merged_words) empty_queue = [] transcript_ptr = 0 start = -1.1 end = -1.1 prev_start = start prev_end = end for sentence in sentences: actualSize = findSize(sentence[0]) print(transcript_ptr) prev_start = start prev_end = end start = -1.0 end = -1.0 found = False for word in sentence[0].split(" "): if word.isspace(): continue if (found): break for word2 in merged_words[transcript_ptr:transcript_ptr + actualSize]: #find start if check_words_equal(word, word2[0]): start = word2[1] found = True break found = False for word in sentence[0].split(" ")[::-1]: print("WORD: " + str(word)) if word.isspace(): continue if (found): break for word2 in range(transcript_ptr + actualSize, transcript_ptr - 1, -1): if (word2 >= len(merged_words)): continue print(actualSize) print(sentence[0].split(" ")) #find start print(word2) print(len(merged_words)) print("WORD 2: " + str(merged_words[word2][0])) if check_words_equal(word, merged_words[word2][0]): end = merged_words[word2][2] transcript_ptr = word2 + 1 found = True break #Could not find the correct start or end times for first and last words #Time to estimate! if start < 0 or end < 0: ''' We know that, if all previous sentences were calculated correctly, The start and end time of this sentence must be after the previous end time of the last sentence (somewhere near the first word after the last sentence) or 0 if its the first sentence. Once we have the start we will calculate the average talking speed (wpm) of the characters. Using this speed we can define a low ball estimate for how long the sentence that couldnt be defined will take, allowing us to define the end time. If this is the first sentence we will attempt to use the average persons wpm (150 wpm). ''' #No previous sentences if len(empty_queue) == 0: start = merged_words[0][1] end = actualSize * (14 / 6) transcript_ptr = actualSize - int(actualSize * 1 / 4) else: start = merged_words[transcript_ptr][1] avg_wpm = findAverageWPM(empty_queue) end = actualSize * avg_wpm transcript_ptr += actualSize - int(actualSize * 1 / 4) else: #create nodes node_to_add = Node( sentence[1], translate.translate_phrase(sentence[0], to_lang), start, end) empty_queue.append(node_to_add) print(empty_queue) return empty_queue
def __init__(self): self._sound_to_text = speech.SpeechClient() self._text_to_sound = texttospeech.TextToSpeechClient() self._audio_config = texttospeech.types.AudioConfig( audio_encoding=texttospeech.enums.AudioEncoding.MP3) self._translator = translate.Client()
def __init__(self): threading.Thread.__init__(self) # 语音自适应 boost = 4 # 识别指定词语的概率 推荐 [0,20] speech_contexts = [{ "phrases": "ホロライブ", "boost": boost }, { "phrases": "しらかみ", "boost": boost }, { "phrases": "ふぶき", "boost": boost }, { "phrases": "うさだ", "boost": boost }, { "phrases": "ぺこら", "boost": boost }, { "phrases": "ぺこ", "boost": boost }, { "phrases": "よし", "boost": boost }, { "phrases": "よしょ", "boost": boost }, { "phrases": "えしょう", "boost": boost }, { "phrases": "ARK", "boost": boost }, { "phrases": "やめろ", "boost": boost }, { "phrases": "マリン", "boost": boost }, { "phrases": "まつり", "boost": boost }, { "phrases": "せんちょう", "boost": boost }] speaker_diarization_config = speech.types.SpeakerDiarizationConfig( # 区分讲话人配置 enable_speaker_diarization=True, min_speaker_count=1, max_speaker_count=2) self.config = speech.types.RecognitionConfig( encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=44100, language_code='ja-JP', max_alternatives=1, enable_automatic_punctuation=True, # 启用标点符号 diarization_config=speaker_diarization_config, # 区分讲话人 speech_contexts=speech_contexts # 语音自适应 ) self.streaming_config = speech.types.StreamingRecognitionConfig( config=self.config, interim_results=True) self.translate_client = translate.Client() self.speech_client = speech.SpeechClient()
def __init__(self, sample_rate): self.client = speech_v1p1beta1.SpeechClient() self.sample_rate = sample_rate
def get_hypothesis(self, uri, configuration): import time """Asynchronously transcribes the audio uri specified by the gcs_uri.""" client = speech.SpeechClient() config = { "model": configuration.get_model(), "use_enhanced": configuration.get_use_enhanced(), "encoding": configuration.get_encoding(), "sample_rate_hertz": configuration.get_sample_rate_hertz(), "language_code": configuration.get_language_code(), "alternative_language_codes": configuration.get_alternative_language_codes(), "audio_channel_count": configuration.get_audio_channel_count(), "enable_separate_recognition_per_channel": configuration.get_enable_separate_recognition_per_channel(), "enable_speaker_diarization": configuration.get_enableSpeakerDiarization(), "diarization_speaker_count": configuration.get_diarizationSpeakerCount(), "enable_automatic_punctuation": configuration.get_enableAutomaticPunctuation(), "speech_contexts": configuration.get_speech_context() } audio = {"uri": uri} operation = object try: operation = client.long_running_recognize(config=config, audio=audio) except google.api_core.exceptions.InvalidArgument as e: raise e count = 0 sleep_time = 5 while not operation.done() and count != 30000: print( f"{operation.metadata.progress_percent}% complete - updates every {sleep_time} seconds" ) if count == 29999: raise TimeoutError("Time out processing audio") count += 1 time.sleep(sleep_time) print( f"{operation.metadata.progress_percent}% complete - updates every {sleep_time} seconds" ) response = operation.result(timeout=1200) transcript = str() for result in response.results: # First alternative is the most probable result transcript += " " + result.alternatives[0].transcript if not transcript: logger.debug('No transcript returned') utilities = Utilities() t = utilities.strip_puc(text=transcript) return t.lower()
def google_transcribe(audio_file_name): file_name = filepath + audio_file_name second_lang = "hi-IN" # The name of the audio file to transcribe frame_rate, channels = frame_rate_channel(file_name) if channels > 1: stereo_to_mono(file_name) bucket_name = bucketname source_file_name = filepath + audio_file_name destination_blob_name = audio_file_name upload_blob(bucket_name, source_file_name, destination_blob_name) gcs_uri = 'gs://' + bucketname + '/' + audio_file_name transcript = '' credential_path = s.get("credential_path") os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path client = speech.SpeechClient() audio = speech.RecognitionAudio(uri=gcs_uri) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=frame_rate, language_code='en-US', alternative_language_codes=[second_lang], enable_speaker_diarization=True, diarization_speaker_count=2) operation = client.long_running_recognize(request={ "config": config, "audio": audio }) response = operation.result(timeout=10000) result = response.results[-1] words_info = result.alternatives[0].words tag = 1 speaker = "" for word_info in words_info: if word_info.speaker_tag == tag: speaker = speaker + " " + word_info.word else: transcript += "speaker {}: {}".format(tag, speaker) + '\n' tag = word_info.speaker_tag speaker = "" + word_info.word transcript += "speaker {}: {}".format(tag, speaker) #for result in response.results: #transcript += result.alternatives[0].transcript storage_client = storage.Client() bucket_name = storage_client.get_bucket(bucket_name) transcript_filename = audio_file_name.split( '.')[0] + '_transcript' + '.txt' blob_transcript_file = bucket_name.blob(transcript_filename) blob_transcript_file.upload_from_string(transcript) #delete_blob(bucket_name, destination_blob_name) return transcript
def transcriberDetail(blob_name, main): # check if already Inserted to ssrDictionary using audio name/blobname flagDntInst = 0 # mycursor = mydb.cursor() mydb._open_connection() sql = "select audioName from ssrDictionary where audioName='" + blob_name + "' LIMIT 2" mycursor.execute(sql) myresult = mycursor.fetchall() for x in myresult: flagDntInst = 1 posts = [] # urll = 'gs://bucketgcssr/SSR_8102019114925.wav' urll = 'gs://bucketgcssr/' + blob_name from google.cloud import speech_v1p1beta1 as speech # GCP api client = speech.SpeechClient() audio = speech.types.RecognitionAudio(uri=urll) config = speech.types.RecognitionConfig( encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16, language_code='ur-PK', # language code # speaker diaraziation not working for urdu for now enable_speaker_diarization=True, diarization_speaker_count=2, # Speak count not working for urdu now sample_rate_hertz=48000, # audio sampel rage audio_channel_count=1) # number of chanel used in aud operation = client.long_running_recognize(config, audio) response = operation.result(timeout=10000) transcrip = "" confidence = 0 for result in response.results: alternative = result.alternatives[0] transcrip = format(alternative.transcript) confidence = alternative.confidence main.append({ 'transcrip': transcrip, 'blob_name': blob_name, 'confidence': confidence }) for word_info in alternative.words: confidence = format(word_info.confidence) word = word_info.word start_time = word_info.start_time end_time = word_info.end_time posts.append({ 'word': word, 'start_time': start_time.seconds + start_time.nanos * 1e-9, 'end_time': end_time.seconds + end_time.nanos * 1e-9, 'confidence': confidence }) # insertion to Mysql For WordDictionary here if flagDntInst == 0: sql = "INSERT INTO ssrDictionary (words,audioName, confidance,endTime,startTime) VALUES (%s, %s, %s, %s, %s)" val = (word, blob_name, confidence, end_time.seconds + end_time.nanos * 1e-9, start_time.seconds + start_time.nanos * 1e-9) mycursor.execute(sql, val) mydb.commit() mydb.close() return posts
def transcribe_gcs(gcs_uri, num_speakers): """Asynchronously transcribes the audio file specified by the gcs_uri.""" # Imports the Google Cloud client library #from google.cloud import speech from google.cloud import speech_v1p1beta1 as speech # Instantiates a client client = speech.SpeechClient() # Construct a recognition metadata object metadata = speech.RecognitionMetadata() metadata.interaction_type = speech.RecognitionMetadata.InteractionType.DISCUSSION metadata.recording_device_type = ( speech.RecognitionMetadata.RecordingDeviceType.OTHER_INDOOR_DEVICE ) metadata.audio_topic = "court trial hearing" metadata.original_mime_type = "audio/mp3" audio = speech.RecognitionAudio(uri=gcs_uri) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=16000, language_code="en-US", enable_automatic_punctuation=True, enable_speaker_diarization=True, diarization_speaker_count=num_speakers, # Enhanced models cost more than standard models. use_enhanced=True, model="video", enable_word_time_offsets=True, ) # Detects speech in the audio file -- long audio file operation = client.long_running_recognize(config=config, audio=audio) print("Waiting for operation to complete...") response = operation.result(timeout=300) # Writing results to json result_counter = 0 word_counter = 0 output_json = {} for result in response.results: alternative = result.alternatives[0] output_json[f"{result_counter}_Transcript"] = alternative.transcript output_json[f"{result_counter}_Confidence"] = alternative.confidence result_counter += 1 for word_info in alternative.words: word = word_info.word start_time = word_info.start_time end_time = word_info.end_time speaker_tag = word_info.speaker_tag output_json[f"{word_counter}_Word"] = word output_json[f"{word_counter}_start_time"] = start_time.total_seconds() output_json[f"{word_counter}_end_time"] = end_time.total_seconds() output_json[f"{word_counter}_speaker_tag"] = speaker_tag word_counter += 1 with open("{}.json".format(gcs_uri.split('/')[-1][:-5]) , "w+") as file: json.dump(output_json, file) print("Dirized and transcribed {}".format(gcs_uri.split('/')[-1]))
def manage_stream(mic, finals, STREAMING_LIMIT): client = speech.SpeechClient() config = speech.types.RecognitionConfig( encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=44100, language_code='en-US', max_alternatives=1, enable_speaker_diarization=True, enable_automatic_punctuation=True) streaming_config = speech.types.StreamingRecognitionConfig( config=config, interim_results=True) with mic as stream: while not stream.to_close: audio_generator = stream.generator() requests = (speech.types.StreamingRecognizeRequest( audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) logging.info("Started new stream") for response in responses: logging.info("new response") if stream.to_close: break if get_current_time() - stream.start_time > STREAMING_LIMIT: stream.start_time = get_current_time() break if not response.results: continue result = response.results[0] if not result.alternatives: continue if result.is_final: transcript = result.alternatives[0].transcript logging.info(transcript) result_seconds = 0 result_nanos = 0 if result.result_end_time.seconds: result_seconds = result.result_end_time.seconds if result.result_end_time.nanos: result_nanos = result.result_end_time.nanos stream.end_time = int((result_seconds * 1000) + (result_nanos / 1000000)) finals.append( f"Speaker {get_main_speaker(result)}: {transcript}<br/><br/>" ) stream.last_interim = "" else: stream.last_interim = result.alternatives[0].transcript stream.next_stream()
def transcribe_file(speech_file): """Transcribe the given audio file asynchronously.""" from google.cloud import speech_v1p1beta1 from google.cloud.speech_v1p1beta1 import enums from google.cloud.speech_v1p1beta1 import types client = speech_v1p1beta1.SpeechClient() # [START migration_async_request] with io.open(speech_file, 'rb') as audio_file: content = audio_file.read() #convert to mono from pydub import AudioSegment AudioSegment.converter = r'C:\FFMpeg' sound = AudioSegment.from_wav(args.path) sound = sound.set_frame_rate(44100) sound = sound.set_channels(1) sound.export(args.path, format="wav") #from pydub.utils import mediainfo #info = mediainfo(args.path) #print (info['sample_rate']) from scipy import io rate = scipy.io.wavfile.read(args.path) print(rate) print('Using ', args.path, ', with the below config:') print("") print("importing speech_v1p1beta1") print("language_code='en-US'") print("use_enhanced=True") print("enable_automatic_punctuation=False") print("enable_word_time_offsets=False") print("profanity_filter=True") print("sample_rate=48000hz") print("") print("Transcript is as follows") audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=48000, language_code='en-US', use_enhanced=True, # A model must be specified to use enhanced model. model='phone_call', enable_automatic_punctuation=False, enable_word_time_offsets=False, profanity_filter=True, #speech_contexts=[speech.types.SpeechContext( # phrases=['Andy', 'Wisy', 'EEP', 'Project', 'Tom', 'Jeff'], # )], ) # [START migration_async_response] operation = client.long_running_recognize(config, audio) # [END migration_async_request] print('Waiting for operation to complete...') response = operation.result(timeout=90) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: # The first alternative is the most likely one for this portion. print(('Transcript: {}'.format(result.alternatives[0].transcript))) print(('Confidence: {}'.format(result.alternatives[0].confidence)))
#Reference: https://cloud.google.com/speech-to-text/docs/async-recognize #https://google-cloud-python.readthedocs.io/en/0.32.0/storage/blobs.html from google.cloud import speech_v1p1beta1 as speech from google.cloud.storage import Blob from google.cloud import storage client = speech.SpeechClient() def diarize(data, context): speech_file = data['name'] bucket = data['bucket'] print('Bucket {}'.format(bucket)) print('File {}'.format(speech_file)) filename_uri = "gs://" + bucket + "/" + speech_file print('File name uri {}'.format(filename_uri)) dest_file = speech_file + ".txt" audio = speech.types.RecognitionAudio(uri=filename_uri) config = speech.types.RecognitionConfig( encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=8000, language_code='en-US', enable_speaker_diarization=True, diarization_speaker_count=2) operation = client.long_running_recognize(config, audio) print('Waiting for operation to complete...')
def main(input_queue, q_conversation, realTime=True, speech_file=None): global stop_loop global queueQA global arr global queueTranscripts global queueSpeakerTags global q_convo global queueThread q_convo = q_conversation max_conv_length = 1000 stop_loop = multiprocessing.Value('i') # used to stop all processes/loops when 'stop recording' is said stop_loop = 0 queueQA = multiprocessing.Queue() queueTranscripts = multiprocessing.Queue() queueSpeakerTags = multiprocessing.Queue() queueThread = multiprocessing.Queue() arr = multiprocessing.Array('i', max_conv_length) process = multiprocessing.Process(target=calculation, args=(arr, queueQA, queueTranscripts, queueSpeakerTags, input_queue, stop_loop, queueThread,)) process.start() os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "C:/Users/RedLine/Desktop/Semester 8/FYP/FYP_final/FYP-key.json" if realTime == True: print("Starting real time process") client = speech.SpeechClient() config = speech.types.RecognitionConfig( encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=SAMPLE_RATE, language_code='en-US', enable_speaker_diarization=True, enable_automatic_punctuation=True, max_alternatives=1 # enable_word_time_offsets=True ) streaming_config = speech.types.StreamingRecognitionConfig( config=config, interim_results=True) mic_manager = ResumableMicrophoneStream(SAMPLE_RATE, CHUNK_SIZE) print('Say "Quit" or "Exit" to terminate the program.') with mic_manager as stream: while not stream.closed: sys.stdout.write('\n' + str( STREAMING_LIMIT * stream.restart_counter) + ': NEW REQUEST\n') stream.audio_input = [] audio_generator = stream.generator() requests = (speech.types.StreamingRecognizeRequest( audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. listen_print_loop(responses, stream) if stream.result_end_time > 0: stream.final_request_end_time = stream.is_final_end_time stream.result_end_time = 0 stream.last_audio_input = [] stream.last_audio_input = stream.audio_input stream.audio_input = [] stream.restart_counter = stream.restart_counter + 1 if not stream.last_transcript_was_final: sys.stdout.write('\n') stream.new_stream = True else: print("Starting Non real time process") client = speech.SpeechClient() storage_uri = 'gs://fyp_1/BEP313-Scrum-Meetings1_1.wav' # Sample rate in Hertz of the audio data sent # sample_rate_hertz = 16000 # The language of the supplied audio language_code = "en-US" # Encoding of audio data sent. This sample sets this explicitly. # This field is optional for FLAC and WAV audio formats. encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16 config = { # "sample_rate_hertz": sample_rate_hertz, "enable_speaker_diarization": True, "enable_automatic_punctuation": True, "language_code": language_code, "encoding": encoding, "audio_channel_count": 2, } audio = {"uri": storage_uri} operation = client.long_running_recognize(config, audio) print(u"Waiting for operation to complete...") response = operation.result() result = response.results[-1] x = result.alternatives[0] words_info = x.words tag = 1 tag_prev = 1 speaker = "" transcript = "" for word_info in words_info: if tag_prev == tag: tag_prev = tag tag = word_info.speaker_tag speaker = speaker + " " + word_info.word elif not (speaker[-1] == "." or speaker[-1] == "?"): speaker = speaker + " " + word_info.word else: transcript += "speaker {}: {}".format(tag_prev, speaker) + '\n' tag_prev = tag tag = word_info.speaker_tag speaker = "" + word_info.word transcript += "speaker {}: {}".format(tag_prev, speaker) print("transcript_1\n", transcript) f = open("transcript_1.txt", "w") f.write(transcript) f.close() f = open("transcript_1.txt") transcript = f.readlines() print("transcript_2\n", transcript) f.close() output = [] for i in transcript: x = i.split(': ') sentence = x[-1] speaker_tag = x[0][-1] sentences = re.split(', |\. |\? ', sentence) for j in sentences: output.append([j.rstrip(), speaker_tag]) print('x: ', output) print(process_transcripts(output)[:])
def async_transcribe(audio_file_paths, bucket_name, output_tsv_path, sample_rate, language_code, speaker_count=0, begin_sec=0.0): """Transcribe a given audio file using the async GCloud Speech-to-Text API. The async API has the advantage of being able to handler longer audio without state reset. Empirically, we've observed that the async calls lead to slightly better accuracy than streaming calls. Args: audio_file_paths: Paths to the audio files as a list of strings in the correct order. bucket_name: Name of GCS bucket used for holding objects temporarily. output_tsv_path: Path to the output TSV file. sample_rate: Audio sample rate. language_code: Language code for recognition. speaker_count: Number of speakers. If 0, speaker diarization will be disabled. begin_sec: Transcript begin timestamp in seconds. """ tmp_audio_file = tempfile.mktemp(suffix=".flac") print("Temporary audio file: %s" % tmp_audio_file) audio_duration_s = concatenate_audio_files(audio_file_paths, tmp_audio_file) storage_client = storage.Client() bucket = storage_client.bucket(bucket_name) destination_blob_name = os.path.basename(tmp_audio_file) blob = bucket.blob(destination_blob_name) print("Uploading %s to GCS bucket %s" % (tmp_audio_file, bucket_name)) blob.upload_from_filename(tmp_audio_file) gcs_uri = "gs://%s/%s" % (bucket_name, destination_blob_name) print("Uploaded to GCS URI: %s" % gcs_uri) client = speech.SpeechClient() audio = speech.RecognitionAudio(uri=gcs_uri) enable_speaker_diarization = speaker_count > 0 config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=sample_rate, language_code=language_code, enable_speaker_diarization=enable_speaker_diarization, diarization_speaker_count=speaker_count) operation = client.long_running_recognize(config=config, audio=audio) timeout_s = int(audio_duration_s * 0.25) print("Waiting for async ASR operation to complete " "(audio duration: %.3f s; ASR timeout: %d s)..." % (audio_duration_s, timeout_s)) response = operation.result(timeout=timeout_s) blob.delete() os.remove(tmp_audio_file) utterances = [] for result in response.results: # The first alternative is the most likely one for this portion. alt = result.alternatives[0] utterances.append(alt.transcript) print(u"Transcript: {}".format(alt.transcript)) diarized_words = [(word.word, word.speaker_tag, word.start_time.total_seconds(), word.end_time.total_seconds()) for word in alt.words] # print("Confidence: {}".format(result.alternatives[0].confidence)) regrouped_utterances = regroup_utterances(utterances, diarized_words) with open(output_tsv_path, "w" if not begin_sec else "a") as f: if not begin_sec: # Write the TSV header. f.write(tsv_data.HEADER + "\n") utterance_counter = 0 for (regrouped_utterance, speaker_index, start_time_sec, end_time_sec) in regrouped_utterances: utterance_counter += 1 line = "%.3f\t%.3f\t%s\t%s [U%d] [Speaker #%d]" % ( start_time_sec + begin_sec, end_time_sec + begin_sec, tsv_data.SPEECH_TRANSCRIPT_TIER, regrouped_utterance, utterance_counter, speaker_index) print(line) f.write(line + "\n")
def sample_long_running_recognize(storage_uri): """ Transcribe long audio file from Cloud Storage using asynchronous speech recognition Args: storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE] """ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "C:/Users/RedLine/Desktop/Semester 8/FYP/FYP_final/FYP-key.json" client = speech.SpeechClient() # storage_uri = 'gs://cloud-samples-data/speech/brooklyn_bridge.raw' # Sample rate in Hertz of the audio data sent # sample_rate_hertz = 16000 # The language of the supplied audio language_code = "en-US" # Encoding of audio data sent. This sample sets this explicitly. # This field is optional for FLAC and WAV audio formats. encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16 config = { # "sample_rate_hertz": sample_rate_hertz, "enable_speaker_diarization": True, "enable_automatic_punctuation": True, "language_code": language_code, "encoding": encoding, "audio_channel_count": 2, } audio = {"uri": storage_uri} operation = client.long_running_recognize(config, audio) print(u"Waiting for operation to complete...") response = operation.result() result = response.results[-1] words_info = result.alternatives[0].words tag = 1 tag_prev = 1 speaker = "" transcript = "" for word_info in words_info: if tag_prev == tag: tag_prev = tag tag = word_info.speaker_tag speaker = speaker + " " + word_info.word elif not (speaker[-1] == "." or speaker[-1] == "?"): speaker = speaker + " " + word_info.word else: transcript += "speaker {}: {}".format(tag_prev, speaker) + '\n' tag_prev = tag tag = word_info.speaker_tag speaker = "" + word_info.word transcript += "speaker {}: {}".format(tag_prev, speaker) print("transcript : ", transcript) f = open("transcript.txt", "a") f.write(transcript) f.close()
import io import os # Imports the Google Cloud client library from google.cloud import speech_v1p1beta1 from google.cloud.speech_v1p1beta1 import enums from google.cloud.speech_v1p1beta1 import types # Instantiates a client client = speech_v1p1beta1.SpeechClient() print "Using testwav1m, 8k, with the below config:" print "" print "importing speech_v1p1beta1" print "language_code='en-GB'" print "use_enhanced=True" print "enable_automatic_punctuation=True" print "enable_word_time_offsets=True" print "" print "Transcript is as follows" # The name of the audio file to transcribe file_name = os.path.join(os.path.dirname(__file__), 'resources', 'test_8k_mocktheweek.wav') # Loads the audio into memory with io.open(file_name, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
def sample_long_running_recognize(storage_uri): """ Transcribe a long audio file using asynchronous speech recognition Args: local_file_path Path to local audio file, e.g. /path/audio.wav """ # standard speech client #client = speech_v1.SpeechClient() # if utilizing speaker diarization client = speech_v1p1beta1.SpeechClient() # local_file_path = 'resources/brooklyn_bridge.raw' # The language of the supplied audio language_code = "en-US" # Enhanced model to use model = "phone_call" # Sample rate in Hertz of the audio data sent sample_rate_hertz = int(sys.argv[2]) # Optional. Specifies the estimated number of speakers in the conversation. diarization_speaker_count = 2 # Encoding of audio data sent. This sample sets this explicitly. # This field is optional for FLAC and WAV audio formats. encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16 config = { "model": model, "use_enhanced": True, "language_code": language_code, "sample_rate_hertz": sample_rate_hertz, "encoding": encoding, "enable_automatic_punctuation": True, "enable_speaker_diarization": True, "diarization_speaker_count": diarization_speaker_count, } audio = {"uri": storage_uri} operation = client.long_running_recognize(config, audio) print(u"Waiting for operation to complete...") response = operation.result() outtext = list() out_text_speaker = list() out_text_speaker_label = list() for result in response.results: # First alternative is the most probable result alternative = result.alternatives[0] # print(u"Transcript: {}".format(alternative.transcript)) outtext.append(alternative.transcript) for word in alternative.words: print(u"Speaker: {}, Word: {}".format(word.speaker_tag, word.word)) out_text_speaker.append(word.word) out_text_speaker_label.append(word.speaker_tag) return outtext, out_text_speaker, out_text_speaker_label