def transcribe_file(speech_file): """Transcribe the given audio file.""" from google.cloud import speech import io client = speech.SpeechClient() with io.open(speech_file, "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=48000, audio_channel_count=2, enable_separate_recognition_per_channel=True, language_code="en-US", ) response = client.recognize(config=config, audio=audio) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: # The first alternative is the most likely one for this portion. return "{}".format(result.alternatives[0].transcript)
def transcribe_file(speech_file): global TIMES, sentence, TRANSCRIPT """Transcribe the given audio file.""" from google.cloud import speech import io client = speech.SpeechClient() with io.open(speech_file, "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, language_code="en-US", audio_channel_count=2, enable_automatic_punctuation=True, enable_word_time_offsets=True, ) response = client.recognize(config=config, audio=audio) print_sentences(response) i = len(TIMES) - 1 del TIMES[i] return FULL_TRANSCRIPT, TRANSCRIPT, TIMES
def google_STT(self, audio): client = speech_v1.SpeechClient.from_service_account_json( '/data/second-conquest-293723-05738e995f8f.json') # Loads the audio into memory with io.open(audio, "rb") as audio_file: content = audio_file.read() audio = speech_v1.RecognitionAudio(content=content) encoding = speech_v1.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED config = speech_v1.RecognitionConfig( encoding=encoding, sample_rate_hertz=22050, language_code="en-US", enable_automatic_punctuation=True, ) # Detects speech in the audio file start = time.time() response = client.recognize(request={"config": config, "audio": audio}) text = '' for result in response.results: text = text + result.alternatives[0].transcript # print("Transcript: {}".format()) return text
def parse_data(filename_weba, filename_wav): os.system("ffmpeg -i {} {} -y".format(filename_weba, filename_wav)) content = None if os.path.exists(filename_wav): with io.open(filename_wav, "rb") as f: content = f.read() audio = speech_v1.RecognitionAudio(content=content) return audio return content
def speech_to_text(local_speech_file): client = speech.SpeechClient() with io.open(local_speech_file, "rb") as audio_file: audio_content = audio_file.read() audio = speech.RecognitionAudio(content=audio_content) config = speech.RecognitionConfig( dict( encoding=speech.RecognitionConfig.AudioEncoding. ENCODING_UNSPECIFIED, sample_rate_hertz=16000, language_code="zh-TW", )) response = client.recognize(config=config, audio=audio) print_sentences(response)
def STT(audio_path, save_path=None): client = speech_v1.SpeechClient.from_service_account_json( '/data/second-conquest-293723-05738e995f8f.json') # Loads the audio into memory with io.open(audio_path, "rb") as audio_file: content = audio_file.read() audio = speech_v1.RecognitionAudio(content=content) encoding = speech_v1.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED config = speech_v1.RecognitionConfig( encoding=encoding, sample_rate_hertz=22050, language_code="en-US", enable_automatic_punctuation=True, ) # Detects speech in the audio file start = time.time() response = client.recognize(request={"config": config, "audio": audio}) text = '' for result in response.results: text = text + result.alternatives[0].transcript # print("Transcript: {}".format()) print(text) audio_name = audio_path.split('/')[-1].replace('.mp3', '') save_file_name = audio_name + GetCurrentDatetime() + '.txt' if save_path != None: os.makedirs(save_path, exist_ok=True) os.chdir(save_path) with open(save_file_name, 'w') as f: f.write(text) print('Inferred Audio File Name: ', audio_path) print('Transcribed Script File Saved: ', save_file_name) print('Processing Time: ', time.time() - start)
def convert(file, folder, pack): os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "api-key.json" files = sorted(os.listdir(str(file) + '/')) all_text = [] for f in files: name = str(file) + '/' + f print("Transcribing File- " + str(name)) with open(name, "rb") as audio_file: content = audio_file.read() try: config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, language_code="en-US") audio = speech.RecognitionAudio(content=content) text = speech_to_text(config, audio) all_text.append(text) except Exception as e: print(e) text = "No Audio" all_text.append(text) transcript = "" for i, t in enumerate(all_text): total_seconds = i * 15 m, s = divmod(total_seconds, 60) h, m = divmod(m, 60) total_seconds_n = total_seconds + 15 m_n, s_n = divmod(total_seconds_n, 60) h_n, m_n = divmod(m_n, 60) transcript = transcript + "{}\n{:0>2d}:{:0>2d}:{:0>2d},000 --> {:0>2d}:{:0>2d}:{:0>2d},000\n {}\n\n".format( i + 1, h, m, s, h_n, m_n, s_n, t) print("Transcript completed- " + str(transcript)) transcript_file = str(folder) + "/" + str(pack) + ".srt" with open(transcript_file, "w") as f: f.write(transcript)
def text_from_audio(wav_fname): with io.open(wav_fname, "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) return speech2text(audio)
def google_STT(self, path): client = speech_v1.SpeechClient.from_service_account_json( '/data/second-conquest-293723-05738e995f8f.json') # Loads the audio into memory with io.open(path, "rb") as audio_file: content = audio_file.read() audio = speech_v1.RecognitionAudio(content=content) encoding = speech_v1.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED config = speech_v1.RecognitionConfig( encoding=encoding, sample_rate_hertz=22050, language_code="en-US", audio_channel_count=2 if path.endswith('.wav') else 1, enable_automatic_punctuation=True, enable_word_time_offsets=True, ) # Detects speech in the audio file start = time.time() response = client.recognize(request={"config": config, "audio": audio}) sent = '' text = [] end_word = [] sent_start_time = [ 0, ] for result in response.results: alternative = result.alternatives[0] # print("Transcript: {}".format(alternative.transcript)) # print("Confidence: {}".format(alternative.confidence)) seperate_sentence = sent_tokenize(alternative.transcript) # for i in range(len(seperate_sentence)): # end_word.append(re.split(" ",text)[-1]) for word_info in alternative.words: word = word_info.word # condition = re.search('\\.', word) | re.search('\\?', word) start_time = word_info.start_time end_time = word_info.end_time sent = sent + ' ' + word if (re.search('\\.', word) != None) | (re.search('\\?', word) != None): sent_start_time.append(end_time.total_seconds()) text.append(sent) print(sent) sent = '' print( f"Word: {word}, end_time: {end_time.total_seconds()}") print(sent_start_time) print(text) return text, sent_start_time