def ds_process_audio(ds, audio_file, file_handle): """Run DeepSpeech inference on each audio file generated after silenceRemoval and write to file pointed by file_handle Args: ds : DeepSpeech Model audio_file : audio file file_handle : SRT file handle """ global line_count fin = wave.open(audio_file, 'rb') fs_orig = fin.getframerate() desired_sample_rate = ds.sampleRate() # Check if sampling rate is required rate (16000) if fs_orig != desired_sample_rate: print("Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition".format(fs_orig, desired_sample_rate), file=sys.stderr) audio = convert_samplerate(audio_file, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) fin.close() # Perform inference on audio segment infered_text = ds.stt(audio) # File name contains start and end times in seconds. Extract that limits = audio_file.split("/")[-1][:-4].split("_")[-1].split("-") if len(infered_text) != 0: line_count += 1 write_to_file(file_handle, infered_text, line_count, limits)
def ds_process_audio(ds,filepath,file_handle): w = wave.open(filepath, 'r') frames = w.getnframes() fs = w.getframerate()#sampling rate buffer = w.readframes(frames) desired_sample_rate = ds.sampleRate() if fs == desired_sample_rate: data16 = np.frombuffer(buffer, dtype=np.int16) else: data16 = convert_samplerate(filepath,desired_sample_rate) w.close() text = ds.stt(data16)#use the deepspeech model to perform speech-to-text file_handle.write(text+"\n")
def ds_process_audio(ds, audio_file, file_handle, vtt): """Run DeepSpeech inference on each audio file generated after silenceRemoval and write to file pointed by file_handle Args: ds : DeepSpeech Model audio_file : audio file file_handle : SRT file handle """ global line_count fin = wave.open(audio_file, 'rb') fs_orig = fin.getframerate() desired_sample_rate = ds.sampleRate() # Check if sampling rate is required rate (16000) # won't be carried out as FFmpeg already converts to 16kHz if fs_orig != desired_sample_rate: print("Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition".format(fs_orig, desired_sample_rate), file=sys.stderr) audio = convert_samplerate(audio_file, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) fin.close() # Perform inference on audio segment metadata = ds.sttWithMetadata(audio) infered_text = ''.join([x.text for x in metadata.transcripts[0].tokens]) # File name contains start and end times in seconds. Extract that limits = audio_file.split(os.sep)[-1][:-4].split("_")[-1].split("-") # Get time cues for each word cues = [float(limits[0])] + [x.start_time + float(limits[0]) for x in metadata.transcripts[0].tokens if x.text == " "] if len(infered_text) != 0: line_count += 1 write_to_file(file_handle, infered_text, line_count, limits, vtt, cues)