def vad_segment_generator(wavFile, aggressiveness): logging.debug("Caught the wav file @: %s" % (wavFile)) audio, sample_rate, audio_length = wavSplit.read_wave(wavFile) vad = webrtcvad.Vad(int(aggressiveness)) frames = wavSplit.frame_generator(10, audio, sample_rate) frames = list(frames) segments = wavSplit.vad_collector(sample_rate, 10, 300, vad, frames) return segments, sample_rate, audio_length
def vad_segment_generator(wavFile, aggressiveness): logging.debug("Caught the wav file @: %s" % (wavFile)) audio, sample_rate, audio_length = wavSplit.read_wave(wavFile) assert sample_rate == 16000, "Only 16000Hz input WAV files are supported for now!" vad = webrtcvad.Vad(int(aggressiveness)) frames = wavSplit.frame_generator(30, audio, sample_rate) frames = list(frames) segments = wavSplit.vad_collector(sample_rate, 30, 300, vad, frames) return segments, sample_rate, audio_length
def vad_segment_generator(wavFile, aggressiveness, model_sample_rate): logging.debug("Caught the wav file @: %s" % (wavFile)) audio, sample_rate, audio_length = wavSplit.read_wave(wavFile) assert sample_rate == model_sample_rate, \ "Audio sample rate must match sample rate of used model: {}Hz".format(model_sample_rate) vad = webrtcvad.Vad(int(aggressiveness)) frames = wavSplit.frame_generator(30, audio, sample_rate) frames = list(frames) segments = wavSplit.vad_collector(sample_rate, 30, 300, vad, frames) return segments, sample_rate, audio_length
def vad_segment_generator(wav_file, aggressiveness): """ Generate VAD segments. Filters out non-voiced audio frames. :param wav_file: Input wav file to run VAD on.0 :param aggressiveness: How aggressive filtering out non-speech is (between 0 and 3) :return: Returns tuple of segments: a bytearray of multiple smaller audio frames (The longer audio split into multiple smaller one's) sample_rate: Sample rate of the input audio file audio_length: Duration of the input audio file """ logging.debug("Caught the wav file @: %s" % wav_file) audio, sample_rate, audio_length = wavSplit.read_wave(wav_file) assert sample_rate == 16000, "Only 16000Hz input WAV files are supported for now!" vad = webrtcvad.Vad(int(aggressiveness)) frames = wavSplit.frame_generator(30, audio, sample_rate) frames = list(frames) segments = wavSplit.vad_collector(sample_rate, 30, 300, 0.5, vad, frames) return segments, sample_rate, audio_length
# load and pre-process audio audio_file = os.path.join(os.getcwd(), "data/testing/audio/2830-3980-0043.wav") # audio_file = os.path.join(os.getcwd(),"data/testing/audio/my_name_is_jamie.wav") # audio_file = os.path.join(os.getcwd(),"data/testing/audio/hello_liv.wav") aggressiveness = 0 print("Reading and processing: {}".format(audio_file)) audio, sample_rate, audio_length = wavSplit.read_wave(audio_file) assert sample_rate == 16000, "Only 16000Hz input WAV files are supported for now!" vad = webrtcvad.Vad(int(aggressiveness)) frames = wavSplit.frame_generator(30, audio, sample_rate) frames = list(frames) segments = wavSplit.vad_collector(sample_rate, 30, 300, vad, frames) # we now have the data in the following segments # segments, sample_rate, audio_length print("we have {} frames".format(len(frames))) start = time.time() for i, segment in enumerate(segments): # Run deepspeech on the chunk that just completed VAD print("Processing chunk %002d" % (i, )) audio = np.frombuffer(segment, dtype=np.int16) # Run Deepspeech print('Running inference...') output = ds.stt(audio) print("Transcript: %s" % output) end = time.time() print("that took: {}".format(end - start))