def vad_segment_generator(wavFile, aggressiveness): logging.debug("Caught the wav file @: %s" % (wavFile)) audio, sample_rate, audio_length = wavSplit.read_wave(wavFile) assert sample_rate == 16000, "Only 16000Hz input WAV files are supported for now!" #vad = webrtcvad.Vad(int(aggressiveness)) frames = list(wavSplit.frame_generator(20000, audio, sample_rate)) return frames
def vad_segment_generator(wavFile, aggressiveness): logging.debug("Caught the wav file @: %s" % (wavFile)) audio, sample_rate, audio_length = wavSplit.read_wave(wavFile) vad = webrtcvad.Vad(int(aggressiveness)) frames = wavSplit.frame_generator(10, audio, sample_rate) frames = list(frames) segments = wavSplit.vad_collector(sample_rate, 10, 300, vad, frames) return segments, sample_rate, audio_length
def vad_segment_generator(wavFile, aggressiveness, frame_duration_ms=30, padding_duration_ms=300): logging.debug("Caught the wav file @: %s" % (wavFile)) audio, sample_rate, audio_length = wavSplit.read_wave(wavFile) assert sample_rate == 16000, "Only 16000Hz input WAV files are supported for now!" vad = webrtcvad.Vad(int(aggressiveness)) frames = wavSplit.frame_generator(frame_duration_ms, audio, sample_rate) frames = list(frames) segments = wavSplit.vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames) return [segment for segment in segments], sample_rate, audio_length
def vad_segment_generator(wavFile, aggressiveness): logging.debug("Caught the wav file @: %s" % (wavFile)) audio, sample_rate, audio_length = wavSplit.read_wave(wavFile) assert sample_rate == 16000, "Only 16000Hz input WAV files are supported for now!" vad = webrtcvad.Vad(int(aggressiveness)) frames = wavSplit.frame_generator(30, audio, sample_rate) frames = list(frames) segments = wavSplit.vad_collector(sample_rate, 30, 300, vad, frames) return segments, sample_rate, audio_length
def vad_segment_generator(wavFile, aggressiveness, model_sample_rate): logging.debug("Caught the wav file @: %s" % (wavFile)) audio, sample_rate, audio_length = wavSplit.read_wave(wavFile) assert sample_rate == model_sample_rate, \ "Audio sample rate must match sample rate of used model: {}Hz".format(model_sample_rate) vad = webrtcvad.Vad(int(aggressiveness)) frames = wavSplit.frame_generator(30, audio, sample_rate) frames = list(frames) segments = wavSplit.vad_collector(sample_rate, 30, 300, vad, frames) return segments, sample_rate, audio_length
def vad_segment_generator(wav_file, aggressiveness): """ Generate VAD segments. Filters out non-voiced audio frames. :param wav_file: Input wav file to run VAD on.0 :param aggressiveness: How aggressive filtering out non-speech is (between 0 and 3) :return: Returns tuple of segments: a bytearray of multiple smaller audio frames (The longer audio split into multiple smaller one's) sample_rate: Sample rate of the input audio file audio_length: Duration of the input audio file """ logging.debug("Caught the wav file @: %s" % wav_file) audio, sample_rate, audio_length = wavSplit.read_wave(wav_file) assert sample_rate == 16000, "Only 16000Hz input WAV files are supported for now!" vad = webrtcvad.Vad(int(aggressiveness)) frames = wavSplit.frame_generator(30, audio, sample_rate) frames = list(frames) segments = wavSplit.vad_collector(sample_rate, 30, 300, 0.5, vad, frames) return segments, sample_rate, audio_length
ds.enableExternalScorer(scorer) # ds === deep speech model # load and pre-process audio audio_file = os.path.join(os.getcwd(), "data/testing/audio/2830-3980-0043.wav") # audio_file = os.path.join(os.getcwd(),"data/testing/audio/my_name_is_jamie.wav") # audio_file = os.path.join(os.getcwd(),"data/testing/audio/hello_liv.wav") aggressiveness = 0 print("Reading and processing: {}".format(audio_file)) audio, sample_rate, audio_length = wavSplit.read_wave(audio_file) assert sample_rate == 16000, "Only 16000Hz input WAV files are supported for now!" vad = webrtcvad.Vad(int(aggressiveness)) frames = wavSplit.frame_generator(30, audio, sample_rate) frames = list(frames) segments = wavSplit.vad_collector(sample_rate, 30, 300, vad, frames) # we now have the data in the following segments # segments, sample_rate, audio_length print("we have {} frames".format(len(frames))) start = time.time() for i, segment in enumerate(segments): # Run deepspeech on the chunk that just completed VAD print("Processing chunk %002d" % (i, )) audio = np.frombuffer(segment, dtype=np.int16) # Run Deepspeech print('Running inference...') output = ds.stt(audio) print("Transcript: %s" % output)
def transcribe_file(audio_path, ds, threadcount): audio, sample_rate, audio_length = wavSplit.read_wave(audio_path) segments = list(wavSplit.frame_generator(CHUNK_SIZE, audio, sample_rate)) print("Beginning to process generated file chunks") inference_time = time.time() # make a set of queues for upstream and downstream communication WriteQueue = queue.Queue() ReadQueue = queue.Queue() workers = [] threadcount = int(threadcount) for i in range(threadcount): x = ChunkWorker( WriteQueue, ReadQueue, ds) # all chunks get the same queues and inference model x.start() workers.append(x) if len(segments) <= 1: print("SINGLE SEGMENT LENGTH IDENTIFIED", file=sys.stderr, flush=True) # this can occur when the chunk size is larger than the audio file, resulting in no chunking # we need to do the transcription manually if len(segments) == 0: audio_file = wave.open(audio_path, 'rb') output = ds.sttWithMetadata(audio_file.bytes, 1) # Run Deepspeech return json.dumps(output) print("Workers started...", file=sys.stderr, flush=True) for i, segment in enumerate(segments): print("Writing segment num {}".format(i), file=sys.stderr, flush=True) WriteQueue.put({'chunk': segment, 'index': i}) print("All Chunks sent...", file=sys.stderr, flush=True) for i in range(threadcount): print("stopping worker {}".format(i), file=sys.stderr, flush=True) WriteQueue.put({"index": -1}) # send a sentinel value to all threads for i in range(threadcount): workers[i].join() # wait for all threads to join print(" worker {} has joined".format(i), file=sys.stderr, flush=True) processed_chunks = [] for ele in list(ReadQueue.queue): print(ele, file=sys.stderr, flush=True) processed_chunks.append(ele) # each thread will send both the inference result # as well as the chunk id which is used to sort # the ReadQueue, allowing for asynchronous processing processed_chunks.sort(key=lambda p: p.get('index')) # because each chunk is processed discretely, each word will have a time # value within the range of zero and CHUNK_SIZE in seconds represented by a float value. # To accurately associate each word with its proper time within the media, and not within # the range described above, we need to add an offset to all tokens in each segment # - excluding the zeroth segment as it needs no adjustment. currentTime = 0.0 i = 0 adjusted_tokens = [] offset = CHUNK_SIZE / 1000 current_offset = 0 if len(processed_chunks) > 1: while i < len(processed_chunks): current_item = processed_chunks[i].get('result') if len( current_item ) != 0: # if we get a chunk that has no speech within it, such as instrumental music, ignore it j = 0 while j < len(current_item): current_word = current_item[j] current_word[ 'time'] = current_word['time'] + current_offset j = j + 1 adjusted_tokens.append(current_word) i = i + 1 current_offset = current_offset + offset else: print("done with file; took{}".format(time.time() - inference_time), file=sys.stderr, flush=True) return json.dumps(processed_chunks[0].get('result')) print("done with file; took{}".format(time.time() - inference_time), file=sys.stderr, flush=True) return json.dumps(adjusted_tokens)