def trim_long_silences(wav): """ Ensures that segments without voice in the waveform remain no longer than a threshold determined by the VAD parameters in params.py. :param wav: the raw waveform as a numpy array of floats :return: the same waveform with silences trimmed away (length <= original wav length) """ # Compute the voice detection window size samples_per_window = (vad_window_length * sampling_rate) // 1000 # Trim the end of the audio to have a multiple of the window size wav = wav[:len(wav) - (len(wav) % samples_per_window)] # Convert the float waveform to 16-bit mono PCM pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16)) # Perform voice activation detection voice_flags = [] vad = webrtcvad.Vad(mode=3) for window_start in range(0, len(wav), samples_per_window): window_end = window_start + samples_per_window voice_flags.append( vad.is_speech(pcm_wave[window_start * 2:window_end * 2], sample_rate=sampling_rate)) voice_flags = np.array(voice_flags) # Smooth the voice detection with a moving average def moving_average(array, width): array_padded = np.concatenate((np.zeros( (width - 1) // 2), array, np.zeros(width // 2))) ret = np.cumsum(array_padded, dtype=float) ret[width:] = ret[width:] - ret[:-width] return ret[width - 1:] / width audio_mask = moving_average(voice_flags, vad_moving_average_width) audio_mask = np.round(audio_mask).astype(np.bool) # Dilate the voiced regions audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1)) audio_mask = np.repeat(audio_mask, samples_per_window) return wav[audio_mask == True]
def main(): RATE = 16000 frame_duration_ms = 30 CHUNK = int(RATE * (frame_duration_ms / 1000.0)) FORMAT = pyaudio.paInt16 CHANNELS = 1 if not os.path.isdir('wavfile'): os.mkdir('wavfile') p = pyaudio.PyAudio() stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK) vad = webrtcvad.Vad(3) root = Tk() root.geometry("800x800") root.title('Result') lbl = Label(root, text="이름") lbl.config() lbl.config(width=50) lbl.config(font=("Courier", 44)) lbl.place(relx=0.5, rely=0.5, anchor=CENTER) t1 = threading.Thread(target=vad_thread, args=(RATE, frame_duration_ms, 300, vad, stream)) t2 = threading.Thread(target=speaker_recog_thread, args=(lbl, )) t1.daemon = True t2.daemon = True t1.start() t2.start() try: root.mainloop() except: pass finally: stream.stop_stream() stream.close() p.terminate()
def main(args): if len(args) != 2: sys.stderr.write( 'Usage: example.py <aggressiveness> <path to wav file>\n') sys.exit(1) audio, sample_rate = read_wave(args[1]) #audio, sample_rate = read_m4a(args[1]) #audio, sample_rate = read_libri(args[1]) vad = webrtcvad.Vad(int(args[0])) frames = frame_generator(30, audio, sample_rate) frames = list(frames) segments = vad_collector(sample_rate, 30, 300, vad, frames) total_wav = b"" for i, segment in enumerate(segments): total_wav += segment path = 'test.wav' write_wave(path, total_wav, sample_rate)
def detect_audio(audio_file_name): return_dict = {} audio, sample_rate = read_wave(os.path.join("static", audio_file_name)) vad = webrtcvad.Vad(2) frames = frame_generator(30, audio, sample_rate) frames = list(frames) segments = vad_collector(sample_rate, 30, 300, vad, frames) number = 0 for i, segment in enumerate(segments): number += 1 return_dict['count'] = number if (number > 0): return_dict['msg'] = "Voice detected" return_dict['code'] = "D" else: return_dict['msg'] = "Voice not detected" return_dict['code'] = "ND" return return_dict
def detect_speech(signal, sample_rate, window_size, aggressiveness, window_size_dilate=None): vad = webrtcvad.Vad(aggressiveness) frame_len = int(window_size * sample_rate) speech = torch.as_tensor([[ len(chunk) == frame_len and vad.is_speech(bytearray(chunk.numpy()), sample_rate) for chunk in channel.split(frame_len) ] for channel in signal]) #if window_size_dilate is not None: # kernel_size = int(window_size_dilate / window_size) # speech = F.max_pool1d(speech.unsqueeze(1).float(), stride = 1, kernel_size = kernel_size, padding = kernel_size // 2).squeeze(1).to(speech.dtype) return speech.repeat_interleave(frame_len, dim=-1)[:, :signal.shape[1]]
def apply_webrtc_vad(signal, sample_rate, frame_duration, agressiveness=3): vad = webrtcvad.Vad(agressiveness) frame_size = np.int(sample_rate * frame_duration / 1000) nb_frames = np.int(len(signal) / frame_size) signal_clean_vad = [] no_speech = [] for i in range(0, nb_frames): if vad.is_speech(signal[i*frame_size:(i+1)*frame_size], sample_rate) is True: signal_clean_vad = np.append(signal_clean_vad, signal[i*frame_size:(i+1)*frame_size-1]) no_speech = np.append(no_speech, np.zeros(frame_size)) else: no_speech = np.append(no_speech, np.ones(frame_size)) signal_clean_vad = np.divide(signal_clean_vad, max(signal_clean_vad)) max_signal = max(signal) for i in range(0, len(no_speech)): no_speech[i] = no_speech[i]*max_signal return signal_clean_vad, no_speech
def main(args): if len(args) != 2: sys.stderr.write('Usage: vad.py <aggressiveness> <path to wav file>\n') sys.exit(1) audio, sample_rate = read_wave(args[1]) vad = webrtcvad.Vad(int(args[0])) # frames = frame_generator(30, audio, sample_rate) frames = frame_generator( 10, audio, sample_rate) #increase resolution from 30 ms to 10 ms frames = list(frames) # segments = vad_collector(sample_rate, 30, 300, vad, frames) segments = vad_collector(sample_rate, 10, 300, vad, frames) #reduce padding from 300 ms to 200 ms for i, segment in enumerate(segments): path = 'C:\\Users\\rober\\Documents\\Projects\\vad_dataset\\chunk-%002d.wav' % ( i, ) print(' Writing %s' % (path, )) write_wave(path, segment, sample_rate)
def init(_recording_dir, _callback_func): global vad, pa, stream, recording_dir global recording_dir, callback_func recording_dir = _recording_dir callback_func = _callback_func vad = webrtcvad.Vad(0) pa = pyaudio.PyAudio() stream = pa.open( format=FORMAT, channels=CHANNELS, rate=RATE, input=True, start=False, # input_device_index=2, frames_per_buffer=CHUNK_SIZE)
def __init__(self, config, audio_files=[]): # empty list self.audio_files = audio_files # create voice acitvity detector with agressiveness of 3 self.vad = webrtcvad.Vad(3) # config parameters, explained in its respective file print(config) self._PATH_IN = config.IN_PATH self._PATH_OUT = config.OUT_PATH self._FORMAT_IN = config.FORMAT_IN self._FORMAT_OUT = config.FORMAT_OUT self._PREFIX = config.PREFIX self._NUM_SAMPLES = config.NUM_SAMPLES self._MAX_NUM_SPEAKER = config.MAX_NUM_SPEAKER self._MIXTURE_DURATION = config.CLIP_DURATION
def vad_file(self, filename): print(f"Starting {filename}") t0 = time.time() input_path = os.path.join(self.input_folder, filename) audio, sample_rate = read_wave(input_path) vad = webrtcvad.Vad(self.aggressivity) frames = frame_generator(30, audio, sample_rate) frames = list(frames) segments = vad_collector(sample_rate, 30, 300, vad, frames) # Segmenting the Voice audio and save it in list as bytes concataudio = [segment for segment in segments] joinedaudio = b"".join(concataudio) output_path = os.path.join(self.output_folder, filename.replace(".wav", f".wav")) write_wave(output_path, joinedaudio, sample_rate) print(f"{filename} done in {time.time()-t0:.3f}s")
def __init__(self, vad=webrtcvad.Vad(2), filter_order=1, filter_frequency=0.0002, threshold=0.2): """ :param vad: webrtcvad.vad object :param a: a array for lowpass filter :param b: b array for lowpass filter """ b, a = butter(filter_order, filter_frequency, 'low') self.vad = vad self.threshold = threshold self.filter_a = a self.filter_b = b self.frame_length_ms = 20 self.sample_rate = 8000
def main(args): if len(args) != 2: sys.stderr.write( 'Usage: VAD_tool.py <aggressiveness> <path to wav file>\n') sys.exit(1) audio, sample_rate = read_wave(args[1]) vad = webrtcvad.Vad(int(args[0])) frames = frame_generator(30, audio, sample_rate) frames = list(frames) segments = vad_collector(sample_rate, 30, 300, vad, frames) temp = b'' for i, segment in enumerate(segments): path = 'chunk-%002d.wav' % (i, ) #print(' Writing %s' % (path,)) #print(segment) temp += segment write_wave(args[1][:len(args[1]) - 4] + '_vad.wav', temp, sample_rate)
def main(): vad = webrtcvad.Vad(3) speech_count = 0 chunks = [] doa_chunks = int(DOA_FRAMES / VAD_FRAMES) try: with MicArray(RATE, CHANNELS, RATE * VAD_FRAMES / 1000) as mic: audInstance = mic.pyaudio_instance for chunk in mic.read_chunks(): wavframes.append(chunk.tostring()) # Use single channel audio to detect voice activity if vad.is_speech(chunk[0::CHANNELS].tobytes(), RATE): speech_count += 1 chunks.append(chunk) if len(chunks) == doa_chunks: if speech_count > (doa_chunks / 2): frames = np.concatenate(chunks) direction = mic.get_direction(frames) show(direction) now = datetime.datetime.now() file.write('{},{}\n'.format( now.strftime("%H:%M:%S %d-%m-%Y"), int(direction))) print('\n{},{}'.format( now.strftime("%H:%M:%S %d-%m-%Y"), int(direction))) speech_count = 0 chunks = [] except KeyboardInterrupt: file.close() wav = wave.open('session.wav', 'wb') wav.setnchannels(CHANNELS) wav.setsampwidth(audInstance.get_sample_size(pyaudio.paInt16)) wav.setframerate(RATE) wav.writeframes(b''.join(wavframes)) wav.close() print(" Audio recording is saved in file: session.wav") print(" Direction of arrival recorded in file: speaking.csv") print("Good Bye.....")
def vad_split(audio, rate, frame_duration, aggressiveness=1): """Splits the audio into audio segments on non-speech frames. Args: audio: A numpy ndarray, which has 1 dimension and values within -1.0 to 1.0 (inclusive) rate: An integer, which is the rate at which samples are taken frame_duration: A float, which is the duration of each frame to check Returns: A list of numpy ndarray, which are 1 dimension each and have values within -1.0 to 1.0 (inclusive) """ assert rate in (8000, 16000, 32000, 48000), ('Invalid Rate, use 8000, 16000, 32000, or 48000') assert frame_duration in (.01, .02, .03), ('Invalid frame_dur, use .01, .02, .03') assert 0 <= aggressiveness <= 3, ( 'Invalid aggressiveness, must be between 0 and 3') audio = (audio * np.iinfo('int16').max).astype('int16') vad = webrtcvad.Vad(aggressiveness) frame_size = int(rate * frame_duration) offset = 0 off = True voiced_frames = [] while offset + frame_size < len(audio): frame = audio[offset:offset + frame_size] if vad.is_speech(frame.tobytes(), rate): if off is True: off = False voiced_frames.append([frame]) else: voiced_frames[-1].append(frame) else: off = True offset += frame_size if len(voiced_frames) == 0: return np.array([audio]) for ndx in range(len(voiced_frames)): voiced_frames[ndx] = np.hstack(voiced_frames[ndx]) return voiced_frames
def record_vad(filePath="speech.wav", speechCount=40): """ 测试时发现鸟叫声也会造成影响,可以设定连续200-300ms的时间 :return: """ audio = pyaudio.PyAudio() vad = webrtcvad.Vad() vad.set_mode(0) frame = [] framesNum = 16000 * 20 // 1000 stream = audio.open(format=paInt16, channels=channels, rate=framerate, input=True, frames_per_buffer=framesNum) stream.start_stream() print("开始录音") count = 0 speechnum = 0 while not (speechnum > speechCount and count >= 10): if (count > 60): # 长时间没有说话,直接结束 return -2 data = stream.read(framesNum) frame.append(data) isSpeech = vad.is_speech(data, framerate) if (isSpeech): count = 0 speechnum += 1 else: count += 1 print("current speech:{}".format(isSpeech)) stream.stop_stream() audio.terminate() with wave.open(filePath, "wb") as f: f.setframerate(framerate) f.setnchannels(channels) f.setsampwidth(audio.get_sample_size(paInt16)) f.writeframes(b"".join(frame)) print("结束录音") pcm_path = filePath.split("wav")[0] + "pcm" #生成pcm文件名 wav2pcm.wav2pcm(filePath, pcm_path) return 0
def home(): global model model_path = './flask_app/static/h5_file/CRNN_04_epochs70_adam_CE_batch1_lr1e-05.pth.tar' if request.method == 'POST': #members = ['one','four','five','two','three','six'] members = ['five', 'three', 'six', 'four', 'one', 'two'] real = ['Ryan', 'Rick', 'Yanbo', 'Hsiaoen', 'Kunyu', 'Joyee'] #real = ['Kunyu','Hsiaoen','Ryan','Joyee','Rick','Yanbo'] # index = random.randint(0,len(members)-1) # name = members[index] from flask_app.static.CRNN import CRNN_04 as model model = load_model(model, model_path) audio, sample_rate = read_wave( './flask_app/static/wav_file/predict.wav') vad = webrtcvad.Vad(1) frames = frame_generator(30, audio, sample_rate) frames = list(frames) segments = vad_collector(sample_rate, 30, 300, vad, frames) for segment in segments: path = './flask_app/static/wav_file/after_vad.wav' write_wave(path, segment, sample_rate) noisy, sr = librosa.load('./flask_app/static/wav_file/after_vad.wav', sr=16000, mono=True) noisy = noisy / max(abs(noisy)) MFCC_fea = transforms.MFCC(16000, melkwargs={ 'n_fft': 512, 'hop_length': 160 })(torch.from_numpy(noisy)).squeeze().t() MFCC_fea = MFCC_fea.unsqueeze(0) pred = model(MFCC_fea).max(1)[1].numpy() # noisy,sr = librosa.load('./flask_app/static/wav_file/predict.wav', sr=16000, mono=True) #noisy = noisy[13000:26440] #mfcc = librosa.feature.mfcc(y=noisy, sr=sr,n_mfcc=40, dct_type=2, hop_length=256, n_fft=512, center=False) #fea = mfcc.transpose() #test = np.reshape(fea,(1,51,-1,1)) #model = load_model('./flask_app/static/h5_file/model.43-0.04.h5') #pre = model.predict(test) name = members[int(pred)] real_name = real[int(pred)] return render_template('home.html', name=name, real_name=real_name) return render_template('home.html', name='')
def test_leak(self): sound, fs = self._load_wave('leak-test.wav') frame_ms = 0.010 frame_len = int(round(fs * frame_ms)) n = int(len(sound) / (2 * frame_len)) nrepeats = 1000 vad = webrtcvad.Vad(3) used_memory_before = memory_usage(-1)[0] for counter in range(nrepeats): find_voice = False for frame_ind in range(n): slice_start = (frame_ind * 2 * frame_len) slice_end = ((frame_ind + 1) * 2 * frame_len) if vad.is_speech(sound[slice_start:slice_end], fs): find_voice = True self.assertTrue(find_voice) used_memory_after = memory_usage(-1)[0] self.assertGreaterEqual(used_memory_before / 5.0, used_memory_after - used_memory_before)
def main(args): if len(args) != 2: sys.stderr.write( 'Usage: example.py <aggressiveness> <path to wav file>\n') sys.exit(1) audio, sample_rate = read_wave(args[1]) # print(len(audio)) vad = webrtcvad.Vad(int(args[0])) frames = frame_generator(30, audio, sample_rate) frames = list(frames) segments = vad_collector(sample_rate, 30, 300, vad, frames) # print(len(segments)) # print(segments[0]) for i, segment in enumerate(segments): print(len(segment)) # print(segment) path = 'chunk-%002d.wav' % (i, ) print(' Writing %s' % (path, )) write_wave(path, segment, sample_rate)
def initialize(self): """Initialize a Hermes audio recorder.""" self.logger.debug('Probing for available input devices...') for index in range(self.audio.get_device_count()): device = self.audio.get_device_info_by_index(index) name = device['name'] channels = device['maxInputChannels'] if channels: self.logger.debug('[%d] %s', index, name) try: self.audio_in = self.audio.get_default_input_device_info()['name'] except OSError: raise NoDefaultAudioDeviceError('input') self.logger.info('Connected to audio input %s.', self.audio_in) if self.config.vad.enabled: self.logger.info('Voice Activity Detection enabled with mode %s.', self.config.vad.mode) self.vad = webrtcvad.Vad(self.config.vad.mode)
def brain(df_list, agg, frame, padding, output): Process = os.getpid() try: for i in range(len(df_list)): abs_wav_files = df_list.values[i][0] abs_filename = os.path.splitext(os.path.basename(abs_wav_files))[0] audio, sample_rate = read_wave(abs_wav_files) vad = webrtcvad.Vad(int(agg)) frames = frame_generator(frame, audio, sample_rate) frames = list(frames) segments = vad_collector(sample_rate, frame, padding, vad, frames) for j, segment in enumerate(segments): path = os.path.join(output, abs_filename) + '-%0002d.wav' % (j + 1, ) print("Process {}: Created file {}".format(Process, path)) write_wave(path, segment, sample_rate) except EOFError as e: print("Empty file {}".format(abs_wav_files_filename)) pass
def vad_process(path, dataset): # VAD Process if dataset == "vox1": audio, sample_rate = read_wave(path) elif dataset == "vox2": audio, sample_rate = read_m4a(path) elif dataset == "librispeech": audio, sample_rate = read_libri(path) vad = webrtcvad.Vad(1) frames = frame_generator(30, audio, sample_rate) frames = list(frames) segments = vad_collector(sample_rate, 30, 300, vad, frames) total_wav = b"" for i, segment in enumerate(segments): total_wav += segment # Without writing, unpack total_wav into numpy [N,1] array wav_arr = np.frombuffer(total_wav, dtype=np.int16) #print("read audio data from byte string. np array of shape:"+str(wav_arr.shape)) return wav_arr, sample_rate
def __init__(self, aggressiveness=2, sample_rate=SAMPLE_RATE, min_utt_length=MIN_UTT_LENGTH, max_utt_length=MAX_UTT_LENGTH, max_utt_gap=MAX_UTT_GAP): self.sample_rate = sample_rate self.vad = webrtcvad.Vad() self.vad.set_mode(aggressiveness) self.state = STATE_IDLE self.buf = [] self.buf_sent = 0 self.min_buf_entries = int(min_utt_length * 1000) / BUFFER_DURATION self.max_buf_entries = int(max_utt_length * 1000) / BUFFER_DURATION self.max_gap = int(max_utt_gap * 1000) / BUFFER_DURATION
def detect_voices(path, convert=True, agg=3, save_samples=False, voices_folder='voices', int16_folder='int16'): """ path: path for file to detect voices on. If not int16 then it must be converted as well convert: if this is True then convert the file to an int16 version and save in a specified folder + run the rest of function using that file agg: the aggressiveness of the voice detection. 3 is most aggressive. I suggest 3 all the time.(https://github.com/wiseman/py-webrtcvad) save_voices: if you want to save file of all voices to make sure they are actually voices voices_folder: where you want to save the tagged audio(voices) int16_folder: where you want to save the int16 version use for detecting voices """ print 'detecting voices' if convert: #change path to path of converted file path = convert_file_format(path, int16_folder) audio, sample_rate = read_wave(path) vad = webrtcvad.Vad(int(agg))#https://github.com/wiseman/py-webrtcvad check that out frames = frame_generator(30, audio, sample_rate) frames = list(frames)#list of 3ms frames segments = vad_collector(sample_rate, 30, 300, vad, frames) bytes = b'' timestamps = [] for segment in segments: timestamps.append([segment[0].timestamp, segment[-1].timestamp]) if save_samples: for s in segment: bytes += s.bytes if save_samples: if not os.path.exists(voices_folder): os.makedirs(voices_folder) basename = os.path.basename(path).split('.')[0] voices_path = voices_folder + '/' + basename + '_voices.wav' write_wave(voices_path, bytes, sample_rate)#voices can be used to listen to the 'tagged audio' return timestamps
def __init__(self) -> None: self.count = 0 # for interval self.is_run = True self.stream = None with tempfile.NamedTemporaryFile() as f: self.path = f.name self.vad_mode = 3 self.vad = webrtcvad.Vad(self.vad_mode) self.pipeline = torch.hub.load('pyannote/pyannote-audio', 'sad_ami', pipeline=True) self.rate = 16000 self.chunk_duration_ms = 30 # supports 10, 20 and 30 (ms) self.chunk_size = int(self.rate * self.chunk_duration_ms / 1000) # chunk to read self.voiced_frames_rate = 0.9 self.unvoiced_frames_rate = 0.9 self.max_voiced_frames = 100 self.leave = False
def main(args): if len(args) != 2: sys.stderr.write('Usage: example.py <out_dir> <path to wav file>\n') sys.exit(1) audio, sample_rate = read_wave(args[1]) out_dir = args[0] if os.path.exists(out_dir): shutil.rmtree(out_dir) os.makedirs(out_dir) vad = webrtcvad.Vad(3) frames = frame_generator(30, audio, sample_rate) frames = list(frames) #print(len(frames)) segments = vad_collector(sample_rate, 30, 900, vad, frames) for i, segment in enumerate(segments): path = os.path.join(out_dir, 'chunk-%0003d-%d.wav' % (i, segment[1])) #print(segment[1]) #print(' Writing %s' % (path,)) write_wave(path, segment[0], sample_rate)
def main(args): if len(args) != 2: sys.stderr.write( 'Usage: example.py <aggressiveness> <path to wav file>\n') sys.exit(1) audio, sample_rate = read_wave(args[1]) vad = webrtcvad.Vad(int(args[0])) frames = frame_generator(30, audio, sample_rate) frames = list(frames) segments = vad_collector(sample_rate, 30, 300, vad, frames) if not os.path.isdir(args[1]+'_splitted'): os.mkdir(args[1]+'_splitted') for i, segment in enumerate(segments): path = args[1]+'_splitted/chunk-%002d.wav' % (i,) print(' Writing %s' % (path,)) write_wave(path, segment, sample_rate) print("\n")
def main(args): if len(args) != 2: sys.stderr.write( 'Usage: silenceremove.py <aggressiveness> <path to wav file>\n') sys.exit(1) pcm_data, sample_rate, num_channels, sample_width = read_wave(args[1]) # Aggressiveness mode # An integer between 0 and 3. vad = webrtcvad.Vad(int(args[0])) frames = frame_generator(30, audio, sample_rate) frames = list(frames) segments = vad_collector(sample_rate, 30, 300, vad, frames) # Segmenting the Voice audio and save it in list as bytes concataudio = [segment for segment in segments] joinedaudio = b"".join(concataudio) write_wave("Non-Silenced-Audio.wav", joinedaudio, sample_rate)
def filter_voice(signal, sample_rate, mode=3, threshold_voice=80): signal = np.array(signal, dtype=np.int16) signal = np.ascontiguousarray(signal) vad = webrtcvad.Vad(mode) frames = frame_generator(10, signal, sample_rate) frames = list(frames) if len(frames) == 0: return 0 match = 0 for frame in frames: is_speech = vad.is_speech(frame.bytes, sample_rate) if is_speech: match += 1 percentage_voice = match * 100 / len(frames) return percentage_voice > threshold_voice
def main(args): if len(args) != 2: sys.stderr.write( 'Usage: example.py <aggressiveness> <path to wav file>\n') sys.exit(1) fn = args[1] fn_base = os.path.splitext(fn)[0].split('/')[-1] audio, sample_rate = read_wave(fn) vad = webrtcvad.Vad(int(args[0])) frames = frame_generator(30, audio, sample_rate) frames = list(frames) segments = vad_collector(sample_rate, 30, 300, vad, frames) for i, segment in enumerate(segments): path = '%s-chunk-%002d.wav' % ( fn_base, i, ) print(' Writing %s' % (path, )) write_wave(path, segment, sample_rate, stt=0)
def remove_silence(wav, sr=_sr, max_silence_ms=20): """ 去除语音中的静音。 :param wav: :param sr: :param max_silence_ms: 单位ms :return: """ # Compute the voice detection window size wav = librosa.resample(wav, orig_sr=sr, target_sr=_sr) vad_window_length = 20 vad_moving_average_width = 10 samples_per_window = (vad_window_length * _sr) // 1000 # Trim the end of the audio to have a multiple of the window size wav = wav[:len(wav) - (len(wav) % samples_per_window)] # Convert the float waveform to 16-bit mono PCM pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * _int16_max)).astype(np.int16)) # Perform voice activation detection voice_flags = [] vad = webrtcvad.Vad(mode=3) for window_start in range(0, len(wav), samples_per_window): window_end = window_start + samples_per_window voice_flags.append( vad.is_speech(pcm_wave[window_start * 2:window_end * 2], sample_rate=_sr)) voice_flags = np.array(voice_flags) audio_mask = moving_average(voice_flags, vad_moving_average_width) audio_mask = np.round(audio_mask).astype(np.bool) # Dilate the voiced regions audio_mask = binary_dilation(audio_mask, np.ones(max_silence_ms + 1)) audio_mask = np.repeat(audio_mask, samples_per_window) out = wav[audio_mask == True] out = librosa.resample(out, orig_sr=_sr, target_sr=sr) return out