def run(self): #global my_mutex global numRecordings global audioNum global processNum megaArray = [] #while processNum < numRecordings: while goingFlag == True or processNum < audioNum: while processNum >= audioNum: y = 0 #print processNum if processNum > 0: subprocess.check_output(['rm', wavFile]) #jsonFile = "/home/pi/results" + `processNum` + ".json" wavFile = "/home/pi/talking" + ` processNum ` + ".wav" print "Processing " + wavFile foFile = "fo" + ` processNum ` + ".fo" pmFile = "pm" + ` processNum ` + ".pm" timeEnd = q.get() print timeEnd #print subprocess.check_output(['python', '/home/pi/detectVoiceInWav.py', wavFile, jsonFile, str(timeEnd)]) v = VoiceActivityDetector(wavFile) raw_detection = v.detect_speech() speech_labels = v.convert_windows_to_readible_labels( raw_detection, str(timeEnd)) megaArray.append(speech_labels) save_to_file(megaArray, soundName) #print subprocess.check_output(['./reaper/REAPER/build/reaper', '-i', wavFile, '-f', foFile, '-p', pmFile, '-a']) processNum = processNum + 1
def getSpeechStartTime(self, fileName): v = VoiceActivityDetector(fileName) windows = v.detect_speech() for i in range(0, len(windows)): arr = windows[i] if arr[len(arr) - 1] == 1: # VAD breaks wav file into 20 ms windowed chunks, with a 10 ms overlay return i * 10 return -1
def get_score(file_name): v = VoiceActivityDetector(file_name) raw_detection = v.detect_speech() speech_labels = v.convert_windows_to_readible_labels(raw_detection) start = 0 end = 0 if len(speech_labels) > 0: start = speech_labels[0]['speech_begin'] end = speech_labels[-1]['speech_end'] return end - start
def apply_VAD(phonation_files, new_wav_path): # speech_HD = {0:[0,0], 2:[0,0], 3:[0,0]} # for wav in phonation_files: # optimal_thresh = 0.5 # v = VoiceActivityDetector(wav) # v.speech_energy_threshold = optimal_thresh # frames = v.detect_speech() # speech_labels = v.convert_windows_to_readible_labels(frames) # # print('spkr wav', wav.split('/')[-1]) # # print(speech_labels) # # exit() # spkr_id = wav.split('/')[-1].split('_')[0] # if len(speech_labels) > 1: #if more than 1 split # speech_HD[early_late_balanced[spkr_id]][1]+=1 # else: # speech_HD[early_late_balanced[spkr_id]][0]+=1 # print(speech_HD) # exit() for thresh_sweep in [0.25, 0.3, 0.35]: #0.5 too high, no speech detected for some spkrs for wav in phonation_files: optimal_thresh = thresh_sweep v = VoiceActivityDetector(wav) v.speech_energy_threshold = optimal_thresh frames = v.detect_speech() speech_labels = v.convert_windows_to_readible_labels(frames) #output new wav old_basename = wav.split('/')[-1] new_wav_path_dir = os.path.join(new_wav_path, str(thresh_sweep)) if not os.path.exists(new_wav_path_dir): os.makedirs(new_wav_path_dir) out_wav = os.path.join(new_wav_path_dir, old_basename) # print(frames) # for i in frames: # if i[1] > 0: # print(i) # print('wav', wav) # print(speech_labels) # print(thresh_sweep) if '38717' in wav: print(speech_labels) exit() first_vad_instance = speech_labels[0] beg_t = first_vad_instance['speech_begin'] end_t = first_vad_instance['speech_end'] transformer = sox.Transformer() transformer.trim(float(beg_t),float(end_t)) transformer.convert(samplerate=32000, n_channels=1) transformer.build(wav, out_wav)
def human_voice_detect(wave_file): # noise reduction denoised_wave = denoise(wave_file) # band pass filter perform_bandpass(denoised_wave) # run VAD v = VoiceActivityDetector('filtered_file.wav') array = v.detect_speech() #print array return parse_voice_array(array.tolist())
def run(self): #global my_mutex global numRecordings global audioNum global processNum megaArray = [] currDay = 1 fileChar = "A" #while processNum < numRecordings: while goingFlag == True or processNum < audioNum: while processNum >= audioNum: y=0 #print processNum try: if processNum > 0: subprocess.check_output(['rm', wavFile]) #jsonFile = "/home/pi/results" + `processNum` + ".json" wavFile = dir_path + "/talking" + `processNum` + ".wav" print "Processing " + wavFile #foFile = "fo" + `processNum` + ".fo" #pmFile = "pm" + `processNum` + ".pm" timeEnd = q.get() tempDay = currDay currDay = qSoundJson.get() if tempDay != currDay: megaArray = [] soundName = dir_path + "/data/day" + str(currDay) + "/" + fileChar + "/sound" + str(fileNum) + ".json" if fileChar == "A": fileChar = "B" else: fileChar = "A" print timeEnd #print subprocess.check_output(['python', '/home/pi/detectVoiceInWav.py', wavFile, jsonFile, str(timeEnd)]) v = VoiceActivityDetector(wavFile) raw_detection = v.detect_speech() speech_labels = v.convert_windows_to_readible_labels(raw_detection, str(timeEnd)) megaArray.append(speech_labels) save_to_file(megaArray, soundName) #print subprocess.check_output(['./reaper/REAPER/build/reaper', '-i', wavFile, '-f', foFile, '-p', pmFile, '-a']) processNum = processNum + 1 except subprocess.CalledProcessError as e: logging.exception("message") sense.clear() with open("processWav.txt", "w") as text_file: text_file.write("Total wavs processed: {}".format(str(processNum))) with open("createWav.txt", "w") as t2: t2.write("Total wavs supposed to be processed: {}".format(str(audioNum)))
def containsSpeech(inputfile, saveToFileFlag=False, outputfile='temp.json'): #if __name__ == "__main__": #parser = argparse.ArgumentParser(description='Analyze input wave-file and save detected speech interval to json file.') #parser.add_argument('inputfile', metavar='INPUTWAVE',help='the full path to input wave file') #parser.add_argument('outputfile', metavar='OUTPUTFILE',help='the full path to output json file to save detected speech intervals') #args = parser.parse_args() v = VoiceActivityDetector(inputfile) raw_detection = v.detect_speech() speech_labels = v.convert_windows_to_readible_labels(raw_detection) if (saveToFileFlag): save_to_file(speech_labels, outputfile) if (len(speech_labels) != 0): return True else: return False
def detect_speech(input_file, output_file="results.txt"): try: v = VoiceActivityDetector(input_file) raw_detection = v.detect_speech() logger.info("Speech detection complete") speech_labels = v.convert_windows_to_readible_labels(raw_detection) speech = get_speech_duration(speech_labels) logger.info( "File: {}, Duration of video: {}, Duration of speech: {}".format( input_file, v.duration, speech)) empty_venue = (speech / v.duration) < 0.05 save_to_file(empty_venue, output_file, True) logger.info("File: {}, Empty: {}".format(input_file, empty_venue)) except Exception as e: save_to_file(False, output_file, False) logger.error( "Something went wrong while trying to detect speech: {}".format(e))
def vad_trial(file): # Create our detector v = VoiceActivityDetector(file) # Generate VAD intervals for the file speech_labels = v.convert_windows_to_readible_labels(v.detect_speech()) # Get file info data = mediainfo(file) # Add all the times together total_time = 0 for speech_data in speech_labels: # add the length of the speech total_time += speech_data['speech_end'] - speech_data['speech_begin'] percent = total_time / float(data['duration']) return percent
def __init__(self, wave_input_filename, output_File, duration_of_video): v = VoiceActivityDetector(wave_input_filename) raw_detection = v.detect_speech() speech_labels = v.convert_windows_to_readible_labels(raw_detection) speech_labels = [float(i) for i in speech_labels] if len(speech_labels) == 0: with open(output_File, 'w') as fp: fp.write(str(0.00) + " " + str(duration_of_video) + "\n") return final = [] if speech_labels[0] != 0.00: final.append(0.00) final.append(speech_labels[0] - 0.01) count = 1 length = duration_of_video last = len(speech_labels) - 1 while count <= last: final.append(speech_labels[count] + 0.01) count += 1 if count >= last: final.append(length) else: final.append(speech_labels[count] - 0.01) count += 1 count = 1 while count < len(final): if final[count] - final[ count - 1] < 4: # 4 seconds is the minimum limit for caption, as sentence can't fit in less than 4 seconds del final[count] del final[count - 1] else: count += 2 if len(final) != 0: with open(output_File, 'w') as fp: index = 0 while index < len(final): fp.write( str(final[index]) + " " + str(final[index + 1]) + "\n") index += 2
def split_wavs(inputfile, basedir): v = VoiceActivityDetector(inputfile) raw_detection = v.detect_speech() speech_labels = v.convert_windows_to_readible_labels(raw_detection) out_file_list = [] begin = speech_labels[0]["speech_begin"] last_end = speech_labels[0]["speech_end"] valid_begin = begin valid_end = last_end num = len(speech_labels) i = 0 for labels in speech_labels[1:]: i += 1 end = labels["speech_end"] if end - begin > 59: valid_end = last_end valid_begin = begin print valid_begin, valid_end outputfile = os.path.join(basedir, "out_"+str(i)+".wav") save_wav(valid_begin, valid_end, inputfile, outputfile) out_file_list.append(outputfile) elif end - begin > 50: valid_end = end valid_begin = begin print valid_begin, valid_end outputfile = os.path.join(basedir, "out_"+str(i)+".wav") save_wav(valid_begin, valid_end, inputfile, outputfile) out_file_list.append(outputfile) if i < num: begin = speech_labels[i+1]["speech_begin"] if valid_end < speech_labels[-1]["speech_end"]: valid_begin= begin valid_end = speech_labels[-1]["speech_end"] print begin , speech_labels[-1]["speech_end"] outputfile = os.path.join(basedir, "out_"+str(i)+".wav") save_wav(valid_begin, valid_end, inputfile, outputfile) out_file_list.append(outputfile) return out_file_list
def main(): fn_s = glob.glob( './wav/171219101741/*.wav') # set the wave data directory. now = datetime.datetime.now().strftime('%y%m%d%H%M%S') save_path = os.path.join('./note_annotation', now) os.makedirs(save_path) for fn_full in fn_s: fn, ext = os.path.splitext(fn_full) v = VoiceActivityDetector(fn + ext) dw, sve, pe = v.detect_speech( ) # dw: detected windows; sve: sum_voice_energy_s; pe: peak energy st = v.convert_windows_to_readible_labels( dw ) # return the list of speech times based on the detected window lists. data_passed_hilbert_sum = data2hilbert(v.data, v.rate) # hilbert transform note_time = split_frame_hilbert_peak(dw, st, data_passed_hilbert_sum) # here set frame length for putting in mfcc compute 0.1 df_note_time = pd.DataFrame( np.vstack( (note_time[0][:], note_time[0][:] + 0.1, note_time[2][:])).T, columns=["start_time", "end_time", "phrase_label"]) df_note_time_sel = df_note_time[df_note_time.phrase_label > 0] df_note_time_sel.reset_index() df_note_time_sel.to_csv(save_path + '/' + fn.split('/')[-1] + '_phrase_peak_time.csv') mfcc_s_str_100_wo = '' for i in range(len(df_note_time_sel.index)): mfcc_s_100 = get_mfcc_s(v.data, v.rate, df_note_time_sel['start_time'].iloc[i], df_note_time_sel['end_time'].iloc[i], frames=0.1, overlap=0.1) mfcc_s_str_100_wo += mfcc2str(mfcc_s_100) + '\n' with open( save_path + '/' + fn.split('/')[-1] + '_100msec_no_overlap_at_peak_loc.ghmm', 'w') as f_samples_100: f_samples_100.write(mfcc_s_str_100_wo) print('done for %s' % fn)
def main(): parser = argparse.ArgumentParser( description= 'Analyze input wave-file and save detected speech interval to json file.' ) parser.add_argument('inputfile', metavar='INPUTWAVE', help='the full path to input wave file') parser.add_argument( 'outputfile', metavar='OUTPUTFILE', help= 'the full path to output json file to save detected speech intervals') args = parser.parse_args() v = VoiceActivityDetector(args.inputfile) raw_detection = v.detect_speech() speech_labels = v.convert_windows_to_readible_labels(raw_detection) print("") print("Saving Detected Vocal Ranges to disk") save_to_file(speech_labels, args.outputfile) print("Done")
def main(wave_file, txt_output): wr = wave.open(wave_file, 'r') par = list(wr.getparams()) par[3] = 0 filtered_wav = wave_file[:-4] filtered_wav = filtered_wav+"_fltrd.wav" ww = wave.open(filtered_wav, 'w') ww.setparams(tuple(par)) lowpass = 300 highpass = 3400 sz = wr.getframerate() c = int(wr.getnframes()/sz) for num in range(c): print('Processing {}/{} s'.format(num+1, c)) da = np.fromstring(wr.readframes(sz), dtype=np.int16) left, right = da[0::2], da[1::2] # left and right channel lf, rf = np.fft.rfft(left), np.fft.rfft(right) lf[:lowpass], rf[:lowpass] = 0, 0 # low pass filter lf[55:66], rf[55:66] = 0, 0 # line noise lf[highpass:], rf[highpass:] = 0,0 # high pass filter nl, nr = np.fft.irfft(lf), np.fft.irfft(rf) ns = np.column_stack((nl,nr)).ravel().astype(np.int16) ww.writeframes(ns.tostring()) wr.close() ww.close() v = VoiceActivityDetector(filtered_wav) with open(txt_output, "w") as open_file: array = v.detect_speech() open_file.write(str(array)) open_file.close()
def recognize_speech(audiofile): from vad import VoiceActivityDetector v = VoiceActivityDetector(audiofile) detected = v.detect_speech() # detected2 = v.convert_windows_to_readible_labels(detected) return detected # returns array of window numbers and speech flags (1 - speech, 0 - nonspeech)
def main(wave_file): v = VoiceActivityDetector(wave_file) array = v.detect_speech() print array.tolist()
from vad import VoiceActivityDetector import argparse import json def save_to_file(data, filename): with open(filename, 'w') as fp: json.dump(data, fp) if __name__ == "__main__": parser = argparse.ArgumentParser(description='Analyze input wave-file and save detected speech interval to json file.') # parser.add_argument('inputfile', metavar='INPUTWAVE', # help='the full path to input wave file') parser.add_argument('outputfile', metavar='OUTPUTFILE', help='the full path to output json file to save detected speech intervals') args = parser.parse_args() v = VoiceActivityDetector('rec_unlimited_7iwka57k.wav') raw_detection = v.detect_speech() speech_labels = v.convert_windows_to_readible_labels(raw_detection) save_to_file(speech_labels, args.outputfile)
def save_to_file(data, filename): with open(filename, 'w') as fp: json.dump(data, fp) if __name__ == "__main__": parser = argparse.ArgumentParser( description= 'Analyze input wave-file and save detected speech interval to json file.' ) # parser.add_argument('inputfile', metavar='INPUTWAVE', # help='the full path to input wave file') parser.add_argument( 'outputfile', metavar='OUTPUTFILE', help= 'the full path to output json file to save detected speech intervals') args = parser.parse_args() # Create a speech detection object and read in the audio data v = VoiceActivityDetector('FILE_NAME.wav') # Detect speech in the given data raw_detection = v.detect_speech() # Convert the detected speech signals to a readable format speech_labels = v.convert_windows_to_readible_labels(raw_detection) # Save the converted and detected speech to a file save_to_file(speech_labels, args.outputfile)
import argparse import json def save_to_file(data, filename): with open(filename, 'w') as fp: json.dump(data, fp) if __name__ == "__main__": parser = argparse.ArgumentParser( description= 'Analyze input wave-file and save detected speech interval to json file.' ) parser.add_argument('inputfile', metavar='INPUTWAVE', help='the full path to input wave file') parser.add_argument( 'outputfile', metavar='OUTPUTFILE', help= 'the full path to output json file to save detected speech intervals') args = parser.parse_args() v = VoiceActivityDetector(args.inputfile) raw_detection = v.detect_speech() speech_labels = v.convert_windows_to_readible_labels(raw_detection) print(speech_labels) save_to_file(speech_labels, args.outputfile)
def includes_speech(file): v = VoiceActivityDetector(file) raw_detection = v.detect_speech() return v.includes_speech(raw_detection, thresh=3)
EAnalysis = load_model("../Emotion_analysis") sad = np.array([["Image", "Emotion"]], dtype=object) predit = np.array( [['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']], dtype=object) for file in os.listdir(he.FACEDIR): face = os.path.join(he.FACEDIR, file) img = cv2.imread(face) lastimg = cv2.resize(img, (48, 48)) gray_image = lastimg / 255 out = gray_image[np.newaxis, :, :] predictions = EAnalysis.predict(out) emotion = he.getEmotion(predictions) sad = np.append(sad, [[face, emotion]], axis=0) predit = np.append(predit, predictions, axis=0) f = open('emotion.dat', 'ab') np.savetxt(f, sad, fmt='%s') f.close() x = open('prediction.dat', 'ab') np.savetxt(x, predit, fmt='%s') x.close() v = VoiceActivityDetector("audio.wav") raw_detection = v.detect_speech() speech_labels = v.convert_windows_to_readible_labels(raw_detection) he.save_to_file(speech_labels, "vad.json")
def main(): datadir = './dataset/train/audio/' labels = { 'yes': 0, 'no': 1, 'up': 2, 'down': 3, 'left': 4, 'right': 5, 'on': 6, 'off': 7, 'go': 8, 'stop': 9 } labels_list = np.array([ 'yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'go', 'stop' ]) train_in = [] train_out = [] test_in = [] test_out = [] removal_thr = 0 vadcount = 0 for word, label in labels.items(): print(word) dirpath = datadir + word files = os.listdir(dirpath) np.random.shuffle(files) for i, fn in enumerate(files): filepath = os.path.join(dirpath, fn) sample_rate, samples = wavfile.read(filepath) #print(filepath) # run voice activity detector v = VoiceActivityDetector(filepath) detected_windows = v.detect_speech() speechtime = v.convert_windows_to_readible_labels(detected_windows) silence_free = np.array([]) if len(speechtime) != 0: for segment in speechtime: n0 = int(segment['speech_begin'] * sample_rate) n1 = int(segment['speech_end'] * sample_rate) silence_free = np.append(silence_free, samples[n0:n1]) #print(len(silence_free)) # !!! IMPORTANT check the quality of silence removel # bad ones will crush the spectrogram step if len(silence_free) >= 3200: vadcount += 1 freqs, times, spectrogram = log_specgram(silence_free, sample_rate, window_size=10, step_size=5) else: freqs, times, spectrogram = log_specgram(samples, sample_rate, window_size=10, step_size=5) #plt.pcolormesh(freqs, times, spectrogram, cmap='coolwarm') #plt.show() # average spectrogram shape is (97.3,161) vvvvv: x and y are reversed in cv2 spectrogram = cv2.resize(spectrogram, dsize=(161, 100), interpolation=cv2.INTER_CUBIC) spectrogram = spectrogram.reshape(100, 161, 1) if i < len(files) / 5: test_in.append(spectrogram) test_out.append(one_hot_encode(label)) else: train_in.append(spectrogram) train_out.append(one_hot_encode(label)) train_in = np.array(train_in, dtype=np.float32) train_out = np.array(train_out, dtype=np.int32) test_in = np.array(test_in, dtype=np.float32) test_out = np.array(test_out, dtype=np.int32) print(train_in.shape) print(train_out.shape) print(test_in.shape) print(test_out.shape) print(vadcount, ' audios were processed with VAD.') save_filename = './mini_speech_data2.npz' np.savez(save_filename, train_in=train_in, train_out=train_out, test_in=test_in, test_out=test_out, labels=labels_list)
from pydub import AudioSegment from vad import VoiceActivityDetector sound = AudioSegment.from_wav('Test1-master/Test/mic1.wav') slicedSounds = sound[::500] slicedSounds = list(slicedSounds) for i in range(0, len(slicedSounds)): slicedSounds[i].export('trash.wav', format='wav') obj = VoiceActivityDetector('trash.wav') data = obj.detect_speech() Found = False for i in range(0, len(data)): if data[i][len(data[i]) - 1] == 1: speech = AudioSegment.from_wav('trash.wav') speech.export('speech.wav', format='wav') Found = True if Found: break
child_puzzle_wav = FLAGS.child_puzzle_wav mom_puzzle_wav = FLAGS.mom_puzzle_wav mom_puzzle_textgrid = FLAGS.mom_puzzle_textgrid child_outfile_textgrid = FLAGS.child_outfile_textgrid add_seconds_at_boundary = FLAGS.add_seconds_at_boundary child_segment_wav_outdir = FLAGS.child_segment_wav_outdir mom_segment_wav_outdir = FLAGS.mom_segment_wav_outdir if not os.path.exists(child_segment_wav_outdir): os.makedirs(child_segment_wav_outdir) if not os.path.exists(mom_segment_wav_outdir): os.makedirs(mom_segment_wav_outdir) # detects child speech parts v = VoiceActivityDetector(child_puzzle_wav) data = v.data total_time = len(data) * 1.0 / v.rate total_time = float("{0:.2f}".format(total_time)) speech_time, mom_tier = child_speech_detector(mom_puzzle_textgrid, v) # export detected child speech segments wav turns = export_child_audio_segments(total_time, child_puzzle_wav, add_seconds_at_boundary, child_segment_wav_outdir, speech_time) total_turns = turns tier = write_to_txtgrids('Machine-Label-CS', turns) # modify manually annotated mom speech segments, and export the wav segments mom_turns = export_mom_audio_segments(mom_puzzle_wav, mom_tier, mom_segment_wav_outdir)
from vad import VoiceActivityDetector import argparse import json def save_to_file(data, filename): with open(filename, 'w') as fp: json.dump(data, fp) if __name__ == "__main__": parser = argparse.ArgumentParser(description='Analyze input wave-file and save detected speech interval to json file.') parser.add_argument('inputfile', metavar='INPUTWAVE', help='the full path to input wave file') parser.add_argument('outputfile', metavar='OUTPUTFILE', help='the full path to output json file to save detected speech intervals') args = parser.parse_args() v = VoiceActivityDetector(args.inputfile) raw_detection = v.detect_speech() speech_labels = v.convert_windows_to_readible_labels(raw_detection) save_to_file(speech_labels, args.outputfile)
x = AudioSegment.from_wav(fullFileName) x = x.set_channels(1) x.export(FILE_FOLDER + FILE_NAME + '-mono.wav', format="wav") waveFile = wave.open(FILE_FOLDER + FILE_NAME + '-mono.wav', 'r') # Getting some variables from file signal = waveFile.readframes(-1) signal = np.fromstring(signal, 'Int16') frameRate = waveFile.getframerate() Time = np.linspace(0, len(signal) / frameRate, num=len(signal)) totalFrames = waveFile.getnframes() wavFileLengthSec = totalFrames / frameRate # VOICE ACTIVITY DETECTION (VAD) vad = VoiceActivityDetector(fullFileName) raw_detection = vad.detect_speech() speech_labels, starters, enders = vad.convert_windows_to_readible_labels( raw_detection) # list of beginnings of voice detected segments (in frames) starterFrames = [int(i) for i in starters] # list of endings of voice detected segments (in frames) enderFrames = [int(i) for i in enders] # if the last voice detected segment isn't closed, use the last frame as end of this segment if len(starterFrames) > len(enderFrames): enderFrames.append(totalFrames) # list of beginnings of voice detected segments (in milliseconds) starterMs = [int((i / frameRate) * 1000) for i in starterFrames]
def run(self): #global my_mutex global numRecordings global audioNum global processNum megaArray = [] currDay = 1 fileChar = "A" wavFile = "" #while processNum < numRecordings: while goingFlag == True or processNum < audioNum: while processNum >= audioNum: y = 0 #print processNum try: if processNum > 0: subprocess.check_output(['rm', wavFile]) #jsonFile = "/home/pi/results" + `processNum` + ".json" wavFile = dir_path + "/talking" + ` processNum ` + ".wav" print "Processing " + wavFile #foFile = "fo" + `processNum` + ".fo" #pmFile = "pm" + `processNum` + ".pm" timeEnd = q.get() tempDay = currDay currDay = qSoundJson.get() if tempDay != currDay: megaArray = [] soundName = dir_path + "/data/day" + str( currDay) + "/" + fileChar + "/sound" + str( fileNum) + ".json" if fileChar == "A": fileChar = "B" else: fileChar = "A" print timeEnd #print subprocess.check_output(['python', '/home/pi/detectVoiceInWav.py', wavFile, jsonFile, str(timeEnd)]) v = VoiceActivityDetector(wavFile) ampData = v._calculate_amplitude(v.data) if np.percentile(ampData, 75) < 200: print "AUTO" speech_labels = [] speech_dict = {} speech_dict['percent_time_w_speech'] = 0.0 speech_dict['time_started'] = round(timeEnd, 3) speech_dict['length_of_recording'] = round( lengthRecording, 3) speech_dict['mean_amp'] = round( float(float(sum(ampData)) / len(ampData)), 3) speech_dict['med_amp'] = round(np.median(ampData), 3) speech_dict['25_amp'] = round(np.percentile(ampData, 25), 3) speech_dict['75_amp'] = round(np.percentile(ampData, 75), 3) speech_dict['max_amp'] = round(np.amax(ampData), 3) speech_dict['processed'] = 0 speech_labels.append(speech_dict) print "AUTO DONE" else: print "MANUAL" raw_detection = v.detect_speech() #print raw_detection speech_labels = v.convert_windows_to_readible_labels( raw_detection, str(timeEnd)) print "MANUAL DONE" megaArray.append(speech_labels) save_to_file(megaArray, soundName) #print subprocess.check_output(['./reaper/REAPER/build/reaper', '-i', wavFile, '-f', foFile, '-p', pmFile, '-a']) processNum = processNum + 1 except subprocess.CalledProcessError as e: logging.exception("message") subprocess.check_output(['rm', wavFile]) sense.clear() with open("processWav.txt", "w") as text_file: text_file.write("Total wavs processed: {}".format(str(processNum))) with open("createWav.txt", "w") as t2: t2.write("Total wavs supposed to be processed: {}".format( str(audioNum))) currTime = time.time() currTime = currTime - timeStart with open("timeTaken.txt", "w") as t3: t3.write("Time taken to process wavs: {}".format(str(currTime)))
i = 0 total_speech = 0 try: while (True): speech_labels = {} if (i < 4): i = i + 1 else: i = 0 wav_name = 'file' + str(i) + '.wav' create_wav(wav_name) v = VoiceActivityDetector(wav_name) raw_detection = v.detect_speech() speech_labels, speech_in_wav = v.convert_windows_to_readible_labels( raw_detection) if (speech_in_wav > 0.5): print('we have speech in ' + wav_name) else: print('we dont have speech in ' + wav_name) output_txt = 'testingreal' + str(i) + '.txt' total_speech = total_speech + speech_in_wav save_to_file(speech_labels, output_txt) except KeyboardInterrupt: print('Total time since program started:' + str(total_speech)) print("Done")
from vad import VoiceActivityDetector filename = '/home/leferrae/Desktop/These/Kunwok/tour_nabagardi/20190723_085214.wav' v = VoiceActivityDetector(filename) v.plot_detected_speech_regions()
from vad import VoiceActivityDetector # import json # Input file audiofile = "abc.wav" plotaudiosignal = VoiceActivityDetector(audiofile) # Plotting the Audio signals, so that we can identify when it is having silence in audio plotaudiosignal.plot_detected_speech_regions() # # def save_to_file(data, file): # with open(file, 'w') as fp: # json.dump(data, fp) # # # # v = VoiceActivityDetector(filename) # raw_detection = v.detect_speech() # print(raw_detection) # speech_labels = v.convert_windows_to_readible_labels(raw_detection) # # save_to_file(speech_labels, "bala.json")
from vad import VoiceActivityDetector import sys v = VoiceActivityDetector(sys.argv[1]) raw_detection = v.detect_speech() speech_labels = v.convert_windows_to_readible_labels(raw_detection) print(speech_labels)