示例#1
0
    def run(self):
        #global my_mutex
        global numRecordings
        global audioNum
        global processNum

        megaArray = []
        #while processNum < numRecordings:
        while goingFlag == True or processNum < audioNum:
            while processNum >= audioNum:
                y = 0
            #print processNum
            if processNum > 0:
                subprocess.check_output(['rm', wavFile])
            #jsonFile = "/home/pi/results" + `processNum` + ".json"
            wavFile = "/home/pi/talking" + ` processNum ` + ".wav"
            print "Processing " + wavFile
            foFile = "fo" + ` processNum ` + ".fo"
            pmFile = "pm" + ` processNum ` + ".pm"
            timeEnd = q.get()
            print timeEnd
            #print subprocess.check_output(['python', '/home/pi/detectVoiceInWav.py', wavFile, jsonFile, str(timeEnd)])
            v = VoiceActivityDetector(wavFile)
            raw_detection = v.detect_speech()
            speech_labels = v.convert_windows_to_readible_labels(
                raw_detection, str(timeEnd))
            megaArray.append(speech_labels)
            save_to_file(megaArray, soundName)
            #print subprocess.check_output(['./reaper/REAPER/build/reaper', '-i', wavFile, '-f', foFile, '-p', pmFile, '-a'])
            processNum = processNum + 1
示例#2
0
 def getSpeechStartTime(self, fileName):
     v = VoiceActivityDetector(fileName)
     windows = v.detect_speech()
     for i in range(0, len(windows)):
         arr = windows[i]
         if arr[len(arr) - 1] == 1:
             # VAD breaks wav file into 20 ms windowed chunks, with a 10 ms overlay
             return i * 10 
     return -1 
示例#3
0
def get_score(file_name):
    v = VoiceActivityDetector(file_name)
    raw_detection = v.detect_speech()
    speech_labels = v.convert_windows_to_readible_labels(raw_detection)
    
    start = 0
    end = 0
    if len(speech_labels) > 0:
        start = speech_labels[0]['speech_begin']
        end = speech_labels[-1]['speech_end']
    
    return end - start
示例#4
0
def apply_VAD(phonation_files, new_wav_path):
    # speech_HD = {0:[0,0], 2:[0,0], 3:[0,0]}
    # for wav in phonation_files:
    #     optimal_thresh = 0.5
    #     v = VoiceActivityDetector(wav)
    #     v.speech_energy_threshold = optimal_thresh
    #     frames = v.detect_speech()
    #     speech_labels = v.convert_windows_to_readible_labels(frames)
    #     # print('spkr wav', wav.split('/')[-1])
    #     # print(speech_labels)
    #     # exit()
    #     spkr_id = wav.split('/')[-1].split('_')[0]
    #     if len(speech_labels) > 1: #if more than 1 split
    #         speech_HD[early_late_balanced[spkr_id]][1]+=1
    #     else:
    #         speech_HD[early_late_balanced[spkr_id]][0]+=1
    # print(speech_HD)
    # exit()

    for thresh_sweep in [0.25, 0.3, 0.35]: #0.5 too high, no speech detected for some spkrs
        for wav in phonation_files:
            optimal_thresh = thresh_sweep
            v = VoiceActivityDetector(wav)
            v.speech_energy_threshold = optimal_thresh
            frames = v.detect_speech()
            speech_labels = v.convert_windows_to_readible_labels(frames)

            #output new wav
            old_basename = wav.split('/')[-1]
            new_wav_path_dir = os.path.join(new_wav_path, str(thresh_sweep))
            if not os.path.exists(new_wav_path_dir):
                os.makedirs(new_wav_path_dir)
            out_wav = os.path.join(new_wav_path_dir, old_basename)

            # print(frames)
            # for i in frames:
            #     if i[1] > 0:
            #         print(i)
            # print('wav', wav)
            # print(speech_labels)
            # print(thresh_sweep)
            if '38717' in wav:
                print(speech_labels)
                exit()
            first_vad_instance = speech_labels[0]
            beg_t = first_vad_instance['speech_begin']
            end_t = first_vad_instance['speech_end']

            transformer = sox.Transformer()
            transformer.trim(float(beg_t),float(end_t))
            transformer.convert(samplerate=32000, n_channels=1)
            transformer.build(wav, out_wav)
示例#5
0
def human_voice_detect(wave_file):
    # noise reduction
    denoised_wave = denoise(wave_file)

    # band pass filter
    perform_bandpass(denoised_wave)

    # run VAD
    v = VoiceActivityDetector('filtered_file.wav')
    array = v.detect_speech()
    #print array

    return parse_voice_array(array.tolist())
示例#6
0
 def run(self):
     #global my_mutex
     global numRecordings
     global audioNum
     global processNum
      
     megaArray = []
     currDay = 1
     fileChar = "A"
     #while processNum < numRecordings:
     while goingFlag == True or processNum < audioNum:
         while processNum >= audioNum:
            y=0
         #print processNum
         try:
             if processNum > 0:
                 subprocess.check_output(['rm', wavFile])
             #jsonFile = "/home/pi/results" + `processNum` + ".json"
             wavFile = dir_path + "/talking" + `processNum` + ".wav"
             print "Processing " + wavFile
             #foFile = "fo" + `processNum` + ".fo"
             #pmFile = "pm" + `processNum` + ".pm"
             timeEnd = q.get()
             tempDay = currDay
             currDay = qSoundJson.get()
             if tempDay != currDay:
                 megaArray = []
             soundName = dir_path + "/data/day" + str(currDay) + "/" + fileChar + "/sound" + str(fileNum) + ".json"
             if fileChar == "A":
                 fileChar = "B"
             else:
                 fileChar = "A"
             print timeEnd
             #print subprocess.check_output(['python', '/home/pi/detectVoiceInWav.py', wavFile, jsonFile, str(timeEnd)])
             v = VoiceActivityDetector(wavFile)
             raw_detection = v.detect_speech()
             speech_labels = v.convert_windows_to_readible_labels(raw_detection, str(timeEnd))
             megaArray.append(speech_labels)
             save_to_file(megaArray, soundName)
             #print subprocess.check_output(['./reaper/REAPER/build/reaper', '-i', wavFile, '-f', foFile, '-p', pmFile, '-a'])
             processNum = processNum + 1
         
         except subprocess.CalledProcessError as e:
             logging.exception("message")
     sense.clear()
     with open("processWav.txt", "w") as text_file:
         text_file.write("Total wavs processed: {}".format(str(processNum)))
     with open("createWav.txt", "w") as t2:
         t2.write("Total wavs supposed to be processed: {}".format(str(audioNum)))
def containsSpeech(inputfile, saveToFileFlag=False, outputfile='temp.json'):
    #if __name__ == "__main__":
    #parser = argparse.ArgumentParser(description='Analyze input wave-file and save detected speech interval to json file.')
    #parser.add_argument('inputfile', metavar='INPUTWAVE',help='the full path to input wave file')
    #parser.add_argument('outputfile', metavar='OUTPUTFILE',help='the full path to output json file to save detected speech intervals')
    #args = parser.parse_args()
    v = VoiceActivityDetector(inputfile)
    raw_detection = v.detect_speech()
    speech_labels = v.convert_windows_to_readible_labels(raw_detection)
    if (saveToFileFlag):
        save_to_file(speech_labels, outputfile)
    if (len(speech_labels) != 0):
        return True
    else:
        return False
示例#8
0
def detect_speech(input_file, output_file="results.txt"):
    try:
        v = VoiceActivityDetector(input_file)
        raw_detection = v.detect_speech()
        logger.info("Speech detection complete")
        speech_labels = v.convert_windows_to_readible_labels(raw_detection)
        speech = get_speech_duration(speech_labels)
        logger.info(
            "File: {}, Duration of video: {}, Duration of speech: {}".format(
                input_file, v.duration, speech))
        empty_venue = (speech / v.duration) < 0.05
        save_to_file(empty_venue, output_file, True)
        logger.info("File: {}, Empty: {}".format(input_file, empty_venue))
    except Exception as e:
        save_to_file(False, output_file, False)
        logger.error(
            "Something went wrong while trying to detect speech: {}".format(e))
示例#9
0
def vad_trial(file):
    # Create our detector
    v = VoiceActivityDetector(file)
    # Generate VAD intervals for the file
    speech_labels = v.convert_windows_to_readible_labels(v.detect_speech())

    # Get file info
    data = mediainfo(file)

    # Add all the times together
    total_time = 0
    for speech_data in speech_labels:
        # add the length of the speech
        total_time += speech_data['speech_end'] - speech_data['speech_begin']

    percent = total_time / float(data['duration'])

    return percent
示例#10
0
    def __init__(self, wave_input_filename, output_File, duration_of_video):
        v = VoiceActivityDetector(wave_input_filename)
        raw_detection = v.detect_speech()
        speech_labels = v.convert_windows_to_readible_labels(raw_detection)
        speech_labels = [float(i) for i in speech_labels]
        if len(speech_labels) == 0:
            with open(output_File, 'w') as fp:
                fp.write(str(0.00) + " " + str(duration_of_video) + "\n")
                return

        final = []
        if speech_labels[0] != 0.00:
            final.append(0.00)
            final.append(speech_labels[0] - 0.01)
        count = 1
        length = duration_of_video
        last = len(speech_labels) - 1
        while count <= last:
            final.append(speech_labels[count] + 0.01)
            count += 1
            if count >= last:
                final.append(length)
            else:
                final.append(speech_labels[count] - 0.01)
            count += 1

        count = 1
        while count < len(final):
            if final[count] - final[
                    count -
                    1] < 4:  # 4 seconds is the minimum limit for caption, as sentence can't fit in less than 4 seconds
                del final[count]
                del final[count - 1]
            else:
                count += 2

        if len(final) != 0:
            with open(output_File, 'w') as fp:
                index = 0
                while index < len(final):
                    fp.write(
                        str(final[index]) + " " + str(final[index + 1]) + "\n")
                    index += 2
示例#11
0
def split_wavs(inputfile, basedir):
    v = VoiceActivityDetector(inputfile)
    raw_detection = v.detect_speech()
    speech_labels = v.convert_windows_to_readible_labels(raw_detection)
    out_file_list = []
    
    begin = speech_labels[0]["speech_begin"]
    last_end = speech_labels[0]["speech_end"]
    valid_begin = begin
    valid_end = last_end
    num = len(speech_labels)
    i = 0
    for labels in speech_labels[1:]:
        i += 1
        end = labels["speech_end"]
        if end - begin > 59:
            valid_end = last_end
            valid_begin = begin
            print valid_begin, valid_end
            outputfile = os.path.join(basedir, "out_"+str(i)+".wav")
            save_wav(valid_begin, valid_end, inputfile, outputfile)
            out_file_list.append(outputfile)
        elif end - begin > 50:
            valid_end = end
            valid_begin = begin 
            print valid_begin, valid_end
            outputfile = os.path.join(basedir, "out_"+str(i)+".wav")
            save_wav(valid_begin, valid_end, inputfile, outputfile)
            out_file_list.append(outputfile)

            if i < num: 
                begin = speech_labels[i+1]["speech_begin"]

    if valid_end < speech_labels[-1]["speech_end"]:
        valid_begin= begin
        valid_end = speech_labels[-1]["speech_end"]
        print begin , speech_labels[-1]["speech_end"]
        outputfile = os.path.join(basedir, "out_"+str(i)+".wav")
        save_wav(valid_begin, valid_end, inputfile, outputfile)
        out_file_list.append(outputfile)
    return out_file_list
示例#12
0
def main():
    fn_s = glob.glob(
        './wav/171219101741/*.wav')  # set the wave data directory.
    now = datetime.datetime.now().strftime('%y%m%d%H%M%S')
    save_path = os.path.join('./note_annotation', now)
    os.makedirs(save_path)
    for fn_full in fn_s:
        fn, ext = os.path.splitext(fn_full)
        v = VoiceActivityDetector(fn + ext)
        dw, sve, pe = v.detect_speech(
        )  # dw: detected windows; sve: sum_voice_energy_s; pe: peak energy
        st = v.convert_windows_to_readible_labels(
            dw
        )  # return the list of speech times based on the detected window lists.
        data_passed_hilbert_sum = data2hilbert(v.data,
                                               v.rate)  # hilbert transform
        note_time = split_frame_hilbert_peak(dw, st, data_passed_hilbert_sum)
        # here set frame length for putting in mfcc compute 0.1
        df_note_time = pd.DataFrame(
            np.vstack(
                (note_time[0][:], note_time[0][:] + 0.1, note_time[2][:])).T,
            columns=["start_time", "end_time", "phrase_label"])
        df_note_time_sel = df_note_time[df_note_time.phrase_label > 0]
        df_note_time_sel.reset_index()
        df_note_time_sel.to_csv(save_path + '/' + fn.split('/')[-1] +
                                '_phrase_peak_time.csv')
        mfcc_s_str_100_wo = ''
        for i in range(len(df_note_time_sel.index)):
            mfcc_s_100 = get_mfcc_s(v.data,
                                    v.rate,
                                    df_note_time_sel['start_time'].iloc[i],
                                    df_note_time_sel['end_time'].iloc[i],
                                    frames=0.1,
                                    overlap=0.1)
            mfcc_s_str_100_wo += mfcc2str(mfcc_s_100) + '\n'
        with open(
                save_path + '/' + fn.split('/')[-1] +
                '_100msec_no_overlap_at_peak_loc.ghmm', 'w') as f_samples_100:
            f_samples_100.write(mfcc_s_str_100_wo)
        print('done for %s' % fn)
示例#13
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Analyze input wave-file and save detected speech interval to json file.'
    )
    parser.add_argument('inputfile',
                        metavar='INPUTWAVE',
                        help='the full path to input wave file')
    parser.add_argument(
        'outputfile',
        metavar='OUTPUTFILE',
        help=
        'the full path to output json file to save detected speech intervals')
    args = parser.parse_args()

    v = VoiceActivityDetector(args.inputfile)
    raw_detection = v.detect_speech()
    speech_labels = v.convert_windows_to_readible_labels(raw_detection)
    print("")
    print("Saving Detected Vocal Ranges to disk")
    save_to_file(speech_labels, args.outputfile)
    print("Done")
示例#14
0
def main(wave_file, txt_output):
	
	wr = wave.open(wave_file, 'r')
	par = list(wr.getparams())
	par[3] = 0
	
	filtered_wav = wave_file[:-4]
	filtered_wav = filtered_wav+"_fltrd.wav"

	ww = wave.open(filtered_wav, 'w')
	ww.setparams(tuple(par))

	lowpass = 300
	highpass = 3400

	sz = wr.getframerate()
	c = int(wr.getnframes()/sz)
	for num in range(c):
		print('Processing {}/{} s'.format(num+1, c))
		da = np.fromstring(wr.readframes(sz), dtype=np.int16)
		left, right = da[0::2], da[1::2] # left and right channel
		lf, rf = np.fft.rfft(left), np.fft.rfft(right)
		lf[:lowpass], rf[:lowpass] = 0, 0 # low pass filter
		lf[55:66], rf[55:66] = 0, 0 # line noise
		lf[highpass:], rf[highpass:] = 0,0 # high pass filter
		nl, nr = np.fft.irfft(lf), np.fft.irfft(rf)
		ns = np.column_stack((nl,nr)).ravel().astype(np.int16)
		ww.writeframes(ns.tostring())
	wr.close()
	ww.close()

	v = VoiceActivityDetector(filtered_wav)
	with open(txt_output, "w") as open_file:
		array = v.detect_speech()
		open_file.write(str(array))
	open_file.close()
示例#15
0
def recognize_speech(audiofile):
    from vad import VoiceActivityDetector
    v = VoiceActivityDetector(audiofile)
    detected = v.detect_speech()
    # detected2 = v.convert_windows_to_readible_labels(detected)
    return detected  # returns array of window numbers and speech flags (1 - speech, 0 - nonspeech)
示例#16
0
def main(wave_file):
    v = VoiceActivityDetector(wave_file)
    array = v.detect_speech()
    print array.tolist()
示例#17
0
from vad import VoiceActivityDetector
import argparse
import json

def save_to_file(data, filename):
    with open(filename, 'w') as fp:
        json.dump(data, fp)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Analyze input wave-file and save detected speech interval to json file.')
    # parser.add_argument('inputfile', metavar='INPUTWAVE',
    #                    help='the full path to input wave file')
    parser.add_argument('outputfile', metavar='OUTPUTFILE',
                        help='the full path to output json file to save detected speech intervals')
                        args = parser.parse_args()
                        
                        v = VoiceActivityDetector('rec_unlimited_7iwka57k.wav')
                        raw_detection = v.detect_speech()
                        speech_labels = v.convert_windows_to_readible_labels(raw_detection)

    save_to_file(speech_labels, args.outputfile)
示例#18
0
def save_to_file(data, filename):
    with open(filename, 'w') as fp:
        json.dump(data, fp)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description=
        'Analyze input wave-file and save detected speech interval to json file.'
    )
    # parser.add_argument('inputfile', metavar='INPUTWAVE',
    #                    help='the full path to input wave file')
    parser.add_argument(
        'outputfile',
        metavar='OUTPUTFILE',
        help=
        'the full path to output json file to save detected speech intervals')
    args = parser.parse_args()

    # Create a speech detection object and read in the audio data
    v = VoiceActivityDetector('FILE_NAME.wav')

    # Detect speech in the given data
    raw_detection = v.detect_speech()

    # Convert the detected speech signals to a readable format
    speech_labels = v.convert_windows_to_readible_labels(raw_detection)

    # Save the converted and detected speech to a file
    save_to_file(speech_labels, args.outputfile)
示例#19
0
import argparse
import json


def save_to_file(data, filename):
    with open(filename, 'w') as fp:
        json.dump(data, fp)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description=
        'Analyze input wave-file and save detected speech interval to json file.'
    )
    parser.add_argument('inputfile',
                        metavar='INPUTWAVE',
                        help='the full path to input wave file')
    parser.add_argument(
        'outputfile',
        metavar='OUTPUTFILE',
        help=
        'the full path to output json file to save detected speech intervals')
    args = parser.parse_args()

    v = VoiceActivityDetector(args.inputfile)
    raw_detection = v.detect_speech()
    speech_labels = v.convert_windows_to_readible_labels(raw_detection)
    print(speech_labels)

    save_to_file(speech_labels, args.outputfile)
示例#20
0
def includes_speech(file):
    v = VoiceActivityDetector(file)
    raw_detection = v.detect_speech()
    return v.includes_speech(raw_detection, thresh=3)
示例#21
0
EAnalysis = load_model("../Emotion_analysis")

sad = np.array([["Image", "Emotion"]], dtype=object)
predit = np.array(
    [['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']],
    dtype=object)
for file in os.listdir(he.FACEDIR):
    face = os.path.join(he.FACEDIR, file)
    img = cv2.imread(face)
    lastimg = cv2.resize(img, (48, 48))
    gray_image = lastimg / 255
    out = gray_image[np.newaxis, :, :]
    predictions = EAnalysis.predict(out)
    emotion = he.getEmotion(predictions)
    sad = np.append(sad, [[face, emotion]], axis=0)
    predit = np.append(predit, predictions, axis=0)

f = open('emotion.dat', 'ab')
np.savetxt(f, sad, fmt='%s')
f.close()

x = open('prediction.dat', 'ab')
np.savetxt(x, predit, fmt='%s')
x.close()

v = VoiceActivityDetector("audio.wav")
raw_detection = v.detect_speech()
speech_labels = v.convert_windows_to_readible_labels(raw_detection)
he.save_to_file(speech_labels, "vad.json")
def main():
    datadir = './dataset/train/audio/'
    labels = {
        'yes': 0,
        'no': 1,
        'up': 2,
        'down': 3,
        'left': 4,
        'right': 5,
        'on': 6,
        'off': 7,
        'go': 8,
        'stop': 9
    }
    labels_list = np.array([
        'yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'go', 'stop'
    ])
    train_in = []
    train_out = []
    test_in = []
    test_out = []
    removal_thr = 0

    vadcount = 0
    for word, label in labels.items():
        print(word)
        dirpath = datadir + word
        files = os.listdir(dirpath)
        np.random.shuffle(files)

        for i, fn in enumerate(files):

            filepath = os.path.join(dirpath, fn)
            sample_rate, samples = wavfile.read(filepath)
            #print(filepath)
            # run voice activity detector
            v = VoiceActivityDetector(filepath)
            detected_windows = v.detect_speech()
            speechtime = v.convert_windows_to_readible_labels(detected_windows)

            silence_free = np.array([])
            if len(speechtime) != 0:
                for segment in speechtime:
                    n0 = int(segment['speech_begin'] * sample_rate)
                    n1 = int(segment['speech_end'] * sample_rate)
                    silence_free = np.append(silence_free, samples[n0:n1])
                #print(len(silence_free))
            # !!! IMPORTANT check the quality of silence removel
            # bad ones will crush the spectrogram step
            if len(silence_free) >= 3200:
                vadcount += 1
                freqs, times, spectrogram = log_specgram(silence_free,
                                                         sample_rate,
                                                         window_size=10,
                                                         step_size=5)
            else:
                freqs, times, spectrogram = log_specgram(samples,
                                                         sample_rate,
                                                         window_size=10,
                                                         step_size=5)

            #plt.pcolormesh(freqs, times, spectrogram, cmap='coolwarm')
            #plt.show()
            # average spectrogram shape is (97.3,161)   vvvvv: x and y are reversed in cv2
            spectrogram = cv2.resize(spectrogram,
                                     dsize=(161, 100),
                                     interpolation=cv2.INTER_CUBIC)
            spectrogram = spectrogram.reshape(100, 161, 1)

            if i < len(files) / 5:
                test_in.append(spectrogram)
                test_out.append(one_hot_encode(label))
            else:
                train_in.append(spectrogram)
                train_out.append(one_hot_encode(label))
    train_in = np.array(train_in, dtype=np.float32)
    train_out = np.array(train_out, dtype=np.int32)
    test_in = np.array(test_in, dtype=np.float32)
    test_out = np.array(test_out, dtype=np.int32)
    print(train_in.shape)
    print(train_out.shape)
    print(test_in.shape)
    print(test_out.shape)
    print(vadcount, ' audios were processed with VAD.')

    save_filename = './mini_speech_data2.npz'
    np.savez(save_filename,
             train_in=train_in,
             train_out=train_out,
             test_in=test_in,
             test_out=test_out,
             labels=labels_list)
示例#23
0
from pydub import AudioSegment
from vad import VoiceActivityDetector

sound = AudioSegment.from_wav('Test1-master/Test/mic1.wav')
slicedSounds = sound[::500]
slicedSounds = list(slicedSounds)

for i in range(0, len(slicedSounds)):
    slicedSounds[i].export('trash.wav', format='wav')
    obj = VoiceActivityDetector('trash.wav')
    data = obj.detect_speech()
    Found = False
    for i in range(0, len(data)):
        if data[i][len(data[i]) - 1] == 1:
            speech = AudioSegment.from_wav('trash.wav')
            speech.export('speech.wav', format='wav')
            Found = True
    if Found:
        break
示例#24
0
child_puzzle_wav = FLAGS.child_puzzle_wav
mom_puzzle_wav = FLAGS.mom_puzzle_wav
mom_puzzle_textgrid = FLAGS.mom_puzzle_textgrid
child_outfile_textgrid = FLAGS.child_outfile_textgrid
add_seconds_at_boundary = FLAGS.add_seconds_at_boundary
child_segment_wav_outdir = FLAGS.child_segment_wav_outdir
mom_segment_wav_outdir = FLAGS.mom_segment_wav_outdir

if not os.path.exists(child_segment_wav_outdir):
    os.makedirs(child_segment_wav_outdir)

if not os.path.exists(mom_segment_wav_outdir):
    os.makedirs(mom_segment_wav_outdir)

# detects child speech parts
v = VoiceActivityDetector(child_puzzle_wav)
data = v.data
total_time = len(data) * 1.0 / v.rate
total_time = float("{0:.2f}".format(total_time))
speech_time, mom_tier = child_speech_detector(mom_puzzle_textgrid, v)

# export detected child speech segments wav
turns = export_child_audio_segments(total_time, child_puzzle_wav,
                                    add_seconds_at_boundary,
                                    child_segment_wav_outdir, speech_time)
total_turns = turns
tier = write_to_txtgrids('Machine-Label-CS', turns)

# modify manually annotated mom speech segments, and export the wav segments
mom_turns = export_mom_audio_segments(mom_puzzle_wav, mom_tier,
                                      mom_segment_wav_outdir)
示例#25
0
from vad import VoiceActivityDetector
import argparse
import json

def save_to_file(data, filename):
    with open(filename, 'w') as fp:
        json.dump(data, fp)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Analyze input wave-file and save detected speech interval to json file.')
    parser.add_argument('inputfile', metavar='INPUTWAVE',
                        help='the full path to input wave file')
    parser.add_argument('outputfile', metavar='OUTPUTFILE',
                        help='the full path to output json file to save detected speech intervals')
    args = parser.parse_args()
    
    v = VoiceActivityDetector(args.inputfile)
    raw_detection = v.detect_speech()
    speech_labels = v.convert_windows_to_readible_labels(raw_detection)
    
    save_to_file(speech_labels, args.outputfile)
示例#26
0
    x = AudioSegment.from_wav(fullFileName)
    x = x.set_channels(1)
    x.export(FILE_FOLDER + FILE_NAME + '-mono.wav', format="wav")
    waveFile = wave.open(FILE_FOLDER + FILE_NAME + '-mono.wav', 'r')

# Getting some variables from file
signal = waveFile.readframes(-1)
signal = np.fromstring(signal, 'Int16')
frameRate = waveFile.getframerate()
Time = np.linspace(0, len(signal) / frameRate, num=len(signal))
totalFrames = waveFile.getnframes()
wavFileLengthSec = totalFrames / frameRate

#  VOICE ACTIVITY DETECTION (VAD)

vad = VoiceActivityDetector(fullFileName)
raw_detection = vad.detect_speech()
speech_labels, starters, enders = vad.convert_windows_to_readible_labels(
    raw_detection)

# list of beginnings of voice detected segments (in frames)
starterFrames = [int(i) for i in starters]
# list of endings of voice detected segments (in frames)
enderFrames = [int(i) for i in enders]

# if the last voice detected segment isn't closed, use the last frame as end of this segment
if len(starterFrames) > len(enderFrames):
    enderFrames.append(totalFrames)

# list of beginnings of voice detected segments (in milliseconds)
starterMs = [int((i / frameRate) * 1000) for i in starterFrames]
示例#27
0
    def run(self):
        #global my_mutex
        global numRecordings
        global audioNum
        global processNum

        megaArray = []
        currDay = 1
        fileChar = "A"
        wavFile = ""
        #while processNum < numRecordings:
        while goingFlag == True or processNum < audioNum:
            while processNum >= audioNum:
                y = 0
            #print processNum
            try:
                if processNum > 0:
                    subprocess.check_output(['rm', wavFile])
                #jsonFile = "/home/pi/results" + `processNum` + ".json"
                wavFile = dir_path + "/talking" + ` processNum ` + ".wav"
                print "Processing " + wavFile
                #foFile = "fo" + `processNum` + ".fo"
                #pmFile = "pm" + `processNum` + ".pm"
                timeEnd = q.get()
                tempDay = currDay
                currDay = qSoundJson.get()
                if tempDay != currDay:
                    megaArray = []
                soundName = dir_path + "/data/day" + str(
                    currDay) + "/" + fileChar + "/sound" + str(
                        fileNum) + ".json"
                if fileChar == "A":
                    fileChar = "B"
                else:
                    fileChar = "A"
                print timeEnd
                #print subprocess.check_output(['python', '/home/pi/detectVoiceInWav.py', wavFile, jsonFile, str(timeEnd)])
                v = VoiceActivityDetector(wavFile)
                ampData = v._calculate_amplitude(v.data)

                if np.percentile(ampData, 75) < 200:
                    print "AUTO"
                    speech_labels = []
                    speech_dict = {}
                    speech_dict['percent_time_w_speech'] = 0.0
                    speech_dict['time_started'] = round(timeEnd, 3)
                    speech_dict['length_of_recording'] = round(
                        lengthRecording, 3)
                    speech_dict['mean_amp'] = round(
                        float(float(sum(ampData)) / len(ampData)), 3)
                    speech_dict['med_amp'] = round(np.median(ampData), 3)
                    speech_dict['25_amp'] = round(np.percentile(ampData, 25),
                                                  3)
                    speech_dict['75_amp'] = round(np.percentile(ampData, 75),
                                                  3)
                    speech_dict['max_amp'] = round(np.amax(ampData), 3)
                    speech_dict['processed'] = 0
                    speech_labels.append(speech_dict)
                    print "AUTO DONE"
                else:
                    print "MANUAL"
                    raw_detection = v.detect_speech()
                    #print raw_detection
                    speech_labels = v.convert_windows_to_readible_labels(
                        raw_detection, str(timeEnd))
                    print "MANUAL DONE"
                megaArray.append(speech_labels)
                save_to_file(megaArray, soundName)
                #print subprocess.check_output(['./reaper/REAPER/build/reaper', '-i', wavFile, '-f', foFile, '-p', pmFile, '-a'])
                processNum = processNum + 1

            except subprocess.CalledProcessError as e:
                logging.exception("message")

        subprocess.check_output(['rm', wavFile])
        sense.clear()
        with open("processWav.txt", "w") as text_file:
            text_file.write("Total wavs processed: {}".format(str(processNum)))
        with open("createWav.txt", "w") as t2:
            t2.write("Total wavs supposed to be processed: {}".format(
                str(audioNum)))
        currTime = time.time()
        currTime = currTime - timeStart
        with open("timeTaken.txt", "w") as t3:
            t3.write("Time taken to process wavs: {}".format(str(currTime)))
示例#28
0
    i = 0
    total_speech = 0
    try:
        while (True):
            speech_labels = {}

            if (i < 4):

                i = i + 1

            else:
                i = 0

            wav_name = 'file' + str(i) + '.wav'
            create_wav(wav_name)
            v = VoiceActivityDetector(wav_name)
            raw_detection = v.detect_speech()
            speech_labels, speech_in_wav = v.convert_windows_to_readible_labels(
                raw_detection)
            if (speech_in_wav > 0.5):
                print('we have speech in ' + wav_name)

            else:
                print('we dont have speech in ' + wav_name)
            output_txt = 'testingreal' + str(i) + '.txt'
            total_speech = total_speech + speech_in_wav
            save_to_file(speech_labels, output_txt)

    except KeyboardInterrupt:
        print('Total time since program started:' + str(total_speech))
        print("Done")
示例#29
0
from vad import VoiceActivityDetector

filename = '/home/leferrae/Desktop/These/Kunwok/tour_nabagardi/20190723_085214.wav'
v = VoiceActivityDetector(filename)
v.plot_detected_speech_regions()
示例#30
0
from vad import VoiceActivityDetector
# import json

# Input file
audiofile = "abc.wav"

plotaudiosignal = VoiceActivityDetector(audiofile)

# Plotting the Audio signals, so that we can identify when it is having silence in audio

plotaudiosignal.plot_detected_speech_regions()

#
# def save_to_file(data, file):
#     with open(file, 'w') as fp:
#         json.dump(data, fp)
#
#
#
# v = VoiceActivityDetector(filename)
# raw_detection = v.detect_speech()
# print(raw_detection)
# speech_labels = v.convert_windows_to_readible_labels(raw_detection)
#
# save_to_file(speech_labels, "bala.json")
示例#31
0
文件: vad2.py 项目: truered8/vadar
from vad import VoiceActivityDetector
import sys

v = VoiceActivityDetector(sys.argv[1])
raw_detection = v.detect_speech()
speech_labels = v.convert_windows_to_readible_labels(raw_detection)
print(speech_labels)