def shortTermAnalyses(sound_type, filename, patient_name): fs, signal = wavfile.read(filename) window.refresh() if sound_type == 'speech': s = audioSegmentation.silence_removal(signal, fs, 0.5, 0.1, weight=0.2) signal2 = np.concatenate([signal[int((i[0]+0.1)*fs):int((i[1]+0.1)*fs)] for i in s]) wavfile.write("database/{0}/speechFileSegmented.wav".format(patient_name), fs, signal2) s1 = ShortTermFeatures.feature_extraction(signal[:, 0], fs, 0.05*fs, 0.025*fs, deltas=True)[0][:8] window.refresh() s2 = ShortTermFeatures.feature_extraction(signal[:, 1], fs, 0.05*fs, 0.025*fs, deltas=True)[0][:8] window.refresh() filename = filename[:-4] + "1.wav" fs, signal = wavfile.read(filename) s = audioSegmentation.silence_removal(signal, fs, 0.5, 0.1, weight=0.2) signal2 = np.concatenate([signal[int((i[0]+0.1)*fs):int((i[1]+0.1)*fs)] for i in s]) wavfile.write("database/{0}/speechFileSegmented1.wav".format(patient_name), fs, signal2) s3 = ShortTermFeatures.feature_extraction(signal[:, 0], fs, 0.05*fs, 0.025*fs, deltas=True)[0][:8] window.refresh() s4 = ShortTermFeatures.feature_extraction(signal[:, 1], fs, 0.05*fs, 0.025*fs, deltas=True)[0][:8] window.refresh() n = min(s1.shape[0], s2.shape[0], s3.shape[0], s4.shape[0]) m = min(s1.shape[1], s2.shape[1], s3.shape[1], s4.shape[1]) return (s1[:n, :m]+s2[:n, :m]+s3[:n, :m]+s4[:n, :m])/4 else: return ShortTermFeatures.feature_extraction(signal, fs, 0.05*fs, 0.025*fs, deltas=True)[0][:8]
def silence_detection(speech): ''' Returns list of lists containing the start and end time stamps of speech segments e.g.: [[0.05,0.07],[0.09,0.012]] --> speech exists in the intervals 0.05-0.07 and 0.09-1.12 Utlises a pretained Hidden Markov Model in pyAudioanalysis that determines speech activity. Parameters: data (reader): reader returned by bob.io.audio.reader Returns: intervals(list): list of lists containing intervals where speech segments exist ''' config = configuration()['speech_params'] intervals = aS.silence_removal(speech, config['sample_rate'].get(), config['frame_length'].get(), config['frame_overlap'].get(), smooth_window=config['smooth_window'].get(), weight=config['sil_rem_wt'].get(), plot=config['sil_rem_plt'].get()) for i in range(len(intervals)): intervals[i] = [ int(stamp * config['sample_rate'].get()) for stamp in intervals[i] ] return intervals
def silenceRemovalWrapper(inputFile, outFolder, smoothingWindow, weight): if not os.path.isfile(inputFile): raise Exception("Input audio file not found!") [fs, x] = audioBasicIO.read_audio_file(inputFile) segmentLimits = aS.silence_removal(x, fs, 0.05, 0.05, smoothingWindow, weight, True) for i, s in enumerate(segmentLimits): strOut = outFolder+"{0:s}_{1:.3f}-{2:.3f}.wav".format(inputFile[0:-4], s[0], s[1]) wavfile.write(strOut, fs, x[int(fs * s[0]):int(fs * s[1])])
def silence_detection(utter): intervals = aS.silence_removal(utter, hp.data.sr, 0.025, 0.010, smooth_window = 1.0, weight = 0.3, plot = False) for i in range(len(intervals)): intervals[i] = [int(stamp*hp.data.sr) for stamp in intervals[i]] return intervals
def extractAudio(input_file, output_dir, smoothing_window=1.0, weight=0.1): print("Detecting silences...") [fs, x] = read_audio_file(input_file) segmentLimits = silence_removal(x, fs, 0.05, 0.05, smoothing_window, weight) ifile_name = os.path.basename(input_file) os.makedirs(output_dir, exist_ok=True) files = [] print("Writing segments...") for i, s in enumerate(segmentLimits): strOut = "{0:s}_{1:.3f}-{2:.3f}.wav".format(ifile_name, s[0], s[1]) strOut = os.path.join(output_dir, strOut) wavfile.write(strOut, fs, x[int(fs * s[0]):int(fs * s[1])]) files.append(strOut) return files
def segment(path, file): Fs, x = audioBasicIO.read_audio_file(path + file) segments = audioSegmentation.silence_removal(x, Fs, 0.020, 0.020, smooth_window=1.0, weight=0.3, plot=False) print(segments) X = x[0:0] for i, segment in enumerate(segments): print(i) t0 = int(Fs * segment[0]) t1 = int(Fs * segment[1]) # wavfile.write( path + 'segments/' + str(i) + '.wav', Fs, x[t0:t1]) X = np.concatenate((X, x[t0:t1]), axis=0) wavfile.write(path + 'seg_pyAudioAnalysis.wav', Fs, X)
def main(argv): if argv[1] == "-shortTerm": for i in range(nExp): [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav") duration = x.shape[0] / float(Fs) t1 = time.time() F = MidTermFeatures.short_term_feature_extraction( x, Fs, 0.050 * Fs, 0.050 * Fs) t2 = time.time() perTime1 = duration / (t2 - t1) print "short-term feature extraction: {0:.1f} x realtime".format( perTime1) elif argv[1] == "-classifyFile": for i in range(nExp): [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav") duration = x.shape[0] / float(Fs) t1 = time.time() aT.file_classification("diarizationExample.wav", "svmSM", "svm") t2 = time.time() perTime1 = duration / (t2 - t1) print "Mid-term feature extraction + classification \t {0:.1f} x realtime".format( perTime1) elif argv[1] == "-mtClassify": for i in range(nExp): [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav") duration = x.shape[0] / float(Fs) t1 = time.time() [flagsInd, classesAll, acc] = aS.mid_term_file_classification("diarizationExample.wav", "svmSM", "svm", False, '') t2 = time.time() perTime1 = duration / (t2 - t1) print "Fix-sized classification - segmentation \t {0:.1f} x realtime".format( perTime1) elif argv[1] == "-hmmSegmentation": for i in range(nExp): [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav") duration = x.shape[0] / float(Fs) t1 = time.time() aS.hmm_segmentation('diarizationExample.wav', 'hmmRadioSM', False, '') t2 = time.time() perTime1 = duration / (t2 - t1) print "HMM-based classification - segmentation \t {0:.1f} x realtime".format( perTime1) elif argv[1] == "-silenceRemoval": for i in range(nExp): [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav") duration = x.shape[0] / float(Fs) t1 = time.time() [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav") segments = aS.silence_removal(x, Fs, 0.050, 0.050, smooth_window=1.0, Weight=0.3, plot=False) t2 = time.time() perTime1 = duration / (t2 - t1) print "Silence removal \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-thumbnailing": for i in range(nExp): [Fs1, x1] = audioBasicIO.read_audio_file("scottish.wav") duration1 = x1.shape[0] / float(Fs1) t1 = time.time() [A1, A2, B1, B2, Smatrix] = aS.music_thumbnailing(x1, Fs1, 1.0, 1.0, 15.0) # find thumbnail endpoints t2 = time.time() perTime1 = duration1 / (t2 - t1) print "Thumbnail \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-diarization-noLDA": for i in range(nExp): [Fs1, x1] = audioBasicIO.read_audio_file("diarizationExample.wav") duration1 = x1.shape[0] / float(Fs1) t1 = time.time() aS.speaker_diarization("diarizationExample.wav", 4, LDAdim=0, PLOT=False) t2 = time.time() perTime1 = duration1 / (t2 - t1) print "Diarization \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-diarization-LDA": for i in range(nExp): [Fs1, x1] = audioBasicIO.read_audio_file("diarizationExample.wav") duration1 = x1.shape[0] / float(Fs1) t1 = time.time() aS.speaker_diarization("diarizationExample.wav", 4, PLOT=False) t2 = time.time() perTime1 = duration1 / (t2 - t1) print "Diarization \t {0:.1f} x realtime".format(perTime1)
def vadFolderWrapperMergedByTh(inputFolder, outFolder, smoothingWindow, weight, model_name, threshold): if not os.path.isfile(model_name): print("fileClassification: input model_name not found!") classifier, mean, std, classes, mid_window, mid_step, short_window, \ short_step, compute_beat = aT.load_model(model_name) types = ('*.wav', '*.mp3') wavFilesList = [] for files in types: print(inputFolder + files) wavFilesList.extend(glob.glob((inputFolder + files))) wavFilesList = sorted(wavFilesList) if len(wavFilesList) == 0: print("No WAV files found!") return for wavFile in wavFilesList: # print(wavFile) if not os.path.isfile(wavFile): raise Exception("Input audio file not found!") base = os.path.splitext(os.path.basename(wavFile))[0] folder = outFolder + base + '/' if not os.path.exists(folder): os.makedirs(folder) segfile = open(os.path.join(folder, 'segments'), 'w+') segfile2 = open(os.path.join(folder, 'segments_details'), 'w+') stack = deque() [fs, x] = audioBasicIO.read_audio_file(wavFile) segmentLimits = aS.silence_removal(x, fs, 0.05, 0.05, smoothingWindow, weight, False) merge=True for i, st in enumerate(segmentLimits): signal = audioBasicIO.stereo_to_mono(x[int(fs * st[0]):int(fs * st[1])]) # print('in here', len(segmentLimits), st[0],st[1],classes, type(st)) if fs == 0: continue # audio file IO problem # return -1, -1, -1 if signal.shape[0] / float(fs) < mid_window: mid_window = signal.shape[0] / float(fs) # feature extraction: mid_features, s, _ = \ aF.mid_feature_extraction(signal, fs, mid_window * fs, mid_step * fs, round(fs * short_window), round(fs * short_step)) # long term averaging of mid-term statistics mid_features = mid_features.mean(axis=1) if compute_beat: # print('in here3') beat, beat_conf = aF.beat_extraction(s, short_step) mid_features = np.append(mid_features, beat) mid_features = np.append(mid_features, beat_conf) feature_vector = (mid_features - mean) / std # normalization # class_id = -1 # probability = -1 class_id = classifier.predict(feature_vector.reshape(1, -1))[0] # probability = classifier.predict_proba(feature_vector.reshape(1, -1))[0] print(class_id, type(class_id)) label=classes[int(class_id)] print(label) if label=='speech': dur=st[1]-st[0] # print('in hereas') if merge == True: seg_prev=[] # print('in hereasq12') if len(stack) >0: seg_prev = stack.pop() if len(seg_prev) >0 and st[1]-seg_prev[0] > threshold: # print('in hereas4') seg = [st[0], st[1], label] stack.append(seg_prev) stack.append(seg) merge = True elif len(seg_prev) >0: # print('in hereasqw345') seg = [seg_prev[0], st[1], label] stack.append(seg) merge = True else: seg = [st[0], st[1], label] stack.append(seg) merge = True else: # print('in hereas2') seg = [st[0], st[1], label] stack.append(seg) merge = True else: merge = False print(i, merge) # print(len(segmentLimits), len(stack)) for sn in stack: # print(type(wavFile), sn[0].shape, sn[1].shape, type(sn[0]), type(sn[1])) strName = base + "_" + "{:.3f}".format(sn[0]) + "_" + "{:.3f}".format(sn[1]) if sn[2] == 'speech': strOut = folder + base + "_" + "{:.3f}".format(sn[0]) + "_" + "{:.3f}".format(sn[1]) + ".wav" wavfile.write(strOut, fs, x[int(fs * sn[0]):int(fs * sn[1])]) segfile.write(strName + ' ' + base + ' ' + "{:.3f}".format(sn[0]) + ' ' + "{:.3f}".format(sn[1]) + "\n") segfile2.write(strName + ' ' + "{:.3f}".format(sn[0]) + ' ' + "{:.3f}".format(sn[1]) + ' ' + sn[2] + "\n") segfile.close() segfile2.close()
from pyAudioAnalysis import audioBasicIO as aIO from pyAudioAnalysis import audioSegmentation as aS audio_path = "../project_files/project_dataset/audio/soccer.wav" [Fs, x] = aIO.read_audio_file(audio_path) segments = aS.silence_removal(x, Fs, 0.020, 0.020, smooth_window=0.1, weight=0.6, plot=False) for i in range(len(segments)): print("[" + str(segments[i][0]) + "," + str(segments[i][1]) + "]") segments[i][0] = segments[i][0] // 30 segments[i][1] = segments[i][1] // 30
def audio_based_feature_extraction(input_file, models_directory, raudio_features_discard=0, pyaudio_num_features="all", mode=0, pyaudio_params=None): """ Export all features for a wav file (silence based + classifiers based) :param input_file: the audio file :param models_directory: the directory which contains all trained classifiers (models' files + MEANS files) :return: features , feature_names , metadata """ # A. silence features fs, dur = get_wav_properties(input_file) fs, x = aio.read_audio_file(input_file) print(input_file) print(len(x) / fs) # get the silence estimates using pyAudioAnalysis semi-supervised approach # for different windows and steps if dur < 6.2: seg_limits_short = [[0, dur]] seg_limits_long = [[0, dur]] else: seg_limits_short = aS.silence_removal(x, fs, 0.5, 0.25, 0.5) seg_limits_long = aS.silence_removal(x, fs, 1.0, 0.25, 0.5) # short windows silence_features_short, number_of_pauses_short, total_speech_short = \ silence_features(seg_limits_short, dur) # long windows silence_features_long, number_of_pauses_long, total_speech_long = \ silence_features(seg_limits_long, dur) features = [] feature_names = [] if mode < 2: # B. segment model-based features # Load classifier: dictionaries = [] for filename in os.listdir(models_directory): model_path = os.path.join(models_directory, filename) dictionary = predict_audio_labels(input_file, model_path)[0] dictionaries.append(dictionary) # list of features and feature names feature_names = [ "Average silence duration short (sec)", "Average silence duration long (sec)", "Silence segments per minute short (segments/min)", "Silence segments per minute long (segments/min)", "Std short", "Std long", "Speech ratio short (sec)", "Speech ratio long (sec)", "Word rate in speech short (words/sec)", "Word rate in speech long (words/sec)" ] for i in range(len(silence_features_short)): features.append(silence_features_short[i]) features.append(silence_features_long[i]) for dictionary in dictionaries: for label in dictionary: feature_string = label + "(%)" feature_value = dictionary[label] feature_names.append(feature_string) features.append(feature_value) if raudio_features_discard != 0: features = features[raudio_features_discard:] feature_names = feature_names[raudio_features_discard:] # C. pyaudio features if mode > 0: (segment_features_stats, segment_features, pyaudio_feature_names) = aF.mid_feature_extraction( x, fs, round(pyaudio_params['mid_window'] * fs), round(pyaudio_params['mid_step'] * fs), round(fs * pyaudio_params['short_window']), round(fs * pyaudio_params['short_step'])) pyaudio_list = list(segment_features_stats.mean(axis=1)) if pyaudio_num_features != "all": #pyaudio_num_features = int(pyaudio_num_features) pyaudio_list = pyaudio_list[:pyaudio_num_features - 1] pyaudio_feature_names = pyaudio_feature_names[:pyaudio_num_features - 1] features = features + pyaudio_list feature_names = feature_names + pyaudio_feature_names metadata = { "Number of pauses short": number_of_pauses_short, "Number of pauses long": number_of_pauses_long, "Total speech duration short (sec)": total_speech_short, "Total speech duration long (sec)": total_speech_long } return features, feature_names, metadata