def segmentclassifyFileWrapper(inputWavFile, model_name, model_type): if not os.path.isfile(model_name): raise Exception("Input model_name not found!") if not os.path.isfile(inputWavFile): raise Exception("Input audio file not found!") gtFile = "" if inputWavFile[-4::]==".wav": gtFile = inputWavFile.replace(".wav", ".segments") if inputWavFile[-4::]==".mp3": gtFile = inputWavFile.replace(".mp3", ".segments") aS.mtFileClassification(inputWavFile, model_name, model_type, True, gtFile)
def segmentclassifyFileWrapper(inputWavFile, model_name, model_type): if not os.path.isfile(model_name): raise Exception("Input model_name not found!") if not os.path.isfile(inputWavFile): raise Exception("Input audio file not found!") gtFile = "" if inputWavFile[-4::] == ".wav": gtFile = inputWavFile.replace(".wav", ".segments") if inputWavFile[-4::] == ".mp3": gtFile = inputWavFile.replace(".mp3", ".segments") aS.mtFileClassification(inputWavFile, model_name, model_type, True, gtFile)
def find_music(audio_file): modelName = "pyAA/data/svmSM" [Fs, x] = aIO.readAudioFile(audio_file) duration = x.shape[0] / float(Fs) t1 = time.clock() flagsInd, classNames, acc, CMt = aS.mtFileClassification( audio_file, modelName, "svm", False, '') [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT ] = aT.loadSVModel(modelName) t2 = time.clock() perTime1 = duration / (t2 - t1) flags = [classNames[int(f)] for f in flagsInd] (segs, classes) = aS.flags2segs(flags, mtStep) i = 0 #len(classes)-1 file_parts = [] cbn = sox.Combiner() if len(classes) > 1: for c in classes: if c == 'music': start = segs[i][0] if i != 0: start -= 0.5 end = segs[i][1] if i != len(classes) - 1: end += 2.5 file_parts.append((int(start * 1000), int(end * 1000))) i += 1 return file_parts
def func(): res = [flagsInd, classesAll, acc] = aS.mtFileClassification(PATH_TO_WAV, PATH_TO_SVM, "svm", False, PATH_TO_SEGMENTS_FILE) print res segments = getSegments(res[0]) print segments f = open(PATH_TO_DIR + "segments", "w") f.write(str(segments)) cutSegments(segments)
def aud_classify(direc, wav_file, model): ##test on one file global j, l print "classifying" + direc + wav_file [flagsInd, classesAll, acc, CM] = aS.mtFileClassification(direc + wav_file, "models/" + model, "svm", True) print classesAll j = flagsInd l = classesAll return j, l, model
def aud_classify(direc, wav_file): ##test on one file global j global l print "classifying" + direc + wav_file [flagsInd, classesAll, acc, CM] = aS.mtFileClassification(direc + wav_file, "svm_amlo_v1", "svm", True) print classesAll j = flagsInd l = classesAll return j, l
def seg(filename, rec_type): [flagsInd, classesAll, acc, CM] = aS.mtFileClassification('audiofiles/' + filename, "./" + rec_type + 'MusicGenre2', rec_type, False) flags = array2list(flagsInd) audio_step0, _ = audiofilter(flags) with open(os.path.join('seg_result/', filename + '.seg'), 'w') as f: for s in audio_step0: f.write(str(s) + '\n') seg_points = read_seg_points(audio_step0, 8) #seg_points = read_segment_points(audio_step0, 8) audio_segment('audiofiles/' + filename, seg_points) return audio_step0, len(flagsInd)
def find_applause(inputfile,outputfile,to_csv,plot,default_speaker,buffer_secs,script_path): wav_source=True if inputfile.lower()[-4:]!='.wav': # Creates a temporary WAV wav_source=False # if input is MP3 temp_filename=inputfile.split('/')[-1]+'_temp.wav' wav_path='/var/tmp/'+temp_filename # Pathname for temp WAV subprocess.call(['ffmpeg', '-y', '-i', inputfile, wav_path]) # '-y' option overwrites existing file if present else: wav_path=inputfile classifier_model_path = os.path.join(script_path,'data/svm_applause_model') output, classesAll, acc, CM = aS.mtFileClassification(wav_path, classifier_model_path, "svm") output = list(output) applause_secs=[] for i, x in enumerate(output): if float(x)==1.0: applause_secs.append(i) applause_ranges=seconds_list_to_ranges(applause_secs) if (plot==True)&(len(applause_ranges)>0): import matplotlib.pyplot as plt import pandas as pd import numpy as np print(applause_ranges) print('\n') pd.Series(output).plot() plt.title(inputfile.split('/')[-1]) plt.xlabel('Seconds') plt.ylabel('Applause Classification') plt.show() if wav_source==False: os.remove(wav_path) if to_csv==True: if outputfile=='': outputfile=inputfile[:-4]+'_applause.csv' if default_speaker=='': with open(outputfile, 'w') as csv_fo: applause_ranges_expanded=[(start+buffer_secs,0,duration-buffer_secs) for start,duration in applause_ranges] csv_writer = csv.writer(csv_fo) csv_writer.writerows(applause_ranges_expanded) else: with open(outputfile, 'w') as csv_fo: prev_end='0.0' csv_writer = csv.writer(csv_fo) for start,duration in applause_ranges: if float(float(start)-float(prev_end)-(float(buffer_secs)*2))>0.0: csv_writer.writerow([float(prev_end)+buffer_secs,1,float(start)-float(prev_end)-(float(buffer_secs)*2),default_speaker.replace(',',';')]) if float(float(duration)-buffer_secs)>0: csv_writer.writerow([start+buffer_secs,0,float(duration)-buffer_secs,'Applause']) prev_end=start+duration if (prev_end < len(output)): if float(float(len(output)-prev_end)-buffer_secs-1)>0.0: csv_writer.writerow([float(prev_end)+buffer_secs,1,float(len(output)-prev_end)-buffer_secs-1,default_speaker.replace(',',';')]) # "-1" is a kluge to make sure final tag doesn't exceed length of audio file
def classify_file(audiofile, model, model_type, model_color): # sets duration of audiofile, for getting the timestamps of each classification with contextlib.closing(wave.open(audiofile, 'r')) as f: frames = f.getnframes() rate = f.getframerate() duration = frames / float(rate) print(duration) # only used for console output, can be removed to speed up runtime # function will throw error because of true flag at end, # console log is still displayed in spite of #try: # aS.mtFileClassification(audiofile, model,"svm", True) #except TypeError: # print("TypeError") # pulls all the data given from the classification function [flagsInd, classesAll, acc, CM] = aS.mtFileClassification(audiofile, model, "svm") # print( flagsInd ) # print( classesAll ) # print( acc ) # print( CM ) flag_len = len(flagsInd) # amount of segments made segment = duration / flag_len # length of each time segment # dictionary to be built of timestamps and categories classify_dict = {'name': model_type, 'color': model_color, 'data': []} classify_dict['data'].append({"category": "NO", "time": 0}) for index in range(flag_len): timestamp = segment * index + 1 # current timestamp # builds dictionary classify_dict['data'].append({ "category": classesAll[int(flagsInd[index])], "time": timestamp }) # used for console logging # print( str( "{ category: '" + classesAll[int(flagsInd[index])] ) + "', time: " + str(timestamp) + " }," ) return classify_dict
def audio_segmentation(wav_file_path, classifier_type, classifier_path): """ Main function: takes a .wav file, and a scikit-learn classifier as arguments. Returns a partition of the sound track into segments of silence, of speech or of music. Example: audio_segmentation(path_to_wav_file, 'svm', path_to_svm_pickle_object) See pyAudioAnalysis's code and documentation for the list of possible classifiers. For this to work, the 'classifier' and 'classifier.arff' files must be in the same directory, as ('classifier' + 'classifier.arff' = trained_classifier) """ sound_segments = aS.mtFileClassification(wav_file_path, classifier_path, classifier_type, return_for_user=True) segments = sound_segments['segments'] classes = sound_segments['classes'] silences = detect_silences(wav_file_path, 2, 1) final_segmentation = incorporate_silences_to_segments( segments, classes, silences) return final_segmentation
def vad(wav_in, wav_out): results, a, b, c = aS.mtFileClassification( wav_in, '/home/seni/git/pyAudioAnalysis/data/svmSM', 'svm', False) results = [(val + 1) % 2 for val in results] # flipping 1 and 0s fs, data = wav.read(wav_in) num_segments = len(results) padded_results = [1, 1] padded_results.extend(results) padded_results.extend([1, 1]) speech_data = [] # hopefully for i in range(2, num_segments + 2): #print(padded_results[i]) segment = data[(i - 2) * fs:(i - 1) * fs] # -2 if sum(padded_results[i - 2:i + 3]) == 0: continue else: speech_data.extend(segment) speech_data = np.array(speech_data) wav.write(wav_out, fs, speech_data)
def getMusicSegmentsFromFile(inputFile): modelType = "svm" modelName = "data/svmMovies8classes" dirOutput = inputFile[0:-4] + "_musicSegments" if os.path.exists(dirOutput) and dirOutput!=".": shutil.rmtree(dirOutput) os.makedirs(dirOutput) [Fs, x] = audioBasicIO.readAudioFile(inputFile) if modelType=='svm': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, compute_beat] = aT.load_model(modelName) elif modelType=='knn': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, compute_beat] = aT.load_model_knn(modelName) flagsInd, classNames, acc, CM = aS.mtFileClassification(inputFile, modelName, modelType, plotResults = False, gtFile = "") segs, classes = aS.flags2segs(flagsInd, mtStep) for i, s in enumerate(segs): if (classNames[int(classes[i])] == "Music") and (s[1] - s[0] >= minDuration): strOut = "{0:s}{1:.3f}-{2:.3f}.wav".format(dirOutput+os.sep, s[0], s[1]) wavfile.write( strOut, Fs, x[int(Fs*s[0]):int(Fs*s[1])])
def classify(self): if self.algo == "ina": self.seg = Segmenter() counter = 0 for audioPath in self.media: startTime = int(round(time.time())) vid = audioPath.split("/")[-1] print("### {}/{} Processing {} ###".format(counter, len(self.media), vid)) if self.algo == "ina": tmp = self.seg(audioPath) tmp2 = str(tmp) self.segmentation.append(tmp) if ("Male" in tmp2 or "Female" in tmp2) and "Music" in tmp2: self.results.append("Mixed") elif "Music" in tmp2: self.results.append("Music") elif "Male" in tmp2 or "Female" in tmp2: self.results.append("Speech") elif self.algo == "paa": [flagsInd, classesAll, acc, CM] = aS.mtFileClassification(audioPath, "svmSM/svmSM", "svm", False, '') res = np.array(flagsInd).mean() if res <= 0.1: self.results.append("Speech") elif res >= 0.9: self.results.append("Music") else: self.results.append("Mixed") endTime = int(round(time.time())) self.times.append(endTime - startTime) counter += 1
def main(argv): if argv[1]=="--file": getMusicSegmentsFromFile(argv[2]) classifyFolderWrapper(argv[2][0:-4] + "_musicSegments", "svm", "data/svmMusicGenre8", True) elif argv[1]=="--dir": analyzeDir(argv[2]) elif argv[1]=="--sim": csvFile = argv[2] f = [] fileNames = [] with open(csvFile, 'rb') as csvfile: spamreader = csv.reader(csvfile, delimiter='\t', quotechar='|') for j,row in enumerate(spamreader): if j>0: ftemp = [] for i in range(1,9): ftemp.append(float(row[i])) f.append(ftemp) R = row[0] II = R.find(".wav"); fileNames.append(row[0][0:II]) f = numpy.array(f) Sim = numpy.zeros((f.shape[0], f.shape[0])) for i in range(f.shape[0]): for j in range(f.shape[0]): Sim[i,j] = scipy.spatial.distance.cdist(numpy.reshape(f[i,:], (f.shape[1],1)).T, numpy.reshape(f[j,:], (f.shape[1],1)).T, 'cosine') Sim1 = numpy.reshape(Sim, (Sim.shape[0]*Sim.shape[1], 1)) plt.hist(Sim1) plt.show() fo = open(csvFile + "_simMatrix", "wb") cPickle.dump(fileNames, fo, protocol = cPickle.HIGHEST_PROTOCOL) cPickle.dump(f, fo, protocol = cPickle.HIGHEST_PROTOCOL) cPickle.dump(Sim, fo, protocol = cPickle.HIGHEST_PROTOCOL) fo.close() elif argv[1]=="--loadsim": try: fo = open(argv[2], "rb") except IOError: print( "didn't find file") return try: fileNames = cPickle.load(fo) f = cPickle.load(fo) Sim = cPickle.load(fo) except: fo.close() fo.close() print(fileNames) Sim1 = numpy.reshape(Sim, (Sim.shape[0]*Sim.shape[1], 1)) plt.hist(Sim1) plt.show() elif argv[1]=="--audio-event-dir": files = "*.wav" inputFolder = argv[2] if os.path.isdir(inputFolder): strFilePattern = os.path.join(inputFolder, files) else: strFilePattern = inputFolder + files wavFilesList = [] wavFilesList.extend(glob.glob(strFilePattern)) wavFilesList = sorted(wavFilesList) for i,w in enumerate(wavFilesList): [flagsInd, classesAll, acc, CM] = aS.mtFileClassification(w, "data/svmMovies8classes", "svm", False, '') histTemp = numpy.zeros( (len(classesAll), ) ) for f in flagsInd: histTemp[int(f)] += 1.0 histTemp /= histTemp.sum() if i==0: print( "".ljust(100)+"\t",) for C in classesAll: print( C.ljust(12)+"\t",) print() print (w.ljust(100)+"\t",) for h in histTemp: print( "{0:.2f}".format(h).ljust(12)+"\t",) print() return 0
def run(wavFileName2,bagFile2): time = 0 segmentDuration = 0 segments = [] # >> Open WAVfile #---------------------- #audioGlobals.wavFileName -> global variable audioGlobals.wavFileName = wavFileName2 audioGlobals.bagFile = bagFile2 audioGlobals.spf = wave.open(audioGlobals.wavFileName,'r') #Extract Raw Audio from Wav File audioGlobals.signal = audioGlobals.spf.readframes(-1) audioGlobals.signal = np.fromstring(audioGlobals.signal, 'Int16') #self.axes.clear() #Get wavFile audioGlabals.duration frames = audioGlobals.spf.getnframes() rate = audioGlobals.spf.getframerate() audioGlobals.duration = frames / float(rate) # >> Open CSVfile #---------------------- # check if .csv exists csvFileName = audioGlobals.bagFile.replace(".bag","_audio.csv") if os.path.isfile(csvFileName): annotationFile = open(csvFileName, 'rb') read = csv.reader(annotationFile) for row in read: row[0] = float(row[0]) row[1] = float(row[1]) audioGlobals.annotations.append([row[0], row[1], row[2]]) # get speakers unic colors for annotation plot and ganttChart #print len(audioGlobals.GreenShades) for shadeIndex in range(len(audioGlobals.annotations)): if audioGlobals.annotations[shadeIndex][2][:8] == 'Speech::': #print audioGlobals.greenIndex, len(audioGlobals.GreenShades)-1 if audioGlobals.greenIndex >= (len(audioGlobals.GreenShades)-1): audioGlobals.greenIndex = 0 else: audioGlobals.greenIndex = audioGlobals.greenIndex + 1 #print audioGlobals.greenIndex, shadeIndex audioGlobals.shadesAndSpeaker.append([audioGlobals.annotations[shadeIndex][2], audioGlobals.GreenShades[audioGlobals.greenIndex]]) # >> Call Classifier in case CSVFile not exists #---------------------- else: [flagsInd, classesAll, acc,CM] = aS.mtFileClassification(audioGlobals.wavFileName, os.path.abspath('audio/ClassifierMethods/svmModelTest'), 'svm', False) # declare classes [segs, classes] = aS.flags2segs(flagsInd, 1) lengthClass = len(classesAll) className = np.arange(lengthClass, dtype=np.float) for j in range(len(segs)): # no Annotation for Silence segments for i in range(len(classesAll)): if classes[j] == className[i] and classesAll[i] != 'Silence': audioGlobals.annotations.append([segs[j][0]*1000, segs[j][1]*1000, classesAll[i]]) # >> Write annotations in csv file csvFileName = audioGlobals.bagFile.replace(".bag","_audio.csv") annotationFile = open(csvFileName, 'w') write = csv.writer(annotationFile) write.writerows(audioGlobals.annotations) annotationFile.close()
from pyAudioAnalysis import audioSegmentation as aS [flagsInd, classesAll, acc, CM] = aS.mtFileClassification("keys.wav", "svmTaps", "svm", True)
from pyAudioAnalysis import audioTrainTest as aT from pyAudioAnalysis import audioBasicIO from pyAudioAnalysis import audioFeatureExtraction import matplotlib.pyplot as plt from pyAudioAnalysis import audioSegmentation as aS [flagsInd, classesAll, acc, CM] = aS.mtFileClassification(audiofile, "data/svmSM", "svm", True, 'data/scottish.segments') from scipy import fftpack audiofile='/Users/kaixiwang/Documents/USC/CSCI-576/FinalProject/dataset2/Videos/data_test2.wav' adaudio=['/Users/kaixiwang/Documents/USC/CSCI-576/FinalProject/dataset/Ads/Subway_Ad_15s.wav','/Users/kaixiwang/Documents/USC/CSCI-576/FinalProject/dataset/Ads/Starbucks_Ad_15s.wav'] [Fs, x] = audioBasicIO.readAudioFile(audiofile); X = fftpack.fft(x) freqs = fftpack.fftfreq(len(x)) * Fs fig, ax = plt.subplots() ax.stem(freqs, np.abs(X)) ax.set_xlabel('Frequency in Hertz [Hz]') ax.set_ylabel('Frequency Domain (Spectrum) Magnitude') ax.set_xlim(-Fs / 2, Fs / 2) ax.set_ylim(-5, 110) [Fs, x] = audioBasicIO.readAudioFile(audiofile); F, f_names = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.025*Fs); plt.subplot(2,1,1); plt.plot(F[0,:]); plt.xlabel('Frame no'); plt.ylabel(f_names[0]); plt.subplot(2,1,2); plt.plot(F[1,:]); plt.xlabel('Frame no'); plt.ylabel(f_names[1]); plt.show() #======================== n = len(channel1) fourier=fft.fft(ad1)
def run(wavFileName2, bagFile2): time = 0 segmentDuration = 0 segments = [] # >> Open WAVfile #---------------------- #audioGlobals.wavFileName -> global variable audioGlobals.wavFileName = wavFileName2 audioGlobals.bagFile = bagFile2 audioGlobals.spf = wave.open(audioGlobals.wavFileName, 'r') #Extract Raw Audio from Wav File audioGlobals.signal = audioGlobals.spf.readframes(-1) audioGlobals.signal = np.fromstring(audioGlobals.signal, 'Int16') #self.axes.clear() #Get wavFile audioGlabals.duration frames = audioGlobals.spf.getnframes() rate = audioGlobals.spf.getframerate() audioGlobals.duration = frames / float(rate) # >> Open CSVfile #---------------------- # check if .csv exists csvFileName = audioGlobals.bagFile.replace(".bag", "_audio.csv") if os.path.isfile(csvFileName): annotationFile = open(csvFileName, 'rb') read = csv.reader(annotationFile) for row in read: row[0] = float(row[0]) row[1] = float(row[1]) audioGlobals.annotations.append([row[0], row[1], row[2]]) # get speakers unic colors for annotation plot and ganttChart #print len(audioGlobals.GreenShades) for shadeIndex in range(len(audioGlobals.annotations)): if audioGlobals.annotations[shadeIndex][2][:8] == 'Speech::': #print audioGlobals.greenIndex, len(audioGlobals.GreenShades)-1 if audioGlobals.greenIndex >= (len(audioGlobals.GreenShades) - 1): audioGlobals.greenIndex = 0 else: audioGlobals.greenIndex = audioGlobals.greenIndex + 1 #print audioGlobals.greenIndex, shadeIndex audioGlobals.shadesAndSpeaker.append([ audioGlobals.annotations[shadeIndex][2], audioGlobals.GreenShades[audioGlobals.greenIndex] ]) # >> Call Classifier in case CSVFile not exists #---------------------- else: [flagsInd, classesAll, acc, CM] = aS.mtFileClassification(audioGlobals.wavFileName, 'svmModelTest', 'svm', False) # declare classes [segs, classes] = aS.flags2segs(flagsInd, 1) lengthClass = len(classesAll) className = np.arange(lengthClass, dtype=np.float) for j in range(len(segs)): # no Annotation for Silence segments for i in range(len(classesAll)): if classes[j] == className[i] and classesAll[i] != 'Silence': audioGlobals.annotations.append( [segs[j][0] * 1000, segs[j][1] * 1000, classesAll[i]]) # >> Write annotations in csv file csvFileName = audioGlobals.bagFile.replace(".bag", "_audio.csv") annotationFile = open(csvFileName, 'w') write = csv.writer(annotationFile) write.writerows(audioGlobals.annotations) annotationFile.close()
"""! @brief Example 31B @details: Speech music discrimination and segmentation (using a trained speech - music segment classifier) Important: Need to run 31A first to extract speech music model (stored in svm_speech_music) @author Theodoros Giannakopoulos {[email protected]} """ from pyAudioAnalysis.audioSegmentation import mtFileClassification if __name__ == '__main__': au = "../data/scottish_radio.wav" gt = "../data/scottish_radio.segments" # au = "../data/musical_genres_small/hiphop/run_dmc_peter_riper.wav" mtFileClassification(au, "svm_speech_music", "svm_rbf", True, gt)
def Classify(wavFileName): #Segmatation and Classification #os.chdir('Home/Documents/python/audioGraph') [flagsInd, classesAll, acc] = aS.mtFileClassification(wavFileName, 'svmModelTest', 'svm', False) print flagsInd, classesAll
''' Segmentation ''' import subprocess from pyAudioAnalysis import audioSegmentation as aS ''' Fixed-segment Segmentation & Classification ''' # [flagsInd, classesAll, acc, CM] = aS.mtFileClassification("../../audio-source/SMTest/voice_speech.mp3", "pyAudioAnalysis/data/svmSM", "svm", True, 'output/voice_speech.segments') [flagsInd, classesAll, acc, CM] = aS.mtFileClassification("pyAudioAnalysis/data/scottish.wav", "pyAudioAnalysis/data/svmSM", "svm", True, 'output/scottish.segments') # Command-line use: # python audioAnalysis.py segmentClassifyFile -i <inputFile> --model <model type (svm or knn)> --modelName <path to classifier model> # Example: # python audioAnalysis.py segmentClassifyFile -i data/scottish.wav --model svm --modelName data/svmSM # subprocess.call("cd pyAudioAnalysis; " # "python audioAnalysis.py segmentClassifyFile -i data/scottish.wav --model svm --modelName data/svmSM", shell=True)
from pyAudioAnalysis import audioSegmentation as aS [flagsIndknn, classesAll, acc, CM] = aS.mtFileClassification("/Users/mclaugh/Desktop/MLK_2/Martin Luther King The Three Evils of Society-j8d-IYSM-08.WAV", "/Volumes/McLaughlin-6TB-1/Dropbox/test_set_616_clips/knn_MLK_bg", "knn", True) ############
def main(argv): if argv[1] == "-shortTerm": for i in range(nExp): [Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav"); duration = x.shape[0] / float(Fs) t1 = time.clock() F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.050*Fs); t2 = time.clock() perTime1 = duration / (t2-t1); print "short-term feature extraction: {0:.1f} x realtime".format(perTime1) elif argv[1] == "-classifyFile": for i in range(nExp): [Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav"); duration = x.shape[0] / float(Fs) t1 = time.clock() aT.fileClassification("diarizationExample.wav", "svmSM","svm") t2 = time.clock() perTime1 = duration / (t2-t1); print "Mid-term feature extraction + classification \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-mtClassify": for i in range(nExp): [Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav"); duration = x.shape[0] / float(Fs) t1 = time.clock() [flagsInd, classesAll, acc] = aS.mtFileClassification("diarizationExample.wav", "svmSM", "svm", False, '') t2 = time.clock() perTime1 = duration / (t2-t1); print "Fix-sized classification - segmentation \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-hmmSegmentation": for i in range(nExp): [Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav"); duration = x.shape[0] / float(Fs) t1 = time.clock() aS.hmmSegmentation('diarizationExample.wav', 'hmmRadioSM', False, '') t2 = time.clock() perTime1 = duration / (t2-t1); print "HMM-based classification - segmentation \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-silenceRemoval": for i in range(nExp): [Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav"); duration = x.shape[0] / float(Fs) t1 = time.clock() [Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav"); segments = aS.silenceRemoval(x, Fs, 0.050, 0.050, smoothWindow = 1.0, Weight = 0.3, plot = False) t2 = time.clock() perTime1 = duration / (t2-t1); print "Silence removal \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-thumbnailing": for i in range(nExp): [Fs1, x1] = audioBasicIO.readAudioFile("scottish.wav") duration1 = x1.shape[0] / float(Fs1) t1 = time.clock() [A1, A2, B1, B2, Smatrix] = aS.musicThumbnailing(x1, Fs1, 1.0, 1.0, 15.0) # find thumbnail endpoints t2 = time.clock() perTime1 = duration1 / (t2-t1); print "Thumbnail \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-diarization-noLDA": for i in range(nExp): [Fs1, x1] = audioBasicIO.readAudioFile("diarizationExample.wav") duration1 = x1.shape[0] / float(Fs1) t1 = time.clock() aS.speakerDiarization("diarizationExample.wav", 4, LDAdim = 0, PLOT = False) t2 = time.clock() perTime1 = duration1 / (t2-t1); print "Diarization \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-diarization-LDA": for i in range(nExp): [Fs1, x1] = audioBasicIO.readAudioFile("diarizationExample.wav") duration1 = x1.shape[0] / float(Fs1) t1 = time.clock() aS.speakerDiarization("diarizationExample.wav", 4, PLOT = False) t2 = time.clock() perTime1 = duration1 / (t2-t1); print "Diarization \t {0:.1f} x realtime".format(perTime1)
from pyAudioAnalysis import audioSegmentation as aS [flagsInd, classesAll, acc, CM] = aS.mtFileClassification("./audiofiles/c3_0629.wav", "./knnMusicGenre2", "knn", False, './audiofiles/c3_0629.segments')
x, Fs, round(Fs * 0.040), round(Fs * 0.040), True) print("\n\n\n * * * TEST 3 * * * \n\n\n") [Fs, x] = audioBasicIO.read_audio_file(root_data_path + "pyAudioAnalysis/data/doremi.wav") x = audioBasicIO.stereo_to_mono(x) specgram, TimeAxis, FreqAxis = audioFeatureExtraction.stChromagram( x, Fs, round(Fs * 0.040), round(Fs * 0.040), True) print("\n\n\n * * * TEST 4 * * * \n\n\n") aT.featureAndTrain([root_data_path + "SM/speech", root_data_path + "SM/music"], 1.0, 1.0, 0.2, 0.2, "svm", "temp", True) print("\n\n\n * * * TEST 5 * * * \n\n\n") [flagsInd, classesAll, acc, CM] = aS.mtFileClassification( root_data_path + "pyAudioAnalysis/data//scottish.wav", root_data_path + "pyAudioAnalysis/data/svmSM", "svm", True, root_data_path + 'pyAudioAnalysis/data/scottish.segments') print("\n\n\n * * * TEST 6 * * * \n\n\n") aS.trainHMM_fromFile(root_data_path + 'radioFinal/train/bbc4A.wav', root_data_path + 'radioFinal/train/bbc4A.segments', 'hmmTemp1', 1.0, 1.0) aS.trainHMM_fromDir(root_data_path + 'radioFinal/small', 'hmmTemp2', 1.0, 1.0) aS.hmmSegmentation(root_data_path + 'pyAudioAnalysis/data//scottish.wav', 'hmmTemp1', True, root_data_path + 'pyAudioAnalysis/data//scottish.segments') # test 1 aS.hmmSegmentation(root_data_path + 'pyAudioAnalysis/data//scottish.wav', 'hmmTemp2', True, root_data_path + 'pyAudioAnalysis/data//scottish.segments') # test 2 print("\n\n\n * * * TEST 7 * * * \n\n\n")
print("\n\n\n * * * TEST 2 * * * \n\n\n") [Fs, x] = audioBasicIO.readAudioFile(root_data_path + "pyAudioAnalysis/data/doremi.wav") x = audioBasicIO.stereo2mono(x) specgram, TimeAxis, FreqAxis = audioFeatureExtraction.stSpectogram(x, Fs, round(Fs * 0.040), round(Fs * 0.040), True) print("\n\n\n * * * TEST 3 * * * \n\n\n") [Fs, x] = audioBasicIO.readAudioFile(root_data_path + "pyAudioAnalysis/data/doremi.wav") x = audioBasicIO.stereo2mono(x) specgram, TimeAxis, FreqAxis = audioFeatureExtraction.stChromagram(x, Fs, round(Fs * 0.040), round(Fs * 0.040), True) print("\n\n\n * * * TEST 4 * * * \n\n\n") aT.featureAndTrain([root_data_path +"SM/speech",root_data_path + "SM/music"], 1.0, 1.0, 0.2, 0.2, "svm", "temp", True) print("\n\n\n * * * TEST 5 * * * \n\n\n") [flagsInd, classesAll, acc, CM] = aS.mtFileClassification(root_data_path + "pyAudioAnalysis/data//scottish.wav", root_data_path + "pyAudioAnalysis/data/svmSM", "svm", True, root_data_path + 'pyAudioAnalysis/data/scottish.segments') print("\n\n\n * * * TEST 6 * * * \n\n\n") aS.trainHMM_fromFile(root_data_path + 'radioFinal/train/bbc4A.wav', root_data_path + 'radioFinal/train/bbc4A.segments', 'hmmTemp1', 1.0, 1.0) aS.trainHMM_fromDir(root_data_path + 'radioFinal/small', 'hmmTemp2', 1.0, 1.0) aS.hmmSegmentation(root_data_path + 'pyAudioAnalysis/data//scottish.wav', 'hmmTemp1', True, root_data_path + 'pyAudioAnalysis/data//scottish.segments') # test 1 aS.hmmSegmentation(root_data_path + 'pyAudioAnalysis/data//scottish.wav', 'hmmTemp2', True, root_data_path + 'pyAudioAnalysis/data//scottish.segments') # test 2 print("\n\n\n * * * TEST 7 * * * \n\n\n") aT.featureAndTrainRegression(root_data_path + "pyAudioAnalysis/data/speechEmotion", 1, 1, 0.050, 0.050, "svm_rbf", "temp.mod", compute_beat=False) print(aT.fileRegression(root_data_path + "pyAudioAnalysis/data/speechEmotion/01.wav", "temp.mod", "svm_rbf")) print("\n\n\n * * * TEST 8 * * * \n\n\n") aT.featureAndTrainRegression(root_data_path + "pyAudioAnalysis/data/speechEmotion", 1, 1, 0.050, 0.050, "svm", "temp.mod", compute_beat=False) print(aT.fileRegression(root_data_path + "pyAudioAnalysis/data/speechEmotion/01.wav", "temp.mod", "svm"))
def run(wavFileName2, bagFile2): global wavFileName global bagFile global xStart global xEnd global annotationFlag, annotations, shadesAndSpeaker, greenIndex global spf, duration, signal time = 0 segmentDuration = 0 segments = [] # >> Open WAVfile #---------------------- #wavFileName -> global variable wavFileName = wavFileName2 bagFile = bagFile2 spf = wave.open(wavFileName, 'r') #Extract Raw Audio from Wav File signal = spf.readframes(-1) signal = np.fromstring(signal, 'Int16') #self.axes.clear() #Get wavFile duration frames = spf.getnframes() rate = spf.getframerate() duration = frames / float(rate) # >> Open CSVfile #---------------------- # check if .csv exists csvFileName = bagFile.replace(".bag", "_audio.csv") if os.path.isfile(csvFileName): # print '.csv Found !' annotationFile = open(csvFileName, 'rb') read = csv.reader(annotationFile) for row in read: row[0] = float(row[0]) row[1] = float(row[1]) annotations.append([row[0], row[1], row[2]]) # get speakers unic colors for annotation plot and ganttChart for shadeIndex in range(len(annotations)): if annotations[shadeIndex][2][:8] == 'Speech::': shadesAndSpeaker.append( [annotations[shadeIndex][2], GreenShades[greenIndex]]) if greenIndex > len(GreenShades): greenIndex = 0 else: greenIndex = greenIndex + 1 # >> Call Classifier in case CSVFile not exists #---------------------- else: # print 'classifier...' [flagsInd, classesAll, acc] = aS.mtFileClassification(wavFileName, 'svmModelTest', 'svm', False) # declare classes [segs, classes] = aS.flags2segs(flagsInd, 1) lengthClass = len(classesAll) className = np.arange(lengthClass, dtype=np.float) for j in range(len(segs)): # no Annotation for Silence segments for i in range(len(classesAll)): if classes[j] == className[i] and classesAll[i] != 'Silence': annotations.append( [segs[j][0] * 1000, segs[j][1] * 1000, classesAll[i]]) # >> Initialize GUI #---------------------- qApp = QtWidgets.QApplication(sys.argv) aw = ApplicationWindow() aw.setWindowTitle("Audio") aw.show() # >> Terminate GUI #---------------------- sys.exit(qApp.exec_())
def evaluateSpeechMusic(fileName, modelName, method="svm", postProcess=0, postProcessModelName="", PLOT=False): # load grount truth file (matlab annotation) matFile = fileName.replace(".wav", "_true.mat") if os.path.isfile(matFile): matfile = loadmat(matFile) segs_gt = matfile["segs_r"] classes_gt1 = matfile["classes_r"] classes_gt = [] for c in classes_gt1[0]: if c == "M": classes_gt.append("music") if c == "S" or c == "E": classes_gt.append("speech") flagsIndGT, classesAllGT = audioSegmentation.segs2flags( [s[0] for s in segs_gt], [s[1] for s in segs_gt], classes_gt, 1.0) if method == "svm" or method == "randomforest" or method == "gradientboosting" or method == "extratrees": # speech-music segmentation: [flagsInd, classesAll, acc, CM] = audioSegmentation.mtFileClassification(fileName, modelName, method, False, '') elif method == "hmm": [flagsInd, classesAll, _, _] = audioSegmentation.hmmSegmentation(fileName, modelName, PLOT=False, gtFileName="") elif method == "cnn": WIDTH_SEC = 2.4 [Fs, x] = io.readAudioFile(fileName) x = io.stereo2mono(x) [flagsInd, classesAll, CNNprobs] = mtCNN_classification(x, Fs, WIDTH_SEC, 1.0, RGB_singleFrame_net, SOUND_mean_RGB, transformer_RGB, classNamesCNN) for i in range(flagsIndGT.shape[0]): flagsIndGT[i] = classesAll.index(classesAllGT[flagsIndGT[i]]) #plt.plot(flagsIndGT, 'r') #plt.plot(flagsInd) #plt.show() #print classesAllGT, classesAll if postProcess >= 1: # medfilt here! flagsInd = scipy.signal.medfilt(flagsInd, 11) if postProcess >= 2: #load HMM try: fo = open(postProcessModelName, "rb") except IOError: print "didn't find file" return try: hmm = cPickle.load(fo) classesAll = cPickle.load(fo) except: fo.close() #Features = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.050*Fs); # feature extraction #[Features, _] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * 0.050), round(Fs * 0.050)) flagsInd = hmm.predict(CNNprobs) flagsInd = scipy.signal.medfilt(flagsInd, 3) if PLOT: plt.plot(flagsInd + 0.01) plt.plot(flagsIndGT, 'r') plt.show() CM = np.zeros((2, 2)) for i in range(min(flagsInd.shape[0], flagsIndGT.shape[0])): CM[int(flagsIndGT[i]), int(flagsInd[i])] += 1 print CM return CM, classesAll
def main(argv): if argv[1] == "-shortTerm": for i in range(nExp): [Fs, x] = audioBasicIO.readAudioFile("snakehit.wav") duration = x.shape[0] / float(Fs) t1 = time.clock() F = audioFeatureExtraction.stFeatureExtraction( x, Fs, 0.050 * Fs, 0.050 * Fs) t2 = time.clock() perTime1 = duration / (t2 - t1) print "short-term feature extraction: {0:.1f} x realtime".format( perTime1) elif argv[1] == "-classifyFile": for i in range(nExp): [Fs, x] = audioBasicIO.readAudioFile("snakehit.wav") duration = x.shape[0] / float(Fs) t1 = time.clock() aT.fileClassification("snakehit.wav", "svmSM", "svm") t2 = time.clock() perTime1 = duration / (t2 - t1) print "Mid-term feature extraction + classification \t {0:.1f} x realtime".format( perTime1) elif argv[1] == "-mtClassify": for i in range(nExp): [Fs, x] = audioBasicIO.readAudioFile("snakehit.wav") duration = x.shape[0] / float(Fs) t1 = time.clock() [flagsInd, classesAll, acc] = aS.mtFileClassification("snakehit.wav", "svmSM", "svm", False, '') t2 = time.clock() perTime1 = duration / (t2 - t1) print "Fix-sized classification - segmentation \t {0:.1f} x realtime".format( perTime1) elif argv[1] == "-hmmSegmentation": for i in range(nExp): [Fs, x] = audioBasicIO.readAudioFile("snakehit.wav") duration = x.shape[0] / float(Fs) t1 = time.clock() aS.hmmSegmentation('snakehit.wav', 'hmmRadioSM', False, '') t2 = time.clock() perTime1 = duration / (t2 - t1) print "HMM-based classification - segmentation \t {0:.1f} x realtime".format( perTime1) elif argv[1] == "-silenceRemoval": for i in range(nExp): [Fs, x] = audioBasicIO.readAudioFile("snakehit.wav") duration = x.shape[0] / float(Fs) t1 = time.clock() [Fs, x] = audioBasicIO.readAudioFile("snakehit.wav") segments = aS.silenceRemoval(x, Fs, 0.050, 0.050, smoothWindow=1.0, Weight=0.3, plot=False) t2 = time.clock() perTime1 = duration / (t2 - t1) print "Silence removal \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-thumbnailing": for i in range(nExp): [Fs1, x1] = audioBasicIO.readAudioFile("scottish.wav") duration1 = x1.shape[0] / float(Fs1) t1 = time.clock() [A1, A2, B1, B2, Smatrix] = aS.musicThumbnailing(x1, Fs1, 1.0, 1.0, 15.0) # find thumbnail endpoints t2 = time.clock() perTime1 = duration1 / (t2 - t1) print "Thumbnail \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-diarization-noLDA": for i in range(nExp): [Fs1, x1] = audioBasicIO.readAudioFile("snakehit.wav") duration1 = x1.shape[0] / float(Fs1) t1 = time.clock() aS.speakerDiarization("snakehit.wav", 4, LDAdim=0, PLOT=False) t2 = time.clock() perTime1 = duration1 / (t2 - t1) print "Diarization \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-diarization-LDA": for i in range(nExp): [Fs1, x1] = audioBasicIO.readAudioFile("snakehit.wav") duration1 = x1.shape[0] / float(Fs1) t1 = time.clock() aS.speakerDiarization("snakehit.wav", 4, PLOT=False) t2 = time.clock() perTime1 = duration1 / (t2 - t1) print "Diarization \t {0:.1f} x realtime".format(perTime1)
if not os.path.isfile(modelName): modelName = default_modelName if not os.path.isfile(modelName): print('Cannot locate model file {}'.format(modelName)) else: # detect mic configuration by analyzing input wav file modelName = get_model_path(args['inputWavFile']) if (args['debug']): print('\tusing: {}'.format(modelName)) model_time = time.time() - start_time modelType = "svm" gtFile = "" returnVal = aS.mtFileClassification(args['inputWavFile'], modelName, modelType, False, gtFile) flagsInd = returnVal[0] classNames = returnVal[1] flags = [classNames[int(f)] for f in flagsInd] (segs, classes) = aS.flags2segs(flags, 1) for s in range(len(segs)): sg = segs[s] diff = int(sg[1]) - int(sg[0]) if (args['debug']): print('{:>6} - {:>6} ({:>6}) : {}').format(sg[0], sg[1], diff, classes[s]) my_segments.append(Segment(int(sg[0]), int(sg[1]), str(classes[s]))) # Speech and non speech lists
def run(wavFileName2,bagFile2): global wavFileName global bagFile global xStart global xEnd global annotationFlag, annotations, shadesAndSpeaker, greenIndex global spf, duration, signal time = 0 segmentDuration = 0 segments = [] # >> Open WAVfile #---------------------- #wavFileName -> global variable wavFileName = wavFileName2 bagFile = bagFile2 spf = wave.open(wavFileName,'r') #Extract Raw Audio from Wav File signal = spf.readframes(-1) signal = np.fromstring(signal, 'Int16') #self.axes.clear() #Get wavFile duration frames = spf.getnframes() rate = spf.getframerate() duration = frames / float(rate) # >> Open CSVfile #---------------------- # check if .csv exists csvFileName = bagFile.replace(".bag","_audio.csv") if os.path.isfile(csvFileName): # print '.csv Found !' annotationFile = open(csvFileName, 'rb') read = csv.reader(annotationFile) for row in read: row[0] = float(row[0]) row[1] = float(row[1]) annotations.append([row[0], row[1], row[2]]) # get speakers unic colors for annotation plot and ganttChart for shadeIndex in range(len(annotations)): if annotations[shadeIndex][2][:8] == 'Speech::': shadesAndSpeaker.append([annotations[shadeIndex][2], GreenShades[greenIndex]]) if greenIndex > len(GreenShades): greenIndex = 0 else: greenIndex = greenIndex + 1 # >> Call Classifier in case CSVFile not exists #---------------------- else: # print 'classifier...' [flagsInd, classesAll, acc] = aS.mtFileClassification(wavFileName, 'svmModelTest', 'svm', False) # declare classes [segs, classes] = aS.flags2segs(flagsInd, 1) lengthClass = len(classesAll) className = np.arange(lengthClass, dtype=np.float) for j in range(len(segs)): # no Annotation for Silence segments for i in range(len(classesAll)): if classes[j] == className[i] and classesAll[i] != 'Silence': annotations.append([segs[j][0]*1000, segs[j][1]*1000, classesAll[i]]) # >> Initialize GUI #---------------------- qApp = QtWidgets.QApplication(sys.argv) aw = ApplicationWindow() aw.setWindowTitle("Audio") aw.show() # >> Terminate GUI #---------------------- sys.exit(qApp.exec_())
def save_out(test_file): [flags_ind, classes_all, acc] = aS.mtFileClassification(test_file, args.model, "knn", False) np.save(test_file, flags_ind) return classes_all
""" @details: Speech Non-speech discrimination and segmentation (using a trained speech - non segment classifier) Important: Need to run speech_non_speech.py first to extract speech non_speech model (stored in svm_speech_non_speech) """ from pyAudioAnalysis.audioSegmentation import mtFileClassification if __name__ == '__main__': au = "/media/vlachos/4e757fbf-09d9-4276-a1f4-af671280a9bb/NCSR-UOP/Multimodal Information Processing and Analysis/audio/speech_non_speech_test.wav" gt = "/media/vlachos/4e757fbf-09d9-4276-a1f4-af671280a9bb/NCSR-UOP/Multimodal Information Processing and Analysis/audio/speech_non_speech_test.txt" mtFileClassification(au, "svm_speech_non_speech", "svm_rbf", True, gt)
# boundary speech model boundary_model = "model/svmNoLapelSpeechModel" # run the classification model on the audio file [Result, P, classNames] = aT.fileClassification(wavFile, mic_model, "svm") Result = int(Result) # if the winner class is boundary_speech return # the path of the boundary speech model, otherwise # return the path of thelapel speech model if classNames[Result] == "boundry_speech": return boundary_model else: return lapel_model # argument handler ap = argparse.ArgumentParser() ap.add_argument("-i", "--input", required=True, help="path to the audio file") args = vars(ap.parse_args()) audio_file = args["input"] # determin speech model for audio file speech_model = get_model_path(audio_file) # run predicted speech model to segment audio file segmentation = aS.mtFileClassification(audio_file, speech_model, "svm", False, gtFile="")
# # # check for silence # silent = True # # wave frame samples are stored in little endian** # # this example works for a single channel 16-bit per sample encoding # unpacked_signed_value = struct.unpack("<h", current_frame) # * # if abs(unpacked_signed_value[0]) > 500: # silent = False # # if silent: # print("Frame %s is silent." % wave_file.tell()) # else: # print("Frame %s is not silent." % wave_file.tell()) # rate, data = wf.read('testing.wav') # # data0 is the data from channel 0. # data0 = data[:, 0] # # print(data0) # from pydub import AudioSegment # from pydub.silence import detect_silence, detect_nonsilent # # song = AudioSegment.from_wav("soundaudio.wav") # val = detect_silence(song) # print(val) from pyAudioAnalysis import audioSegmentation as aS [flagsInd, classesAll, acc, CM] = aS.mtFileClassification("data/scottish.wav", "data/svmSM", "svm", True, 'data/scottish.segments')
def get_classification(): au = "../audio/Adam_Driver_and_Michael_Shannon.wav" gt = "annotated_data/Adam_Driver_and_Michael_Shannon.segments" # au = "../data/musical_genres_small/hiphop/run_dmc_peter_riper.wav" mtFileClassification(au, "diarization", "knn", True, gt)