def thumbnailWrapper(inputFile, thumbnailWrapperSize): st_window = 0.5 st_step = 0.5 if not os.path.isfile(inputFile): raise Exception("Input audio file not found!") [fs, x] = audioBasicIO.readAudioFile(inputFile) if fs == -1: # could not read file return [A1, A2, B1, B2, Smatrix] = aS.musicThumbnailing(x, fs, st_window, st_step, thumbnailWrapperSize) # write thumbnailWrappers to WAV files: if inputFile.endswith(".wav"): thumbnailWrapperFileName1 = inputFile.replace(".wav", "_thumb1.wav") thumbnailWrapperFileName2 = inputFile.replace(".wav", "_thumb2.wav") if inputFile.endswith(".mp3"): thumbnailWrapperFileName1 = inputFile.replace(".mp3", "_thumb1.mp3") thumbnailWrapperFileName2 = inputFile.replace(".mp3", "_thumb2.mp3") wavfile.write(thumbnailWrapperFileName1, fs, x[int(fs * A1):int(fs * A2)]) wavfile.write(thumbnailWrapperFileName2, fs, x[int(fs * B1):int(fs * B2)]) print("1st thumbnailWrapper (stored in file {0:s}): {1:4.1f}sec" \ " -- {2:4.1f}sec".format(thumbnailWrapperFileName1, A1, A2)) print("2nd thumbnailWrapper (stored in file {0:s}): {1:4.1f}sec" \ " -- {2:4.1f}sec".format(thumbnailWrapperFileName2, B1, B2)) # Plot self-similarity matrix: fig = plt.figure() ax = fig.add_subplot(111, aspect="auto") plt.imshow(Smatrix) # Plot best-similarity diagonal: Xcenter = (A1 / st_step + A2 / st_step) / 2.0 Ycenter = (B1 / st_step + B2 / st_step) / 2.0 e1 = matplotlib.patches.Ellipse((Ycenter, Xcenter), thumbnailWrapperSize * 1.4, 3, angle=45, linewidth=3, fill=False) ax.add_patch(e1) plt.plot([B1/ st_step, Smatrix.shape[0]], [A1/ st_step, A1/ st_step], color="k", linestyle="--", linewidth=2) plt.plot([B2/ st_step, Smatrix.shape[0]], [A2/ st_step, A2/ st_step], color="k", linestyle="--", linewidth=2) plt.plot([B1/ st_step, B1/ st_step], [A1/ st_step, Smatrix.shape[0]], color="k", linestyle="--", linewidth=2) plt.plot([B2/ st_step, B2/ st_step], [A2/ st_step, Smatrix.shape[0]], color="k", linestyle="--", linewidth=2) plt.xlim([0, Smatrix.shape[0]]) plt.ylim([Smatrix.shape[1], 0]) ax.yaxis.set_label_position("right") ax.yaxis.tick_right() plt.xlabel("frame no") plt.ylabel("frame no") plt.title("Self-similarity matrix") plt.show()
def fileChromagramWrapper(wav_file): if not os.path.isfile(wav_file): raise Exception("Input audio file not found!") [fs, x] = audioBasicIO.readAudioFile(wav_file) x = audioBasicIO.stereo2mono(x) specgram, TimeAxis, FreqAxis = aF.stChromagram(x, fs, round(fs * 0.040), round(fs * 0.040), True)
def load_validation_set(): """ Output a tuple of features: (fft features, mfcc features, mean-std features) Description extracts three types of features from validation set. """ ffts = dict() mfccs = dict() mean_stds = dict() for i in validation_ids: path = './validation/validation.{i}.wav'.format(i=i) _, X = read_wav(path) # FFT fft = np.array(abs(sp.fft(X)[:1000])) ffts.update({i: fft}) # MFCC ceps, mspec, spec = mfcc(X) num_ceps = len(ceps) x = np.mean(ceps[int(num_ceps*1/10):int(num_ceps*9/10)], axis=0) mfccs.update({i: x}) # Mean-Std [Fs, x] = audioBasicIO.readAudioFile(path); F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.025*Fs); mean_std = [] for f in F: mean_std.extend([f.mean(), f.std()]) mean_stds.update({i: np.array(mean_std)}) return (ffts, mfccs, mean_stds)
def POST(self): x = web.input(myfile={}) filename = 'tmp/'+uuid.uuid4().hex+'.wav' file = open(filename, 'w+') file.seek(0) file.write(x['myfile'].value) file.close() [Fs, x] = audioBasicIO.readAudioFile(filename); #os.remove(filename) x = audioBasicIO.stereo2mono(x) [F, _] = audioFeatureExtraction.mtFeatureExtraction(x, Fs, round(Fs*1.0), round(Fs * 1.0), round(Fs * 0.050), round(Fs * 0.050)) F = F.transpose() for vec in F: results={} current_highest = "" current_highest_value = 0 vec = numpy.around(vec.astype(numpy.float), 6) current = model.getNN(vec) result = current[0][1].partition("_")[0] if result in results: results[result] = results[result]+1 else: results[result] = 1 if results[result] > current_highest_value: current_highest_value = results[result] current_highest = result print results print current_highest raise web.seeother('/')
def beatExtractionWrapper(wav_file, plot): if not os.path.isfile(wav_file): raise Exception("Input audio file not found!") [fs, x] = audioBasicIO.readAudioFile(wav_file) F, _ = aF.stFeatureExtraction(x, fs, 0.050 * fs, 0.050 * fs) bpm, ratio = aF.beatExtraction(F, 0.050, plot) print("Beat: {0:d} bpm ".format(int(bpm))) print("Ratio: {0:.2f} ".format(ratio))
def silenceRemovalWrapper(inputFile, smoothingWindow, weight): if not os.path.isfile(inputFile): raise Exception("Input audio file not found!") [fs, x] = audioBasicIO.readAudioFile(inputFile) segmentLimits = aS.silenceRemoval(x, fs, 0.05, 0.05, smoothingWindow, weight, True) for i, s in enumerate(segmentLimits): strOut = "{0:s}_{1:.3f}-{2:.3f}.wav".format(inputFile[0:-4], s[0], s[1]) wavfile.write(strOut, fs, x[int(fs * s[0]):int(fs * s[1])])
def process_mp3_files(): files = read_input() os.system("touch test.wav") for mp3_file in files: mean_value = [] sound = AudioSegment.from_mp3(mp3_file) sound.export("test.wav", format="wav") # print mp3_file [Fs, x] = audioBasicIO.readAudioFile("test.wav") F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050 * Fs, 0.025 * Fs) for i in range(len(F)): mean_value.append(numpy.mean(F[i])) compute_emotion(mean_value)
def dirWavFeatureExtractionNoAveraging(dirName, mt_win, mt_step, st_win, st_step): """ This function extracts the mid-term features of the WAVE files of a particular folder without averaging each file. ARGUMENTS: - dirName: the path of the WAVE directory - mt_win, mt_step: mid-term window and step (in seconds) - st_win, st_step: short-term window and step (in seconds) RETURNS: - X: A feature matrix - Y: A matrix of file labels - filenames: """ all_mt_feats = numpy.array([]) signal_idx = numpy.array([]) process_times = [] types = ('*.wav', '*.aif', '*.aiff', '*.ogg') wav_file_list = [] for files in types: wav_file_list.extend(glob.glob(os.path.join(dirName, files))) wav_file_list = sorted(wav_file_list) for i, wavFile in enumerate(wav_file_list): [fs, x] = audioBasicIO.readAudioFile(wavFile) if isinstance(x, int): continue x = audioBasicIO.stereo2mono(x) [mt_term_feats, _, _] = mtFeatureExtraction(x, fs, round(mt_win * fs), round(mt_step * fs), round(fs * st_win), round(fs * st_step)) mt_term_feats = numpy.transpose(mt_term_feats) if len(all_mt_feats) == 0: # append feature vector all_mt_feats = mt_term_feats signal_idx = numpy.zeros((mt_term_feats.shape[0], )) else: all_mt_feats = numpy.vstack((all_mt_feats, mt_term_feats)) signal_idx = numpy.append(signal_idx, i * numpy.ones((mt_term_feats.shape[0], ))) return (all_mt_feats, signal_idx, wav_file_list)
def mtFeatureExtractionToFile(fileName, midTermSize, midTermStep, shortTermSize, shortTermStep, outPutFile, storeStFeatures=False, storeToCSV=False, PLOT=False): """ This function is used as a wrapper to: a) read the content of a WAV file b) perform mid-term feature extraction on that signal c) write the mid-term feature sequences to a numpy file """ [fs, x] = audioBasicIO.readAudioFile(fileName) x = audioBasicIO.stereo2mono(x) if storeStFeatures: [mtF, stF, _] = mtFeatureExtraction(x, fs, round(fs * midTermSize), round(fs * midTermStep), round(fs * shortTermSize), round(fs * shortTermStep)) else: [mtF, _, _] = mtFeatureExtraction(x, fs, round(fs*midTermSize), round(fs * midTermStep), round(fs * shortTermSize), round(fs * shortTermStep)) # save mt features to numpy file numpy.save(outPutFile, mtF) if PLOT: print("Mid-term numpy file: " + outPutFile + ".npy saved") if storeToCSV: numpy.savetxt(outPutFile+".csv", mtF.T, delimiter=",") if PLOT: print("Mid-term CSV file: " + outPutFile + ".csv saved") if storeStFeatures: # save st features to numpy file numpy.save(outPutFile+"_st", stF) if PLOT: print("Short-term numpy file: " + outPutFile + "_st.npy saved") if storeToCSV: # store st features to CSV file numpy.savetxt(outPutFile+"_st.csv", stF.T, delimiter=",") if PLOT: print("Short-term CSV file: " + outPutFile + "_st.csv saved")
def showFeatures(name): print("processing - " + name) [Fs, x] = audioBasicIO.readAudioFile(name) # print(x) F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.50 * Fs, 0.25 * Fs) # print(x.size, Fs, 0.50 * Fs, 0.25 * Fs) # a = F[0, :] # numpy.savetxt("foo.csv", a, delimiter=",") # plt.subplot(3, 1, 1) # plt.plot(F[0, :]) # plt.xlabel('Frame no') # plt.ylabel('ZCR') # # plt.subplot(3, 1, 2) # plt.plot(F[1, :]) # plt.xlabel('Frame no') # plt.ylabel('Energy') # # plt.subplot(3, 1, 3) # plt.plot(F[3, :]) # plt.xlabel('Frame no') # plt.ylabel('SC') # # plt.show() # items = ' '.join(map(str, a)) # print(items) # print("--", F[0, :]) vec = [ F[0, :].mean(), F[1, :].mean(), F[4, :].mean(), F[5, :].mean(), F[6, :].mean(), F[7, :].mean(), F[0, :].std(), F[1, :].std(), F[4, :].std(), F[5, :].std(), F[6, :].std(), F[7, :].std() ] vecstr = ' '.join(map(str, vec)) print("vector in audio.py : ",vecstr); melfeat = melfeature(F) # chromafeat = chromafeature(F) return vecstr + " " + melfeat
def trainHMM_fromDir(dirPath, hmmModelName, mtWin, mtStep): ''' This function trains a HMM model for segmentation-classification using a where WAV files and .segment (ground-truth files) are stored ARGUMENTS: - dirPath: the path of the data diretory - hmmModelName: the name of the HMM model to be stored - mtWin: mid-term window size - mtStep: mid-term window step RETURNS: - hmm: an object to the resulting HMM - classNames: a list of classNames After training, hmm, classNames, along with the mtWin and mtStep values are stored in the hmmModelName file ''' flagsAll = numpy.array([]) initializedFall = False classesAll = [] # for each WAV file for i, f in enumerate(glob.glob(dirPath + os.sep + '*.wav')): wavFile = f # open for annotated file gtFile = f.replace('.wav', '.segments') # if current WAV file does not have annotation -> skip if not os.path.isfile(gtFile): continue [segStart, segEnd, segLabels] = readSegmentGT( gtFile) # read GT data flags, classNames = segs2flags( segStart, segEnd, segLabels, mtStep) # convert to flags # update classnames: for c in classNames: if c not in classesAll: classesAll.append(c) [Fs, x] = audioBasicIO.readAudioFile( wavFile) # read audio data [F, _] = aF.mtFeatureExtraction( x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * 0.050), round(Fs * 0.050)) # feature extraction lenF = F.shape[1] lenL = len(flags) MIN = min(lenF, lenL) F = F[:, 0:MIN] flags = flags[0:MIN] flagsNew = [] for j, fl in enumerate(flags): # append features and labels flagsNew.append(classesAll.index(classNames[flags[j]])) flagsAll = numpy.append(flagsAll, numpy.array(flagsNew)) if not initializedFall: Fall = F initializedFall = True else: Fall = numpy.concatenate((Fall, F), axis=1) startprob, transmat, means, cov = trainHMM_computeStatistics( Fall, flagsAll) # compute HMM statistics hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], "diag") # train HMM hmm.startprob_ = startprob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov fo = open(hmmModelName, "wb") # save HMM model cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(classesAll, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mtWin, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mtStep, fo, protocol=cPickle.HIGHEST_PROTOCOL) fo.close() return hmm, classesAll
from pyAudioAnalysis import audioBasicIO from pyAudioAnalysis import audioFeatureExtraction import matplotlib.pyplot as plt [Fs, x] = audioBasicIO.readAudioFile("demo.wav") F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050 * Fs, 0.025 * Fs) plt.subplot(2, 1, 1) plt.plot(F[0, :]) plt.xlabel('Frame no') plt.ylabel('ZCR') plt.subplot(2, 1, 2) plt.plot(F[1, :]) plt.xlabel('Frame no') plt.ylabel('Energy') plt.show()
# from pyAudioAnalysis import audioTrainTest as aT # aT.featureAndTrain(["data/uniform_ah_18/1", "data/uniform_ah_18/2"], 1.0, 1.0, # aT.shortTermWindow, aT.shortTermStep, "svm", "svmSMtemp", False) # aT.fileClassification("data/doremi.wav", "svmSMtemp", "svm") from pyAudioAnalysis import audioBasicIO from pyAudioAnalysis import audioFeatureExtraction import matplotlib.pyplot as plt [Fs, x] = audioBasicIO.readAudioFile("data/english.wav") F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050 * Fs, 0.025 * Fs) # print len(F) ''' Feature ID Feature Name Description 1 Zero Crossing Rate The rate of sign-changes of the signal during the duration of a particular frame. 2 Energy The sum of squares of the signal values, normalized by the respective frame length. 3 Entropy of Energy The entropy of sub-frames' normalized energies. It can be interpreted as a measure of abrupt changes. 4 Spectral Centroid The center of gravity of the spectrum. 5 Spectral Spread The second central moment of the spectrum. 6 Spectral Entropy Entropy of the normalized spectral energies for a set of sub-frames. 7 Spectral Flux The squared difference between the normalized magnitudes of the spectra of the two successive frames. 8 Spectral Rolloff The frequency below which 90% of the magnitude distribution of the spectrum is concentrated. 9-21 MFCCs Mel Frequency Cepstral Coefficients form a cepstral representation where the frequency bands are not linear but distributed according to the mel-scale. 22-33 Chroma Vector A 12-element representation of the spectral energy where the bins represent the 12 equal-tempered pitch classes of western-type music (semitone spacing). 34 Chroma Deviation The standard deviation of the 12 chroma coefficients. ''' fig, ax = plt.subplots(figsize=(12, 15)) fig.suptitle('pyAudioAnalysis', fontsize=14, fontweight='bold') plt.subplot(13, 1, 1) plt.plot(F[8, :]) plt.xlabel('Frame no')
def main(argv): if argv[1] == "-shortTerm": for i in range(nExp): [Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav") duration = x.shape[0] / float(Fs) t1 = time.clock() F = audioFeatureExtraction.stFeatureExtraction( x, Fs, 0.050 * Fs, 0.050 * Fs) t2 = time.clock() perTime1 = duration / (t2 - t1) print "short-term feature extraction: {0:.1f} x realtime".format( perTime1) elif argv[1] == "-classifyFile": for i in range(nExp): [Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav") duration = x.shape[0] / float(Fs) t1 = time.clock() aT.fileClassification("diarizationExample.wav", "svmSM", "svm") t2 = time.clock() perTime1 = duration / (t2 - t1) print "Mid-term feature extraction + classification \t {0:.1f} x realtime".format( perTime1) elif argv[1] == "-mtClassify": for i in range(nExp): [Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav") duration = x.shape[0] / float(Fs) t1 = time.clock() [flagsInd, classesAll, acc] = aS.mtFileClassification("diarizationExample.wav", "svmSM", "svm", False, '') t2 = time.clock() perTime1 = duration / (t2 - t1) print "Fix-sized classification - segmentation \t {0:.1f} x realtime".format( perTime1) elif argv[1] == "-hmmSegmentation": for i in range(nExp): [Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav") duration = x.shape[0] / float(Fs) t1 = time.clock() aS.hmmSegmentation('diarizationExample.wav', 'hmmRadioSM', False, '') t2 = time.clock() perTime1 = duration / (t2 - t1) print "HMM-based classification - segmentation \t {0:.1f} x realtime".format( perTime1) elif argv[1] == "-silenceRemoval": for i in range(nExp): [Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav") duration = x.shape[0] / float(Fs) t1 = time.clock() [Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav") segments = aS.silenceRemoval(x, Fs, 0.050, 0.050, smoothWindow=1.0, Weight=0.3, plot=False) t2 = time.clock() perTime1 = duration / (t2 - t1) print "Silence removal \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-thumbnailing": for i in range(nExp): [Fs1, x1] = audioBasicIO.readAudioFile("scottish.wav") duration1 = x1.shape[0] / float(Fs1) t1 = time.clock() [A1, A2, B1, B2, Smatrix] = aS.musicThumbnailing(x1, Fs1, 1.0, 1.0, 15.0) # find thumbnail endpoints t2 = time.clock() perTime1 = duration1 / (t2 - t1) print "Thumbnail \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-diarization-noLDA": for i in range(nExp): [Fs1, x1] = audioBasicIO.readAudioFile("diarizationExample.wav") duration1 = x1.shape[0] / float(Fs1) t1 = time.clock() aS.speakerDiarization("diarizationExample.wav", 4, LDAdim=0, PLOT=False) t2 = time.clock() perTime1 = duration1 / (t2 - t1) print "Diarization \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-diarization-LDA": for i in range(nExp): [Fs1, x1] = audioBasicIO.readAudioFile("diarizationExample.wav") duration1 = x1.shape[0] / float(Fs1) t1 = time.clock() aS.speakerDiarization("diarizationExample.wav", 4, PLOT=False) t2 = time.clock() perTime1 = duration1 / (t2 - t1) print "Diarization \t {0:.1f} x realtime".format(perTime1)
from pyAudioAnalysis import audioBasicIO from pyAudioAnalysis import audioFeatureExtraction import matplotlib.pyplot as plt import numpy as np from scipy import stats #audio_path = "/home/brainlab/Desktop/Rudas/Data/Propofol/Taken-[AudioTrimmer.com].wav" audio_path = "/home/brainlab/Desktop/Rudas/Data/Propofol/Taken-[AudioTrimmer.com].wav" [Fs, x] = audioBasicIO.readAudioFile(audio_path) x = audioBasicIO.stereo2mono(x) tr = 2 F, f_names = audioFeatureExtraction.stFeatureExtraction( x, Fs, tr * Fs, tr * Fs) np.savetxt('audio_predictors.txt', np.transpose(F[:21]), fmt='%10.6f', delimiter=',') from nilearn.signal import clean #F = clean(signals=F, # detrend=False, # standardize=True, # ensure_finite=False) #for feature in range(2): # plt.subplot(2,1,feature+1); # plt.plot(F[feature,:]);
def Audio_Feature_Extraction_Extract_Directory(self): pathToSaveFiles = QFileDialog.getExistingDirectory( self.somethingToPass, "Select Directory to Save Files") i = 0 path = self.Audio_Feature_Extraction_DirectoryPath.text() extractionList = [] nameList = [] tempHold = [] for root, directs, files in os.walk(path): for x in files: extractionList.append(root + "/" + x) nameList.append(x) for name in nameList: x = name.split('.') tempHold.append(x[0]) nameList = tempHold if self.Audio_Feature_Extraction_DirectoryWindowTerm.currentText( ) == "ShortTerm": for audio in extractionList: [Fs, x] = audioBasicIO.readAudioFile(audio) print(audio) stFeatures = audioFeatureExtraction.stFeatureExtraction( x, Fs, float(self.Audio_Feature_Extraction_DirectoryWindowSize. text()) * Fs, float( self.Audio_Feature_Extraction_DirectoryStepSize.text()) * Fs) #I think the files are overwriting each other. I am getting #299 files but it should be 5134. I changed the namelist[i] to #just i numpy.savetxt(pathToSaveFiles + "/" + str(i) + ".csv", stFeatures, delimiter=',') i += 1 QMessageBox.about(self.somethingToPass, "Files Created", "Files have been saved as CSV files.") else: for x in extractionList: audioFeatureExtraction.mtFeatureExtractionToFile( x, float(self. Audio_Feature_Extraction_DirectorymidTermWindowSize. text()), float( self. Audio_Feature_Extraction_DirectorymidTermWindowStepSize .text()), float(self.Audio_Feature_Extraction_DirectoryWindowSize. text()), float(self.Audio_Feature_Extraction_DirectoryStepSize.text( )), pathToSaveFiles + "/" + nameList[i], storeStFeatures=True, storeToCSV=True, PLOT=False) i += 1 QMessageBox.about( self.somethingToPass, "Files Created", "Files have been saved as CSV files and .npy files. There ")
from pyAudioAnalysis import audioBasicIO from pyAudioAnalysis import audioFeatureExtraction import matplotlib.pyplot as plt [Fs1, x1] = audioBasicIO.readAudioFile("happy.wav"); [Fs2, x2] = audioBasicIO.readAudioFile("sad.wav"); # Fs is frequency # x is real data th = 100 # fixed fea length k12 = (len(x1)-800)/th/float(Fs1) k22 = (len(x2)-800)/th/float(Fs2) F1, f_names1 = audioFeatureExtraction.stFeatureExtraction(x1, Fs1, 0.05*Fs1, k12*Fs1); F2, f_names2 = audioFeatureExtraction.stFeatureExtraction(x2, Fs2, 0.05*Fs2, k22*Fs2); # stFeatureExtraction(signal, fs, win, step): # signal: the input signal samples # fs: the sampling freq (in Hz) # win: the short-term window size (in samples) # step: the short-term window step (in samples) ''' here, window size = 0.05*Fs = 0.05*16000 = 800 step size = 0.025*Fs = 0.024*16000 = 400 we can get n frames from signal with length 23776 400*n+800=23776 -> n=57.44 = 58 as below F.shape = (34,58)
def main(argv): dirName = argv[1] types = ('*.wav', ) filesList = [] for files in types: filesList.extend(glob.glob(os.path.join(dirName, files))) filesList = sorted(filesList) WIDTH_SEC = 2.4 stWin = 0.020 stStep = 0.015 WIDTH = WIDTH_SEC / stStep for f in filesList: [Fs, x] = audioBasicIO.readAudioFile(f) print(Fs) x = audioBasicIO.stereo2mono(x) specgramOr, TimeAxis, FreqAxis = aF.stSpectogram( x, Fs, round(Fs * stWin), round(Fs * stStep), False) if specgramOr.shape[0] > WIDTH: specgram = specgramOr[int(specgramOr.shape[0] / 2) - WIDTH / 2:int(specgramOr.shape[0] / 2) + WIDTH / 2, :] specgram = scipy.misc.imresize(specgram, float(227.0) / float(specgram.shape[0]), interp='bilinear') print specgram.shape im = Image.fromarray(numpy.uint8( matplotlib.cm.jet(specgram) * 255)) #plt.imshow(im) scipy.misc.imsave(f.replace(".wav", ".jpg"), im) if int(specgramOr.shape[0] / 2) - WIDTH / 2 - int( (0.2) / stStep) > 0: specgram = specgramOr[ int(specgramOr.shape[0] / 2) - WIDTH / 2 - int((0.2) / stStep):int(specgramOr.shape[0] / 2) + WIDTH / 2 - int((0.2) / stStep), :] specgram = scipy.misc.imresize(specgram, float(227.0) / float(specgram.shape[0]), interp='bilinear') im = Image.fromarray( numpy.uint8(matplotlib.cm.jet(specgram) * 255)) print specgram.shape scipy.misc.imsave(f.replace(".wav", "_02A.jpg"), im) specgram = specgramOr[ int(specgramOr.shape[0] / 2) - WIDTH / 2 + int((0.2) / stStep):int(specgramOr.shape[0] / 2) + WIDTH / 2 + int((0.2) / stStep), :] specgram = scipy.misc.imresize(specgram, float(227.0) / float(specgram.shape[0]), interp='bilinear') print specgram.shape im = Image.fromarray( numpy.uint8(matplotlib.cm.jet(specgram) * 255)) scipy.misc.imsave(f.replace(".wav", "_02B.jpg"), im) # ONLY FOR SPEECH (fewer samples). Must comment for music """specgram = specgramOr[int(specgramOr.shape[0]/2) - WIDTH/2 - int((0.1) / stStep):int(specgramOr.shape[0]/2) + WIDTH/2 - int((0.1) / stStep), :]
def remove_silence(filename, out_dir, smoothing=1.0, weight=0.3, plot=False): """ A function that implements pyAudioAnalysis' silence extraction module and creates wav files of the participant specific portions of audio. The smoothing and weight parameters were tuned for the AVEC 2016 dataset. Parameters ---------- filename : filepath path to the input wav file out_dir : filepath path to the desired directory (where a participant folder will be created containing a 'PXXX_no_silence.wav' file) smoothing : float tunable parameter to compensate for sparseness of recordings weight : float probability threshold for silence removal used in SVM plot : bool plots SVM probabilities of silence (used in tuning) Returns ------- A folder for each participant containing a single wav file (named 'PXXX_no_silence.wav') with the vast majority of silence and virtual interviewer speech removed. Feature extraction is performed on these segmented wav files. """ # print(filename.split('/')[-1].split('_')[0], 'filename') partic_id = 'P' + filename.split('/')[-1].split('_')[0].split('\\')[ 1] # PXXX print(partic_id, 'partic_id') if is_segmentable(partic_id): # create participant directory for segmented wav files participant_dir = os.path.join(out_dir, partic_id) if not os.path.exists(participant_dir): os.makedirs(participant_dir) os.chdir(participant_dir) # print(participant_dir, 'participant_dir') [Fs, x] = aIO.readAudioFile(filename) segments = aS.silenceRemoval(x, Fs, 0.020, 0.020, smoothWindow=smoothing, weight=weight, plot=plot) # print(segments) for s in segments: # filename = partic_id + s[0] + s[1] # seg_name = "%.s_%.2f-%.2f.wav".format(partic_id, s[0], s[1]) # print(s[0]) # print(s[1]) # seg_name = '/' + str(partic_id) + '_' + str(s[0]).replace('.', 'b') + '_' + str(s[1]).replace('.', 'b') + '.wav' seg_name = '/' + '_' + str(s[0]).replace('.', 'b') + '_' + str( s[1]).replace('.', 'b') + '.wav' # print(seg_name, 'seg_name') wavfile.write(participant_dir + seg_name, Fs, x[int(Fs * s[0]):int(Fs * s[1])]) # concatenate segmented wave files within participant directory concatenate_segments(participant_dir, partic_id)
with open(speaker_file, 'r') as data: speaker_features = ujson.load(data) for i, dirname in enumerate(os.listdir(datadir)): if dirname == ".DS_Store": continue speaker = dirname for filename in os.listdir(datadir + dirname + '/audio_trimmed/pedal/'): if filename == ".DS_Store": continue if "lie" in filename: #labels.append((filename, 1)) labels[filename] = 1 else: labels[filename] = 0 [Fs, x] = audioBasicIO.readAudioFile(datadir + dirname + '/audio_trimmed/pedal/' + filename) #we might want to play with the timeframe here - as it is this is giving us up to ~1.5k frames for our sequences speaker_feat = speaker_features[dirname] st_features = audioFeatureExtraction.stFeatureExtraction( x, Fs, frame_size * Fs, frame_stepsize * Fs) num_features, num_windows = st_features.shape new_features = np.zeros((num_features, num_windows)) for i in range(num_features): new_features[i] = (st_features[i] - speaker_feat[i]) / speaker_feat[i] st_features = np.concatenate((st_features, new_features)) features[filename] = st_features.tolist() total += 1 print(i) print(total) with open('labels_{}_{}.json'.format(frame_size, frame_stepsize),
return x[15:17] def getThird(val): return val[2] os.chdir('C:/Users/konst_000/Desktop/Σχολή/6ο Εξάμηνο/ΨΕΣ/Speech Emotion Recognition/Audio Database/Complete') fileList = os.listdir('C:/Users/konst_000/Desktop/Σχολή/6ο Εξάμηνο/ΨΕΣ/Speech Emotion Recognition/Audio Database/Complete') featureList = [] #list of lists used to store the extracted features of each training sample labelList = [] #list of strings used to store the labels(emotions) for each training sample speakerList = [] #list of strings used to store the speaker identity for f in fileList: label = getEmotionLabel(f) if (label != '02' and label != '03' and label != '04' and label != '05' and label != '06'): continue [Fs, sample] = audioBasicIO.readAudioFile(f) sample = audioBasicIO.stereo2mono(sample) #feature extraction can be performed only on mono signals speaker = getSpeakerLabel(f) features = emoFeatExtract(sample, Fs, 0.050*Fs, 0.025*Fs) featureList.append(features) labelList.append(label) speakerList.append(speaker) final = [] for i in range(len(featureList)): l = [featureList[i]] l.append(labelList[i]) l.append(speakerList[i]) final.append(l)
# this list of wav files is consistent with labels # checked with == operator (data_id == files_id) #files = [os.path.basename(x) for x in glob.glob(os.path.join(data_path + './session?/*/?/', '*.wav'))] files = glob.glob(os.path.join(data_path + './session?/*/?/', '*.wav')) files.sort(key=lambda x: x[-30:]) # feat_train = [] # feat_test = [] hfs_train = [] hfs_test = [] for f in files: if int(ntpath.basename(f)[18]) in range(1, 6): print("Process..., ", f) [Fs, x] = audioBasicIO.readAudioFile(f) F, f_names = audioFeatureExtraction.stFeatureExtraction( x, Fs, 0.025 * Fs, 0.010 * Fs) mean_train = np.mean(F, axis=1) std_train = np.std(F, axis=1) feat_hfs_train = np.hstack([mean_train, std_train]) hfs_train.append(feat_hfs_train) feat_train.append(F.transpose()) elif int(ntpath.basename(f)[18]) == 6: print("Process..., ", f) [Fs, x] = audioBasicIO.readAudioFile(f) F, f_names = audioFeatureExtraction.stFeatureExtraction( x, Fs, 0.025 * Fs, 0.010 * Fs) mean_test = np.mean(F, axis=1) std_test = np.std(F, axis=1)
# Sanity cleaning to remove empty strings files = [f for f in files if f] return files data_set = [] for file in os.listdir("training_dataset/unhappy"): temp = [] mean_value = [] if file.endswith(".mp3"): #print "training_dataset/unhappy/"+file sound=AudioSegment.from_mp3("training_dataset/unhappy/"+file) sound.export("test.wav",format="wav") [Fs, x] = audioBasicIO.readAudioFile("test.wav"); F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05*Fs, 0.025*Fs); for i in range(len(F)): temp.append(numpy.mean(F[i])) mean_value.append(temp) mean_value.append(1) data_set.append(mean_value) for file in os.listdir("training_dataset/happy"): temp = [] mean_value = [] if file.endswith(".mp3"): #print "training_dataset/happy/"+file sound=AudioSegment.from_mp3("training_dataset/happy/"+file) sound.export("test.wav",format="wav") [Fs, x] = audioBasicIO.readAudioFile("test.wav"); F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05*Fs, 0.025*Fs);
def speakerDiarization(fileName, sRange = xrange(2, 10), mtSize = 2.0, mtStep = 0.2, stWin = 0.05, LDAdim = 35): Fs, x = audioBasicIO.readAudioFile(fileName) x = audioBasicIO.stereo2mono(x) duration = len(x) / Fs Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1 = aT.loadKNNModel(os.path.join('/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data', 'knnSpeakerAll')) Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2 = aT.loadKNNModel(os.path.join('/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data', 'knnSpeakerFemaleMale')) MidTermFeatures, ShortTermFeatures = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stWin * 0.5)) MidTermFeatures2 = numpy.zeros((MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1])) for i in range(MidTermFeatures.shape[1]): curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1 curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2 Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1) Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2) MidTermFeatures2[0: MidTermFeatures.shape[0], i] = MidTermFeatures[:, i] MidTermFeatures2[MidTermFeatures.shape[0]: MidTermFeatures.shape[0] + len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1):, i] = P2 + 0.0001 MidTermFeatures = MidTermFeatures2 iFeaturesSelect = range(8, 21) + range(41, 54) MidTermFeatures = MidTermFeatures[iFeaturesSelect, :] MidTermFeaturesNorm, MEAN, STD = aT.normalizeFeatures([MidTermFeatures.T]) MidTermFeaturesNorm = MidTermFeaturesNorm[0].T numOfWindows = MidTermFeatures.shape[1] DistancesAll = numpy.sum(distance.squareform(distance.pdist(MidTermFeaturesNorm.T)), axis = 0) MDistancesAll = numpy.mean(DistancesAll) iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0] perOutLier = (100.0 * (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows MidTermFeaturesNormOr = MidTermFeaturesNorm MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers] if LDAdim > 0: mtWinRatio, mtStepRatio, mtFeaturesToReduce, numOfFeatures, numOfStatistics = int(round(mtSize / stWin)), int(round(stWin / stWin)), list(), len(ShortTermFeatures), 2 for i in range(numOfStatistics * numOfFeatures): mtFeaturesToReduce.append(list()) for i in range(numOfFeatures): curPos = 0 N = len(ShortTermFeatures[i]) while (curPos < N): N1, N2 = curPos, curPos + mtWinRatio if N2 > N: N2 = N curStFeatures = ShortTermFeatures[i][N1: N2] mtFeaturesToReduce[i].append(numpy.mean(curStFeatures)) mtFeaturesToReduce[i + numOfFeatures].append(numpy.std(curStFeatures)) curPos += mtStepRatio mtFeaturesToReduce = numpy.array(mtFeaturesToReduce) mtFeaturesToReduce2 = numpy.zeros((mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1])) for i in range(mtFeaturesToReduce.shape[1]): curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1 curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2 Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1) Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2) mtFeaturesToReduce2[0: mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i] mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]: mtFeaturesToReduce.shape[0] + len(classNames1), i] = P1 + 0.0001 mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] + len(classNames1):, i] = P2 + 0.0001 mtFeaturesToReduce = mtFeaturesToReduce2 mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :] mtFeaturesToReduce, MEAN, STD = aT.normalizeFeatures([mtFeaturesToReduce.T]) mtFeaturesToReduce = mtFeaturesToReduce[0].T Labels = numpy.zeros((mtFeaturesToReduce.shape[1], )) LDAstep = 1.0 LDAstepRatio = LDAstep / stWin for i in range(Labels.shape[0]): Labels[i] = int(i * stWin / LDAstepRatio) clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(n_components = LDAdim) clf.fit(mtFeaturesToReduce.T, Labels) MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T clsAll, silAll, centersAll = list(), list(), list() for iSpeakers in sRange: k_means = sklearn.cluster.KMeans(n_clusters = iSpeakers) k_means.fit(MidTermFeaturesNorm.T) cls = k_means.labels_ means = k_means.cluster_centers_ clsAll.append(cls) centersAll.append(means) silA, silB = list(), list() for c in range(iSpeakers): clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float(len(cls)) if clusterPerCent < 0.02: silA.append(0.0) silB.append(0.0) else: MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls == c] Yt = distance.pdist(MidTermFeaturesNormTemp.T) silA.append(numpy.mean(Yt) * clusterPerCent) silBs = list() for c2 in range(iSpeakers): if c2 != c: clusterPerCent2 = numpy.nonzero(cls == c2)[0].shape[0] / float(len(cls)) MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:, cls == c2] Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T) silBs.append(numpy.mean(Yt) * (clusterPerCent+clusterPerCent2) / 2.0) silBs = numpy.array(silBs) silB.append(min(silBs)) silA, silB, sil = numpy.array(silA), numpy.array(silB), list() for c in range(iSpeakers): sil.append((silB[c] - silA[c]) / (max(silB[c], silA[c]) + 0.00001)) silAll.append(numpy.mean(sil)) imax = numpy.argmax(silAll) nSpeakersFinal = sRange[imax] cls = numpy.zeros((numOfWindows, )) for i in range(numOfWindows): j = numpy.argmin(numpy.abs(i - iNonOutLiers)) cls[i] = clsAll[imax][j] startprob, transmat, means, cov = trainHMM(MidTermFeaturesNormOr, cls) hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], 'diag') hmm.startprob_ = startprob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov cls = hmm.predict(MidTermFeaturesNormOr.T) cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = silAll[imax] classNames = ['SPEAKER{0:d}'.format(c) for c in range(nSpeakersFinal)] return cls, classNames, duration, mtStep, silAll
import operator import wave import numpy as np from pyAudioAnalysis import audioFeatureExtraction as aF from pyAudioAnalysis import audioTrainTest as aT from pyAudioAnalysis import audioSegmentation as aS from pyAudioAnalysis import audioVisualization as aV from pyAudioAnalysis import audioBasicIO if __name__ =='__main__': #csv and wav file as argument csvFileName = sys.argv[1] wavFileName = sys.argv[2] Fs, x = audioBasicIO.readAudioFile(wavFileName) annotations = [] silence = [] folderName = None fileCounter = 0 start, end = 0, 0 #duration of wavFile spf = wave.open(wavFileName,'r') #Get wavFile duration frames = spf.getnframes() rate = spf.getframerate() duration = frames / float(rate) #duration = int((duration))
def speakerDiarization(fileName, sRange=xrange(2, 10), mtSize=2.0, mtStep=0.2, stWin=0.05, LDAdim=35): Fs, x = audioBasicIO.readAudioFile(fileName) x = audioBasicIO.stereo2mono(x) duration = len(x) / Fs Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1 = aT.loadKNNModel( os.path.join( '/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data', 'knnSpeakerAll')) Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2 = aT.loadKNNModel( os.path.join( '/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data', 'knnSpeakerFemaleMale')) MidTermFeatures, ShortTermFeatures = aF.mtFeatureExtraction( x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stWin * 0.5)) MidTermFeatures2 = numpy.zeros( (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1])) for i in range(MidTermFeatures.shape[1]): curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1 curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2 Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1) Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2) MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i] MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0] + len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1):, i] = P2 + 0.0001 MidTermFeatures = MidTermFeatures2 iFeaturesSelect = range(8, 21) + range(41, 54) MidTermFeatures = MidTermFeatures[iFeaturesSelect, :] MidTermFeaturesNorm, MEAN, STD = aT.normalizeFeatures([MidTermFeatures.T]) MidTermFeaturesNorm = MidTermFeaturesNorm[0].T numOfWindows = MidTermFeatures.shape[1] DistancesAll = numpy.sum(distance.squareform( distance.pdist(MidTermFeaturesNorm.T)), axis=0) MDistancesAll = numpy.mean(DistancesAll) iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0] perOutLier = (100.0 * (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows MidTermFeaturesNormOr = MidTermFeaturesNorm MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers] if LDAdim > 0: mtWinRatio, mtStepRatio, mtFeaturesToReduce, numOfFeatures, numOfStatistics = int( round(mtSize / stWin)), int(round( stWin / stWin)), list(), len(ShortTermFeatures), 2 for i in range(numOfStatistics * numOfFeatures): mtFeaturesToReduce.append(list()) for i in range(numOfFeatures): curPos = 0 N = len(ShortTermFeatures[i]) while (curPos < N): N1, N2 = curPos, curPos + mtWinRatio if N2 > N: N2 = N curStFeatures = ShortTermFeatures[i][N1:N2] mtFeaturesToReduce[i].append(numpy.mean(curStFeatures)) mtFeaturesToReduce[i + numOfFeatures].append( numpy.std(curStFeatures)) curPos += mtStepRatio mtFeaturesToReduce = numpy.array(mtFeaturesToReduce) mtFeaturesToReduce2 = numpy.zeros( (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1])) for i in range(mtFeaturesToReduce.shape[1]): curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1 curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2 Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1) Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2) mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i] mtFeaturesToReduce2[ mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0] + len(classNames1), i] = P1 + 0.0001 mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] + len(classNames1):, i] = P2 + 0.0001 mtFeaturesToReduce = mtFeaturesToReduce2 mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :] mtFeaturesToReduce, MEAN, STD = aT.normalizeFeatures( [mtFeaturesToReduce.T]) mtFeaturesToReduce = mtFeaturesToReduce[0].T Labels = numpy.zeros((mtFeaturesToReduce.shape[1], )) LDAstep = 1.0 LDAstepRatio = LDAstep / stWin for i in range(Labels.shape[0]): Labels[i] = int(i * stWin / LDAstepRatio) clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis( n_components=LDAdim) clf.fit(mtFeaturesToReduce.T, Labels) MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T clsAll, silAll, centersAll = list(), list(), list() for iSpeakers in sRange: k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers) k_means.fit(MidTermFeaturesNorm.T) cls = k_means.labels_ means = k_means.cluster_centers_ clsAll.append(cls) centersAll.append(means) silA, silB = list(), list() for c in range(iSpeakers): clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float( len(cls)) if clusterPerCent < 0.02: silA.append(0.0) silB.append(0.0) else: MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls == c] Yt = distance.pdist(MidTermFeaturesNormTemp.T) silA.append(numpy.mean(Yt) * clusterPerCent) silBs = list() for c2 in range(iSpeakers): if c2 != c: clusterPerCent2 = numpy.nonzero( cls == c2)[0].shape[0] / float(len(cls)) MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:, cls == c2] Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T) silBs.append( numpy.mean(Yt) * (clusterPerCent + clusterPerCent2) / 2.0) silBs = numpy.array(silBs) silB.append(min(silBs)) silA, silB, sil = numpy.array(silA), numpy.array(silB), list() for c in range(iSpeakers): sil.append((silB[c] - silA[c]) / (max(silB[c], silA[c]) + 0.00001)) silAll.append(numpy.mean(sil)) imax = numpy.argmax(silAll) nSpeakersFinal = sRange[imax] cls = numpy.zeros((numOfWindows, )) for i in range(numOfWindows): j = numpy.argmin(numpy.abs(i - iNonOutLiers)) cls[i] = clsAll[imax][j] startprob, transmat, means, cov = trainHMM(MidTermFeaturesNormOr, cls) hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], 'diag') hmm.startprob_ = startprob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov cls = hmm.predict(MidTermFeaturesNormOr.T) cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = silAll[imax] classNames = ['SPEAKER{0:d}'.format(c) for c in range(nSpeakersFinal)] return cls, classNames, duration, mtStep, silAll
from pyAudioAnalysis import audioBasicIO from pyAudioAnalysis import audioFeatureExtraction import matplotlib.pyplot as plt [Fs, x] = audioBasicIO.readAudioFile("../audio_data/doremi.wav") print Fs print len(x) #using a frame size of 50 msecs and a frame step of 25 msecs (50% overlap) F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.025*Fs) """ stFeatureExtraction This function implements the shor-term windowing process. For each short-term window a set of features is extracted. This results to a sequence of feature vectors, stored in a numpy matrix. ARGUMENTS signal: the input signal samples Fs: the sampling freq (in Hz) Win: the short-term window size (in samples) Step: the short-term window step (in samples) RETURNS stFeatures: a numpy array (numOfFeatures x numOfShortTermWindows) """ print len(F) plt.subplot(2,1,1); plt.plot(F[0,:]); plt.xlabel('Frame no'); plt.ylabel('ZCR') plt.subplot(2,1,2); plt.plot(F[1,:]); plt.xlabel('Frame no'); plt.ylabel('Energy'); plt.show()
from pyAudioAnalysis import audioBasicIO from pyAudioAnalysis import audioFeatureExtraction import matplotlib.pyplot as plt [Fs, x] = audioBasicIO.readAudioFile("data/20170621_16sec.wav") F = audioFeatureExtraction.stFeatureExtraction(x, float(Fs), float(0.1 * Fs), float(0.1 * Fs)) print(Fs) print(x) # plt.subplot(2,1,1); plt.plot(F[0,:]); plt.xlabel('Frame no'); plt.ylabel('ZCR'); plt.subplot(2, 1, 1) plt.plot(F[1, :]) plt.xlabel('Frame no') plt.ylabel('Energy') plt.show()
def speakerDiarization(fileName, numOfSpeakers, mtSize=2.0, mtStep=0.2, stWin=0.05, LDAdim=35, PLOT=False): ''' ARGUMENTS: - fileName: the name of the WAV file to be analyzed - numOfSpeakers the number of speakers (clusters) in the recording (<=0 for unknown) - mtSize (opt) mid-term window size - mtStep (opt) mid-term window step - stWin (opt) short-term window size - LDAdim (opt) LDA dimension (0 for no LDA) - PLOT (opt) 0 for not plotting the results 1 for plottingy ''' [Fs, x] = audioBasicIO.readAudioFile(fileName) x = audioBasicIO.stereo2mono(x) Duration = len(x) / Fs [ Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1 ] = aT.loadKNNModel( os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "knnSpeakerAll")) [ Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2 ] = aT.loadKNNModel( os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "knnSpeakerFemaleMale")) [MidTermFeatures, ShortTermFeatures] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stWin * 0.5)) MidTermFeatures2 = numpy.zeros( (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1])) for i in range(MidTermFeatures.shape[1]): curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1 curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2 [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1) [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2) MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i] MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0] + len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1)::, i] = P2 + 0.0001 MidTermFeatures = MidTermFeatures2 # TODO # SELECT FEATURES: #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20]; # SET 0A #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100]; # SET 0B #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96, # 97,98, 99,100]; # SET 0C iFeaturesSelect = [ 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ] # SET 1A #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; # SET 1B #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 1C #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53]; # SET 2A #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; # SET 2B #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 2C #iFeaturesSelect = range(100); # SET 3 #MidTermFeatures += numpy.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010 MidTermFeatures = MidTermFeatures[iFeaturesSelect, :] (MidTermFeaturesNorm, MEAN, STD) = aT.normalizeFeatures([MidTermFeatures.T]) MidTermFeaturesNorm = MidTermFeaturesNorm[0].T numOfWindows = MidTermFeatures.shape[1] # remove outliers: DistancesAll = numpy.sum(distance.squareform( distance.pdist(MidTermFeaturesNorm.T)), axis=0) MDistancesAll = numpy.mean(DistancesAll) iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0] # TODO: Combine energy threshold for outlier removal: #EnergyMin = numpy.min(MidTermFeatures[1,:]) #EnergyMean = numpy.mean(MidTermFeatures[1,:]) #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 #iNonOutLiers = numpy.nonzero(MidTermFeatures[1,:] > Thres)[0] #print iNonOutLiers perOutLier = (100.0 * (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows MidTermFeaturesNormOr = MidTermFeaturesNorm MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers] # LDA dimensionality reduction: if LDAdim > 0: #[mtFeaturesToReduce, _] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, stWin * Fs, round(Fs*stWin), round(Fs*stWin)); # extract mid-term features with minimum step: mtWinRatio = int(round(mtSize / stWin)) mtStepRatio = int(round(stWin / stWin)) mtFeaturesToReduce = [] numOfFeatures = len(ShortTermFeatures) numOfStatistics = 2 #for i in range(numOfStatistics * numOfFeatures + 1): for i in range(numOfStatistics * numOfFeatures): mtFeaturesToReduce.append([]) for i in range(numOfFeatures): # for each of the short-term features: curPos = 0 N = len(ShortTermFeatures[i]) while (curPos < N): N1 = curPos N2 = curPos + mtWinRatio if N2 > N: N2 = N curStFeatures = ShortTermFeatures[i][N1:N2] mtFeaturesToReduce[i].append(numpy.mean(curStFeatures)) mtFeaturesToReduce[i + numOfFeatures].append( numpy.std(curStFeatures)) curPos += mtStepRatio mtFeaturesToReduce = numpy.array(mtFeaturesToReduce) mtFeaturesToReduce2 = numpy.zeros( (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1])) for i in range(mtFeaturesToReduce.shape[1]): curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1 curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2 [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1) [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2) mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i] mtFeaturesToReduce2[ mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0] + len(classNames1), i] = P1 + 0.0001 mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] + len(classNames1)::, i] = P2 + 0.0001 mtFeaturesToReduce = mtFeaturesToReduce2 mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :] #mtFeaturesToReduce += numpy.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010 (mtFeaturesToReduce, MEAN, STD) = aT.normalizeFeatures([mtFeaturesToReduce.T]) mtFeaturesToReduce = mtFeaturesToReduce[0].T #DistancesAll = numpy.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0) #MDistancesAll = numpy.mean(DistancesAll) #iNonOutLiers2 = numpy.nonzero(DistancesAll < 3.0*MDistancesAll)[0] #mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2] Labels = numpy.zeros((mtFeaturesToReduce.shape[1], )) LDAstep = 1.0 LDAstepRatio = LDAstep / stWin #print LDAstep, LDAstepRatio for i in range(Labels.shape[0]): Labels[i] = int(i * stWin / LDAstepRatio) clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis( n_components=LDAdim) clf.fit(mtFeaturesToReduce.T, Labels) MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T if numOfSpeakers <= 0: sRange = range(2, 10) else: sRange = [numOfSpeakers] clsAll = [] silAll = [] centersAll = [] for iSpeakers in sRange: k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers) k_means.fit(MidTermFeaturesNorm.T) cls = k_means.labels_ means = k_means.cluster_centers_ # Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T)) clsAll.append(cls) centersAll.append(means) silA = [] silB = [] for c in range(iSpeakers ): # for each speaker (i.e. for each extracted cluster) clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float( len(cls)) if clusterPerCent < 0.020: silA.append(0.0) silB.append(0.0) else: MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls == c] # get subset of feature vectors Yt = distance.pdist( MidTermFeaturesNormTemp.T ) # compute average distance between samples that belong to the cluster (a values) silA.append(numpy.mean(Yt) * clusterPerCent) silBs = [] for c2 in range( iSpeakers ): # compute distances from samples of other clusters if c2 != c: clusterPerCent2 = numpy.nonzero( cls == c2)[0].shape[0] / float(len(cls)) MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:, cls == c2] Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T) silBs.append( numpy.mean(Yt) * (clusterPerCent + clusterPerCent2) / 2.0) silBs = numpy.array(silBs) silB.append( min(silBs) ) # ... and keep the minimum value (i.e. the distance from the "nearest" cluster) silA = numpy.array(silA) silB = numpy.array(silB) sil = [] for c in range(iSpeakers): # for each cluster (speaker) sil.append((silB[c] - silA[c]) / (max(silB[c], silA[c]) + 0.00001)) # compute silhouette silAll.append(numpy.mean(sil)) # keep the AVERAGE SILLOUETTE #silAll = silAll * (1.0/(numpy.power(numpy.array(sRange),0.5))) imax = numpy.argmax(silAll) # position of the maximum sillouette value nSpeakersFinal = sRange[imax] # optimal number of clusters # generate the final set of cluster labels # (important: need to retrieve the outlier windows: this is achieved by giving them the value of their nearest non-outlier window) cls = numpy.zeros((numOfWindows, )) for i in range(numOfWindows): j = numpy.argmin(numpy.abs(i - iNonOutLiers)) cls[i] = clsAll[imax][j] # Post-process method 1: hmm smoothing for i in range(1): startprob, transmat, means, cov = trainHMM_computeStatistics( MidTermFeaturesNormOr, cls) hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], "diag") # hmm training hmm.startprob_ = startprob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov cls = hmm.predict(MidTermFeaturesNormOr.T) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = silAll[imax] # final sillouette classNames = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)] # load ground-truth if available gtFile = fileName.replace('.wav', '.segments') # open for annotated file if os.path.isfile(gtFile): # if groundturh exists [segStart, segEnd, segLabels] = readSegmentGT(gtFile) # read GT data flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels, mtStep) # convert to flags if PLOT: fig = plt.figure() if numOfSpeakers > 0: ax1 = fig.add_subplot(111) else: ax1 = fig.add_subplot(211) ax1.set_yticks(numpy.array(range(len(classNames)))) ax1.axis((0, Duration, -1, len(classNames))) ax1.set_yticklabels(classNames) ax1.plot(numpy.array(range(len(cls))) * mtStep + mtStep / 2.0, cls) if os.path.isfile(gtFile): if PLOT: ax1.plot( numpy.array(range(len(flagsGT))) * mtStep + mtStep / 2.0, flagsGT, 'r') purityClusterMean, puritySpeakerMean = evaluateSpeakerDiarization( cls, flagsGT) print("{0:.1f}\t{1:.1f}".format(100 * purityClusterMean, 100 * puritySpeakerMean)) if PLOT: plt.title( "Cluster purity: {0:.1f}% - Speaker purity: {1:.1f}%".format( 100 * purityClusterMean, 100 * puritySpeakerMean)) if PLOT: plt.xlabel("time (seconds)") #print sRange, silAll if numOfSpeakers <= 0: plt.subplot(212) plt.plot(sRange, silAll) plt.xlabel("number of clusters") plt.ylabel("average clustering's sillouette") plt.show() return cls
def trainHMM_fromDir(dirPath, hmm_model_name, mt_win, mt_step): """ This function trains a HMM model for segmentation-classification using a where WAV files and .segment (ground-truth files) are stored ARGUMENTS: - dirPath: the path of the data diretory - hmm_model_name: the name of the HMM model to be stored - mt_win: mid-term window size - mt_step: mid-term window step RETURNS: - hmm: an object to the resulting HMM - class_names: a list of class_names After training, hmm, class_names, along with the mt_win and mt_step values are stored in the hmm_model_name file """ flags_all = np.array([]) classes_all = [] for i, f in enumerate(glob.glob(dirPath + os.sep + '*.wav')): # for each WAV file wav_file = f gt_file = f.replace('.wav', '.segments') if not os.path.isfile(gt_file): continue [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file) flags, class_names = segs2flags(seg_start, seg_end, seg_labs, mt_step) for c in class_names: # update class names: if c not in classes_all: classes_all.append(c) [fs, x] = audioBasicIO.readAudioFile(wav_file) [F, _, _] = aF.mtFeatureExtraction(x, fs, mt_win * fs, mt_step * fs, round(fs * 0.050), round(fs * 0.050)) lenF = F.shape[1] lenL = len(flags) min_sm = min(lenF, lenL) F = F[:, 0:min_sm] flags = flags[0:min_sm] flagsNew = [] for j, fl in enumerate(flags): # append features and labels flagsNew.append(classes_all.index(class_names[flags[j]])) flags_all = np.append(flags_all, np.array(flagsNew)) if i == 0: f_all = F else: f_all = np.concatenate((f_all, F), axis=1) # compute HMM statistics start_prob, transmat, means, cov = trainHMM_computeStatistics(f_all, flags_all) # train the HMM hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag") hmm.startprob_ = start_prob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov fo = open(hmm_model_name, "wb") # save HMM model cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(classes_all, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL) fo.close() return hmm, classes_all
for emotion in sorted(glob.glob('train_wavdata/*')): #print (spct/float(total_sp))*100.0,'% completed' emotion_name = emotion.replace('train_wavdata/', '') #print emotion_name emotions.update({emotion_name: spct}) all_emotion_Fs, all_emotion_data = 0, [] for sample_file in glob.glob(emotion + '/*.wav'): [Fs, x] = audioBasicIO.readAudioFile(sample_file) if all_emotion_Fs == 0: all_emotion_Fs = Fs if Fs == all_emotion_Fs: features = extract_MFCCs(x, Fs, window * Fs, window_overlap * Fs, voiced_threshold_mul, voiced_threshold_range, calc_deltas) all_emotion_data.append(features) else: print sample_file + " skipped due to mismatch in frame rate" all_emotion_data = np.concatenate(all_emotion_data, 0)
def mtFileClassification(input_file, model_name, model_type, plot_results=False, gt_file=""): """ This function performs mid-term classification of an audio stream. Towards this end, supervised knowledge is used, i.e. a pre-trained classifier. ARGUMENTS: - input_file: path of the input WAV file - model_name: name of the classification model - model_type: svm or knn depending on the classifier type - plot_results: True if results are to be plotted using matplotlib along with a set of statistics RETURNS: - segs: a sequence of segment's endpoints: segs[i] is the endpoint of the i-th segment (in seconds) - classes: a sequence of class flags: class[i] is the class ID of the i-th segment """ if not os.path.isfile(model_name): print("mtFileClassificationError: input model_type not found!") return (-1, -1, -1, -1) # Load classifier: if model_type == "knn": [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step, compute_beat] = \ aT.load_model_knn(model_name) else: [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step, compute_beat] = aT.load_model(model_name) if compute_beat: print("Model " + model_name + " contains long-term music features " "(beat etc) and cannot be used in " "segmentation") return (-1, -1, -1, -1) [fs, x] = audioBasicIO.readAudioFile(input_file) # load input file if fs == -1: # could not read file return (-1, -1, -1, -1) x = audioBasicIO.stereo2mono(x) # convert stereo (if) to mono # mid-term feature extraction: [mt_feats, _, _] = aF.mtFeatureExtraction(x, fs, mt_win * fs, mt_step * fs, round(fs * st_win), round(fs * st_step)) flags = [] Ps = [] flags_ind = [] # for each feature vector (i.e. for each fix-sized segment): for i in range(mt_feats.shape[1]): cur_fv = (mt_feats[:, i] - MEAN) / STD # normalize current feature v # classify vector: [res, P] = aT.classifierWrapper(classifier, model_type, cur_fv) flags_ind.append(res) flags.append(class_names[int(res)]) # update class label matrix Ps.append(np.max(P)) # update probability matrix flags_ind = np.array(flags_ind) # 1-window smoothing for i in range(1, len(flags_ind) - 1): if flags_ind[i-1] == flags_ind[i + 1]: flags_ind[i] = flags_ind[i + 1] # convert fix-sized flags to segments and classes (segs, classes) = flags2segs(flags, mt_step) segs[-1] = len(x) / float(fs) # Load grount-truth: if os.path.isfile(gt_file): [seg_start_gt, seg_end_gt, seg_l_gt] = readSegmentGT(gt_file) flags_gt, class_names_gt = segs2flags(seg_start_gt, seg_end_gt, seg_l_gt, mt_step) flags_ind_gt = [] for j, fl in enumerate(flags_gt): # "align" labels with GT if class_names_gt[flags_gt[j]] in class_names: flags_ind_gt.append(class_names.index(class_names_gt[ flags_gt[j]])) else: flags_ind_gt.append(-1) flags_ind_gt = np.array(flags_ind_gt) cm = np.zeros((len(class_names_gt), len(class_names_gt))) for i in range(min(flags_ind.shape[0], flags_ind_gt.shape[0])): cm[int(flags_ind_gt[i]),int(flags_ind[i])] += 1 else: cm = [] flags_ind_gt = np.array([]) acc = plotSegmentationResults(flags_ind, flags_ind_gt, class_names, mt_step, not plot_results) if acc >= 0: print("Overall Accuracy: {0:.3f}".format(acc) ) return (flags_ind, class_names_gt, acc, cm) else: return (flags_ind, class_names, acc, cm)
def dirWavFeatureExtraction(dirName, mt_win, mt_step, st_win, st_step, compute_beat=False): """ This function extracts the mid-term features of the WAVE files of a particular folder. The resulting feature vector is extracted by long-term averaging the mid-term features. Therefore ONE FEATURE VECTOR is extracted for each WAV file. ARGUMENTS: - dirName: the path of the WAVE directory - mt_win, mt_step: mid-term window and step (in seconds) - st_win, st_step: short-term window and step (in seconds) """ all_mt_feats = numpy.array([]) process_times = [] types = ('*.wav', '*.aif', '*.aiff', '*.mp3', '*.au', '*.ogg') wav_file_list = [] for files in types: wav_file_list.extend(glob.glob(os.path.join(dirName, files))) wav_file_list = sorted(wav_file_list) wav_file_list2, mt_feature_names = [], [] for i, wavFile in enumerate(wav_file_list): print("Analyzing file {0:d} of " "{1:d}: {2:s}".format(i+1, len(wav_file_list), wavFile)) if os.stat(wavFile).st_size == 0: print(" (EMPTY FILE -- SKIPPING)") continue [fs, x] = audioBasicIO.readAudioFile(wavFile) if isinstance(x, int): continue t1 = time.clock() x = audioBasicIO.stereo2mono(x) if x.shape[0]<float(fs)/5: print(" (AUDIO FILE TOO SMALL - SKIPPING)") continue wav_file_list2.append(wavFile) if compute_beat: [mt_term_feats, st_features, mt_feature_names] = \ mtFeatureExtraction(x, fs, round(mt_win * fs), round(mt_step * fs), round(fs * st_win), round(fs * st_step)) [beat, beat_conf] = beatExtraction(st_features, st_step) else: [mt_term_feats, _, mt_feature_names] = \ mtFeatureExtraction(x, fs, round(mt_win * fs), round(mt_step * fs), round(fs * st_win), round(fs * st_step)) mt_term_feats = numpy.transpose(mt_term_feats) mt_term_feats = mt_term_feats.mean(axis=0) # long term averaging of mid-term statistics if (not numpy.isnan(mt_term_feats).any()) and \ (not numpy.isinf(mt_term_feats).any()): if compute_beat: mt_term_feats = numpy.append(mt_term_feats, beat) mt_term_feats = numpy.append(mt_term_feats, beat_conf) if len(all_mt_feats) == 0: # append feature vector all_mt_feats = mt_term_feats else: all_mt_feats = numpy.vstack((all_mt_feats, mt_term_feats)) t2 = time.clock() duration = float(len(x)) / fs process_times.append((t2 - t1) / duration) if len(process_times) > 0: print("Feature extraction complexity ratio: " "{0:.1f} x realtime".format((1.0 / numpy.mean(numpy.array(process_times))))) return (all_mt_feats, wav_file_list2, mt_feature_names)
def speakerDiarization(filename, n_speakers, mt_size=2.0, mt_step=0.2, st_win=0.05, lda_dim=35, plot_res=False): """ ARGUMENTS: - filename: the name of the WAV file to be analyzed - n_speakers the number of speakers (clusters) in the recording (<=0 for unknown) - mt_size (opt) mid-term window size - mt_step (opt) mid-term window step - st_win (opt) short-term window size - lda_dim (opt LDA dimension (0 for no LDA) - plot_res (opt) 0 for not plotting the results 1 for plotting """ [fs, x] = audioBasicIO.readAudioFile(filename) x = audioBasicIO.stereo2mono(x) duration = len(x) / fs [classifier_1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.load_model_knn(os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "knnSpeakerAll")) [classifier_2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.load_model_knn(os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "knnSpeakerFemaleMale")) [mt_feats, st_feats, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs, mt_step * fs, round(fs * st_win), round(fs*st_win * 0.5)) MidTermFeatures2 = np.zeros((mt_feats.shape[0] + len(classNames1) + len(classNames2), mt_feats.shape[1])) for i in range(mt_feats.shape[1]): cur_f1 = (mt_feats[:, i] - MEAN1) / STD1 cur_f2 = (mt_feats[:, i] - MEAN2) / STD2 [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1) [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2) MidTermFeatures2[0:mt_feats.shape[0], i] = mt_feats[:, i] MidTermFeatures2[mt_feats.shape[0]:mt_feats.shape[0]+len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[mt_feats.shape[0] + len(classNames1)::, i] = P2 + 0.0001 mt_feats = MidTermFeatures2 # TODO iFeaturesSelect = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53] mt_feats = mt_feats[iFeaturesSelect, :] (mt_feats_norm, MEAN, STD) = aT.normalizeFeatures([mt_feats.T]) mt_feats_norm = mt_feats_norm[0].T n_wins = mt_feats.shape[1] # remove outliers: dist_all = np.sum(distance.squareform(distance.pdist(mt_feats_norm.T)), axis=0) m_dist_all = np.mean(dist_all) i_non_outliers = np.nonzero(dist_all < 1.2 * m_dist_all)[0] # TODO: Combine energy threshold for outlier removal: #EnergyMin = np.min(mt_feats[1,:]) #EnergyMean = np.mean(mt_feats[1,:]) #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 #i_non_outliers = np.nonzero(mt_feats[1,:] > Thres)[0] #print i_non_outliers perOutLier = (100.0 * (n_wins - i_non_outliers.shape[0])) / n_wins mt_feats_norm_or = mt_feats_norm mt_feats_norm = mt_feats_norm[:, i_non_outliers] # LDA dimensionality reduction: if lda_dim > 0: #[mt_feats_to_red, _, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs, # st_win * fs, round(fs*st_win), round(fs*st_win)); # extract mid-term features with minimum step: mt_win_ratio = int(round(mt_size / st_win)) mt_step_ratio = int(round(st_win / st_win)) mt_feats_to_red = [] num_of_features = len(st_feats) num_of_stats = 2 #for i in range(num_of_stats * num_of_features + 1): for i in range(num_of_stats * num_of_features): mt_feats_to_red.append([]) for i in range(num_of_features): # for each of the short-term features: curPos = 0 N = len(st_feats[i]) while (curPos < N): N1 = curPos N2 = curPos + mt_win_ratio if N2 > N: N2 = N curStFeatures = st_feats[i][N1:N2] mt_feats_to_red[i].append(np.mean(curStFeatures)) mt_feats_to_red[i+num_of_features].append(np.std(curStFeatures)) curPos += mt_step_ratio mt_feats_to_red = np.array(mt_feats_to_red) mt_feats_to_red_2 = np.zeros((mt_feats_to_red.shape[0] + len(classNames1) + len(classNames2), mt_feats_to_red.shape[1])) for i in range(mt_feats_to_red.shape[1]): cur_f1 = (mt_feats_to_red[:, i] - MEAN1) / STD1 cur_f2 = (mt_feats_to_red[:, i] - MEAN2) / STD2 [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1) [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2) mt_feats_to_red_2[0:mt_feats_to_red.shape[0], i] = mt_feats_to_red[:, i] mt_feats_to_red_2[mt_feats_to_red.shape[0]:mt_feats_to_red.shape[0] + len(classNames1), i] = P1 + 0.0001 mt_feats_to_red_2[mt_feats_to_red.shape[0]+len(classNames1)::, i] = P2 + 0.0001 mt_feats_to_red = mt_feats_to_red_2 mt_feats_to_red = mt_feats_to_red[iFeaturesSelect, :] #mt_feats_to_red += np.random.rand(mt_feats_to_red.shape[0], mt_feats_to_red.shape[1]) * 0.0000010 (mt_feats_to_red, MEAN, STD) = aT.normalizeFeatures([mt_feats_to_red.T]) mt_feats_to_red = mt_feats_to_red[0].T #dist_all = np.sum(distance.squareform(distance.pdist(mt_feats_to_red.T)), axis=0) #m_dist_all = np.mean(dist_all) #iNonOutLiers2 = np.nonzero(dist_all < 3.0*m_dist_all)[0] #mt_feats_to_red = mt_feats_to_red[:, iNonOutLiers2] Labels = np.zeros((mt_feats_to_red.shape[1], )); LDAstep = 1.0 LDAstepRatio = LDAstep / st_win #print LDAstep, LDAstepRatio for i in range(Labels.shape[0]): Labels[i] = int(i*st_win/LDAstepRatio); clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(n_components=lda_dim) clf.fit(mt_feats_to_red.T, Labels) mt_feats_norm = (clf.transform(mt_feats_norm.T)).T if n_speakers <= 0: s_range = range(2, 10) else: s_range = [n_speakers] clsAll = [] sil_all = [] centersAll = [] for iSpeakers in s_range: k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers) k_means.fit(mt_feats_norm.T) cls = k_means.labels_ means = k_means.cluster_centers_ # Y = distance.squareform(distance.pdist(mt_feats_norm.T)) clsAll.append(cls) centersAll.append(means) sil_1 = []; sil_2 = [] for c in range(iSpeakers): # for each speaker (i.e. for each extracted cluster) clust_per_cent = np.nonzero(cls == c)[0].shape[0] / \ float(len(cls)) if clust_per_cent < 0.020: sil_1.append(0.0) sil_2.append(0.0) else: # get subset of feature vectors mt_feats_norm_temp = mt_feats_norm[:, cls==c] # compute average distance between samples # that belong to the cluster (a values) Yt = distance.pdist(mt_feats_norm_temp.T) sil_1.append(np.mean(Yt)*clust_per_cent) silBs = [] for c2 in range(iSpeakers): # compute distances from samples of other clusters if c2 != c: clust_per_cent_2 = np.nonzero(cls == c2)[0].shape[0] /\ float(len(cls)) MidTermFeaturesNormTemp2 = mt_feats_norm[:, cls == c2] Yt = distance.cdist(mt_feats_norm_temp.T, MidTermFeaturesNormTemp2.T) silBs.append(np.mean(Yt)*(clust_per_cent + clust_per_cent_2)/2.0) silBs = np.array(silBs) # ... and keep the minimum value (i.e. # the distance from the "nearest" cluster) sil_2.append(min(silBs)) sil_1 = np.array(sil_1); sil_2 = np.array(sil_2); sil = [] for c in range(iSpeakers): # for each cluster (speaker) compute silhouette sil.append( ( sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 0.00001)) # keep the AVERAGE SILLOUETTE sil_all.append(np.mean(sil)) imax = np.argmax(sil_all) # optimal number of clusters nSpeakersFinal = s_range[imax] # generate the final set of cluster labels # (important: need to retrieve the outlier windows: # this is achieved by giving them the value of their # nearest non-outlier window) cls = np.zeros((n_wins,)) for i in range(n_wins): j = np.argmin(np.abs(i-i_non_outliers)) cls[i] = clsAll[imax][j] # Post-process method 1: hmm smoothing for i in range(1): # hmm training start_prob, transmat, means, cov = \ trainHMM_computeStatistics(mt_feats_norm_or, cls) hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag") hmm.startprob_ = start_prob hmm.transmat_ = transmat hmm.means_ = means; hmm.covars_ = cov cls = hmm.predict(mt_feats_norm_or.T) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = sil_all[imax] class_names = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)]; # load ground-truth if available gt_file = filename.replace('.wav', '.segments') # if groundturh exists if os.path.isfile(gt_file): [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file) flags_gt, class_names_gt = segs2flags(seg_start, seg_end, seg_labs, mt_step) if plot_res: fig = plt.figure() if n_speakers > 0: ax1 = fig.add_subplot(111) else: ax1 = fig.add_subplot(211) ax1.set_yticks(np.array(range(len(class_names)))) ax1.axis((0, duration, -1, len(class_names))) ax1.set_yticklabels(class_names) ax1.plot(np.array(range(len(cls)))*mt_step+mt_step/2.0, cls) if os.path.isfile(gt_file): if plot_res: ax1.plot(np.array(range(len(flags_gt))) * mt_step + mt_step / 2.0, flags_gt, 'r') purity_cluster_m, purity_speaker_m = \ evaluateSpeakerDiarization(cls, flags_gt) print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.title("Cluster purity: {0:.1f}% - " "Speaker purity: {1:.1f}%".format(100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.xlabel("time (seconds)") #print s_range, sil_all if n_speakers<=0: plt.subplot(212) plt.plot(s_range, sil_all) plt.xlabel("number of clusters"); plt.ylabel("average clustering's sillouette"); plt.show() return cls
from pyAudioAnalysis import audioBasicIO from pyAudioAnalysis import audioFeatureExtraction import matplotlib.pyplot as plt [Fs, x] = audioBasicIO.readAudioFile("in_Data/rattle.wav") F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.025*Fs); plt.subplot(2,1,1); plt.plot(F[0,:]); plt.xlabel('Frame no'); plt.ylabel('ZCR'); plt.subplot(2,1,2); plt.plot(F[1,:]); plt.xlabel('Frame no'); plt.ylabel('Energy'); plt.show()
model_file_path = 'Models/neural_net_model.model' model_weigths_path = 'Models/neural_net_model.weights' model = keras.models.load_model(model_file_path) out_audio_for_test_path = 'output_audio_for_testing' dir_to_test = os.path.join(os.path.dirname(os.getcwd()), out_audio_for_test_path) file_list = os.listdir(dir_to_test) feat_list = [] for file in file_list: file_path = os.path.join(dir_to_test, file) [Fs, x] = audioBasicIO.readAudioFile(file_path) F, f_names = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.200 * Fs, 0.150 * Fs) feat_list.append(F) # Make the input shape (646,1) for Dense input neural networks audio_feature_set = [] for item in feat_list: list = [] for feature in item: for frame in feature: list.append(frame) audio_feature_set.append(list) feat_list = np.array(feat_list) audio_feature_set = np.array(audio_feature_set)
emotion_names = { all_emotions[k].replace('../Ravdess_Dataset/test_wavdata/',''):k for k in range(len(all_emotions)) } total_emotions=len(num_test_cases) confusion_matrix = np.zeros((total_emotions,total_emotions)) for emotion in all_emotions: emotion_name=emotion.replace('../Ravdess_Dataset/test_wavdata/','') # speaker_name=speaker.replace(emotion+'/','') for testcasefile in glob.glob(emotion+'/*.wav'): [Fs, x] = audioBasicIO.readAudioFile(testcasefile) mfcc_features = extract_MFCCs(x,Fs,window*Fs,window_overlap*Fs,calc_deltas) actual_file_name = testcasefile.replace(emotion+"/",'') # print actual_file_name prosody_features = extract_prosody(actual_file_name,emotion_name) lpcc_features = extract_lpcc(actual_file_name,emotion_name) #print mfcc_features.shape,prosody_features.shape,lpcc_features.shape if mfcc_features.shape[0]==prosody_features.shape[0] and prosody_features.shape[0]==lpcc_features.shape[0]: pass else: min_shape=min([ mfcc_features.shape[0],prosody_features.shape[0],lpcc_features.shape[0] ]) if mfcc_features.shape[0]!=min_shape: mfcc_features=mfcc_features[0:min_shape] if prosody_features.shape[0]!=min_shape:
#!/usr/bin/env python2 # -*- coding: utf-8 -*- """ Created on Tue May 22 11:13:09 2018 @author: bara """ from pyAudioAnalysis import audioBasicIO from pyAudioAnalysis import audioFeatureExtraction import matplotlib.pyplot as plt import numpy as np [fs, x_good] = audioBasicIO.readAudioFile("samples/good/5.wav") x_good = x_good / (2.**15) times = np.arange(len(x_good)) / float(fs) plt.subplot(2, 1, 1) plt.plot(times, x_good) plt.xlabel('Tempo (s)') plt.ylabel('Amplitude') plt.show()
def getfileintoframe(file,itt,file1,file2,file3): waveFile = opens(file, 'rb') [Fs, x] = audioBasicIO.readAudioFile(file) length = waveFile.getnframes() # Read them into the frames array samples=[] sample1=[] start=0 prev="" for i in range(start,itt): waveData = waveFile.readframes(1) data=struct.unpack("%ih"%1,waveData) sample1.append(int(data[0])) start=start+1 samples = np.array(sample1) signal = numpy.double(samples) signal = signal / (2.0 ** 15) DC = signal.mean() MAX = (numpy.abs(signal)).max() signal = (signal - DC) / MAX N = len(signal) # total number of samples curPos = 0 countFrames = 0 nFFT = int(1500 / 2) X = abs(fft(signal)) # get fft magnitude X = X[0:nFFT] # normalize fft X = X / len(X) prev=X itt=itt+itt count=0 flag_check=True while True: for i in range(start,itt): waveData = waveFile.readframes(1) try: data=struct.unpack("%ih"%1,waveData) except: pass sample1.append(int(data[0])) start=start+1 samples = np.array(sample1) signal = numpy.double(samples) signal = signal / (2.0 ** 15) DC = signal.mean() MAX = (numpy.abs(signal)).max() signal = (signal - DC) / MAX N = len(signal) # total number of samples curPos = 0 countFrames = 0 nFFT = int(1500 / 2) X = abs(fft(signal)) # get fft magnitude X = X[0:nFFT] # normalize fft X = X / len(X) itt=itt+itt file1.write("Energy ") file1.write(str(stEnergy(X))) file1.write("\n") #print("Energy",stEnergy(X)) file1.write("entropy ") file1.write(str(stEnergyEntropy(X))) file1.write("\n") #print("entropy",stEnergyEntropy(X)) file1.write("flux ") file1.write(str(stSpectralFlux(X,prev))) file1.write("\n") #print("flux",stSpectralFlux(X,prev)) file1.write("spectral_roll_off ") file1.write(str(stSpectralRollOff(X,stEnergy(X),Fs))) file1.write("\n") #print("spectral_roll_off",stSpectralRollOff(X,stEnergy(X),Fs)) [nChroma, nFreqsPerChroma]=stChromaFeaturesInit(1500,Fs) [nChroma1, nFreqsPerChroma1]=stChromaFeatures(X, Fs, nChroma, nFreqsPerChroma) if(flag_check): for each in nChroma1: file2.write(each+" ") flag_check=False file2.write("\n") for each in nFreqsPerChroma1: for each1 in each: #try: str1=str(each1).replace('[',' ') str1=str(str1).replace(']',' ') str1=str1.split(' ') for each2 in str1: if(each2!=" "): try: value=float(each2) file2.write(str(value)+" ") except: pass #except: # print("error") file2.write("\n") #print("Chroma Feature",stChromaFeatures(X, Fs, nChroma, nFreqsPerChroma)) #print("Chromagram",stChromagram(samples, Fs, itt, count, True)) #print("Zero Crossing",stHarmonic(X, Fs)) #[fbank, freqs]=mfccInitFilterBanks(Fs,itt) #print("MFCC",stMFCC(X,fbank,freqs)) count=count+1 if count>=10: break prev=X for each in mfccInitFilterBanks(Fs,1500): for each1 in each: try: for each2 in each1: if(each2!='[' or each2!=']'): if(float(each2!=0)): file3.write(str(each2)+" ") file3.write("\n") except: if(float(each1!=0)): file3.write(str(each1)+" ") file3.write("\n")
def main(argv): if argv[2] == 'full': dirName = argv[1] types = ('*.wav', ) filesList = [] for files in types: filesList.extend(glob.glob(os.path.join(dirName, files))) filesList = sorted(filesList) filesListIrr = [] filesListIrr = sorted(filesListIrr) stWin = 0.020 stStep = 0.015 for f in filesList: [Fs, x] = audioBasicIO.readAudioFile(f) x = audioBasicIO.stereo2mono(x) createSpectrogramFile(x, Fs, f.replace(".wav", ".png"), stWin, stStep) else: dirName = argv[1] dirNameIrrelevant = argv[2] types = ('*.wav', ) filesList = [] for files in types: filesList.extend(glob.glob(os.path.join(dirName, files))) filesList = sorted(filesList) filesListIrr = [] for files in types: filesListIrr.extend( glob.glob(os.path.join(dirNameIrrelevant, files))) filesListIrr = sorted(filesListIrr) print filesListIrr WIDTH_SEC = 1.5 stWin = 0.040 stStep = 0.005 WIDTH = WIDTH_SEC / stStep for f in filesList: print f [Fs, x] = audioBasicIO.readAudioFile(f) x = audioBasicIO.stereo2mono(x) x = x.astype(float) / x.max() for i in range(3): if x.shape[0] > WIDTH_SEC * Fs + 200: randStartSignal = random.randrange( 0, int(x.shape[0] - WIDTH_SEC * Fs - 200)) x2 = x[randStartSignal:randStartSignal + int((WIDTH_SEC + stStep) * Fs)] createSpectrogramFile(x2, Fs, f.replace(".wav", ".png"), stWin, stStep) # ORIGINAL if len(dirNameIrrelevant) > 0: # AUGMENTED randIrrelevant = random.randrange(0, len(filesListIrr)) [Fs, xnoise] = audioBasicIO.readAudioFile( filesListIrr[randIrrelevant]) xnoise = xnoise.astype(float) / xnoise.max() randStartNoise = random.randrange( 0, xnoise.shape[0] - WIDTH_SEC * Fs - 200) R = 5 xN = (R * x2.astype(float) + xnoise[randStartNoise:randStartNoise + x2.shape[0]].astype(float)) / float(R + 1) wavfile.write( f.replace(".wav", "_rnoise{0:d}1.wav".format(i)), Fs, (16000 * xN).astype('int16')) #createSpectrogramFile(xN, Fs, f.replace(".wav","_rnoise{0:d}1.png".format(i)), stWin, stStep) randStartNoise = random.randrange( 0, xnoise.shape[0] - WIDTH_SEC * Fs - 200) R = 4 xN = (R * x2.astype(float) + xnoise[randStartNoise:randStartNoise + x2.shape[0]].astype(float)) / float(R + 1) wavfile.write( f.replace(".wav", "_rnoise{0:d}2.wav".format(i)), Fs, (16000 * xN).astype('int16')) #createSpectrogramFile(xN, Fs, f.replace(".wav","_rnoise{0:d}2.png".format(i)), stWin, stStep) randStartNoise = random.randrange( 0, xnoise.shape[0] - WIDTH_SEC * Fs - 200) R = 3 xN = (R * x2.astype(float) + xnoise[randStartNoise:randStartNoise + x2.shape[0]].astype(float)) / float(R + 1) wavfile.write( f.replace(".wav", "_rnoise{0:d}3.wav".format(i)), Fs, (16000 * xN).astype('int16')) #createSpectrogramFile(xN, Fs, f.replace(".wav","_rnoise{0:d}3.png".format(i)), stWin, stStep) randStartNoise = random.randrange( 0, xnoise.shape[0] - WIDTH_SEC * Fs - 200) R = 6 xN = (R * x2.astype(float) + xnoise[randStartNoise:randStartNoise + x2.shape[0]].astype(float)) / float(R + 1) wavfile.write( f.replace(".wav", "_rnoise{0:d}4.wav".format(i)), Fs, (16000 * xN).astype('int16')) #createSpectrogramFile(xN, Fs, f.replace(".wav","_rnoise{0:d}4.png".format(i)), stWin, stStep) randStartNoise = random.randrange( 0, xnoise.shape[0] - WIDTH_SEC * Fs - 200) R = 2 xN = (R * x2.astype(float) + xnoise[randStartNoise:randStartNoise + x2.shape[0]].astype(float)) / float(R + 1) wavfile.write( f.replace(".wav", "_rnoise{0:d}5.wav".format(i)), Fs, (16000 * xN).astype('int16')) #createSpectrogramFile(xN, Fs, f.replace(".wav","_rnoise{0:d}5.png".format(i)), stWin, stStep) randStartNoise = random.randrange( 0, xnoise.shape[0] - WIDTH_SEC * Fs - 200) R = 1 xN = (R * x2.astype(float) + xnoise[randStartNoise:randStartNoise + x2.shape[0]].astype(float)) / float(R + 1) wavfile.write( f.replace(".wav", "_rnoise{0:d}6.wav".format(i)), Fs, (16000 * xN).astype('int16')) #createSpectrogramFile(xN, Fs, f.replace(".wav","_rnoise{0:d}6.png".format(i)), stWin, stStep) #specgramOr, TimeAxis, FreqAxis = aF.stSpectogram(x2, Fs, round(Fs * stWin), round(Fs * stStep), False) #im2 = Image.fromarray(numpy.uint8(matplotlib.cm.jet(specgram)*255)) #plt.subplot(2,1,1) #plt.imshow(im1) #plt.subplot(2,1,2) #plt.imshow(im2) #plt.show() '''
def main(argv): if argv[1] == "-shortTerm": for i in range(nExp): [Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav"); duration = x.shape[0] / float(Fs) t1 = time.clock() F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.050*Fs); t2 = time.clock() perTime1 = duration / (t2-t1); print "short-term feature extraction: {0:.1f} x realtime".format(perTime1) elif argv[1] == "-classifyFile": for i in range(nExp): [Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav"); duration = x.shape[0] / float(Fs) t1 = time.clock() aT.fileClassification("diarizationExample.wav", "svmSM","svm") t2 = time.clock() perTime1 = duration / (t2-t1); print "Mid-term feature extraction + classification \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-mtClassify": for i in range(nExp): [Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav"); duration = x.shape[0] / float(Fs) t1 = time.clock() [flagsInd, classesAll, acc] = aS.mtFileClassification("diarizationExample.wav", "svmSM", "svm", False, '') t2 = time.clock() perTime1 = duration / (t2-t1); print "Fix-sized classification - segmentation \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-hmmSegmentation": for i in range(nExp): [Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav"); duration = x.shape[0] / float(Fs) t1 = time.clock() aS.hmmSegmentation('diarizationExample.wav', 'hmmRadioSM', False, '') t2 = time.clock() perTime1 = duration / (t2-t1); print "HMM-based classification - segmentation \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-silenceRemoval": for i in range(nExp): [Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav"); duration = x.shape[0] / float(Fs) t1 = time.clock() [Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav"); segments = aS.silenceRemoval(x, Fs, 0.050, 0.050, smoothWindow = 1.0, Weight = 0.3, plot = False) t2 = time.clock() perTime1 = duration / (t2-t1); print "Silence removal \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-thumbnailing": for i in range(nExp): [Fs1, x1] = audioBasicIO.readAudioFile("scottish.wav") duration1 = x1.shape[0] / float(Fs1) t1 = time.clock() [A1, A2, B1, B2, Smatrix] = aS.musicThumbnailing(x1, Fs1, 1.0, 1.0, 15.0) # find thumbnail endpoints t2 = time.clock() perTime1 = duration1 / (t2-t1); print "Thumbnail \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-diarization-noLDA": for i in range(nExp): [Fs1, x1] = audioBasicIO.readAudioFile("diarizationExample.wav") duration1 = x1.shape[0] / float(Fs1) t1 = time.clock() aS.speakerDiarization("diarizationExample.wav", 4, LDAdim = 0, PLOT = False) t2 = time.clock() perTime1 = duration1 / (t2-t1); print "Diarization \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-diarization-LDA": for i in range(nExp): [Fs1, x1] = audioBasicIO.readAudioFile("diarizationExample.wav") duration1 = x1.shape[0] / float(Fs1) t1 = time.clock() aS.speakerDiarization("diarizationExample.wav", 4, PLOT = False) t2 = time.clock() perTime1 = duration1 / (t2-t1); print "Diarization \t {0:.1f} x realtime".format(perTime1)
def dirWavFeatureExtraction(dirName, mt_win, mt_step, st_win, st_step, compute_beat=False): """ This function extracts the mid-term features of the WAVE files of a particular folder. The resulting feature vector is extracted by long-term averaging the mid-term features. Therefore ONE FEATURE VECTOR is extracted for each WAV file. ARGUMENTS: - dirName: the path of the WAVE directory - mt_win, mt_step: mid-term window and step (in seconds) - st_win, st_step: short-term window and step (in seconds) """ all_mt_feats = numpy.array([]) process_times = [] types = ('*.wav', '*.aif', '*.aiff', '*.mp3', '*.au', '*.ogg') wav_file_list = [] for files in types: wav_file_list.extend(glob.glob(os.path.join(dirName, files))) wav_file_list = sorted(wav_file_list) wav_file_list2, mt_feature_names = [], [] for i, wavFile in enumerate(wav_file_list): print("Analyzing file {0:d} of " "{1:d}: {2:s}".format(i+1, len(wav_file_list), wavFile)) if os.stat(wavFile).st_size == 0: print(" (EMPTY FILE -- SKIPPING)") continue [fs, x] = audioBasicIO.readAudioFile(wavFile) if isinstance(x, int): continue t1 = time.clock() x = audioBasicIO.stereo2mono(x) if x.shape[0]<float(fs)/5: print(" (AUDIO FILE TOO SMALL - SKIPPING)") continue wav_file_list2.append(wavFile) if compute_beat: [mt_term_feats, st_features, mt_feature_names] = \ mtFeatureExtraction(x, fs, round(mt_win * fs), round(mt_step * fs), round(fs * st_win), round(fs * st_step)) [beat, beat_conf] = beatExtraction(st_features, st_step) else: [mt_term_feats, _, mt_feature_names] = \ mtFeatureExtraction(x, fs, round(mt_win * fs), round(mt_step * fs), round(fs * st_win), round(fs * st_step)) mt_term_feats = numpy.transpose(mt_term_feats) mt_term_feats = mt_term_feats.mean(axis=0) # long term averaging of mid-term statistics if (not numpy.isnan(mt_term_feats).any()) and \ (not numpy.isinf(mt_term_feats).any()): if compute_beat: mt_term_feats = numpy.append(mt_term_feats, beat) mt_term_feats = numpy.append(mt_term_feats, beat_conf) if len(all_mt_feats) == 0: # append feature vector all_mt_feats = mt_term_feats else: all_mt_feats = numpy.vstack((all_mt_feats, mt_term_feats)) t2 = time.clock() duration = float(len(x)) / fs process_times.append((t2 - t1) / duration) if len(process_times) > 0: print("Feature extraction complexity ratio: " "{0:.1f} x realtime".format((1.0 / numpy.mean(numpy.array(process_times))))) return (all_mt_feats, wav_file_list2, mt_feature_names)
def train_classifier(): data_set = [] for file in os.listdir("training_dataset/unhappy"): temp = [] mean_value = [] if file.endswith(".mp3"): # print "training_dataset/unhappy/"+file sound = AudioSegment.from_mp3("training_dataset/unhappy/" + file) sound.export("test.wav", format="wav") [Fs, x] = audioBasicIO.readAudioFile("test.wav") F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05 * Fs, 0.025 * Fs) for i in range(len(F)): temp.append(numpy.mean(F[i])) mean_value.append(temp) mean_value.append(1) data_set.append(mean_value) for file in os.listdir("training_dataset/happy"): temp = [] mean_value = [] if file.endswith(".mp3"): # print "training_dataset/happy/"+file sound = AudioSegment.from_mp3("training_dataset/happy/" + file) sound.export("test.wav", format="wav") [Fs, x] = audioBasicIO.readAudioFile("test.wav") F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05 * Fs, 0.025 * Fs) for i in range(len(F)): temp.append(numpy.mean(F[i])) mean_value.append(temp) mean_value.append(2) data_set.append(mean_value) for file in os.listdir("training_dataset/angry"): temp = [] mean_value = [] if file.endswith(".mp3"): # print "training_dataset/angry/"+file sound = AudioSegment.from_mp3("training_dataset/angry/" + file) sound.export("test.wav", format="wav") [Fs, x] = audioBasicIO.readAudioFile("test.wav") F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05 * Fs, 0.025 * Fs) for i in range(len(F)): temp.append(numpy.mean(F[i])) mean_value.append(temp) mean_value.append(3) data_set.append(mean_value) for file in os.listdir("training_dataset/neutral"): temp = [] mean_value = [] if file.endswith(".mp3"): # print "training_dataset/neutral/"+file sound = AudioSegment.from_mp3("training_dataset/neutral/" + file) sound.export("test.wav", format="wav") [Fs, x] = audioBasicIO.readAudioFile("test.wav") F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05 * Fs, 0.025 * Fs) for i in range(len(F)): temp.append(numpy.mean(F[i])) mean_value.append(temp) mean_value.append(4) data_set.append(mean_value) x = [] y = [] for i in range(len(data_set)): x.append(data_set[i][0]) y.append(data_set[i][1]) clf = RandomForestClassifier(n_estimators=30, max_features=6, max_depth=None, min_samples_split=1, bootstrap=True) clf = clf.fit(x, y) f2 = open("classifier.pickle", "wb") pickle.dump(clf, f2) f2.close()
from pyAudioAnalysis import audioBasicIO from pyAudioAnalysis import audioFeatureExtraction import matplotlib.pyplot as plt [Fs, x] = audioBasicIO.readAudioFile("happy.wav"); # Fs is frequency # x is real data F, f_names = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.025*Fs); # stFeatureExtraction(signal, fs, win, step): # signal: the input signal samples # fs: the sampling freq (in Hz) # win: the short-term window size (in samples) # step: the short-term window step (in samples) ''' here, window size = 0.05*Fs = 0.05*16000 = 800 step size = 0.025*Fs = 0.024*16000 = 400 we can get n frames from signal with length 23776 400*n+800=23776 -> n=57.44 = 58 as below F.shape = (34,58) ''' '''
def main(path): ds = Dataset(path) loader = Loader(path + "/train/", 32, 16) X = [] y = [] Z = [] ii = 0 for p in ds.trainTracks(): f = p.split("/") name = f[len(f) - 1] labelTeller = loader.loadLabelsForSoundfile(name) [Fs, x] = audioBasicIO.readAudioFile(p) F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.032 * Fs, 0.016 * Fs) G = zip(*F) N = 0 if len(G) > labelTeller.tellNoOfAllBlocks(): N = labelTeller.tellNoOfAllBlocks() else: N = len(G) for i in range(N): Z.append([G[i], labelTeller.tell(i)]) # i = 0 # for w in ds.windows(x,44100, 1410, 705): # mf = mfcc(w) # row = [i] # Z.append([mf[0],labelTeller.tell(i)]) # i = i+1 print p + " " + str(ii) + "/61" ii = ii + 1 print "shuffle" random.shuffle(Z) Z = zip(*Z) NN = 20000 L = NN R = NN FINAL = [[], []] for i in range(len(Z[0])): if Z[1][i] == "sing" and L > 0: L = L - 1 FINAL[0].append(Z[0][i]) FINAL[1].append(Z[1][i]) if Z[1][i] == "nosing" and R > 0: R = R - 1 FINAL[0].append(Z[0][i]) FINAL[1].append(Z[1][i]) clf = svm.SVC(cache_size=2000) print "######### " + str(len(Z[0])) clf.fit(FINAL[0], FINAL[1]) loader = Loader(path + "/test/", 32, 16) print "Loading test" for p in ds.validationTracks(): X = [] y = [] f = p.split("/") name = f[len(f) - 1] labelTeller = loader.loadLabelsForSoundfile(name) i = 0 [Fs, x] = audioBasicIO.readAudioFile(p) F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.032 * Fs, 0.016 * Fs) G = zip(*F) N = 0 if len(G) > labelTeller.tellNoOfAllBlocks(): N = labelTeller.tellNoOfAllBlocks() else: N = len(G) for i in range(N): X.append(G[i]) y.append(labelTeller.tell(i)) print "Starting prediction " + p Y = clf.predict(X) ok = 0 al = 0 for i in range(len(y)): if y[i] == Y[i]: ok = ok + 1 al = al + 1 print ok / float(al)
def mtFileClassification(inputFile, modelName, modelType, plotResults=False, gtFile=""): ''' This function performs mid-term classification of an audio stream. Towards this end, supervised knowledge is used, i.e. a pre-trained classifier. ARGUMENTS: - inputFile: path of the input WAV file - modelName: name of the classification model - modelType: svm or knn depending on the classifier type - plotResults: True if results are to be plotted using matplotlib along with a set of statistics RETURNS: - segs: a sequence of segment's endpoints: segs[i] is the endpoint of the i-th segment (in seconds) - classes: a sequence of class flags: class[i] is the class ID of the i-th segment ''' if not os.path.isfile(modelName): print("mtFileClassificationError: input modelType not found!") return (-1, -1, -1, -1) # Load classifier: if modelType == "knn": [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = \ aT.load_model_knn(modelName) else: [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT ] = aT.load_model(modelName) if computeBEAT: print("Model " + modelName + " contains long-term music features " "(beat etc) and cannot be used in " "segmentation") return (-1, -1, -1, -1) [Fs, x] = audioBasicIO.readAudioFile(inputFile) # load input file if Fs == -1: # could not read file return (-1, -1, -1, -1) x = audioBasicIO.stereo2mono(x) # convert stereo (if) to mono Duration = len(x) / Fs # mid-term feature extraction: [MidTermFeatures, _] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep)) flags = [] Ps = [] flagsInd = [] for i in range( MidTermFeatures.shape[1] ): # for each feature vector (i.e. for each fix-sized segment): curFV = (MidTermFeatures[:, i] - MEAN) / STD # normalize current feature vector [Result, P] = aT.classifierWrapper(Classifier, modelType, curFV) # classify vector flagsInd.append(Result) flags.append(classNames[int(Result)]) # update class label matrix Ps.append(numpy.max(P)) # update probability matrix flagsInd = numpy.array(flagsInd) # 1-window smoothing for i in range(1, len(flagsInd) - 1): if flagsInd[i - 1] == flagsInd[i + 1]: flagsInd[i] = flagsInd[i + 1] (segs, classes) = flags2segs( flags, mtStep) # convert fix-sized flags to segments and classes segs[-1] = len(x) / float(Fs) # Load grount-truth: if os.path.isfile(gtFile): [segStartGT, segEndGT, segLabelsGT] = readSegmentGT(gtFile) flagsGT, classNamesGT = segs2flags(segStartGT, segEndGT, segLabelsGT, mtStep) flagsIndGT = [] for j, fl in enumerate(flagsGT): # "align" labels with GT if classNamesGT[flagsGT[j]] in classNames: flagsIndGT.append(classNames.index(classNamesGT[flagsGT[j]])) else: flagsIndGT.append(-1) flagsIndGT = numpy.array(flagsIndGT) CM = numpy.zeros((len(classNamesGT), len(classNamesGT))) for i in range(min(flagsInd.shape[0], flagsIndGT.shape[0])): CM[int(flagsIndGT[i]), int(flagsInd[i])] += 1 else: CM = [] flagsIndGT = numpy.array([]) acc = plotSegmentationResults(flagsInd, flagsIndGT, classNames, mtStep, not plotResults) if acc >= 0: print("Overall Accuracy: {0:.3f}".format(acc)) return (flagsInd, classNamesGT, acc, CM) else: return (flagsInd, classNames, acc, CM)
def fileClassification(inputFile, modelName, modelType): # Load classifier: if not os.path.isfile(modelName): print("fileClassification: input modelName not found!") return (-1, -1, -1) if not os.path.isfile(inputFile): print("fileClassification: wav file not found!") return (-1, -1, -1) if (modelType) == 'svm' or (modelType == 'svm_rbf'): [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT ] = loadSVModel(modelName) elif modelType == 'knn': [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT ] = loadKNNModel(modelName) elif modelType == 'randomforest': [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT ] = loadRandomForestModel(modelName) elif modelType == 'gradientboosting': [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT ] = loadGradientBoostingModel(modelName) elif modelType == 'extratrees': [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT ] = loadExtraTreesModel(modelName) # read audio file and convert to mono [Fs, x] = audioBasicIO.readAudioFile(inputFile) x = audioBasicIO.stereo2mono(x) if isinstance(x, int): # audio file IO problem return (-1, -1, -1) if x.shape[0] / float(Fs) <= mtWin: return (-1, -1, -1) # feature extraction: [MidTermFeatures, s] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep)) # long term averaging of mid-term statistics MidTermFeatures = MidTermFeatures.mean(axis=1) if computeBEAT: [beat, beatConf] = aF.beatExtraction(s, stStep) MidTermFeatures = numpy.append(MidTermFeatures, beat) MidTermFeatures = numpy.append(MidTermFeatures, beatConf) curFV = (MidTermFeatures - MEAN) / STD # normalization [Result, P] = classifierWrapper(Classifier, modelType, curFV) # classification return Result, P, classNames
def Audio_Feature_Extraction_Extract(self): if self.Audio_Feature_Extraction_SingleFileWindowTerm.currentText( ) == "ShortTerm": [Fs, x] = audioBasicIO.readAudioFile( self.Audio_Feature_Extraction_SingleFilePath.text()) stFeatures = audioFeatureExtraction.stFeatureExtraction( x, Fs, float( self.Audio_Feature_Extraction_SingleFileWindowSize.text()) * Fs, float(self.Audio_Feature_Extraction_SingleFileStepSize.text()) * Fs) options = QFileDialog.Options() options |= QFileDialog.DontUseNativeDialog fileName, _ = QFileDialog.getSaveFileName(self.somethingToPass, "Where to save?", "", "CSV files (*.csv)", options=options) numpy.savetxt(fileName + ".csv", stFeatures, delimiter=',') QMessageBox.about(self.somethingToPass, "Files Created", "File has been saved as CSV file") ## Let the User specify how many features and what features to see eventually. #============================================================================== # if self.Audio_Feature_Extraction_SingleFileDataVisualisation.isChecked(): # stFeatures = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.025*Fs) # # labels = ["Zero Crossing Rate", "Energy", "Entropy of Energy", "Spectral Centroid", # "Spectral Spread", "Spectral Entropy", "Spectral Flux", "Spectral Rolloff", # "MFCC 1", "MFCC 2", "MFCC 3", "MFCC 4", # "MFCC 5", "MFCC 6", "MFCC 7", "MFCC 8", # "MFCC 9", "MFCC 10", "MFCC 11", "MFCC 12", "MFCC 13", # "Chroma Vector 1", "Chroma Vector 2", "Chroma Vector 3", "Chroma Vector 4", # "Chroma Vector 5", "Chroma Vector 6", "Chroma Vector 7", "Chroma Vector 8", # "Chroma Vector 9", "Chroma Vector 10", "Chroma Vector 11","Chroma Vector 12", "Chroma Deviation"] # # # for x in range(0, len(labels)-1): # plt.subplot(34,1,x+1); plt.plot(stFeatures[x,:]); plt.xlabel('Frame no'); plt.ylabel(labels[x]) # # plt.show() # #============================================================================== else: options = QFileDialog.Options() options |= QFileDialog.DontUseNativeDialog fileName, _ = QFileDialog.getSaveFileName(self.somethingToPass, "Where to save?", "", "CSV files (*.csv)", options=options) audioFeatureExtraction.mtFeatureExtractionToFile( self.Audio_Feature_Extraction_SingleFilePath.text(), float( self.Audio_Feature_Extraction_SingleFilemidTermWindowSize. text()), float(self. Audio_Feature_Extraction_SingleFilemidTermWindowStepSize. text()), float( self.Audio_Feature_Extraction_SingleFileWindowSize.text()), float(self.Audio_Feature_Extraction_SingleFileStepSize.text()), fileName, storeStFeatures=True, storeToCSV=True, PLOT=False) QMessageBox.about( self.somethingToPass, "Files Created", "Files have been saved as CSV files and .npy files")
from pyAudioAnalysis import audioBasicIO from pyAudioAnalysis import audioFeatureExtraction import numpy as np import math import matplotlib.pyplot as plt # main process [Fs, x] = audioBasicIO.readAudioFile("data/diarizationExample.wav") TIME_OF_WINDOW = 0.050 #a window = 0.05s TIME_OF_STEP = 0.025 #step = 0.01s SIZE_OF_WINDOW = int(TIME_OF_WINDOW * Fs) #the number of frame for one window SIZE_OF_STEP = int(TIME_OF_STEP * Fs) #the number of frame for one step BLOCK_SIZE = 4 #a block has (6 * SIZE_OF_STEP) frame BLOCK_STEP = 2 # variables END_OF_FILE = 0 FIRST_PAIR = 1 INDEX_BOUCLE = 1 def getMFCCs(block_start, block_end): return attribute[8:20,block_start:block_end+1] def getMFCCsFromTime(moment_start, moment_end): block_start = int(moment_start / BLOCK_STEP / TIME_OF_STEP - 1) block_end = int(moment_end / BLOCK_STEP / TIME_OF_STEP - 1) return getMFCCs(block_start, block_end) def gauss(x, mean, cov): [n, d] = x.shape
def train_classifier(): data_set = [] for file in os.listdir("training_dataset/unhappy"): temp = [] mean_value = [] if file.endswith(".mp3"): #print "training_dataset/unhappy/"+file sound=AudioSegment.from_mp3("training_dataset/unhappy/"+file) sound.export("test.wav",format="wav") [Fs, x] = audioBasicIO.readAudioFile("test.wav"); F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05*Fs, 0.025*Fs); for i in range(len(F)): temp.append(numpy.mean(F[i])) mean_value.append(temp) mean_value.append(1) data_set.append(mean_value) for file in os.listdir("training_dataset/happy"): temp = [] mean_value = [] if file.endswith(".mp3"): #print "training_dataset/happy/"+file sound=AudioSegment.from_mp3("training_dataset/happy/"+file) sound.export("test.wav",format="wav") [Fs, x] = audioBasicIO.readAudioFile("test.wav"); F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05*Fs, 0.025*Fs); for i in range(len(F)): temp.append(numpy.mean(F[i])) mean_value.append(temp) mean_value.append(2) data_set.append(mean_value) for file in os.listdir("training_dataset/angry"): temp = [] mean_value = [] if file.endswith(".mp3"): #print "training_dataset/angry/"+file sound=AudioSegment.from_mp3("training_dataset/angry/"+file) sound.export("test.wav",format="wav") [Fs, x] = audioBasicIO.readAudioFile("test.wav"); F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05*Fs, 0.025*Fs); for i in range(len(F)): temp.append(numpy.mean(F[i])) mean_value.append(temp) mean_value.append(3) data_set.append(mean_value) for file in os.listdir("training_dataset/neutral"): temp = [] mean_value = [] if file.endswith(".mp3"): #print "training_dataset/neutral/"+file sound=AudioSegment.from_mp3("training_dataset/neutral/"+file) sound.export("test.wav",format="wav") [Fs, x] = audioBasicIO.readAudioFile("test.wav"); F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05*Fs, 0.025*Fs); for i in range(len(F)): temp.append(numpy.mean(F[i])) mean_value.append(temp) mean_value.append(4) data_set.append(mean_value) x = [] y = [] for i in range(len(data_set)): x.append(data_set[i][0]) y.append(data_set[i][1]) clf = RandomForestClassifier(n_estimators=30,max_features=6,max_depth=None,min_samples_split=1,bootstrap=True) clf = clf.fit(x, y) f2 = open('classifier.pickle', 'wb') pickle.dump(clf, f2) f2.close()
def save(csv_, wav): #csv and wav file as argument print csv_, wav csvFileName = csv_ wavFileName = wav Fs, x = audioBasicIO.readAudioFile(wavFileName) annotations = [] silence = [] folderName = None fileCounter = 0 start, end = 0, 0 #duration of wavFile spf = wave.open(wavFileName, 'r') #Get wavFile duration frames = spf.getnframes() rate = spf.getframerate() duration = frames / float(rate) #duration = int((duration)) csvFile = open(csvFileName, 'rb') # >> Empty csv file is 1 Byte -> check for empty if os.path.getsize(csvFileName) > 1: read = csv.reader(csvFile) #startTimeToPlay, endTimeToPlay -> str2float for row in read: row[0] = round(float(row[0]) / 1000, 2) row[1] = round(float(row[1]) / 1000, 2) #print row[0], row[1] if row[2][:8] == "Speech::": folderName = row[2][:6] else: folderName = row[2] annotations.append([row[0], row[1], folderName]) #check if the directory exists and create it if necessary if not os.path.exists(folderName): os.makedirs(folderName) #sort annotations alphabetically based on class name annotations = sorted(annotations, key=operator.itemgetter(2), reverse=False) # >> Save audio segments in folders, based on annotation class for i, an in enumerate(annotations): #find file ID for existing files in directory to continue writing.. directory = os.listdir(an[2]) #check for empty directory if directory: index = directory[0].index('_') fileCounter = 0 for i in range(len(directory)): if directory[i][index + 1] > fileCounter: fileCounter = directory[i][index + 1] fileCounter = int(fileCounter) + 1 else: fileCounter = 0 strOut = an[2] + "/{1:s}_{2:d}.wav".format( wavFileName.replace(".wav", ""), an[2], fileCounter) fileCounter = fileCounter + 1 #print strOut, int(Fs * an[0]), int(Fs * an[1]) folderName = an[2] wavfile.write(strOut, Fs, x[int(Fs * an[0]):int(Fs * an[1])]) # >> Find silence in audio file #sort annotations by start time annotations = sorted(annotations, key=operator.itemgetter(0), reverse=False) time = np.arange(0, duration, 0.01) #Get silence before-between-after annotations for i in range(len(annotations)): tS = np.searchsorted(time, annotations[i][0]) tE = np.searchsorted(time, annotations[i][1]) end = round(time[tS], 2) silence.append([start, end]) start = round(time[tE], 2) silence.append([start, duration]) #remove overlapping for i, s in enumerate(silence): if s[0] > s[1]: silence.remove(s) folderName = 'Silence' if not os.path.exists(folderName): os.makedirs(folderName) #find file ID for Silence directory = os.listdir(folderName) if directory: index = directory[0].index('_') fileCounter = 0 for i in range(len(directory)): if directory[i][index + 1] > fileCounter: fileCounter = directory[i][index + 1] fileCounter = int(fileCounter) + 1 else: fileCounter = 0 #save silence segment for i, s in enumerate(silence): strOut = folderName + "/Silence_{1:d}.wav".format( wavFileName.replace(".wav", ""), fileCounter) fileCounter = fileCounter + 1 wavfile.write(strOut, Fs, x[int(Fs * s[0]):int(Fs * s[1])]) print 'Finish saving audio segments...'
#import pyAudioAnalysis from pyAudioAnalysis import audioBasicIO from pyAudioAnalysis import audioFeatureExtraction import matplotlib.pyplot as plt path from pydub import AudioSegment sound = AudioSegment.from_mp3("../1.mp3") sound = sound.set_channels(1) sound.export("../1.mp3", format="mp3") [Fs, x] = audioBasicIO.readAudioFile("../1.mp3") F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050 * Fs, 0.025 * Fs) plt.subplot(2, 1, 1) plt.plot(F[0, :]) plt.xlabel('Frame no') plt.ylabel('ZCR') plt.subplot(2, 1, 2) plt.plot(F[1, :]) plt.xlabel('Frame no') plt.ylabel('Energy') plt.show()
def save(csv_, wav): #csv and wav file as argument print csv_, wav csvFileName = csv_ wavFileName = wav Fs, x = audioBasicIO.readAudioFile(wavFileName) annotations = [] silence = [] folderName = None fileCounter = 0 start, end = 0, 0 #duration of wavFile spf = wave.open(wavFileName,'r') #Get wavFile duration frames = spf.getnframes() rate = spf.getframerate() duration = frames / float(rate) #duration = int((duration)) csvFile = open(csvFileName, 'rb') # >> Empty csv file is 1 Byte -> check for empty if os.path.getsize(csvFileName) > 1: read = csv.reader(csvFile) #startTimeToPlay, endTimeToPlay -> str2float for row in read: row[0] = round(float(row[0])/1000,2) row[1] = round(float(row[1])/1000,2) #print row[0], row[1] if row[2][:8] == "Speech::": folderName = row[2][:6] else: folderName = row[2] annotations.append([row[0], row[1], folderName]) #check if the directory exists and create it if necessary if not os.path.exists(folderName): os.makedirs(folderName) #sort annotations alphabetically based on class name annotations = sorted(annotations, key=operator.itemgetter(2), reverse=False) # >> Save audio segments in folders, based on annotation class for i,an in enumerate(annotations): #find file ID for existing files in directory to continue writing.. directory = os.listdir(an[2]) #check for empty directory if directory: index = directory[0].index('_') fileCounter = 0 for i in range(len(directory)): if directory[i][index+1] > fileCounter: fileCounter = directory[i][index+1] fileCounter = int(fileCounter) + 1 else: fileCounter = 0 strOut = an[2] + "/{1:s}_{2:d}.wav".format(wavFileName.replace(".wav",""), an[2], fileCounter) fileCounter = fileCounter + 1 #print strOut, int(Fs * an[0]), int(Fs * an[1]) folderName = an[2] wavfile.write(strOut, Fs, x[int(Fs * an[0]):int(Fs * an[1])]) # >> Find silence in audio file #sort annotations by start time annotations = sorted(annotations, key=operator.itemgetter(0), reverse=False) time = np.arange(0,duration,0.01) #Get silence before-between-after annotations for i in range(len(annotations)): tS = np.searchsorted(time, annotations[i][0]) tE = np.searchsorted(time, annotations[i][1]) end = round(time[tS],2) silence.append([start, end]) start = round(time[tE],2) silence.append([start, duration]) #remove overlapping for i, s in enumerate(silence): if s[0]>s[1]: silence.remove(s) folderName = 'Silence' if not os.path.exists(folderName): os.makedirs(folderName) #find file ID for Silence directory = os.listdir(folderName) if directory: index = directory[0].index('_') fileCounter = 0 for i in range(len(directory)): if directory[i][index+1] > fileCounter: fileCounter = directory[i][index+1] fileCounter = int(fileCounter) + 1 else: fileCounter = 0 #save silence segment for i, s in enumerate(silence): strOut = folderName + "/Silence_{1:d}.wav".format(wavFileName.replace(".wav",""), fileCounter) fileCounter = fileCounter + 1 wavfile.write(strOut, Fs, x[int(Fs * s[0]):int(Fs * s[1])]) print 'Finish saving audio segments...' for root, dirs, files in os.walk("/mydir"): for silenceFile in files: if sys.getsizeof(silenceFile) <= 44: os.remove(silenceFile)
from __future__ import print_function from pyAudioAnalysis import audioBasicIO from pyAudioAnalysis import audioFeatureExtraction from pyAudioAnalysis import audioTrainTest as aT from pyAudioAnalysis import audioSegmentation as aS import matplotlib.pyplot as plt root_data_path = "/Users/tyiannak/ResearchData/Audio Dataset/pyAudioAnalysisData/" print("\n\n\n * * * TEST 1 * * * \n\n\n") [Fs, x] = audioBasicIO.readAudioFile(root_data_path + "pyAudioAnalysis/data/count.wav"); F, f_names = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.025*Fs); plt.subplot(2,1,1); plt.plot(F[0,:]); plt.xlabel('Frame no'); plt.ylabel(f_names[0]); plt.subplot(2,1,2); plt.plot(F[1,:]); plt.xlabel('Frame no'); plt.ylabel(f_names[1]); plt.show() print("\n\n\n * * * TEST 2 * * * \n\n\n") [Fs, x] = audioBasicIO.readAudioFile(root_data_path + "pyAudioAnalysis/data/doremi.wav") x = audioBasicIO.stereo2mono(x) specgram, TimeAxis, FreqAxis = audioFeatureExtraction.stSpectogram(x, Fs, round(Fs * 0.040), round(Fs * 0.040), True) print("\n\n\n * * * TEST 3 * * * \n\n\n") [Fs, x] = audioBasicIO.readAudioFile(root_data_path + "pyAudioAnalysis/data/doremi.wav") x = audioBasicIO.stereo2mono(x) specgram, TimeAxis, FreqAxis = audioFeatureExtraction.stChromagram(x, Fs, round(Fs * 0.040), round(Fs * 0.040), True) print("\n\n\n * * * TEST 4 * * * \n\n\n") aT.featureAndTrain([root_data_path +"SM/speech",root_data_path + "SM/music"], 1.0, 1.0, 0.2, 0.2, "svm", "temp", True) print("\n\n\n * * * TEST 5 * * * \n\n\n") [flagsInd, classesAll, acc, CM] = aS.mtFileClassification(root_data_path + "pyAudioAnalysis/data//scottish.wav", root_data_path + "pyAudioAnalysis/data/svmSM", "svm", True, root_data_path + 'pyAudioAnalysis/data/scottish.segments')