def speaker_diarization(): file = '/home/daiab/machine_disk/data/voice_identity/dianxin/1.wav' use_LDA = False plot = True num_speaker = 2 if use_LDA: pos, cls = aS.speaker_diarization(file, num_speaker, mt_size=4.0, mt_step=0.1, st_win=0.05, st_step=0.01, plot=plot) else: pos, cls = aS.speaker_diarization(file, num_speaker, lda_dim=0, plot=plot) fr, x = audio_basic_io.read_audio_file(file) sep_voice = [[], []] pre_pos = 0 cut_num = int(x.shape[0] * 0.0001) print('cut_num', cut_num) for i, c in enumerate(cls): c = int(c) v_from = pre_pos v_to = int(pos[i] * fr) sep_voice[c] += x[v_from + cut_num: v_to - cut_num].tolist() pre_pos = v_to print(len(sep_voice[0]), len(sep_voice[1])) wavfile.write('./0.wav', fr, np.array(sep_voice[0], dtype=np.int16)) wavfile.write('./1.wav', fr, np.array(sep_voice[1], dtype=np.int16))
def fileChromagramWrapper(wavFileName): if not os.path.isfile(wavFileName): raise Exception("Input audio file not found!") [Fs, x] = audio_basic_io.read_audio_file(wavFileName) x = audio_basic_io.stereo2mono(x) specgram, TimeAxis, FreqAxis = aF.stChromagram(x, Fs, round(Fs * 0.040), round(Fs * 0.040), True)
def mtFeatureExtractionToFile(fileName, midTermSize, midTermStep, shortTermSize, shortTermStep, outPutFile, storeStFeatures=False, storeToCSV=False, PLOT=False): """ This function is used as a wrapper to: a) read the content of a WAV file b) perform mid-term feature extraction on that signal c) write the mid-term feature sequences to a numpy file """ [Fs, x] = audio_basic_io.read_audio_file(fileName) # read the wav file x = audio_basic_io.stereo2mono(x) # convert to MONO if required if storeStFeatures: [mtF, stF] = mt_feature_extraction(x, Fs, round(Fs * midTermSize), round(Fs * midTermStep), round(Fs * shortTermSize), round(Fs * shortTermStep)) else: [mtF, _] = mt_feature_extraction(x, Fs, round(Fs * midTermSize), round(Fs * midTermStep), round(Fs * shortTermSize), round(Fs * shortTermStep)) numpy.save(outPutFile, mtF) # save mt features to numpy file if PLOT: print("Mid-term numpy file: " + outPutFile + ".npy saved") if storeToCSV: numpy.savetxt(outPutFile + ".csv", mtF.T, delimiter=",") if PLOT: print("Mid-term CSV file: " + outPutFile + ".csv saved") if storeStFeatures: numpy.save(outPutFile + "_st", stF) # save st features to numpy file if PLOT: print("Short-term numpy file: " + outPutFile + "_st.npy saved") if storeToCSV: numpy.savetxt(outPutFile + "_st.csv", stF.T, delimiter=",") # store st features to CSV file if PLOT: print("Short-term CSV file: " + outPutFile + "_st.csv saved")
def getMusicSegmentsFromFile(inputFile): modelType = "svm" modelName = "data/svmMovies8classes" dirOutput = inputFile[0:-4] + "_musicSegments" if os.path.exists(dirOutput) and dirOutput != ".": shutil.rmtree(dirOutput) os.makedirs(dirOutput) [Fs, x] = audio_basic_io.read_audio_file(inputFile) if modelType == 'svm': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadSVModel(modelName) elif modelType == 'knn': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadKNNModel(modelName) flagsInd, classNames, acc, CM = aS.mtFileClassification(inputFile, modelName, modelType, plotResults=False, gtFile="") segs, classes = aS.flags2segs(flagsInd, mtStep) for i, s in enumerate(segs): if (classNames[int(classes[i])] == "Music") and (s[1] - s[0] >= minDuration): strOut = "{0:s}{1:.3f}-{2:.3f}.wav".format(dirOutput + os.sep, s[0], s[1]) wavfile.write(strOut, Fs, x[int(Fs * s[0]):int(Fs * s[1])])
def plot_spectorgram(): import audioFeatureExtraction as aF Fs, x = audio_basic_io.read_audio_file(example_file) x = audio_basic_io.stereo2mono(x) specgram, TimeAxis, FreqAxis = aF.stSpectogram(x, Fs, round(Fs * 0.040), round(Fs * 0.040), True)
def beatExtractionWrapper(wavFileName, plot): if not os.path.isfile(wavFileName): raise Exception("Input audio file not found!") [Fs, x] = audio_basic_io.read_audio_file(wavFileName) F = aF.st_feature_extraction(x, Fs, 0.050 * Fs, 0.050 * Fs) BPM, ratio = aF.beatExtraction(F, 0.050, plot) print("Beat: {0:d} bpm ".format(int(BPM))) print("Ratio: {0:.2f} ".format(ratio))
def fileRegression(inputFile, modelName, modelType): # Load classifier: if not os.path.isfile(inputFile): print("fileClassification: wav file not found!") return (-1, -1, -1) regressionModels = glob.glob(modelName + "_*") regressionModels2 = [] for r in regressionModels: if r[-5::] != "MEANS": regressionModels2.append(r) regressionModels = regressionModels2 regressionNames = [] for r in regressionModels: regressionNames.append(r[r.rfind("_") + 1::]) # FEATURE EXTRACTION # LOAD ONLY THE FIRST MODEL (for mtWin, etc) if modelType == 'svm' or modelType == "svm_rbf": [_, _, _, mtWin, mtStep, stWin, stStep, computeBEAT] = loadSVModel(regressionModels[0], True) elif modelType == 'randomforest': [_, _, _, mtWin, mtStep, stWin, stStep, computeBEAT] = loadRandomForestModel(regressionModels[0], True) [Fs, x] = audio_basic_io.read_audio_file( inputFile) # read audio file and convert to mono x = audio_basic_io.stereo2mono(x) # feature extraction: [MidTermFeatures, s] = aF.mt_feature_extraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep)) MidTermFeatures = MidTermFeatures.mean( axis=1) # long term averaging of mid-term statistics if computeBEAT: [beat, beatConf] = aF.beatExtraction(s, stStep) MidTermFeatures = numpy.append(MidTermFeatures, beat) MidTermFeatures = numpy.append(MidTermFeatures, beatConf) # REGRESSION R = [] for ir, r in enumerate(regressionModels): if not os.path.isfile(r): print("fileClassification: input modelName not found!") return (-1, -1, -1) if modelType == 'svm' or modelType == "svm_rbf": [Model, MEAN, STD, mtWin, mtStep, stWin, stStep, computeBEAT] = loadSVModel(r, True) elif modelType == 'randomforest': [Model, MEAN, STD, mtWin, mtStep, stWin, stStep, computeBEAT] = loadRandomForestModel(r, True) curFV = (MidTermFeatures - MEAN) / STD # normalization R.append(regressionWrapper(Model, modelType, curFV)) # classification return R, regressionNames
def silenceRemovalWrapper(inputFile, smoothingWindow, weight): if not os.path.isfile(inputFile): raise Exception("Input audio file not found!") [Fs, x] = audio_basic_io.read_audio_file(inputFile) segmentLimits = aS.silenceRemoval(x, Fs, 0.05, 0.05, smoothingWindow, weight, True) for i, s in enumerate(segmentLimits): strOut = "{0:s}_{1:.3f}-{2:.3f}.wav".format(inputFile[0:-4], s[0], s[1]) wavfile.write(strOut, Fs, x[int(Fs * s[0]):int(Fs * s[1])])
def remove_silence(): smoothing = 1.0 weight = 0.5 example_file = '/home/yulongwu/d/voice/wav/2nU95KARZwk.wav' fr, x = audio_basic_io.read_audio_file(example_file) print('x shape', x.shape) segment_limits = aS.silenceRemoval(x, fr, 0.05, 0.05, smoothing, weight, True) for i, s in enumerate(segment_limits): name = "{0:s}_{1:.3f}-{2:.3f}.wav".format(example_file[0:-4], s[0], s[1]) wavfile.write(name, fr, x[int(fr * s[0]):int(fr * s[1])])
def classifyFolderWrapper(inputFolder, modelType, modelName, outputMode=False): if not os.path.isfile(modelName): raise Exception("Input modelName not found!") if modelType == 'svm': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadSVModel(modelName) elif modelType == 'knn': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadKNNModel(modelName) PsAll = numpy.zeros((len(classNames),)) files = "*.wav" if os.path.isdir(inputFolder): strFilePattern = os.path.join(inputFolder, files) else: strFilePattern = inputFolder + files wavFilesList = [] wavFilesList.extend(glob.glob(strFilePattern)) wavFilesList = sorted(wavFilesList) if len(wavFilesList) == 0: print("No WAV files found!") return Results = [] for wavFile in wavFilesList: [Fs, x] = audio_basic_io.read_audio_file(wavFile) signalLength = x.shape[0] / float(Fs) [Result, P, classNames] = aT.fileClassification(wavFile, modelName, modelType) PsAll += (numpy.array(P) * signalLength) Result = int(Result) Results.append(Result) if outputMode: print("{0:s}\t{1:s}".format(wavFile, classNames[Result])) Results = numpy.array(Results) # print(distribution of classes: [Histogram, _] = numpy.histogram(Results, bins=numpy.arange(len(classNames) + 1)) if outputMode: for i, h in enumerate(Histogram): print("{0:20s}\t\t{1:d}".format(classNames[i], h)) PsAll = PsAll / numpy.sum(PsAll) if outputMode: fig = plt.figure() ax = fig.add_subplot(111) plt.title("Classes percentage " + inputFolder.replace('Segments', '')) ax.axis((0, len(classNames) + 1, 0, 1)) ax.set_xticks(numpy.array(range(len(classNames) + 1))) ax.set_xticklabels([" "] + classNames) ax.bar(numpy.array(range(len(classNames))) + 0.5, PsAll) plt.show() return classNames, PsAll
def extract_feat(): import audio_basic_io import audioFeatureExtraction import matplotlib.pyplot as plt Fs, x = audio_basic_io.read_audio_file(example_file) F = audioFeatureExtraction.st_feature_extraction(x, Fs, 0.050 * Fs, 0.025 * Fs); plt.subplot(2, 1, 1) plt.plot(F[0, :]) plt.xlabel('Frame no') plt.ylabel('ZCR') plt.subplot(2, 1, 2) plt.plot(F[1, :]) plt.xlabel('Frame no') plt.ylabel('Energy') plt.show()
def hmmSegmentation(wavFileName, hmmModelName, PLOT=False, gtFileName=""): [Fs, x] = audio_basic_io.read_audio_file(wavFileName) # read audio data try: fo = open(hmmModelName, "rb") except IOError: print("didn't find file") return try: hmm = cPickle.load(fo) classesAll = cPickle.load(fo) mtWin = cPickle.load(fo) mtStep = cPickle.load(fo) except: fo.close() fo.close() # Features = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.050*Fs); # feature extraction [Features, _] = aF.mt_feature_extraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * 0.050), round(Fs * 0.050)) flagsInd = hmm.predict(Features.T) # apply model # for i in range(len(flagsInd)): # if classesAll[flagsInd[i]]=="silence": # flagsInd[i]=classesAll.index("speech") # plot results if os.path.isfile(gtFileName): [segStart, segEnd, segLabels] = readSegmentGT(gtFileName) flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels, mtStep) flagsGTNew = [] for j, fl in enumerate(flagsGT): # "align" labels with GT if classNamesGT[flagsGT[j]] in classesAll: flagsGTNew.append(classesAll.index(classNamesGT[flagsGT[j]])) else: flagsGTNew.append(-1) CM = np.zeros((len(classNamesGT), len(classNamesGT))) flagsIndGT = np.array(flagsGTNew) for i in range(min(flagsInd.shape[0], flagsIndGT.shape[0])): CM[int(flagsIndGT[i]), int(flagsInd[i])] += 1 else: flagsIndGT = np.array([]) acc = plotSegmentationResults(flagsInd, flagsIndGT, classesAll, mtStep, not PLOT) if acc >= 0: print("Overall Accuracy: {0:.2f}".format(acc)) return (flagsInd, classNamesGT, acc, CM) else: return (flagsInd, classesAll, -1, -1)
def dirWavFeatureExtractionNoAveraging(dirName, mtWin, mtStep, stWin, stStep): """ This function extracts the mid-term features of the WAVE files of a particular folder without averaging each file. ARGUMENTS: - dirName: the path of the WAVE directory - mtWin, mtStep: mid-term window and step (in seconds) - stWin, stStep: short-term window and step (in seconds) RETURNS: - X: A feature matrix - Y: A matrix of file labels - filenames: """ allMtFeatures = numpy.array([]) signalIndices = numpy.array([]) processingTimes = [] types = ('*.wav', '*.aif', '*.aiff') wavFilesList = [] for files in types: wavFilesList.extend(glob.glob(os.path.join(dirName, files))) wavFilesList = sorted(wavFilesList) for i, wavFile in enumerate(wavFilesList): [Fs, x] = audio_basic_io.read_audio_file(wavFile) # read file if isinstance(x, int): continue x = audio_basic_io.stereo2mono(x) # convert stereo to mono [MidTermFeatures, _] = mt_feature_extraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs), round(Fs * stWin), round(Fs * stStep)) # mid-term feature MidTermFeatures = numpy.transpose(MidTermFeatures) # MidTermFeatures = MidTermFeatures.mean(axis=0) # long term averaging of mid-term statistics if len(allMtFeatures) == 0: # append feature vector allMtFeatures = MidTermFeatures signalIndices = numpy.zeros((MidTermFeatures.shape[0],)) else: allMtFeatures = numpy.vstack((allMtFeatures, MidTermFeatures)) signalIndices = numpy.append(signalIndices, i * numpy.ones((MidTermFeatures.shape[0],))) return (allMtFeatures, signalIndices, wavFilesList)
def trainHMM_fromFile(wavFile, gtFile, hmmModelName, mtWin, mtStep): ''' This function trains a HMM model for segmentation-classification using a single annotated audio file ARGUMENTS: - wavFile: the path of the audio filename - gtFile: the path of the ground truth filename (a csv file of the form <segment start in seconds>,<segment end in seconds>,<segment label> in each row - hmmModelName: the name of the HMM model to be stored - mtWin: mid-term window size - mtStep: mid-term window step RETURNS: - hmm: an object to the resulting HMM - classNames: a list of classNames After training, hmm, classNames, along with the mtWin and mtStep values are stored in the hmmModelName file ''' [segStart, segEnd, segLabels] = readSegmentGT(gtFile) # read ground truth data flags, classNames = segs2flags(segStart, segEnd, segLabels, mtStep) # convert to fix-sized sequence of flags [Fs, x] = audio_basic_io.read_audio_file(wavFile) # read audio data # F = aF.stFeatureExtraction(x, Fs, 0.050*Fs, 0.050*Fs); [F, _] = aF.mt_feature_extraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * 0.050), round(Fs * 0.050)) # feature extraction startprob, transmat, means, cov = trainHMM_computeStatistics(F, flags) # compute HMM statistics (priors, transition matrix, etc) hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], "diag") # hmm training hmm.startprob_ = startprob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov fo = open(hmmModelName, "wb") # output to file cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mtWin, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mtStep, fo, protocol=cPickle.HIGHEST_PROTOCOL) fo.close() return hmm, classNames
def annotation2files(wavFile, csvFile): ''' Break an audio stream to segments of interest, defined by a csv file - wavFile: path to input wavfile - csvFile: path to csvFile of segment limits Input CSV file must be of the format <T1>\t<T2>\t<Label> ''' [Fs, x] = audio_basic_io.read_audio_file(wavFile) with open(csvFile, 'rb') as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='|') for j, row in enumerate(reader): T1 = float(row[0].replace(",", ".")) T2 = float(row[1].replace(",", ".")) label = "%s_%s_%.2f_%.2f.wav" % (wavFile, row[2], T1, T2) label = label.replace(" ", "_") xtemp = x[int(round(T1 * Fs)):int(round(T2 * Fs))] print(T1, T2, label, xtemp.shape) wavfile.write(label, Fs, xtemp)
def trainHMM_fromDir(dirPath, hmmModelName, mtWin, mtStep): ''' This function trains a HMM model for segmentation-classification using a where WAV files and .segment (ground-truth files) are stored ARGUMENTS: - dirPath: the path of the data diretory - hmmModelName: the name of the HMM model to be stored - mtWin: mid-term window size - mtStep: mid-term window step RETURNS: - hmm: an object to the resulting HMM - classNames: a list of classNames After training, hmm, classNames, along with the mtWin and mtStep values are stored in the hmmModelName file ''' flagsAll = np.array([]) classesAll = [] for i, f in enumerate(glob.glob(dirPath + os.sep + '*.wav')): # for each WAV file wavFile = f gtFile = f.replace('.wav', '.segments') # open for annotated file if not os.path.isfile(gtFile): # if current WAV file does not have annotation -> skip continue [segStart, segEnd, segLabels] = readSegmentGT(gtFile) # read GT data flags, classNames = segs2flags(segStart, segEnd, segLabels, mtStep) # convert to flags for c in classNames: # update classnames: if c not in classesAll: classesAll.append(c) [Fs, x] = audio_basic_io.read_audio_file(wavFile) # read audio data [F, _] = aF.mt_feature_extraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * 0.050), round(Fs * 0.050)) # feature extraction lenF = F.shape[1] lenL = len(flags) MIN = min(lenF, lenL) F = F[:, 0:MIN] flags = flags[0:MIN] flagsNew = [] for j, fl in enumerate(flags): # append features and labels flagsNew.append(classesAll.index(classNames[flags[j]])) flagsAll = np.append(flagsAll, np.array(flagsNew)) if i == 0: Fall = F else: Fall = np.concatenate((Fall, F), axis=1) startprob, transmat, means, cov = trainHMM_computeStatistics(Fall, flagsAll) # compute HMM statistics hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], "diag") # train HMM hmm.startprob_ = startprob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov fo = open(hmmModelName, "wb") # save HMM model cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(classesAll, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mtWin, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mtStep, fo, protocol=cPickle.HIGHEST_PROTOCOL) fo.close() return hmm, classesAll
def dirWavFeatureExtraction(dirName, mtWin, mtStep, stWin, stStep, computeBEAT=False): """ This function extracts the mid-term features of the WAVE files of a particular folder. The resulting feature vector is extracted by long-term averaging the mid-term features. Therefore ONE FEATURE VECTOR is extracted for each WAV file. ARGUMENTS: - dirName: the path of the WAVE directory - mtWin, mtStep: mid-term window and step (in seconds) - stWin, stStep: short-term window and step (in seconds) """ allMtFeatures = numpy.array([]) processingTimes = [] types = ('*.wav', '*.aif', '*.aiff', '*.mp3', '*.au') wavFilesList = [] for files in types: wavFilesList.extend(glob.glob(os.path.join(dirName, files))) wavFilesList = sorted(wavFilesList) wavFilesList2 = [] for i, wavFile in enumerate(wavFilesList): print("Analyzing file {0:d} of {1:d}: {2:s}".format(i + 1, len(wavFilesList), wavFile.encode('utf-8'))) if os.stat(wavFile).st_size == 0: print(" (EMPTY FILE -- SKIPPING)") continue [Fs, x] = audio_basic_io.read_audio_file(wavFile) # read file if isinstance(x, int): continue t1 = time.clock() x = audio_basic_io.stereo2mono(x) # convert stereo to mono if x.shape[0] < float(Fs) / 10: print(" (AUDIO FILE TOO SMALL - SKIPPING)") continue wavFilesList2.append(wavFile) if computeBEAT: # mid-term feature extraction for current file [MidTermFeatures, stFeatures] = mt_feature_extraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs), round(Fs * stWin), round(Fs * stStep)) [beat, beatConf] = beatExtraction(stFeatures, stStep) else: [MidTermFeatures, _] = mt_feature_extraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs), round(Fs * stWin), round(Fs * stStep)) MidTermFeatures = numpy.transpose(MidTermFeatures) MidTermFeatures = MidTermFeatures.mean(axis=0) # long term averaging of mid-term statistics if (not numpy.isnan(MidTermFeatures).any()) and (not numpy.isinf(MidTermFeatures).any()): if computeBEAT: MidTermFeatures = numpy.append(MidTermFeatures, beat) MidTermFeatures = numpy.append(MidTermFeatures, beatConf) if len(allMtFeatures) == 0: # append feature vector allMtFeatures = MidTermFeatures else: allMtFeatures = numpy.vstack((allMtFeatures, MidTermFeatures)) t2 = time.clock() duration = float(len(x)) / Fs processingTimes.append((t2 - t1) / duration) if len(processingTimes) > 0: print("Feature extraction complexity ratio: {0:.1f} x realtime".format( (1.0 / numpy.mean(numpy.array(processingTimes))))) return (allMtFeatures, wavFilesList2)
def thumbnailWrapper(inputFile, thumbnailWrapperSize): stWindow = 1.0 stStep = 1.0 if not os.path.isfile(inputFile): raise Exception("Input audio file not found!") [Fs, x] = audio_basic_io.read_audio_file(inputFile) if Fs == -1: # could not read file return [A1, A2, B1, B2, Smatrix] = aS.musicThumbnailing(x, Fs, stWindow, stStep, thumbnailWrapperSize) # write thumbnailWrappers to WAV files: if inputFile.endswith(".wav"): thumbnailWrapperFileName1 = inputFile.replace(".wav", "_thumb1.wav") thumbnailWrapperFileName2 = inputFile.replace(".wav", "_thumb2.wav") if inputFile.endswith(".mp3"): thumbnailWrapperFileName1 = inputFile.replace(".mp3", "_thumb1.mp3") thumbnailWrapperFileName2 = inputFile.replace(".mp3", "_thumb2.mp3") wavfile.write(thumbnailWrapperFileName1, Fs, x[int(Fs * A1):int(Fs * A2)]) wavfile.write(thumbnailWrapperFileName2, Fs, x[int(Fs * B1):int(Fs * B2)]) print("1st thumbnailWrapper (stored in file {0:s}): {1:4.1f}sec" \ " -- {2:4.1f}sec".format(thumbnailWrapperFileName1, A1, A2)) print("2nd thumbnailWrapper (stored in file {0:s}): {1:4.1f}sec" \ " -- {2:4.1f}sec".format(thumbnailWrapperFileName2, B1, B2)) # Plot self-similarity matrix: fig = plt.figure() ax = fig.add_subplot(111, aspect="auto") plt.imshow(Smatrix) # Plot best-similarity diagonal: Xcenter = (A1 / stStep + A2 / stStep) / 2.0 Ycenter = (B1 / stStep + B2 / stStep) / 2.0 e1 = matplotlib.patches.Ellipse((Ycenter, Xcenter), thumbnailWrapperSize * 1.4, 3, angle=45, linewidth=3, fill=False) ax.add_patch(e1) plt.plot([B1, Smatrix.shape[0]], [A1, A1], color="k", linestyle="--", linewidth=2) plt.plot([B2, Smatrix.shape[0]], [A2, A2], color="k", linestyle="--", linewidth=2) plt.plot([B1, B1], [A1, Smatrix.shape[0]], color="k", linestyle="--", linewidth=2) plt.plot([B2, B2], [A2, Smatrix.shape[0]], color="k", linestyle="--", linewidth=2) plt.xlim([0, Smatrix.shape[0]]) plt.ylim([Smatrix.shape[1], 0]) ax.yaxis.set_label_position("right") ax.yaxis.tick_right() plt.xlabel("frame no") plt.ylabel("frame no") plt.title("Self-similarity matrix") plt.show()
def mtFileClassification(inputFile, modelName, modelType, plotResults=False, gtFile=""): ''' This function performs mid-term classification of an audio stream. Towards this end, supervised knowledge is used, i.e. a pre-trained classifier. ARGUMENTS: - inputFile: path of the input WAV file - modelName: name of the classification model - modelType: svm or knn depending on the classifier type - plotResults: True if results are to be plotted using matplotlib along with a set of statistics RETURNS: - segs: a sequence of segment's endpoints: segs[i] is the endpoint of the i-th segment (in seconds) - classes: a sequence of class flags: class[i] is the class ID of the i-th segment ''' if not os.path.isfile(modelName): print("mtFileClassificationError: input modelType not found!") return (-1, -1, -1, -1) # Load classifier: if (modelType == 'svm') or (modelType == 'svm_rbf'): [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadSVModel(modelName) elif modelType == 'knn': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadKNNModel(modelName) elif modelType == 'randomforest': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadRandomForestModel( modelName) elif modelType == 'gradientboosting': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadGradientBoostingModel( modelName) elif modelType == 'extratrees': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadExtraTreesModel( modelName) if computeBEAT: print("Model " + modelName + " contains long-term music features (beat etc) and cannot be used in segmentation") return (-1, -1, -1, -1) [Fs, x] = audio_basic_io.read_audio_file(inputFile) # load input file if Fs == -1: # could not read file return (-1, -1, -1, -1) x = audio_basic_io.stereo2mono(x) # convert stereo (if) to mono Duration = len(x) / Fs # mid-term feature extraction: [MidTermFeatures, _] = aF.mt_feature_extraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep)) flags = [] Ps = [] flagsInd = [] for i in range(MidTermFeatures.shape[1]): # for each feature vector (i.e. for each fix-sized segment): curFV = (MidTermFeatures[:, i] - MEAN) / STD # normalize current feature vector [Result, P] = aT.classifierWrapper(Classifier, modelType, curFV) # classify vector flagsInd.append(Result) flags.append(classNames[int(Result)]) # update class label matrix Ps.append(np.max(P)) # update probability matrix flagsInd = np.array(flagsInd) # 1-window smoothing for i in range(1, len(flagsInd) - 1): if flagsInd[i - 1] == flagsInd[i + 1]: flagsInd[i] = flagsInd[i + 1] (segs, classes) = flags2segs(flags, mtStep) # convert fix-sized flags to segments and classes segs[-1] = len(x) / float(Fs) # Load grount-truth: if os.path.isfile(gtFile): [segStartGT, segEndGT, segLabelsGT] = readSegmentGT(gtFile) flagsGT, classNamesGT = segs2flags(segStartGT, segEndGT, segLabelsGT, mtStep) flagsIndGT = [] for j, fl in enumerate(flagsGT): # "align" labels with GT if classNamesGT[flagsGT[j]] in classNames: flagsIndGT.append(classNames.index(classNamesGT[flagsGT[j]])) else: flagsIndGT.append(-1) flagsIndGT = np.array(flagsIndGT) CM = np.zeros((len(classNamesGT), len(classNamesGT))) for i in range(min(flagsInd.shape[0], flagsIndGT.shape[0])): CM[int(flagsIndGT[i]), int(flagsInd[i])] += 1 else: CM = [] flagsIndGT = np.array([]) acc = plotSegmentationResults(flagsInd, flagsIndGT, classNames, mtStep, not plotResults) if acc >= 0: print("Overall Accuracy: {0:.3f}".format(acc)) return (flagsInd, classNamesGT, acc, CM) else: return (flagsInd, classNames, acc, CM)
def speaker_diarization(file_name, num_speaker, mt_size=2.0, mt_step=0.2, st_win=0.05, st_step=0.025, lda_dim=35, plot=False): ''' ARGUMENTS: - fileName: the name of the WAV file to be analyzed - numOfSpeakers the number of speakers (clusters) in the recording (<=0 for unknown) - mtSize (opt) mid-term window size - mtStep (opt) mid-term window step - stWin (opt) short-term window size - LDAdim (opt) LDA dimension (0 for no LDA) - PLOT (opt) 0 for not plotting the results 1 for plottingy ''' fr, x = audio_basic_io.read_audio_file(file_name) x = audio_basic_io.stereo2mono(x) duration = len(x) / fr classifier1, mean1, std1, class_names1, mt_win1, mt_step1, st_win1, st_step1, compute_beta1 = aT.loadKNNModel( os.path.join("data", "knnSpeakerAll")) classifier2, mean2, std2, class_names2, mt_win2, mt_step2, st_win2, st_step2, compute_beta2 = aT.loadKNNModel( os.path.join("data", "knnSpeakerFemaleMale")) mid_term_features, short_term_features = aF.mt_feature_extraction(signal=x, fr=fr, mt_win=mt_size * fr, mt_step=mt_step * fr, st_win=round(fr * st_win), st_step=round(fr * st_step)) # (68, 329) (34, 2630) print(mid_term_features.shape, short_term_features.shape) mid_term_features2 = np.zeros((mid_term_features.shape[0] + len(class_names1) + len(class_names2), mid_term_features.shape[1])) for i in range(mid_term_features.shape[1]): cur_f1 = (mid_term_features[:, i] - mean1) / std1 cur_f2 = (mid_term_features[:, i] - mean2) / std2 result, p1 = aT.classifierWrapper(classifier1, "knn", cur_f1) result, p2 = aT.classifierWrapper(classifier2, "knn", cur_f2) mid_term_features2[0:mid_term_features.shape[0], i] = mid_term_features[:, i] mid_term_features2[mid_term_features.shape[0]:mid_term_features.shape[0] + len(class_names1), i] = p1 + 0.0001 mid_term_features2[mid_term_features.shape[0] + len(class_names1)::, i] = p2 + 0.0001 mid_term_features = mid_term_features2 # TODO # SELECT FEATURES: # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20]; # SET 0A # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100]; # SET 0B # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73, # 74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96, # 97,98, 99,100]; # SET 0C i_features_select = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53] # SET 1A # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; # SET 1B # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47, # 48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86, # 87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 1C # iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35, # 36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53]; # SET 2A # iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35, # 36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; # SET 2B # iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35, # 36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75, # 76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 2C # iFeaturesSelect = range(100); # SET 3 # MidTermFeatures += np.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010 mid_term_features = mid_term_features[i_features_select, :] mid_term_features_norm, mean, std = aT.normalizeFeatures([mid_term_features.T]) mid_term_features_norm = mid_term_features_norm[0].T num_of_windows = mid_term_features.shape[1] # remove outliers: distances_all = np.sum(distance.squareform(distance.pdist(mid_term_features_norm.T)), axis=0) m_distances_all = np.mean(distances_all) i_non_out_liers = np.nonzero(distances_all < 1.2 * m_distances_all)[0] # TODO: Combine energy threshold for outlier removal: # EnergyMin = np.min(MidTermFeatures[1,:]) # EnergyMean = np.mean(MidTermFeatures[1,:]) # Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 # iNonOutLiers = np.nonzero(MidTermFeatures[1,:] > Thres)[0] # print(iNonOutLiers # per_out_lier = (100.0 * (num_of_windows - i_non_out_liers.shape[0])) / num_of_windows mid_term_features_norm_or = mid_term_features_norm mid_term_features_norm = mid_term_features_norm[:, i_non_out_liers] # LDA dimensionality reduction: if lda_dim > 0: mt_win_ratio = int(round(mt_size / st_win)) mt_step_ratio = int(round(st_win / st_win)) mt_features_to_reduce = [] num_of_features = len(short_term_features) num_of_statistics = 2 # for i in range(numOfStatistics * numOfFeatures + 1): for i in range(num_of_statistics * num_of_features): mt_features_to_reduce.append([]) for i in range(num_of_features): # for each of the short-term features: cur_pos = 0 n = len(short_term_features[i]) while cur_pos < n: n1 = cur_pos n2 = cur_pos + mt_win_ratio if n2 > n: n2 = n cur_st_features = short_term_features[i][n1:n2] mt_features_to_reduce[i].append(np.mean(cur_st_features)) mt_features_to_reduce[i + num_of_features].append(np.std(cur_st_features)) cur_pos += mt_step_ratio mt_features_to_reduce = np.array(mt_features_to_reduce) mt_features_to_reduce2 = np.zeros((mt_features_to_reduce.shape[0] + len(class_names1) + len(class_names2), mt_features_to_reduce.shape[1])) for i in range(mt_features_to_reduce.shape[1]): cur_f1 = (mt_features_to_reduce[:, i] - mean1) / std1 cur_f2 = (mt_features_to_reduce[:, i] - mean2) / std2 result, p1 = aT.classifierWrapper(classifier1, "knn", cur_f1) result, p2 = aT.classifierWrapper(classifier2, "knn", cur_f2) mt_features_to_reduce2[0:mt_features_to_reduce.shape[0], i] = mt_features_to_reduce[:, i] mt_features_to_reduce2[mt_features_to_reduce.shape[0]:mt_features_to_reduce.shape[0] + len(class_names1), i] = p1 + 0.0001 mt_features_to_reduce2[mt_features_to_reduce.shape[0] + len(class_names1)::, i] = p2 + 0.0001 mt_features_to_reduce = mt_features_to_reduce2 mt_features_to_reduce = mt_features_to_reduce[i_features_select, :] # mtFeaturesToReduce += np.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010 mt_features_to_reduce, mean, std = aT.normalizeFeatures([mt_features_to_reduce.T]) mt_features_to_reduce = mt_features_to_reduce[0].T # DistancesAll = np.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0) # MDistancesAll = np.mean(DistancesAll) # iNonOutLiers2 = np.nonzero(DistancesAll < 3.0*MDistancesAll)[0] # mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2] labels = np.zeros((mt_features_to_reduce.shape[1],)) lda_step = 1.0 lda_step_ratio = lda_step / st_win # print(LDAstep, LDAstepRatio for i in range(labels.shape[0]): labels[i] = int(i * st_win / lda_step_ratio) clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(n_components=lda_dim) clf.fit(mt_features_to_reduce.T, labels) mid_term_features_norm = (clf.transform(mid_term_features_norm.T)).T if num_speaker <= 0: s_range = range(2, 10) else: s_range = [num_speaker] cls_all = [] sil_all = [] centers_all = [] # (26, 314) print('mid_term_features_norm', mid_term_features_norm.shape) for i_speakers in s_range: k_means = sklearn.cluster.KMeans(n_clusters=i_speakers) k_means.fit(mid_term_features_norm.T) cls = k_means.labels_ means = k_means.cluster_centers_ # Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T)) cls_all.append(cls) centers_all.append(means) sil_a = [] sil_b = [] for c in range(i_speakers): # for each speaker (i.e. for each extracted cluster) cluster_percent = np.nonzero(cls == c)[0].shape[0] / float(len(cls)) if cluster_percent < 0.020: sil_a.append(0.0) sil_b.append(0.0) else: mid_term_features_norm_temp = mid_term_features_norm[:, cls == c] # get subset of feature vectors # compute average distance between samples that belong to the cluster (a values) yt = distance.pdist(mid_term_features_norm_temp.T) sil_a.append(np.mean(yt) * cluster_percent) sil_bs = [] for c2 in range(i_speakers): # compute distances from samples of other clusters if c2 != c: cluster_percent2 = np.nonzero(cls == c2)[0].shape[0] / float(len(cls)) mid_term_features_norm_temp2 = mid_term_features_norm[:, cls == c2] yt = distance.cdist(mid_term_features_norm_temp.T, mid_term_features_norm_temp2.T) sil_bs.append(np.mean(yt) * (cluster_percent + cluster_percent2) / 2.0) sil_bs = np.array(sil_bs) # ... and keep the minimum value (i.e. the distance from the "nearest" cluster) sil_b.append(min(sil_bs)) sil_a = np.array(sil_a) sil_b = np.array(sil_b) sil = [] for c in range(i_speakers): # for each cluster (speaker) sil.append((sil_b[c] - sil_a[c]) / (max(sil_b[c], sil_a[c]) + 0.00001)) # compute silhouette sil_all.append(np.mean(sil)) # keep the AVERAGE SILLOUETTE # silAll = silAll * (1.0/(np.power(np.array(sRange),0.5))) imax = np.argmax(sil_all) # position of the maximum sillouette value n_speakers_final = s_range[imax] # optimal number of clusters # generate the final set of cluster labels # (important: need to retrieve the outlier windows: # this is achieved by giving them the value of their nearest non-outlier window) cls = np.zeros((num_of_windows,)) for i in range(num_of_windows): j = np.argmin(np.abs(i - i_non_out_liers)) cls[i] = cls_all[imax][j] # Post-process method 1: hmm smoothing for i in range(1): startprob, transmat, means, cov = trainHMM_computeStatistics(mid_term_features_norm_or, cls) hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], "diag") # hmm training hmm.startprob_ = startprob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov cls = hmm.predict(mid_term_features_norm_or.T) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = sil_all[imax] # final sillouette class_names = ["speaker{0:d}".format(c) for c in range(n_speakers_final)] # load ground-truth if available gt_file = file_name.replace('.wav', '.segments') # open for annotated file if os.path.isfile(gt_file): # if groundturh exists seg_start, seg_end, seg_labels = readSegmentGT(gt_file) # read GT data flags_gt, class_names_gt = segs2flags(seg_start, seg_end, seg_labels, mt_step) # convert to flags x = np.arange(len(cls)) * mt_step + mt_step / 2.0 if plot: fig = plt.figure() if num_speaker > 0: ax1 = fig.add_subplot(111) else: ax1 = fig.add_subplot(211) ax1.set_yticks(np.array(range(len(class_names)))) ax1.axis((0, duration, -1, len(class_names))) ax1.set_yticklabels(class_names) ax1.plot(x, cls) if os.path.isfile(gt_file): if plot: ax1.plot(np.array(range(len(flags_gt))) * mt_step + mt_step / 2.0, flags_gt, 'r') purity_cluster_mean, purity_speaker_mean = evaluateSpeakerDiarization(cls, flags_gt) print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_mean, 100 * purity_speaker_mean)) if plot: plt.title("Cluster purity: {0:.1f}% - Speaker purity: {1:.1f}%".format(100 * purity_cluster_mean, 100 * purity_speaker_mean)) if plot: plt.xlabel("time (seconds)") # print(sRange, silAll) if num_speaker <= 0: plt.subplot(212) plt.plot(s_range, sil_all) plt.xlabel("number of clusters") plt.ylabel("average clustering's sillouette") plt.show() return x, cls
def fileClassification(inputFile, modelName, modelType): # Load classifier: if not os.path.isfile(modelName): print("fileClassification: input modelName not found!") return (-1, -1, -1) if not os.path.isfile(inputFile): print("fileClassification: wav file not found!") return (-1, -1, -1) if (modelType) == 'svm' or (modelType == 'svm_rbf'): [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT ] = loadSVModel(modelName) elif modelType == 'knn': [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT ] = loadKNNModel(modelName) elif modelType == 'randomforest': [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT ] = loadRandomForestModel(modelName) elif modelType == 'gradientboosting': [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT ] = loadGradientBoostingModel(modelName) elif modelType == 'extratrees': [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT ] = loadExtraTreesModel(modelName) [Fs, x] = audio_basic_io.read_audio_file( inputFile) # read audio file and convert to mono x = audio_basic_io.stereo2mono(x) if isinstance(x, int): # audio file IO problem return (-1, -1, -1) if x.shape[0] / float(Fs) <= mtWin: return (-1, -1, -1) # feature extraction: [MidTermFeatures, s] = aF.mt_feature_extraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep)) MidTermFeatures = MidTermFeatures.mean( axis=1) # long term averaging of mid-term statistics if computeBEAT: [beat, beatConf] = aF.beatExtraction(s, stStep) MidTermFeatures = numpy.append(MidTermFeatures, beat) MidTermFeatures = numpy.append(MidTermFeatures, beatConf) curFV = (MidTermFeatures - MEAN) / STD # normalization [Result, P] = classifierWrapper(Classifier, modelType, curFV) # classification return Result, P, classNames