def getMusicSegmentsFromFile(inputFile): modelType = "svm" modelName = "data/svmMovies8classes" dirOutput = inputFile[0:-4] + "_musicSegments" if os.path.exists(dirOutput) and dirOutput != ".": shutil.rmtree(dirOutput) os.makedirs(dirOutput) [Fs, x] = audioBasicIO.readAudioFile(inputFile) if modelType == 'svm': [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT ] = aT.loadSVModel(modelName) elif modelType == 'knn': [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT ] = aT.loadKNNModel(modelName) flagsInd, classNames, acc, CM = aS.mtFileClassification(inputFile, modelName, modelType, plotResults=False, gtFile="") segs, classes = aS.flags2segs(flagsInd, mtStep) for i, s in enumerate(segs): if (classNames[int(classes[i])] == "Music") and (s[1] - s[0] >= minDuration): strOut = "{0:s}{1:.3f}-{2:.3f}.wav".format(dirOutput + os.sep, s[0], s[1]) wavfile.write(strOut, Fs, x[int(Fs * s[0]):int(Fs * s[1])])
def speakerDiarization(fileName, numOfSpeakers, mtSize=2.0, mtStep=0.2, stWin=0.05, LDAdim=0, PLOT=False): ''' ARGUMENTS: - fileName: the name of the WAV file to be analyzed - numOfSpeakers the number of speakers (clusters) in the recording (<=0 for unknown) - mtSize (opt) mid-term window size - mtStep (opt) mid-term window step - stWin (opt) short-term window size - LDAdim (opt) LDA dimension (0 for no LDA) - PLOT (opt) 0 for not plotting the results 1 for plottingy ''' [Fs, x] = pyAudioAnalysis.audioBasicIO.readAudioFile(fileName) x = pyAudioAnalysis.audioBasicIO.stereo2mono(x) Duration = len(x) / Fs #[Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.loadKNNModel(os.path.join("data","knnSpeakerAll")) #[Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.loadKNNModel(os.path.join("data","knnSpeakerFemaleMale")) [Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.loadKNNModel("pyAudioAnalysis/data/knnSpeakerAll") [Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.loadKNNModel("pyAudioAnalysis/data/knnSpeakerFemaleMale") [MidTermFeatures, ShortTermFeatures] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin), round(Fs*stWin * 0.5)) MidTermFeatures2 = numpy.zeros((MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1])) for i in range(MidTermFeatures.shape[1]): curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1 curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2 [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1) [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2) MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i] MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0]+len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1)::, i] = P2 + 0.0001 MidTermFeatures = MidTermFeatures2 # TODO # SELECT FEATURES: #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20]; # SET 0A #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100]; # SET 0B #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96, # 97,98, 99,100]; # SET 0C iFeaturesSelect = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53] # SET 1A #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; # SET 1B #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 1C #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53]; # SET 2A #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; # SET 2B #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 2C #iFeaturesSelect = range(100); # SET 3 #MidTermFeatures += numpy.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010 MidTermFeatures = MidTermFeatures[iFeaturesSelect, :] (MidTermFeaturesNorm, MEAN, STD) = aT.normalizeFeatures([MidTermFeatures.T]) MidTermFeaturesNorm = MidTermFeaturesNorm[0].T numOfWindows = MidTermFeatures.shape[1] # remove outliers: DistancesAll = numpy.sum(distance.squareform(distance.pdist(MidTermFeaturesNorm.T)), axis=0) MDistancesAll = numpy.mean(DistancesAll) iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0] # TODO: Combine energy threshold for outlier removal: #EnergyMin = numpy.min(MidTermFeatures[1,:]) #EnergyMean = numpy.mean(MidTermFeatures[1,:]) #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 #iNonOutLiers = numpy.nonzero(MidTermFeatures[1,:] > Thres)[0] #print iNonOutLiers perOutLier = (100.0 * (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows MidTermFeaturesNormOr = MidTermFeaturesNorm MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers] # LDA dimensionality reduction: if LDAdim > 0: #[mtFeaturesToReduce, _] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, stWin * Fs, round(Fs*stWin), round(Fs*stWin)); # extract mid-term features with minimum step: mtWinRatio = int(round(mtSize / stWin)) mtStepRatio = int(round(stWin / stWin)) mtFeaturesToReduce = [] numOfFeatures = len(ShortTermFeatures) numOfStatistics = 2 #for i in range(numOfStatistics * numOfFeatures + 1): for i in range(numOfStatistics * numOfFeatures): mtFeaturesToReduce.append([]) for i in range(numOfFeatures): # for each of the short-term features: curPos = 0 N = len(ShortTermFeatures[i]) while (curPos < N): N1 = curPos N2 = curPos + mtWinRatio if N2 > N: N2 = N curStFeatures = ShortTermFeatures[i][N1:N2] mtFeaturesToReduce[i].append(numpy.mean(curStFeatures)) mtFeaturesToReduce[i+numOfFeatures].append(numpy.std(curStFeatures)) curPos += mtStepRatio mtFeaturesToReduce = numpy.array(mtFeaturesToReduce) mtFeaturesToReduce2 = numpy.zeros((mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1])) for i in range(mtFeaturesToReduce.shape[1]): curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1 curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2 [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1) [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2) mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i] mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0] + len(classNames1), i] = P1 + 0.0001 mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]+len(classNames1)::, i] = P2 + 0.0001 mtFeaturesToReduce = mtFeaturesToReduce2 mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :] #mtFeaturesToReduce += numpy.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010 (mtFeaturesToReduce, MEAN, STD) = aT.normalizeFeatures([mtFeaturesToReduce.T]) mtFeaturesToReduce = mtFeaturesToReduce[0].T #DistancesAll = numpy.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0) #MDistancesAll = numpy.mean(DistancesAll) #iNonOutLiers2 = numpy.nonzero(DistancesAll < 3.0*MDistancesAll)[0] #mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2] Labels = numpy.zeros((mtFeaturesToReduce.shape[1], )); LDAstep = 1.0 LDAstepRatio = LDAstep / stWin #print LDAstep, LDAstepRatio for i in range(Labels.shape[0]): Labels[i] = int(i*stWin/LDAstepRatio); clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(n_components=LDAdim) clf.fit(mtFeaturesToReduce.T, Labels) MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T if numOfSpeakers <= 0: sRange = range(2, 10) else: sRange = [numOfSpeakers] clsAll = [] silAll = [] centersAll = [] for iSpeakers in sRange: k_means = sklearn.cluster.KMeans(n_clusters = iSpeakers) k_means.fit(MidTermFeaturesNorm.T) cls = k_means.labels_ means = k_means.cluster_centers_ # Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T)) clsAll.append(cls) centersAll.append(means) silA = []; silB = [] for c in range(iSpeakers): # for each speaker (i.e. for each extracted cluster) clusterPerCent = numpy.nonzero(cls==c)[0].shape[0] / float(len(cls)) if clusterPerCent < 0.020: silA.append(0.0) silB.append(0.0) else: MidTermFeaturesNormTemp = MidTermFeaturesNorm[:,cls==c] # get subset of feature vectors Yt = distance.pdist(MidTermFeaturesNormTemp.T) # compute average distance between samples that belong to the cluster (a values) silA.append(numpy.mean(Yt)*clusterPerCent) silBs = [] for c2 in range(iSpeakers): # compute distances from samples of other clusters if c2!=c: clusterPerCent2 = numpy.nonzero(cls==c2)[0].shape[0] / float(len(cls)) MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:,cls==c2] Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T) silBs.append(numpy.mean(Yt)*(clusterPerCent+clusterPerCent2)/2.0) silBs = numpy.array(silBs) silB.append(min(silBs)) # ... and keep the minimum value (i.e. the distance from the "nearest" cluster) silA = numpy.array(silA); silB = numpy.array(silB); sil = [] for c in range(iSpeakers): # for each cluster (speaker) sil.append( ( silB[c] - silA[c]) / (max(silB[c], silA[c])+0.00001) ) # compute silhouette silAll.append(numpy.mean(sil)) # keep the AVERAGE SILLOUETTE #silAll = silAll * (1.0/(numpy.power(numpy.array(sRange),0.5))) imax = numpy.argmax(silAll) # position of the maximum sillouette value nSpeakersFinal = sRange[imax] # optimal number of clusters # generate the final set of cluster labels # (important: need to retrieve the outlier windows: this is achieved by giving them the value of their nearest non-outlier window) cls = numpy.zeros((numOfWindows,)) for i in range(numOfWindows): j = numpy.argmin(numpy.abs(i-iNonOutLiers)) cls[i] = clsAll[imax][j] # Post-process method 1: hmm smoothing for i in range(1): startprob, transmat, means, cov = trainHMM_computeStatistics(MidTermFeaturesNormOr, cls) hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], "diag") # hmm training hmm.startprob_ = startprob hmm.transmat_ = transmat hmm.means_ = means; hmm.covars_ = cov cls = hmm.predict(MidTermFeaturesNormOr.T) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = silAll[imax] # final sillouette classNames = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)]; # load ground-truth if available gtFile = fileName.replace('.wav', '.segments'); # open for annotated file if os.path.isfile(gtFile): # if groundturh exists [segStart, segEnd, segLabels] = readSegmentGT(gtFile) # read GT data flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels, mtStep) # convert to flags if PLOT: fig = plt.figure() if numOfSpeakers>0: ax1 = fig.add_subplot(111) else: ax1 = fig.add_subplot(211) ax1.set_yticks(numpy.array(range(len(classNames)))) ax1.axis((0, Duration, -1, len(classNames))) ax1.set_yticklabels(classNames) ax1.plot(numpy.array(range(len(cls)))*mtStep+mtStep/2.0, cls) if os.path.isfile(gtFile): if PLOT: ax1.plot(numpy.array(range(len(flagsGT)))*mtStep+mtStep/2.0, flagsGT, 'r') purityClusterMean, puritySpeakerMean = evaluateSpeakerDiarization(cls, flagsGT) print "{0:.1f}\t{1:.1f}".format(100*purityClusterMean, 100*puritySpeakerMean) if PLOT: plt.title("Cluster purity: {0:.1f}% - Speaker purity: {1:.1f}%".format(100*purityClusterMean, 100*puritySpeakerMean) ) if PLOT: plt.xlabel("time (seconds)") #print sRange, silAll if numOfSpeakers<=0: plt.subplot(212) plt.plot(sRange, silAll) plt.xlabel("number of clusters"); plt.ylabel("average clustering's sillouette"); plt.show() return cls
def mtFileClassification(inputFile, modelName, modelType, plotResults=False, gtFile=""): ''' This function performs mid-term classification of an audio stream. Towards this end, supervised knowledge is used, i.e. a pre-trained classifier. ARGUMENTS: - inputFile: path of the input WAV file - modelName: name of the classification model - modelType: svm or knn depending on the classifier type - plotResults: True if results are to be plotted using matplotlib along with a set of statistics RETURNS: - segs: a sequence of segment's endpoints: segs[i] is the endpoint of the i-th segment (in seconds) - classes: a sequence of class flags: class[i] is the class ID of the i-th segment ''' if not os.path.isfile(modelName): print("mtFileClassificationError: input modelType not found!") return (-1, -1, -1, -1) # Load classifier: if (modelType == 'svm') or (modelType == 'svm_rbf'): [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadSVModel(modelName) elif modelType == 'knn': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadKNNModel(modelName) elif modelType == 'randomforest': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadRandomForestModel(modelName) elif modelType == 'gradientboosting': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadGradientBoostingModel(modelName) elif modelType == 'extratrees': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadExtraTreesModel(modelName) if computeBEAT: print("Model " + modelName + " contains long-term music features (beat etc) and cannot be used in segmentation") return (-1, -1, -1, -1) [Fs, x] = audioBasicIO.readAudioFile(inputFile) # load input file if Fs == -1: # could not read file return (-1, -1, -1, -1) # convert stereo (if) to mono x = audioBasicIO.stereo2mono(x) Duration = len(x) / Fs # mid-term feature extraction: [MidTermFeatures, _] = aF.mtFeatureExtraction( x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep)) flags = [] Ps = [] flagsInd = [] # for each feature vector (i.e. for each fix-sized segment): for i in range(MidTermFeatures.shape[1]): # normalize current feature vector curFV = (MidTermFeatures[:, i] - MEAN) / STD [Result, P] = aT.classifierWrapper( Classifier, modelType, curFV) # classify vector flagsInd.append(Result) # update class label matrix flags.append(classNames[int(Result)]) # update probability matrix Ps.append(numpy.max(P)) flagsInd = numpy.array(flagsInd) # 1-window smoothing for i in range(1, len(flagsInd) - 1): if flagsInd[i - 1] == flagsInd[i + 1]: flagsInd[i] = flagsInd[i + 1] # convert fix-sized flags to segments and classes (segs, classes) = flags2segs(flags, mtStep) segs[-1] = len(x) / float(Fs) # Load grount-truth: if os.path.isfile(gtFile): [segStartGT, segEndGT, segLabelsGT] = readSegmentGT(gtFile) flagsGT, classNamesGT = segs2flags( segStartGT, segEndGT, segLabelsGT, mtStep) flagsIndGT = [] for j, fl in enumerate(flagsGT): # "align" labels with GT if classNamesGT[flagsGT[j]] in classNames: flagsIndGT.append(classNames.index(classNamesGT[flagsGT[j]])) else: flagsIndGT.append(-1) flagsIndGT = numpy.array(flagsIndGT) CM = numpy.zeros((len(classNamesGT), len(classNamesGT))) for i in range(min(flagsInd.shape[0], flagsIndGT.shape[0])): CM[int(flagsIndGT[i]), int(flagsInd[i])] += 1 else: CM = [] flagsIndGT = numpy.array([]) acc = plotSegmentationResults( flagsInd, flagsIndGT, classNames, mtStep, not plotResults) if acc >= 0: print("Overall Accuracy: {0:.3f}".format(acc)) return (flagsInd, classNamesGT, acc, CM) else: return (flagsInd, classNames, acc, CM)
def speakerDiarization(fileName, sRange=xrange(2, 10), mtSize=2.0, mtStep=0.2, stWin=0.05, LDAdim=35): Fs, x = audioBasicIO.readAudioFile(fileName) x = audioBasicIO.stereo2mono(x) duration = len(x) / Fs Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1 = aT.loadKNNModel( os.path.join( '/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data', 'knnSpeakerAll')) Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2 = aT.loadKNNModel( os.path.join( '/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data', 'knnSpeakerFemaleMale')) MidTermFeatures, ShortTermFeatures = aF.mtFeatureExtraction( x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stWin * 0.5)) MidTermFeatures2 = numpy.zeros( (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1])) for i in range(MidTermFeatures.shape[1]): curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1 curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2 Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1) Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2) MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i] MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0] + len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1):, i] = P2 + 0.0001 MidTermFeatures = MidTermFeatures2 iFeaturesSelect = range(8, 21) + range(41, 54) MidTermFeatures = MidTermFeatures[iFeaturesSelect, :] MidTermFeaturesNorm, MEAN, STD = aT.normalizeFeatures([MidTermFeatures.T]) MidTermFeaturesNorm = MidTermFeaturesNorm[0].T numOfWindows = MidTermFeatures.shape[1] DistancesAll = numpy.sum(distance.squareform( distance.pdist(MidTermFeaturesNorm.T)), axis=0) MDistancesAll = numpy.mean(DistancesAll) iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0] perOutLier = (100.0 * (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows MidTermFeaturesNormOr = MidTermFeaturesNorm MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers] if LDAdim > 0: mtWinRatio, mtStepRatio, mtFeaturesToReduce, numOfFeatures, numOfStatistics = int( round(mtSize / stWin)), int(round( stWin / stWin)), list(), len(ShortTermFeatures), 2 for i in range(numOfStatistics * numOfFeatures): mtFeaturesToReduce.append(list()) for i in range(numOfFeatures): curPos = 0 N = len(ShortTermFeatures[i]) while (curPos < N): N1, N2 = curPos, curPos + mtWinRatio if N2 > N: N2 = N curStFeatures = ShortTermFeatures[i][N1:N2] mtFeaturesToReduce[i].append(numpy.mean(curStFeatures)) mtFeaturesToReduce[i + numOfFeatures].append( numpy.std(curStFeatures)) curPos += mtStepRatio mtFeaturesToReduce = numpy.array(mtFeaturesToReduce) mtFeaturesToReduce2 = numpy.zeros( (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1])) for i in range(mtFeaturesToReduce.shape[1]): curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1 curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2 Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1) Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2) mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i] mtFeaturesToReduce2[ mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0] + len(classNames1), i] = P1 + 0.0001 mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] + len(classNames1):, i] = P2 + 0.0001 mtFeaturesToReduce = mtFeaturesToReduce2 mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :] mtFeaturesToReduce, MEAN, STD = aT.normalizeFeatures( [mtFeaturesToReduce.T]) mtFeaturesToReduce = mtFeaturesToReduce[0].T Labels = numpy.zeros((mtFeaturesToReduce.shape[1], )) LDAstep = 1.0 LDAstepRatio = LDAstep / stWin for i in range(Labels.shape[0]): Labels[i] = int(i * stWin / LDAstepRatio) clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis( n_components=LDAdim) clf.fit(mtFeaturesToReduce.T, Labels) MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T clsAll, silAll, centersAll = list(), list(), list() for iSpeakers in sRange: k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers) k_means.fit(MidTermFeaturesNorm.T) cls = k_means.labels_ means = k_means.cluster_centers_ clsAll.append(cls) centersAll.append(means) silA, silB = list(), list() for c in range(iSpeakers): clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float( len(cls)) if clusterPerCent < 0.02: silA.append(0.0) silB.append(0.0) else: MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls == c] Yt = distance.pdist(MidTermFeaturesNormTemp.T) silA.append(numpy.mean(Yt) * clusterPerCent) silBs = list() for c2 in range(iSpeakers): if c2 != c: clusterPerCent2 = numpy.nonzero( cls == c2)[0].shape[0] / float(len(cls)) MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:, cls == c2] Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T) silBs.append( numpy.mean(Yt) * (clusterPerCent + clusterPerCent2) / 2.0) silBs = numpy.array(silBs) silB.append(min(silBs)) silA, silB, sil = numpy.array(silA), numpy.array(silB), list() for c in range(iSpeakers): sil.append((silB[c] - silA[c]) / (max(silB[c], silA[c]) + 0.00001)) silAll.append(numpy.mean(sil)) imax = numpy.argmax(silAll) nSpeakersFinal = sRange[imax] cls = numpy.zeros((numOfWindows, )) for i in range(numOfWindows): j = numpy.argmin(numpy.abs(i - iNonOutLiers)) cls[i] = clsAll[imax][j] startprob, transmat, means, cov = trainHMM(MidTermFeaturesNormOr, cls) hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], 'diag') hmm.startprob_ = startprob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov cls = hmm.predict(MidTermFeaturesNormOr.T) cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = silAll[imax] classNames = ['SPEAKER{0:d}'.format(c) for c in range(nSpeakersFinal)] return cls, classNames, duration, mtStep, silAll
def recordAnalyzeAudio(duration, outputWavFile, midTermBufferSizeSec, modelName, modelType): ''' recordAnalyzeAudio(duration, outputWavFile, midTermBufferSizeSec, modelName, modelType) This function is used to record and analyze audio segments, in a fix window basis. ARGUMENTS: - duration total recording duration - outputWavFile path of the output WAV file - midTermBufferSizeSec (fix)segment length in seconds - modelName classification model name - modelType classification model type ''' if modelType == 'svm': [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT ] = aT.loadSVModel(modelName) elif modelType == 'knn': [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT ] = aT.loadKNNModel(modelName) else: Classifier = None inp = alsaaudio.PCM(alsaaudio.PCM_CAPTURE, alsaaudio.PCM_NONBLOCK) inp.setchannels(1) inp.setrate(Fs) inp.setformat(alsaaudio.PCM_FORMAT_S16_LE) inp.setperiodsize(512) midTermBufferSize = int(midTermBufferSizeSec * Fs) allData = [] midTermBuffer = [] curWindow = [] count = 0 while len(allData) < duration * Fs: # Read data from device l, data = inp.read() if l: for i in range(l): curWindow.append(audioop.getsample(data, 2, i)) if (len(curWindow) + len(midTermBuffer) > midTermBufferSize): samplesToCopyToMidBuffer = midTermBufferSize - \ len(midTermBuffer) else: samplesToCopyToMidBuffer = len(curWindow) midTermBuffer = midTermBuffer + \ curWindow[0:samplesToCopyToMidBuffer] del (curWindow[0:samplesToCopyToMidBuffer]) if len(midTermBuffer) == midTermBufferSize: count += 1 if Classifier != None: [mtFeatures, stFeatures] = aF.mtFeatureExtraction(midTermBuffer, Fs, 2.0 * Fs, 2.0 * Fs, 0.020 * Fs, 0.020 * Fs) curFV = (mtFeatures[:, 0] - MEAN) / STD [result, P] = aT.classifierWrapper(Classifier, modelType, curFV) print(classNames[int(result)]) allData = allData + midTermBuffer plt.clf() plt.plot(midTermBuffer) plt.show(block=False) plt.draw() midTermBuffer = [] allDataArray = numpy.int16(allData) wavfile.write(outputWavFile, Fs, allDataArray)
def speakerDiarization(fileName, sRange = xrange(2, 10), mtSize = 2.0, mtStep = 0.2, stWin = 0.05, LDAdim = 35): Fs, x = audioBasicIO.readAudioFile(fileName) x = audioBasicIO.stereo2mono(x) duration = len(x) / Fs Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1 = aT.loadKNNModel(os.path.join('/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data', 'knnSpeakerAll')) Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2 = aT.loadKNNModel(os.path.join('/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data', 'knnSpeakerFemaleMale')) MidTermFeatures, ShortTermFeatures = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stWin * 0.5)) MidTermFeatures2 = numpy.zeros((MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1])) for i in range(MidTermFeatures.shape[1]): curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1 curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2 Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1) Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2) MidTermFeatures2[0: MidTermFeatures.shape[0], i] = MidTermFeatures[:, i] MidTermFeatures2[MidTermFeatures.shape[0]: MidTermFeatures.shape[0] + len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1):, i] = P2 + 0.0001 MidTermFeatures = MidTermFeatures2 iFeaturesSelect = range(8, 21) + range(41, 54) MidTermFeatures = MidTermFeatures[iFeaturesSelect, :] MidTermFeaturesNorm, MEAN, STD = aT.normalizeFeatures([MidTermFeatures.T]) MidTermFeaturesNorm = MidTermFeaturesNorm[0].T numOfWindows = MidTermFeatures.shape[1] DistancesAll = numpy.sum(distance.squareform(distance.pdist(MidTermFeaturesNorm.T)), axis = 0) MDistancesAll = numpy.mean(DistancesAll) iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0] perOutLier = (100.0 * (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows MidTermFeaturesNormOr = MidTermFeaturesNorm MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers] if LDAdim > 0: mtWinRatio, mtStepRatio, mtFeaturesToReduce, numOfFeatures, numOfStatistics = int(round(mtSize / stWin)), int(round(stWin / stWin)), list(), len(ShortTermFeatures), 2 for i in range(numOfStatistics * numOfFeatures): mtFeaturesToReduce.append(list()) for i in range(numOfFeatures): curPos = 0 N = len(ShortTermFeatures[i]) while (curPos < N): N1, N2 = curPos, curPos + mtWinRatio if N2 > N: N2 = N curStFeatures = ShortTermFeatures[i][N1: N2] mtFeaturesToReduce[i].append(numpy.mean(curStFeatures)) mtFeaturesToReduce[i + numOfFeatures].append(numpy.std(curStFeatures)) curPos += mtStepRatio mtFeaturesToReduce = numpy.array(mtFeaturesToReduce) mtFeaturesToReduce2 = numpy.zeros((mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1])) for i in range(mtFeaturesToReduce.shape[1]): curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1 curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2 Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1) Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2) mtFeaturesToReduce2[0: mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i] mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]: mtFeaturesToReduce.shape[0] + len(classNames1), i] = P1 + 0.0001 mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] + len(classNames1):, i] = P2 + 0.0001 mtFeaturesToReduce = mtFeaturesToReduce2 mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :] mtFeaturesToReduce, MEAN, STD = aT.normalizeFeatures([mtFeaturesToReduce.T]) mtFeaturesToReduce = mtFeaturesToReduce[0].T Labels = numpy.zeros((mtFeaturesToReduce.shape[1], )) LDAstep = 1.0 LDAstepRatio = LDAstep / stWin for i in range(Labels.shape[0]): Labels[i] = int(i * stWin / LDAstepRatio) clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(n_components = LDAdim) clf.fit(mtFeaturesToReduce.T, Labels) MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T clsAll, silAll, centersAll = list(), list(), list() for iSpeakers in sRange: k_means = sklearn.cluster.KMeans(n_clusters = iSpeakers) k_means.fit(MidTermFeaturesNorm.T) cls = k_means.labels_ means = k_means.cluster_centers_ clsAll.append(cls) centersAll.append(means) silA, silB = list(), list() for c in range(iSpeakers): clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float(len(cls)) if clusterPerCent < 0.02: silA.append(0.0) silB.append(0.0) else: MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls == c] Yt = distance.pdist(MidTermFeaturesNormTemp.T) silA.append(numpy.mean(Yt) * clusterPerCent) silBs = list() for c2 in range(iSpeakers): if c2 != c: clusterPerCent2 = numpy.nonzero(cls == c2)[0].shape[0] / float(len(cls)) MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:, cls == c2] Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T) silBs.append(numpy.mean(Yt) * (clusterPerCent+clusterPerCent2) / 2.0) silBs = numpy.array(silBs) silB.append(min(silBs)) silA, silB, sil = numpy.array(silA), numpy.array(silB), list() for c in range(iSpeakers): sil.append((silB[c] - silA[c]) / (max(silB[c], silA[c]) + 0.00001)) silAll.append(numpy.mean(sil)) imax = numpy.argmax(silAll) nSpeakersFinal = sRange[imax] cls = numpy.zeros((numOfWindows, )) for i in range(numOfWindows): j = numpy.argmin(numpy.abs(i - iNonOutLiers)) cls[i] = clsAll[imax][j] startprob, transmat, means, cov = trainHMM(MidTermFeaturesNormOr, cls) hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], 'diag') hmm.startprob_ = startprob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov cls = hmm.predict(MidTermFeaturesNormOr.T) cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = silAll[imax] classNames = ['SPEAKER{0:d}'.format(c) for c in range(nSpeakersFinal)] return cls, classNames, duration, mtStep, silAll
def classifyFolderWrapper(inputFolder, modelType, modelName, outputMode=False): if not os.path.isfile(modelName): raise Exception("Input modelName not found!") if modelType == 'svm': [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT ] = aT.loadSVModel(modelName) elif modelType == 'knn': [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT ] = aT.loadKNNModel(modelName) PsAll = numpy.zeros((len(classNames), )) files = "*.wav" if os.path.isdir(inputFolder): strFilePattern = os.path.join(inputFolder, files) else: strFilePattern = inputFolder + files wavFilesList = [] wavFilesList.extend(glob.glob(strFilePattern)) wavFilesList = sorted(wavFilesList) if len(wavFilesList) == 0: print("No WAV files found!") return Results = [] for wavFile in wavFilesList: [Fs, x] = audioBasicIO.readAudioFile(wavFile) signalLength = x.shape[0] / float(Fs) [Result, P, classNames] = aT.fileClassification(wavFile, modelName, modelType) PsAll += (numpy.array(P) * signalLength) Result = int(Result) Results.append(Result) if outputMode: print("{0:s}\t{1:s}".format(wavFile, classNames[Result])) Results = numpy.array(Results) # print distribution of classes: [Histogram, _] = numpy.histogram(Results, bins=numpy.arange(len(classNames) + 1)) if outputMode: for i, h in enumerate(Histogram): print("{0:20s}\t\t{1:d}".format(classNames[i], h)) PsAll = PsAll / numpy.sum(PsAll) if outputMode: fig = plt.figure() ax = fig.add_subplot(111) plt.title("Classes percentage " + inputFolder.replace('Segments', '')) ax.axis((0, len(classNames) + 1, 0, 1)) ax.set_xticks(numpy.array(range(len(classNames) + 1))) ax.set_xticklabels([" "] + classNames) ax.bar(numpy.array(range(len(classNames))) + 0.5, PsAll) plt.show() return classNames, PsAll