Exemplo n.º 1
0
def mtFileClassification(inputFile, modelName, modelType, plotResults = False, gtFile = ""):
	'''
	This function performs mid-term classification of an audio stream.
	Towards this end, supervised knowledge is used, i.e. a pre-trained classifier.
	ARGUMENTS:
		- inputFile:		path of the input WAV file
		- modelName:		name of the classification model
		- modelType:		svm or knn depending on the classifier type
		- plotResults:		True if results are to be plotted using matplotlib along with a set of statistics
	
	RETURNS:
	  	- segs:			a sequence of segment's endpoints: segs[i] is the endpoint of the i-th segment (in seconds)
		- classes:		a sequence of class flags: class[i] is the class ID of the i-th segment
	'''

	if not os.path.isfile(modelName):
		print "mtFileClassificationError: input modelType not found!"
		return (-1,-1,-1)
	# Load classifier:
	if modelType=='svm':
		[Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadSVModel(modelName)
	elif modelType=='knn':
		[Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadKNNModel(modelName)
	if computeBEAT:
		print "Model " + modelName + " contains long-term music features (beat etc) and cannot be used in segmentation"	
		return (-1,-1,-1)
	[Fs, x] = audioBasicIO.readAudioFile(inputFile)					# load input file
	if Fs == -1:									# could not read file
		return  (-1,-1,-1)
	x = audioBasicIO.stereo2mono(x);						# convert stereo (if) to mono
	Duration = len(x) / Fs					
											# mid-term feature extraction:
	[MidTermFeatures, _] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs*stWin), round(Fs*stStep));
	flags = []; Ps = []; flagsInd = []
	for i in range(MidTermFeatures.shape[1]): 					# for each feature vector (i.e. for each fix-sized segment):
		curFV = (MidTermFeatures[:, i] - MEAN) / STD;				# normalize current feature vector					
		[Result, P] = aT.classifierWrapper(Classifier, modelType, curFV)	# classify vector
		flagsInd.append(Result)
		flags.append(classNames[int(Result)])					# update class label matrix
		Ps.append(numpy.max(P))							# update probability matrix
	flagsInd = numpy.array(flagsInd)

	# 1-window smoothing
	for i in range(1, len(flagsInd)-1):
		if flagsInd[i-1]==flagsInd[i+1]:
			flagsInd[i] = flagsInd[i+1]
	(segs, classes) = flags2segs(flags, mtStep)					# convert fix-sized flags to segments and classes
	segs[-1] = len(x) / float(Fs)

	# Load grount-truth:
	if os.path.isfile(gtFile):
		[segStartGT, segEndGT, segLabelsGT] = readSegmentGT(gtFile)		
		flagsGT, classNamesGT = segs2flags(segStartGT, segEndGT, segLabelsGT, mtStep)
		flagsIndGT = []
		for j, fl in enumerate(flagsGT):					# "align" labels with GT
			if classNamesGT[flagsGT[j]] in classNames:
				flagsIndGT.append( classNames.index( classNamesGT[flagsGT[j]] ) )
			else:
				flagsIndGT.append( -1 )
		flagsIndGT = numpy.array(flagsIndGT)
	else:
		flagsIndGT = numpy.array([])
	acc = plotSegmentationResults(flagsInd, flagsIndGT, classNames, mtStep, not plotResults)
	if acc>=0:
		print "Overall Accuracy: {0:.3f}".format(acc)
	return (flagsInd, classNames, acc)
Exemplo n.º 2
0
def speakerDiarization(fileName, numOfSpeakers, mtSize = 2.0, mtStep=0.2, stWin=0.05, LDAdim = 35, PLOT = False):
	'''
	ARGUMENTS:
		- fileName:		the name of the WAV file to be analyzed
		- numOfSpeakers	the number of speakers (clusters) in the recording (<=0 for unknown)
		- mtSize (opt)	mid-term window size
		- mtStep (opt)	mid-term window step
		- stWin  (opt)	short-term window size
		- LDAdim (opt)	LDA dimension (0 for no LDA)
		- PLOT	 (opt)	0 for not plotting the results 1 for plottingy
	'''
	[Fs, x] = audioBasicIO.readAudioFile(fileName)
	x = audioBasicIO.stereo2mono(x);
	Duration = len(x) / Fs

	[Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.loadKNNModel("data/knnSpeakerAll")
	[Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.loadKNNModel("data/knnSpeakerFemaleMale")

	[MidTermFeatures, ShortTermFeatures] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs*stWin), round(Fs*stWin*0.5));

	MidTermFeatures2 = numpy.zeros( (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1] ) )

	for i in range(MidTermFeatures.shape[1]):
		curF1 = (MidTermFeatures[:,i] - MEAN1)  / STD1
		curF2 = (MidTermFeatures[:,i] - MEAN2)  / STD2
		[Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
		[Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
		MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i]
		MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0]+len(classNames1), i] = P1 + 0.0001;
		MidTermFeatures2[MidTermFeatures.shape[0]+len(classNames1)::, i] = P2 + 0.0001;
	
	MidTermFeatures = MidTermFeatures2	# TODO	
	# SELECT FEATURES:
	#iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20]; 																											# SET 0A
	#iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100]; 																									# SET 0B
	#iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; 	# SET 0C
	
	iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53]; 																	# SET 1A
	#iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; 															# SET 1B
	#iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; 	# SET 1C
	
	#iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53]; 			# SET 2A		
	#iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; 	# SET 2B
	#iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; 	# SET 2C
	
	#iFeaturesSelect = range(100);																									# SET 3	
	#MidTermFeatures += numpy.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010  
	
	MidTermFeatures = MidTermFeatures[iFeaturesSelect,:]		
	
	(MidTermFeaturesNorm, MEAN, STD) = aT.normalizeFeatures([MidTermFeatures.T])
	MidTermFeaturesNorm = MidTermFeaturesNorm[0].T	
	numOfWindows = MidTermFeatures.shape[1]

	# remove outliers:
	DistancesAll = numpy.sum(distance.squareform(distance.pdist(MidTermFeaturesNorm.T)), axis=0)
	MDistancesAll = numpy.mean(DistancesAll)
	iNonOutLiers = numpy.nonzero(DistancesAll < 1.2*MDistancesAll)[0]
	
	# TODO: Combine energy threshold for outlier removal:
	#EnergyMin = numpy.min(MidTermFeatures[1,:])
	#EnergyMean = numpy.mean(MidTermFeatures[1,:])
	#Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
	#iNonOutLiers = numpy.nonzero(MidTermFeatures[1,:] > Thres)[0]
	#print iNonOutLiers

	perOutLier = (100.0*(numOfWindows-iNonOutLiers.shape[0])) / numOfWindows	
	MidTermFeaturesNormOr = MidTermFeaturesNorm
	MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers]
	
	# LDA dimensionality reduction:
	if LDAdim > 0:
		#[mtFeaturesToReduce, _] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, stWin * Fs, round(Fs*stWin), round(Fs*stWin));		
		# extract mid-term features with minimum step:
		mtWinRatio  = int(round(mtSize  / stWin));
		mtStepRatio = int(round(stWin / stWin));
		mtFeaturesToReduce = []			
		numOfFeatures = len(ShortTermFeatures)
		numOfStatistics = 2;			
		#for i in range(numOfStatistics * numOfFeatures + 1):
		for i in range(numOfStatistics * numOfFeatures):
			mtFeaturesToReduce.append([])

		for i in range(numOfFeatures):		# for each of the short-term features:
			curPos = 0
			N = len(ShortTermFeatures[i])
			while (curPos<N):
				N1 = curPos
				N2 = curPos + mtWinRatio
				if N2 > N:
					N2 = N
				curStFeatures = ShortTermFeatures[i][N1:N2]
				mtFeaturesToReduce[i].append(numpy.mean(curStFeatures))
				mtFeaturesToReduce[i+numOfFeatures].append(numpy.std(curStFeatures))				
				curPos += mtStepRatio		
		mtFeaturesToReduce = numpy.array(mtFeaturesToReduce)
				
		mtFeaturesToReduce2 = numpy.zeros( (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1] ) )
		for i in range(mtFeaturesToReduce.shape[1]):
			curF1 = (mtFeaturesToReduce[:,i] - MEAN1)  / STD1
			curF2 = (mtFeaturesToReduce[:,i] - MEAN2)  / STD2
			[Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
			[Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
			mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i]
			mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0]+len(classNames1), i] = P1 + 0.0001;
			mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]+len(classNames1)::, i] = P2 + 0.0001;
		mtFeaturesToReduce = mtFeaturesToReduce2		
		mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect,:]		
		#mtFeaturesToReduce += numpy.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010
		(mtFeaturesToReduce, MEAN, STD) = aT.normalizeFeatures([mtFeaturesToReduce.T])	
		mtFeaturesToReduce = mtFeaturesToReduce[0].T
		#DistancesAll = numpy.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0)
		#MDistancesAll = numpy.mean(DistancesAll)
		#iNonOutLiers2 = numpy.nonzero(DistancesAll < 3.0*MDistancesAll)[0]
		#mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2]
		Labels = numpy.zeros((mtFeaturesToReduce.shape[1],));
		LDAstep = 1.0
		LDAstepRatio = LDAstep / stWin
		#print LDAstep, LDAstepRatio
		for i in range(Labels.shape[0]):
			Labels[i] = int(i*stWin/LDAstepRatio);		
		clf = LDA(n_components=LDAdim)
		clf.fit(mtFeaturesToReduce.T, Labels, tol=0.000001)
		MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T

	if numOfSpeakers<=0:
		sRange = range(2,10)
	else:
		sRange = [numOfSpeakers]
	clsAll = []; silAll = []; centersAll = []
	
	for iSpeakers in sRange:
		cls, means, steps = mlpy.kmeans(MidTermFeaturesNorm.T, k=iSpeakers, plus=True)		# perform k-means clustering
		
		#YDist =   distance.pdist(MidTermFeaturesNorm.T, metric='euclidean')
		#print distance.squareform(YDist).shape
		#hc = mlpy.HCluster()
		#hc.linkage(YDist)
		#cls = hc.cut(14.5)
		#print cls

		# Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T))
		clsAll.append(cls)
		centersAll.append(means)
		silA = []; silB = []
		for c in range(iSpeakers):								# for each speaker (i.e. for each extracted cluster)
			clusterPerCent = numpy.nonzero(cls==c)[0].shape[0] / float(len(cls))
			if clusterPerCent < 0.020:
				silA.append(0.0)
				silB.append(0.0)
			else:
				MidTermFeaturesNormTemp = MidTermFeaturesNorm[:,cls==c]			# get subset of feature vectors
				Yt = distance.pdist(MidTermFeaturesNormTemp.T)				# compute average distance between samples that belong to the cluster (a values)
				silA.append(numpy.mean(Yt)*clusterPerCent)
				silBs = []
				for c2 in range(iSpeakers):						# compute distances from samples of other clusters
					if c2!=c:
						clusterPerCent2 = numpy.nonzero(cls==c2)[0].shape[0] / float(len(cls))
						MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:,cls==c2]
						Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T)
						silBs.append(numpy.mean(Yt)*(clusterPerCent+clusterPerCent2)/2.0)
				silBs = numpy.array(silBs)							
				silB.append(min(silBs))							# ... and keep the minimum value (i.e. the distance from the "nearest" cluster)
		silA = numpy.array(silA); 
		silB = numpy.array(silB); 
		sil = []
		for c in range(iSpeakers):								# for each cluster (speaker)
			sil.append( ( silB[c] - silA[c]) / (max(silB[c],  silA[c])+0.00001)  )		# compute silhouette

		silAll.append(numpy.mean(sil))								# keep the AVERAGE SILLOUETTE

	#silAll = silAll * (1.0/(numpy.power(numpy.array(sRange),0.5)))
	imax = numpy.argmax(silAll)									# position of the maximum sillouette value
	nSpeakersFinal = sRange[imax]									# optimal number of clusters

	# generate the final set of cluster labels
	# (important: need to retrieve the outlier windows: this is achieved by giving them the value of their nearest non-outlier window)
	cls = numpy.zeros((numOfWindows,))
	for i in range(numOfWindows):
		j = numpy.argmin(numpy.abs(i-iNonOutLiers))		
		cls[i] = clsAll[imax][j]
		
	# Post-process method 1: hmm smoothing
	for i in range(1):
		startprob, transmat, means, cov = trainHMM_computeStatistics(MidTermFeaturesNormOr, cls)
		hmm = sklearn.hmm.GaussianHMM(startprob.shape[0], "diag", startprob, transmat)			# hmm training
		hmm.means_ = means; hmm.covars_ = cov
		cls = hmm.predict(MidTermFeaturesNormOr.T)					
	
	# Post-process method 2: median filtering:
	cls = scipy.signal.medfilt(cls, 13)
	cls = scipy.signal.medfilt(cls, 11)

	sil = silAll[imax]										# final sillouette
	classNames = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)];


	# load ground-truth if available
	gtFile = fileName.replace('.wav', '.segments');							# open for annotated file
	if os.path.isfile(gtFile):									# if groundturh exists
		[segStart, segEnd, segLabels] = readSegmentGT(gtFile)					# read GT data
		flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels, mtStep)			# convert to flags

	if PLOT:
		fig = plt.figure()	
		if numOfSpeakers>0:
			ax1 = fig.add_subplot(111)
		else:
			ax1 = fig.add_subplot(211)
		ax1.set_yticks(numpy.array(range(len(classNames))))
		ax1.axis((0, Duration, -1, len(classNames)))
		ax1.set_yticklabels(classNames)
		ax1.plot(numpy.array(range(len(cls)))*mtStep+mtStep/2.0, cls)

	if os.path.isfile(gtFile):
		if PLOT:
			ax1.plot(numpy.array(range(len(flagsGT)))*mtStep+mtStep/2.0, flagsGT, 'r')
		purityClusterMean, puritySpeakerMean = evaluateSpeakerDiarization(cls, flagsGT)
		print "{0:.1f}\t{1:.1f}".format(100*purityClusterMean, 100*puritySpeakerMean)
		if PLOT:
			plt.title("Cluster purity: {0:.1f}% - Speaker purity: {1:.1f}%".format(100*purityClusterMean, 100*puritySpeakerMean) )
	if PLOT:
		plt.xlabel("time (seconds)")
		#print sRange, silAll	
		if numOfSpeakers<=0:
			plt.subplot(212)
			plt.plot(sRange, silAll)
			plt.xlabel("number of clusters");
			plt.ylabel("average clustering's sillouette");
		plt.show()
def speakerDiarization(filename,
                       n_speakers,
                       mt_size=2.0,
                       mt_step=0.2,
                       st_win=0.05,
                       lda_dim=35,
                       plot_res=False):
    '''
	ARGUMENTS:
		- filename:        the name of the WAV file to be analyzed
		- n_speakers    the number of speakers (clusters) in the recording (<=0 for unknown)
		- mt_size (opt)     mid-term window size
		- mt_step (opt)     mid-term window step
		- st_win  (opt)     short-term window size
		- lda_dim (opt)     LDA dimension (0 for no LDA)
		- plot_res     (opt)   0 for not plotting the results 1 for plottingy
	'''
    [fs, x] = audioBasicIO.readAudioFile(filename)
    x = audioBasicIO.stereo2mono(x)
    duration = len(x) / fs

    # [classifier_1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.load_model_knn(os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "knnSpeakerAll"))
    # [classifier_2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.load_model_knn(os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "knnSpeakerFemaleMale"))
    [
        classifier_1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1,
        stStep1, computeBEAT1
    ] = aT.load_model_knn("data/knnSpeakerAll")
    [
        classifier_2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2,
        stStep2, computeBEAT2
    ] = aT.load_model_knn("data/knnSpeakerFemaleMale")

    [mt_feats, st_feats, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs,
                                                     mt_step * fs,
                                                     round(fs * st_win),
                                                     round(fs * st_win * 0.5))

    MidTermFeatures2 = numpy.zeros(
        (mt_feats.shape[0] + len(classNames1) + len(classNames2),
         mt_feats.shape[1]))

    for i in range(mt_feats.shape[1]):
        cur_f1 = (mt_feats[:, i] - MEAN1) / STD1
        cur_f2 = (mt_feats[:, i] - MEAN2) / STD2
        [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1)
        [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2)
        MidTermFeatures2[0:mt_feats.shape[0], i] = mt_feats[:, i]
        MidTermFeatures2[mt_feats.shape[0]:mt_feats.shape[0] +
                         len(classNames1), i] = P1 + 0.0001
        MidTermFeatures2[mt_feats.shape[0] + len(classNames1)::,
                         i] = P2 + 0.0001

    mt_feats = MidTermFeatures2  # TODO
    iFeaturesSelect = [
        8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45,
        46, 47, 48, 49, 50, 51, 52, 53
    ]

    mt_feats = mt_feats[iFeaturesSelect, :]

    (mt_feats_norm, MEAN, STD) = aT.normalizeFeatures([mt_feats.T])
    mt_feats_norm = mt_feats_norm[0].T
    n_wins = mt_feats.shape[1]

    # remove outliers:
    dist_all = numpy.sum(distance.squareform(distance.pdist(mt_feats_norm.T)),
                         axis=0)
    m_dist_all = numpy.mean(dist_all)
    i_non_outliers = numpy.nonzero(dist_all < 1.2 * m_dist_all)[0]

    # TODO: Combine energy threshold for outlier removal:
    #EnergyMin = numpy.min(mt_feats[1,:])
    #EnergyMean = numpy.mean(mt_feats[1,:])
    #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    #i_non_outliers = numpy.nonzero(mt_feats[1,:] > Thres)[0]
    #print i_non_outliers

    perOutLier = (100.0 * (n_wins - i_non_outliers.shape[0])) / n_wins
    mt_feats_norm_or = mt_feats_norm
    mt_feats_norm = mt_feats_norm[:, i_non_outliers]

    # LDA dimensionality reduction:
    if lda_dim > 0:
        #[mt_feats_to_red, _, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs, st_win * fs, round(fs*st_win), round(fs*st_win));
        # extract mid-term features with minimum step:
        mt_win_ratio = int(round(mt_size / st_win))
        mt_step_ratio = int(round(st_win / st_win))
        mt_feats_to_red = []
        num_of_features = len(st_feats)
        num_of_stats = 2
        #for i in range(num_of_stats * num_of_features + 1):
        for i in range(num_of_stats * num_of_features):
            mt_feats_to_red.append([])

        for i in range(
                num_of_features):  # for each of the short-term features:
            curPos = 0
            N = len(st_feats[i])
            while (curPos < N):
                N1 = curPos
                N2 = curPos + mt_win_ratio
                if N2 > N:
                    N2 = N
                curStFeatures = st_feats[i][N1:N2]
                mt_feats_to_red[i].append(numpy.mean(curStFeatures))
                mt_feats_to_red[i + num_of_features].append(
                    numpy.std(curStFeatures))
                curPos += mt_step_ratio
        mt_feats_to_red = numpy.array(mt_feats_to_red)
        mt_feats_to_red_2 = numpy.zeros(
            (mt_feats_to_red.shape[0] + len(classNames1) + len(classNames2),
             mt_feats_to_red.shape[1]))
        for i in range(mt_feats_to_red.shape[1]):
            cur_f1 = (mt_feats_to_red[:, i] - MEAN1) / STD1
            cur_f2 = (mt_feats_to_red[:, i] - MEAN2) / STD2
            [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1)
            [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2)
            mt_feats_to_red_2[0:mt_feats_to_red.shape[0],
                              i] = mt_feats_to_red[:, i]
            mt_feats_to_red_2[
                mt_feats_to_red.shape[0]:mt_feats_to_red.shape[0] +
                len(classNames1), i] = P1 + 0.0001
            mt_feats_to_red_2[mt_feats_to_red.shape[0] + len(classNames1)::,
                              i] = P2 + 0.0001
        mt_feats_to_red = mt_feats_to_red_2
        mt_feats_to_red = mt_feats_to_red[iFeaturesSelect, :]
        #mt_feats_to_red += numpy.random.rand(mt_feats_to_red.shape[0], mt_feats_to_red.shape[1]) * 0.0000010
        (mt_feats_to_red, MEAN,
         STD) = aT.normalizeFeatures([mt_feats_to_red.T])
        mt_feats_to_red = mt_feats_to_red[0].T
        #dist_all = numpy.sum(distance.squareform(distance.pdist(mt_feats_to_red.T)), axis=0)
        #m_dist_all = numpy.mean(dist_all)
        #iNonOutLiers2 = numpy.nonzero(dist_all < 3.0*m_dist_all)[0]
        #mt_feats_to_red = mt_feats_to_red[:, iNonOutLiers2]
        Labels = numpy.zeros((mt_feats_to_red.shape[1], ))
        LDAstep = 1.0
        LDAstepRatio = LDAstep / st_win
        #print LDAstep, LDAstepRatio
        for i in range(Labels.shape[0]):
            Labels[i] = int(i * st_win / LDAstepRatio)
        clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(
            n_components=lda_dim)
        clf.fit(mt_feats_to_red.T, Labels)
        mt_feats_norm = (clf.transform(mt_feats_norm.T)).T

    if n_speakers <= 0:
        s_range = range(2, 10)
    else:
        s_range = [n_speakers]
    clsAll = []
    sil_all = []
    centersAll = []

    for iSpeakers in s_range:
        k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers)
        k_means.fit(mt_feats_norm.T)
        cls = k_means.labels_
        means = k_means.cluster_centers_

        # Y = distance.squareform(distance.pdist(mt_feats_norm.T))
        clsAll.append(cls)
        centersAll.append(means)
        sil_1 = []
        sil_2 = []
        for c in range(iSpeakers):
            # for each speaker (i.e. for each extracted cluster)
            clust_per_cent = numpy.nonzero(cls == c)[0].shape[0] / \
                 float(len(cls))
            if clust_per_cent < 0.020:
                sil_1.append(0.0)
                sil_2.append(0.0)
            else:
                # get subset of feature vectors
                mt_feats_norm_temp = mt_feats_norm[:, cls == c]
                # compute average distance between samples
                # that belong to the cluster (a values)
                Yt = distance.pdist(mt_feats_norm_temp.T)
                sil_1.append(numpy.mean(Yt) * clust_per_cent)
                silBs = []
                for c2 in range(iSpeakers):
                    # compute distances from samples of other clusters
                    if c2 != c:
                        clust_per_cent_2 = numpy.nonzero(cls == c2)[0].shape[0] /\
                               float(len(cls))
                        MidTermFeaturesNormTemp2 = mt_feats_norm[:, cls == c2]
                        Yt = distance.cdist(mt_feats_norm_temp.T,
                                            MidTermFeaturesNormTemp2.T)
                        silBs.append(
                            numpy.mean(Yt) *
                            (clust_per_cent + clust_per_cent_2) / 2.0)
                silBs = numpy.array(silBs)
                # ... and keep the minimum value (i.e.
                # the distance from the "nearest" cluster)
                sil_2.append(min(silBs))
        sil_1 = numpy.array(sil_1)
        sil_2 = numpy.array(sil_2)
        sil = []
        for c in range(iSpeakers):
            # for each cluster (speaker) compute silhouette
            sil.append(
                (sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 0.00001))
        # keep the AVERAGE SILLOUETTE
        sil_all.append(numpy.mean(sil))

    imax = numpy.argmax(sil_all)
    # optimal number of clusters
    nSpeakersFinal = s_range[imax]

    # generate the final set of cluster labels
    # (important: need to retrieve the outlier windows:
    # this is achieved by giving them the value of their
    # nearest non-outlier window)
    cls = numpy.zeros((n_wins, ))
    for i in range(n_wins):
        j = numpy.argmin(numpy.abs(i - i_non_outliers))
        cls[i] = clsAll[imax][j]

    # Post-process method 1: hmm smoothing
    for i in range(1):
        # hmm training
        start_prob, transmat, means, cov = \
         trainHMM_computeStatistics(mt_feats_norm_or, cls)
        hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag")
        hmm.startprob_ = start_prob
        hmm.transmat_ = transmat
        hmm.means_ = means
        hmm.covars_ = cov
        cls = hmm.predict(mt_feats_norm_or.T)

    # Post-process method 2: median filtering:
    cls = scipy.signal.medfilt(cls, 13)
    cls = scipy.signal.medfilt(cls, 11)

    sil = sil_all[imax]
    class_names = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)]

    # load ground-truth if available
    gt_file = filename.replace('.wav', '.segments')
    # if groundturh exists
    if os.path.isfile(gt_file):
        [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file)
        flags_gt, class_names_gt = segs2flags(seg_start, seg_end, seg_labs,
                                              mt_step)

    if plot_res:
        fig = plt.figure()
        if n_speakers > 0:
            ax1 = fig.add_subplot(111)
        else:
            ax1 = fig.add_subplot(211)
        ax1.set_yticks(numpy.array(range(len(class_names))))
        ax1.axis((0, duration, -1, len(class_names)))
        ax1.set_yticklabels(class_names)
        ax1.plot(numpy.array(range(len(cls))) * mt_step + mt_step / 2.0, cls)

    if os.path.isfile(gt_file):
        if plot_res:
            ax1.plot(
                numpy.array(range(len(flags_gt))) * mt_step + mt_step / 2.0,
                flags_gt, 'r')
        purity_cluster_m, purity_speaker_m = \
         evaluateSpeakerDiarization(cls, flags_gt)
        print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m,
                                        100 * purity_speaker_m))
        if plot_res:
            plt.title("Cluster purity: {0:.1f}% - "
                      "Speaker purity: {1:.1f}%".format(
                          100 * purity_cluster_m, 100 * purity_speaker_m))
    if plot_res:
        plt.xlabel("time (seconds)")
        #print s_range, sil_all
        if n_speakers <= 0:
            plt.subplot(212)
            plt.plot(s_range, sil_all)
            plt.xlabel("number of clusters")
            plt.ylabel("average clustering's sillouette")
        #plt.show()
        plt.savefig('output/outImg.jpg')
    return cls
Exemplo n.º 4
0
def speaker_diarization(file_name, num_speaker, mt_size=2.0,
                        mt_step=0.2, st_win=0.05, st_step=0.025,
                        lda_dim=35,
                        plot=False):
    '''
    ARGUMENTS:
        - fileName:        the name of the WAV file to be analyzed
        - numOfSpeakers    the number of speakers (clusters) in the recording (<=0 for unknown)
        - mtSize (opt)     mid-term window size
        - mtStep (opt)     mid-term window step
        - stWin  (opt)     short-term window size
        - LDAdim (opt)     LDA dimension (0 for no LDA)
        - PLOT     (opt)   0 for not plotting the results 1 for plottingy
    '''
    fr, x = audio_basic_io.read_audio_file(file_name)
    x = audio_basic_io.stereo2mono(x)
    duration = len(x) / fr

    classifier1, mean1, std1, class_names1, mt_win1, mt_step1, st_win1, st_step1, compute_beta1 = aT.loadKNNModel(
        os.path.join("data", "knnSpeakerAll"))
    classifier2, mean2, std2, class_names2, mt_win2, mt_step2, st_win2, st_step2, compute_beta2 = aT.loadKNNModel(
        os.path.join("data", "knnSpeakerFemaleMale"))

    mid_term_features, short_term_features = aF.mt_feature_extraction(signal=x,
                                                                      fr=fr,
                                                                      mt_win=mt_size * fr,
                                                                      mt_step=mt_step * fr,
                                                                      st_win=round(fr * st_win),
                                                                      st_step=round(fr * st_step))

    # (68, 329) (34, 2630)
    print(mid_term_features.shape, short_term_features.shape)
    mid_term_features2 = np.zeros((mid_term_features.shape[0] + len(class_names1) + len(class_names2),
                                   mid_term_features.shape[1]))

    for i in range(mid_term_features.shape[1]):
        cur_f1 = (mid_term_features[:, i] - mean1) / std1
        cur_f2 = (mid_term_features[:, i] - mean2) / std2
        result, p1 = aT.classifierWrapper(classifier1, "knn", cur_f1)
        result, p2 = aT.classifierWrapper(classifier2, "knn", cur_f2)
        mid_term_features2[0:mid_term_features.shape[0], i] = mid_term_features[:, i]
        mid_term_features2[mid_term_features.shape[0]:mid_term_features.shape[0] + len(class_names1), i] = p1 + 0.0001
        mid_term_features2[mid_term_features.shape[0] + len(class_names1)::, i] = p2 + 0.0001

    mid_term_features = mid_term_features2  # TODO
    # SELECT FEATURES:
    # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20];     # SET 0A
    # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100];    # SET 0B
    # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,
    # 74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,
    # 97,98, 99,100];     # SET 0C

    i_features_select = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41,
                         42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]  # SET 1A
    # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; # SET 1B
    # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,
    # 48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,
    # 87,88,89,90,91,92,93,94,95,96,97,98, 99,100];     # SET 1C

    # iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,
    # 36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53];  # SET 2A
    # iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,
    # 36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100];     # SET 2B
    # iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,
    # 36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,
    # 76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100];  # SET 2C

    # iFeaturesSelect = range(100);   # SET 3
    # MidTermFeatures += np.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010

    mid_term_features = mid_term_features[i_features_select, :]

    mid_term_features_norm, mean, std = aT.normalizeFeatures([mid_term_features.T])
    mid_term_features_norm = mid_term_features_norm[0].T
    num_of_windows = mid_term_features.shape[1]

    # remove outliers:
    distances_all = np.sum(distance.squareform(distance.pdist(mid_term_features_norm.T)), axis=0)
    m_distances_all = np.mean(distances_all)
    i_non_out_liers = np.nonzero(distances_all < 1.2 * m_distances_all)[0]

    # TODO: Combine energy threshold for outlier removal:
    # EnergyMin = np.min(MidTermFeatures[1,:])
    # EnergyMean = np.mean(MidTermFeatures[1,:])
    # Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    # iNonOutLiers = np.nonzero(MidTermFeatures[1,:] > Thres)[0]
    # print(iNonOutLiers

    # per_out_lier = (100.0 * (num_of_windows - i_non_out_liers.shape[0])) / num_of_windows
    mid_term_features_norm_or = mid_term_features_norm
    mid_term_features_norm = mid_term_features_norm[:, i_non_out_liers]

    # LDA dimensionality reduction:
    if lda_dim > 0:
        mt_win_ratio = int(round(mt_size / st_win))
        mt_step_ratio = int(round(st_win / st_win))
        mt_features_to_reduce = []
        num_of_features = len(short_term_features)
        num_of_statistics = 2
        # for i in range(numOfStatistics * numOfFeatures + 1):
        for i in range(num_of_statistics * num_of_features):
            mt_features_to_reduce.append([])

        for i in range(num_of_features):  # for each of the short-term features:
            cur_pos = 0
            n = len(short_term_features[i])
            while cur_pos < n:
                n1 = cur_pos
                n2 = cur_pos + mt_win_ratio
                if n2 > n:
                    n2 = n
                cur_st_features = short_term_features[i][n1:n2]
                mt_features_to_reduce[i].append(np.mean(cur_st_features))
                mt_features_to_reduce[i + num_of_features].append(np.std(cur_st_features))
                cur_pos += mt_step_ratio
        mt_features_to_reduce = np.array(mt_features_to_reduce)
        mt_features_to_reduce2 = np.zeros((mt_features_to_reduce.shape[0] + len(class_names1) + len(class_names2),
                                           mt_features_to_reduce.shape[1]))
        for i in range(mt_features_to_reduce.shape[1]):
            cur_f1 = (mt_features_to_reduce[:, i] - mean1) / std1
            cur_f2 = (mt_features_to_reduce[:, i] - mean2) / std2
            result, p1 = aT.classifierWrapper(classifier1, "knn", cur_f1)
            result, p2 = aT.classifierWrapper(classifier2, "knn", cur_f2)
            mt_features_to_reduce2[0:mt_features_to_reduce.shape[0], i] = mt_features_to_reduce[:, i]
            mt_features_to_reduce2[mt_features_to_reduce.shape[0]:mt_features_to_reduce.shape[0] + len(class_names1),
            i] = p1 + 0.0001
            mt_features_to_reduce2[mt_features_to_reduce.shape[0] + len(class_names1)::, i] = p2 + 0.0001
        mt_features_to_reduce = mt_features_to_reduce2
        mt_features_to_reduce = mt_features_to_reduce[i_features_select, :]
        # mtFeaturesToReduce += np.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010
        mt_features_to_reduce, mean, std = aT.normalizeFeatures([mt_features_to_reduce.T])
        mt_features_to_reduce = mt_features_to_reduce[0].T
        # DistancesAll = np.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0)
        # MDistancesAll = np.mean(DistancesAll)
        # iNonOutLiers2 = np.nonzero(DistancesAll < 3.0*MDistancesAll)[0]
        # mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2]
        labels = np.zeros((mt_features_to_reduce.shape[1],))
        lda_step = 1.0
        lda_step_ratio = lda_step / st_win
        # print(LDAstep, LDAstepRatio
        for i in range(labels.shape[0]):
            labels[i] = int(i * st_win / lda_step_ratio)
        clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(n_components=lda_dim)
        clf.fit(mt_features_to_reduce.T, labels)
        mid_term_features_norm = (clf.transform(mid_term_features_norm.T)).T

    if num_speaker <= 0:
        s_range = range(2, 10)
    else:
        s_range = [num_speaker]
    cls_all = []
    sil_all = []
    centers_all = []
    # (26, 314)
    print('mid_term_features_norm', mid_term_features_norm.shape)
    for i_speakers in s_range:
        k_means = sklearn.cluster.KMeans(n_clusters=i_speakers)
        k_means.fit(mid_term_features_norm.T)
        cls = k_means.labels_
        means = k_means.cluster_centers_

        # Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T))
        cls_all.append(cls)
        centers_all.append(means)
        sil_a = []
        sil_b = []
        for c in range(i_speakers):  # for each speaker (i.e. for each extracted cluster)
            cluster_percent = np.nonzero(cls == c)[0].shape[0] / float(len(cls))
            if cluster_percent < 0.020:
                sil_a.append(0.0)
                sil_b.append(0.0)
            else:
                mid_term_features_norm_temp = mid_term_features_norm[:, cls == c]  # get subset of feature vectors
                # compute average distance between samples that belong to the cluster (a values)
                yt = distance.pdist(mid_term_features_norm_temp.T)
                sil_a.append(np.mean(yt) * cluster_percent)
                sil_bs = []
                for c2 in range(i_speakers):  # compute distances from samples of other clusters
                    if c2 != c:
                        cluster_percent2 = np.nonzero(cls == c2)[0].shape[0] / float(len(cls))
                        mid_term_features_norm_temp2 = mid_term_features_norm[:, cls == c2]
                        yt = distance.cdist(mid_term_features_norm_temp.T, mid_term_features_norm_temp2.T)
                        sil_bs.append(np.mean(yt) * (cluster_percent + cluster_percent2) / 2.0)
                sil_bs = np.array(sil_bs)
                # ... and keep the minimum value (i.e. the distance from the "nearest" cluster)
                sil_b.append(min(sil_bs))
        sil_a = np.array(sil_a)
        sil_b = np.array(sil_b)
        sil = []
        for c in range(i_speakers):  # for each cluster (speaker)
            sil.append((sil_b[c] - sil_a[c]) / (max(sil_b[c], sil_a[c]) + 0.00001))  # compute silhouette
        sil_all.append(np.mean(sil))  # keep the AVERAGE SILLOUETTE

    # silAll = silAll * (1.0/(np.power(np.array(sRange),0.5)))
    imax = np.argmax(sil_all)  # position of the maximum sillouette value
    n_speakers_final = s_range[imax]  # optimal number of clusters

    # generate the final set of cluster labels
    # (important: need to retrieve the outlier windows:
    # this is achieved by giving them the value of their nearest non-outlier window)
    cls = np.zeros((num_of_windows,))
    for i in range(num_of_windows):
        j = np.argmin(np.abs(i - i_non_out_liers))
        cls[i] = cls_all[imax][j]

    # Post-process method 1: hmm smoothing
    for i in range(1):
        startprob, transmat, means, cov = trainHMM_computeStatistics(mid_term_features_norm_or, cls)
        hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], "diag")  # hmm training
        hmm.startprob_ = startprob
        hmm.transmat_ = transmat
        hmm.means_ = means
        hmm.covars_ = cov
        cls = hmm.predict(mid_term_features_norm_or.T)

        # Post-process method 2: median filtering:
    cls = scipy.signal.medfilt(cls, 13)
    cls = scipy.signal.medfilt(cls, 11)

    sil = sil_all[imax]  # final sillouette
    class_names = ["speaker{0:d}".format(c) for c in range(n_speakers_final)]

    # load ground-truth if available
    gt_file = file_name.replace('.wav', '.segments')  # open for annotated file
    if os.path.isfile(gt_file):  # if groundturh exists
        seg_start, seg_end, seg_labels = readSegmentGT(gt_file)  # read GT data
        flags_gt, class_names_gt = segs2flags(seg_start, seg_end, seg_labels, mt_step)  # convert to flags

    x = np.arange(len(cls)) * mt_step + mt_step / 2.0
    if plot:
        fig = plt.figure()
        if num_speaker > 0:
            ax1 = fig.add_subplot(111)
        else:
            ax1 = fig.add_subplot(211)
        ax1.set_yticks(np.array(range(len(class_names))))
        ax1.axis((0, duration, -1, len(class_names)))
        ax1.set_yticklabels(class_names)
        ax1.plot(x, cls)

    if os.path.isfile(gt_file):
        if plot:
            ax1.plot(np.array(range(len(flags_gt))) * mt_step + mt_step / 2.0, flags_gt, 'r')
        purity_cluster_mean, purity_speaker_mean = evaluateSpeakerDiarization(cls, flags_gt)
        print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_mean, 100 * purity_speaker_mean))
        if plot:
            plt.title("Cluster purity: {0:.1f}% - Speaker purity: {1:.1f}%".format(100 * purity_cluster_mean,
                                                                                   100 * purity_speaker_mean))
    if plot:
        plt.xlabel("time (seconds)")
        # print(sRange, silAll)
        if num_speaker <= 0:
            plt.subplot(212)
            plt.plot(s_range, sil_all)
            plt.xlabel("number of clusters")
            plt.ylabel("average clustering's sillouette")
        plt.show()
    return x, cls
def mtFileClassification(input_file,
                         model_name,
                         model_type,
                         plot_results=False,
                         gt_file=""):
    '''
	This function performs mid-term classification of an audio stream.
	Towards this end, supervised knowledge is used, i.e. a pre-trained classifier.
	ARGUMENTS:
		- input_file:        path of the input WAV file
		- model_name:        name of the classification model
		- model_type:        svm or knn depending on the classifier type
		- plot_results:      True if results are to be plotted using
							 matplotlib along with a set of statistics

	RETURNS:
		  - segs:           a sequence of segment's endpoints: segs[i] is the
							endpoint of the i-th segment (in seconds)
		  - classes:        a sequence of class flags: class[i] is the
							class ID of the i-th segment
	'''

    if not os.path.isfile(model_name):
        print("mtFileClassificationError: input model_type not found!")
        return (-1, -1, -1, -1)
    # Load classifier:
    if model_type == "knn":
        [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step, compute_beat] = \
         aT.load_model_knn(model_name)
    else:
        [
            classifier, MEAN, STD, class_names, mt_win, mt_step, st_win,
            st_step, compute_beat
        ] = aT.load_model(model_name)

    if compute_beat:
        print("Model " + model_name + " contains long-term music features "
              "(beat etc) and cannot be used in "
              "segmentation")
        return (-1, -1, -1, -1)
    [fs, x] = audioBasicIO.readAudioFile(input_file)  # load input file
    if fs == -1:  # could not read file
        return (-1, -1, -1, -1)
    x = audioBasicIO.stereo2mono(x)  # convert stereo (if) to mono
    duration = len(x) / fs
    # mid-term feature extraction:
    [mt_feats, _, _] = aF.mtFeatureExtraction(x, fs, mt_win * fs, mt_step * fs,
                                              round(fs * st_win),
                                              round(fs * st_step))
    flags = []
    Ps = []
    flags_ind = []
    for i in range(
            mt_feats.shape[1]
    ):  # for each feature vector (i.e. for each fix-sized segment):
        cur_fv = (mt_feats[:, i] -
                  MEAN) / STD  # normalize current feature vector
        [res, P] = aT.classifierWrapper(classifier, model_type,
                                        cur_fv)  # classify vector
        flags_ind.append(res)
        flags.append(class_names[int(res)])  # update class label matrix
        Ps.append(numpy.max(P))  # update probability matrix
    flags_ind = numpy.array(flags_ind)

    # 1-window smoothing
    for i in range(1, len(flags_ind) - 1):
        if flags_ind[i - 1] == flags_ind[i + 1]:
            flags_ind[i] = flags_ind[i + 1]
    # convert fix-sized flags to segments and classes
    (segs, classes) = flags2segs(flags, mt_step)
    segs[-1] = len(x) / float(fs)

    # Load grount-truth:
    if os.path.isfile(gt_file):
        [seg_start_gt, seg_end_gt, seg_l_gt] = readSegmentGT(gt_file)
        flags_gt, class_names_gt = segs2flags(seg_start_gt, seg_end_gt,
                                              seg_l_gt, mt_step)
        flags_ind_gt = []
        for j, fl in enumerate(flags_gt):
            # "align" labels with GT
            if class_names_gt[flags_gt[j]] in class_names:
                flags_ind_gt.append(
                    class_names.index(class_names_gt[flags_gt[j]]))
            else:
                flags_ind_gt.append(-1)
        flags_ind_gt = numpy.array(flags_ind_gt)
        cm = numpy.zeros((len(class_names_gt), len(class_names_gt)))
        for i in range(min(flags_ind.shape[0], flags_ind_gt.shape[0])):
            cm[int(flags_ind_gt[i]), int(flags_ind[i])] += 1
    else:
        cm = []
        flags_ind_gt = numpy.array([])
    acc = plotSegmentationResults(flags_ind, flags_ind_gt, class_names,
                                  mt_step, not plot_results)
    if acc >= 0:
        print("Overall Accuracy: {0:.3f}".format(acc))
        return (flags_ind, class_names_gt, acc, cm)
    else:
        return (flags_ind, class_names, acc, cm)
def recordAnalyzeAudio(duration, outputWavFile, midTermBufferSizeSec,
                       modelName, modelType):
    '''
	recordAnalyzeAudio(duration, outputWavFile, midTermBufferSizeSec, modelName, modelType)

	This function is used to record and analyze audio segments, in a fix window basis.

	ARGUMENTS: 
	- duration			total recording duration
	- outputWavFile			path of the output WAV file
	- midTermBufferSizeSec		(fix)segment length in seconds
	- modelName			classification model name
	- modelType			classification model type

	'''
    if modelType == 'neuralnet':
        neuralNetClassidication(duration, midTermBufferSizeSec, modelName)
    else:

        if modelType == 'svm':
            [
                Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin,
                stStep, computeBEAT
            ] = aT.loadSVModel(modelName)
        elif modelType == 'knn':
            [
                Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin,
                stStep, computeBEAT
            ] = aT.loadKNNModel(modelName)
        else:
            Classifier = None

        inp = alsaaudio.PCM(alsaaudio.PCM_CAPTURE, alsaaudio.PCM_NONBLOCK)
        inp.setchannels(1)
        inp.setrate(Fs)
        inp.setformat(alsaaudio.PCM_FORMAT_S16_LE)
        inp.setperiodsize(512)
        midTermBufferSize = int(midTermBufferSizeSec * Fs)
        allData = []
        midTermBuffer = []
        curWindow = []
        count = 0
        # a sequence of samples
        # process a sequence
        # speed
        # emergency vehicle detection what have they done? emergency vehicle classification patents
        # plot features!!!
        # patents extracted!
        # latex literature review
        # writing a paper
        #
        while len(allData) < duration * Fs:
            # Read data from device
            l, data = inp.read()
            if l:
                for i in range(l):
                    curWindow.append(audioop.getsample(data, 2, i))
                if (len(curWindow) + len(midTermBuffer) > midTermBufferSize):
                    samplesToCopyToMidBuffer = midTermBufferSize - len(
                        midTermBuffer)
                else:
                    samplesToCopyToMidBuffer = len(curWindow)
                midTermBuffer = midTermBuffer + curWindow[
                    0:samplesToCopyToMidBuffer]
                del (curWindow[0:samplesToCopyToMidBuffer])
            if len(midTermBuffer) == midTermBufferSize:
                count += 1
                if Classifier != None:
                    [mtFeatures, stFeatures
                     ] = aF.mtFeatureExtraction(midTermBuffer, Fs, 2.0 * Fs,
                                                2.0 * Fs, 0.020 * Fs,
                                                0.020 * Fs)
                    curFV = (mtFeatures[:, 0] - MEAN) / STD
                    [result, P] = aT.classifierWrapper(Classifier, modelType,
                                                       curFV)
                    print classNames[int(result)]
                allData = allData + midTermBuffer

                plt.clf()
                plt.plot(midTermBuffer)
                plt.show(block=False)
                plt.draw()

                midTermBuffer = []

        allDataArray = numpy.int16(allData)
        wavfile.write(outputWavFile, Fs, allDataArray)
def recordAnalyzeAudio(duration, outputWavFile, midTermBufferSizeSec, modelName, modelType):
	'''
	recordAnalyzeAudio(duration, outputWavFile, midTermBufferSizeSec, modelName, modelType)

	This function is used to record and analyze audio segments, in a fix window basis.

	ARGUMENTS: 
	- duration			total recording duration
	- outputWavFile			path of the output WAV file
	- midTermBufferSizeSec		(fix)segment length in seconds
	- modelName			classification model name
	- modelType			classification model type

	'''

	if modelType=='svm':
		[Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, compute_beat] = aT.load_model(modelName)
	elif modelType=='knn':
		[Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, compute_beat] = aT.load_model_knn(modelName)
	else:
		Classifier = None

	inp = alsaaudio.PCM(alsaaudio.PCM_CAPTURE, alsaaudio.PCM_NONBLOCK)
	inp.setchannels(1)
	inp.setrate(Fs)
	inp.setformat(alsaaudio.PCM_FORMAT_S16_LE)
	inp.setperiodsize(512)
	midTermBufferSize = int(midTermBufferSizeSec * Fs)
	allData = []
	midTermBuffer = []
	curWindow = []
	count = 0

	while len(allData)<duration*Fs:
		# Read data from device
		l,data = inp.read()
	    	if l:
			for i in range(l):
				curWindow.append(audioop.getsample(data, 2, i))		
			if (len(curWindow)+len(midTermBuffer)>midTermBufferSize):
				samplesToCopyToMidBuffer = midTermBufferSize - len(midTermBuffer)
			else:
				samplesToCopyToMidBuffer = len(curWindow)
			midTermBuffer = midTermBuffer + curWindow[0:samplesToCopyToMidBuffer];
			del(curWindow[0:samplesToCopyToMidBuffer])
		if len(midTermBuffer) == midTermBufferSize:
			count += 1						
			if Classifier!=None:
				[mtFeatures, stFeatures, _] = aF.mtFeatureExtraction(midTermBuffer, Fs, 2.0*Fs, 2.0*Fs, 0.020*Fs, 0.020*Fs)
				curFV = (mtFeatures[:,0] - MEAN) / STD;
				[result, P] = aT.classifierWrapper(Classifier, modelType, curFV)
				print classNames[int(result)]
			allData = allData + midTermBuffer

			plt.clf()
			plt.plot(midTermBuffer)
			plt.show(block = False)
			plt.draw()


			midTermBuffer = []

	allDataArray = numpy.int16(allData)
	wavfile.write(outputWavFile, Fs, allDataArray)
def speakerDiarization(fileName,
                       numOfSpeakers,
                       mtSize=2.0,
                       mtStep=0.2,
                       stWin=0.05,
                       LDAdim=35,
                       PLOT=False):
    '''
    ARGUMENTS:
        - fileName:        the name of the WAV file to be analyzed
        - numOfSpeakers    the number of speakers (clusters) in the recording (<=0 for unknown)
        - mtSize (opt)     mid-term window size
        - mtStep (opt)     mid-term window step
        - stWin  (opt)     short-term window size
        - LDAdim (opt)     LDA dimension (0 for no LDA)
        - PLOT     (opt)   0 for not plotting the results 1 for plottingy
    '''
    [Fs, x] = audioBasicIO.readAudioFile(fileName)
    x = audioBasicIO.stereo2mono(x)
    Duration = len(x) / Fs

    [
        Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1,
        stStep1, computeBEAT1
    ] = aT.loadKNNModel(os.path.join("data", "knnSpeakerAll"))
    [
        Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2,
        stStep2, computeBEAT2
    ] = aT.loadKNNModel(os.path.join("data", "knnSpeakerFemaleMale"))

    [MidTermFeatures,
     ShortTermFeatures] = aF.mtFeatureExtraction(x, Fs,
                                                 mtSize * Fs, mtStep * Fs,
                                                 round(Fs * stWin),
                                                 round(Fs * stWin * 0.5))

    MidTermFeatures2 = numpy.zeros(
        (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2),
         MidTermFeatures.shape[1]))

    for i in range(MidTermFeatures.shape[1]):
        curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1
        curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2
        [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
        [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
        MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i]
        MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0] +
                         len(classNames1), i] = P1 + 0.0001
        MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1)::,
                         i] = P2 + 0.0001

    MidTermFeatures = MidTermFeatures2  # TODO
    # SELECT FEATURES:
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20];                                                                                         # SET 0A
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100];                                                                                 # SET 0B
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,
    #   97,98, 99,100];     # SET 0C

    iFeaturesSelect = [
        8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45,
        46, 47, 48, 49, 50, 51, 52, 53
    ]  # SET 1A
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100];                                          # SET 1B
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100];     # SET 1C

    #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53];             # SET 2A
    #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100];     # SET 2B
    #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100];     # SET 2C

    #iFeaturesSelect = range(100);                                                                                                    # SET 3
    #MidTermFeatures += numpy.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010

    MidTermFeatures = MidTermFeatures[iFeaturesSelect, :]

    (MidTermFeaturesNorm, MEAN,
     STD) = aT.normalizeFeatures([MidTermFeatures.T])
    MidTermFeaturesNorm = MidTermFeaturesNorm[0].T
    numOfWindows = MidTermFeatures.shape[1]

    # remove outliers:
    DistancesAll = numpy.sum(distance.squareform(
        distance.pdist(MidTermFeaturesNorm.T)),
                             axis=0)
    MDistancesAll = numpy.mean(DistancesAll)
    iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0]

    # TODO: Combine energy threshold for outlier removal:
    #EnergyMin = numpy.min(MidTermFeatures[1,:])
    #EnergyMean = numpy.mean(MidTermFeatures[1,:])
    #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    #iNonOutLiers = numpy.nonzero(MidTermFeatures[1,:] > Thres)[0]
    #print iNonOutLiers

    perOutLier = (100.0 *
                  (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows
    MidTermFeaturesNormOr = MidTermFeaturesNorm
    MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers]

    # LDA dimensionality reduction:
    if LDAdim > 0:
        #[mtFeaturesToReduce, _] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, stWin * Fs, round(Fs*stWin), round(Fs*stWin));
        # extract mid-term features with minimum step:
        mtWinRatio = int(round(mtSize / stWin))
        mtStepRatio = int(round(stWin / stWin))
        mtFeaturesToReduce = []
        numOfFeatures = len(ShortTermFeatures)
        numOfStatistics = 2
        #for i in range(numOfStatistics * numOfFeatures + 1):
        for i in range(numOfStatistics * numOfFeatures):
            mtFeaturesToReduce.append([])

        for i in range(numOfFeatures):  # for each of the short-term features:
            curPos = 0
            N = len(ShortTermFeatures[i])
            while (curPos < N):
                N1 = curPos
                N2 = curPos + mtWinRatio
                if N2 > N:
                    N2 = N
                curStFeatures = ShortTermFeatures[i][N1:N2]
                mtFeaturesToReduce[i].append(numpy.mean(curStFeatures))
                mtFeaturesToReduce[i + numOfFeatures].append(
                    numpy.std(curStFeatures))
                curPos += mtStepRatio
        mtFeaturesToReduce = numpy.array(mtFeaturesToReduce)
        mtFeaturesToReduce2 = numpy.zeros(
            (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2),
             mtFeaturesToReduce.shape[1]))
        for i in range(mtFeaturesToReduce.shape[1]):
            curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1
            curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2
            [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
            [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
            mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0],
                                i] = mtFeaturesToReduce[:, i]
            mtFeaturesToReduce2[
                mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0] +
                len(classNames1), i] = P1 + 0.0001
            mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] +
                                len(classNames1)::, i] = P2 + 0.0001
        mtFeaturesToReduce = mtFeaturesToReduce2
        mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :]
        #mtFeaturesToReduce += numpy.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010
        (mtFeaturesToReduce, MEAN,
         STD) = aT.normalizeFeatures([mtFeaturesToReduce.T])
        mtFeaturesToReduce = mtFeaturesToReduce[0].T
        #DistancesAll = numpy.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0)
        #MDistancesAll = numpy.mean(DistancesAll)
        #iNonOutLiers2 = numpy.nonzero(DistancesAll < 3.0*MDistancesAll)[0]
        #mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2]
        Labels = numpy.zeros((mtFeaturesToReduce.shape[1], ))
        LDAstep = 1.0
        LDAstepRatio = LDAstep / stWin
        #print LDAstep, LDAstepRatio
        for i in range(Labels.shape[0]):
            Labels[i] = int(i * stWin / LDAstepRatio)
        clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(
            n_components=LDAdim)
        clf.fit(mtFeaturesToReduce.T, Labels)
        MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T

    if numOfSpeakers <= 0:
        sRange = range(2, 10)
    else:
        sRange = [numOfSpeakers]
    clsAll = []
    silAll = []
    centersAll = []

    for iSpeakers in sRange:
        k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers)
        k_means.fit(MidTermFeaturesNorm.T)
        cls = k_means.labels_
        means = k_means.cluster_centers_

        # Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T))
        clsAll.append(cls)
        centersAll.append(means)
        silA = []
        silB = []
        for c in range(iSpeakers
                       ):  # for each speaker (i.e. for each extracted cluster)
            clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float(
                len(cls))
            if clusterPerCent < 0.020:
                silA.append(0.0)
                silB.append(0.0)
            else:
                MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls ==
                                                              c]  # get subset of feature vectors
                Yt = distance.pdist(
                    MidTermFeaturesNormTemp.T
                )  # compute average distance between samples that belong to the cluster (a values)
                silA.append(numpy.mean(Yt) * clusterPerCent)
                silBs = []
                for c2 in range(
                        iSpeakers
                ):  # compute distances from samples of other clusters
                    if c2 != c:
                        clusterPerCent2 = numpy.nonzero(
                            cls == c2)[0].shape[0] / float(len(cls))
                        MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:,
                                                                       cls ==
                                                                       c2]
                        Yt = distance.cdist(MidTermFeaturesNormTemp.T,
                                            MidTermFeaturesNormTemp2.T)
                        silBs.append(
                            numpy.mean(Yt) *
                            (clusterPerCent + clusterPerCent2) / 2.0)
                silBs = numpy.array(silBs)
                silB.append(
                    min(silBs)
                )  # ... and keep the minimum value (i.e. the distance from the "nearest" cluster)
        silA = numpy.array(silA)
        silB = numpy.array(silB)
        sil = []
        for c in range(iSpeakers):  # for each cluster (speaker)
            sil.append((silB[c] - silA[c]) /
                       (max(silB[c], silA[c]) + 0.00001))  # compute silhouette

        silAll.append(numpy.mean(sil))  # keep the AVERAGE SILLOUETTE

    #silAll = silAll * (1.0/(numpy.power(numpy.array(sRange),0.5)))
    imax = numpy.argmax(silAll)  # position of the maximum sillouette value
    nSpeakersFinal = sRange[imax]  # optimal number of clusters

    # generate the final set of cluster labels
    # (important: need to retrieve the outlier windows: this is achieved by giving them the value of their nearest non-outlier window)
    cls = numpy.zeros((numOfWindows, ))
    for i in range(numOfWindows):
        j = numpy.argmin(numpy.abs(i - iNonOutLiers))
        cls[i] = clsAll[imax][j]

    # Post-process method 1: hmm smoothing
    for i in range(1):
        startprob, transmat, means, cov = trainHMM_computeStatistics(
            MidTermFeaturesNormOr, cls)
        hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0],
                                       "diag")  # hmm training
        hmm.startprob_ = startprob
        hmm.transmat_ = transmat
        hmm.means_ = means
        hmm.covars_ = cov
        cls = hmm.predict(MidTermFeaturesNormOr.T)

    # Post-process method 2: median filtering:
    cls = scipy.signal.medfilt(cls, 13)
    cls = scipy.signal.medfilt(cls, 11)

    sil = silAll[imax]  # final sillouette
    classNames = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)]

    #debug
    segslist = [list() for x in range(numOfSpeakers)]
    start = 0
    for i in range(0, len(cls) - 1):
        if cls[i] != cls[i + 1]:
            segTemp = dict()
            segTemp['start'] = start
            segTemp['end'] = i * mtStep + mtStep
            speakerID = int(cls[i])
            print speakerID, segTemp
            segslist[speakerID].append(segTemp)
            start = segTemp['end']
    segTemp = dict()
    segTemp['start'] = start
    segTemp['end'] = (len(cls) - 1) * mtStep + mtStep
    speakerID = int(cls[-1])
    print speakerID
    print segTemp
    segslist[speakerID].append(segTemp)
    print segslist
    conversation = list()
    sound = AudioSegment.from_file(fileName)
    for speakerID, speaker in enumerate(segslist):
        for segID, seg in enumerate(speaker):
            chunk = sound[seg['start'] * 1000:seg['end'] * 1000]
            output_name = 'speaker{}_{}.wav'.format(speakerID, segID)
            chunk.export(output_name, format="wav")
            r = sr.Recognizer()
            with sr.AudioFile(output_name) as source:
                audio = r.record(source)  # read the entire audio file
                # recognize speech using Sphinx
                try:
                    print("Sphinx thinks you said: " +
                          r.recognize_sphinx(audio))
                    content = dict()
                    content['text'] = r.recognize_sphinx(audio)
                    content['speakerID'] = speakerID
                    content['start'] = seg['start']
                    conversation.append(content)
                except sr.UnknownValueError:
                    print("Sphinx could not understand audio")
                except sr.RequestError as e:
                    print("Sphinx error; {0}".format(e))

    conversation.sort(key=operator.itemgetter('start'))
    text_file = open('text.txt', 'w')
    for c in conversation:
        line = 'Speaker{}: {}\n'.format(c['speakerID'], c['text'])
        text_file.write(line)

    print conversation
    return cls
Exemplo n.º 9
0
def speakerDiarization(fileName, numOfSpeakers, mtSize=2.0, mtStep=0.2, stWin=0.05, LDAdim=35, PLOT=False):
    '''
    ARGUMENTS:
        - fileName:        the name of the WAV file to be analyzed
        - numOfSpeakers    the number of speakers (clusters) in the recording (<=0 for unknown)
        - mtSize (opt)     mid-term window size
        - mtStep (opt)     mid-term window step
        - stWin  (opt)     short-term window size
        - LDAdim (opt)     LDA dimension (0 for no LDA)
        - PLOT     (opt)   0 for not plotting the results 1 for plottingy
    '''
    [Fs, x] = audioBasicIO.readAudioFile(fileName)
    x = audioBasicIO.stereo2mono(x)
    Duration = len(x) / Fs

    [Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.loadKNNModel(
        "data/knnSpeakerAll")
    [Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.loadKNNModel(
        "data/knnSpeakerFemaleMale")

    [MidTermFeatures, ShortTermFeatures] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin),
                                                                  round(Fs * stWin * 0.5))

    MidTermFeatures2 = numpy.zeros(
        (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1]))

    for i in range(MidTermFeatures.shape[1]):
        curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1
        curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2
        [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
        [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
        MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i]
        MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0] + len(classNames1), i] = P1 + 0.0001
        MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1)::, i] = P2 + 0.0001

    MidTermFeatures = MidTermFeatures2  # TODO
    # SELECT FEATURES:
    # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20];                                                                                         # SET 0A
    # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100];                                                                                 # SET 0B
    # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,
    #   97,98, 99,100];     # SET 0C

    iFeaturesSelect = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
                       53]  # SET 1A
    # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100];                                          # SET 1B
    # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100];     # SET 1C

    # iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53];             # SET 2A
    # iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100];     # SET 2B
    # iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100];     # SET 2C

    # iFeaturesSelect = range(100);                                                                                                    # SET 3
    # MidTermFeatures += numpy.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010

    MidTermFeatures = MidTermFeatures[iFeaturesSelect, :]

    (MidTermFeaturesNorm, MEAN, STD) = aT.normalizeFeatures([MidTermFeatures.T])
    MidTermFeaturesNorm = MidTermFeaturesNorm[0].T
    numOfWindows = MidTermFeatures.shape[1]

    # remove outliers:
    DistancesAll = numpy.sum(distance.squareform(distance.pdist(MidTermFeaturesNorm.T)), axis=0)
    MDistancesAll = numpy.mean(DistancesAll)
    iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0]

    # TODO: Combine energy threshold for outlier removal:
    # EnergyMin = numpy.min(MidTermFeatures[1,:])
    # EnergyMean = numpy.mean(MidTermFeatures[1,:])
    # Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    # iNonOutLiers = numpy.nonzero(MidTermFeatures[1,:] > Thres)[0]
    # print iNonOutLiers

    perOutLier = (100.0 * (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows
    MidTermFeaturesNormOr = MidTermFeaturesNorm
    MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers]

    # LDA dimensionality reduction:
    if LDAdim > 0:
        # [mtFeaturesToReduce, _] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, stWin * Fs, round(Fs*stWin), round(Fs*stWin));
        # extract mid-term features with minimum step:
        mtWinRatio = int(round(mtSize / stWin))
        mtStepRatio = int(round(stWin / stWin))
        mtFeaturesToReduce = []
        numOfFeatures = len(ShortTermFeatures)
        numOfStatistics = 2
        # for i in range(numOfStatistics * numOfFeatures + 1):
        for i in range(numOfStatistics * numOfFeatures):
            mtFeaturesToReduce.append([])

        for i in range(numOfFeatures):  # for each of the short-term features:
            curPos = 0
            N = len(ShortTermFeatures[i])
            while (curPos < N):
                N1 = curPos
                N2 = curPos + mtWinRatio
                if N2 > N:
                    N2 = N
                curStFeatures = ShortTermFeatures[i][N1:N2]
                mtFeaturesToReduce[i].append(numpy.mean(curStFeatures))
                mtFeaturesToReduce[i + numOfFeatures].append(numpy.std(curStFeatures))
                curPos += mtStepRatio
        mtFeaturesToReduce = numpy.array(mtFeaturesToReduce)
        mtFeaturesToReduce2 = numpy.zeros(
            (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1]))
        for i in range(mtFeaturesToReduce.shape[1]):
            curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1
            curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2
            [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
            [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
            mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i]
            mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0] + len(classNames1),
            i] = P1 + 0.0001
            mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] + len(classNames1)::, i] = P2 + 0.0001
        mtFeaturesToReduce = mtFeaturesToReduce2
        mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :]
        # mtFeaturesToReduce += numpy.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010
        (mtFeaturesToReduce, MEAN, STD) = aT.normalizeFeatures([mtFeaturesToReduce.T])
        mtFeaturesToReduce = mtFeaturesToReduce[0].T
        # DistancesAll = numpy.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0)
        # MDistancesAll = numpy.mean(DistancesAll)
        # iNonOutLiers2 = numpy.nonzero(DistancesAll < 3.0*MDistancesAll)[0]
        # mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2]
        Labels = numpy.zeros((mtFeaturesToReduce.shape[1],))
        LDAstep = 1.0
        LDAstepRatio = LDAstep / stWin
        # print LDAstep, LDAstepRatio
        for i in range(Labels.shape[0]):
            Labels[i] = int(i * stWin / LDAstepRatio);
        clf = LDA(n_components=LDAdim)
        clf.fit(mtFeaturesToReduce.T, Labels, tol=0.000001)
        MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T

    if numOfSpeakers <= 0:
        sRange = range(2, 10)
    else:
        sRange = [numOfSpeakers]
    clsAll = []
    silAll = []
    centersAll = []

    for iSpeakers in sRange:
        cls, means, steps = mlpy.kmeans(MidTermFeaturesNorm.T, k=iSpeakers, plus=True)  # perform k-means clustering

        # YDist =   distance.pdist(MidTermFeaturesNorm.T, metric='euclidean')
        # print distance.squareform(YDist).shape
        # hc = mlpy.HCluster()
        # hc.linkage(YDist)
        # cls = hc.cut(14.5)
        # print cls

        # Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T))
        clsAll.append(cls)
        centersAll.append(means)
        silA = [];
        silB = []
        for c in range(iSpeakers):  # for each speaker (i.e. for each extracted cluster)
            clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float(len(cls))
            if clusterPerCent < 0.020:
                silA.append(0.0)
                silB.append(0.0)
            else:
                MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls == c]  # get subset of feature vectors
                Yt = distance.pdist(
                    MidTermFeaturesNormTemp.T)  # compute average distance between samples that belong to the cluster (a values)
                silA.append(numpy.mean(Yt) * clusterPerCent)
                silBs = []
                for c2 in range(iSpeakers):  # compute distances from samples of other clusters
                    if c2 != c:
                        clusterPerCent2 = numpy.nonzero(cls == c2)[0].shape[0] / float(len(cls))
                        MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:, cls == c2]
                        Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T)
                        silBs.append(numpy.mean(Yt) * (clusterPerCent + clusterPerCent2) / 2.0)
                silBs = numpy.array(silBs)
                silB.append(min(silBs))  # ... and keep the minimum value (i.e. the distance from the "nearest" cluster)
        silA = numpy.array(silA)
        silB = numpy.array(silB)
        sil = []
        for c in range(iSpeakers):  # for each cluster (speaker)
            sil.append((silB[c] - silA[c]) / (max(silB[c], silA[c]) + 0.00001))  # compute silhouette

        silAll.append(numpy.mean(sil))  # keep the AVERAGE SILLOUETTE

    # silAll = silAll * (1.0/(numpy.power(numpy.array(sRange),0.5)))
    imax = numpy.argmax(silAll)  # position of the maximum sillouette value
    nSpeakersFinal = sRange[imax]  # optimal number of clusters

    return nSpeakersFinal