def getStVectorPerWav(wavFile, stWin, stStep): # given a wav, get entire sT features [Fs, x] = getTotalAudio([wavFile]) ShortTermFeatures = aF.stFeatureExtraction(x, Fs, stWin * Fs, stStep * Fs) [featuresNormSS, MEANSS, STDSS ] = aT.normalizeFeatures([ShortTermFeatures]) # normalize to 0-mean 1-std [X, y] = featureListToVectors([featuresNormSS]) return X, y, Fs
def ExtractFeatures(newPath): [fs, x] = audioBasicIO.readAudioFile(newPath) mt_size, mt_step, st_win = 1, 1, 0.5 [mt_feats, st_feats, _] = mT(x, fs, mt_size * fs, mt_step * fs, round(fs * st_win), round(fs * st_win * 0.5)) (mt_feats_norm, MEAN, STD) = normalizeFeatures([mt_feats.T]) mt_feats_norm = mt_feats_norm[0] mt_feats_normal = mt_feats_norm[:55] return mt_feats_normal
def evaluateClassifier(argv): dirName = argv[2] useAccelerometer = ((argv[3]=="1") or (argv[3]=="2") or (argv[3]=="3") or (argv[3]=="4")) useAccelerometerOnlyX = (argv[3]=="1") useAccelerometerOnlyY = (argv[3]=="2") useAccelerometerOnlyZ = (argv[3]=="3") useImage = (argv[4]=="1") fileList = sorted(glob.glob(os.path.join(dirName, "*.csv"))) GTs = [] eX = [] eY = [] eZ = [] featuresAll = [] classNames = [] for i, m in enumerate(fileList): gt = int(ntpath.basename(m).split("_")[-1].replace(".csv","")) className = ntpath.basename(m).split("_")[1] if not className in classNames: classNames.append(className) featuresAll.append([]) #if gt>0: if True: GTs.append(gt) FeatureVectorFusion = featureExtraction(m, useAccelerometer, useAccelerometerOnlyX, useAccelerometerOnlyY, useAccelerometerOnlyZ, useImage) print FeatureVectorFusion.shape if len(featuresAll[classNames.index(className)])==0: featuresAll[classNames.index(className)] = FeatureVectorFusion else: featuresAll[classNames.index(className)] = numpy.vstack((featuresAll[classNames.index(className)], FeatureVectorFusion)) #featuresAll = featuresY (featuresAll, MEAN, STD) = aT.normalizeFeatures(featuresAll) #bestParam = aT.evaluateClassifier(featuresAll, classNames, 1000, "svm", [0.05, 0.1, 0.5, 1, 2,3, 5, 10, 15, 20, 25, 50, 100, 200], 0, perTrain=0.80) bestParam = aT.evaluateClassifier(featuresAll, classNames, 1000, "svm", [0.05, 0.1, 0.5], 0, perTrain=0.80) MEAN = MEAN.tolist() STD = STD.tolist() # STEP C: Save the classifier to file Classifier = aT.trainSVM(featuresAll, bestParam) modelName = argv[5] with open(modelName, 'wb') as fid: # save to file cPickle.dump(Classifier, fid) fo = open(modelName + "MEANS", "wb") cPickle.dump(MEAN, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(STD, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL) fo.close() '''
def ExtractFeatures(newPath): [fs, x] = audioBasicIO.readAudioFile(newPath) mt_size, mt_step, st_win = 1, 1, 0.5 [mt_feats, st_feats, _] = mT(x, fs, mt_size * fs, mt_step * fs, round(fs * st_win), round(fs * st_win * 0.5)) (mt_feats_norm, MEAN, STD) = normalizeFeatures([mt_feats.T]) mt_feats_norm = mt_feats_norm[0].T #F, name = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.025*Fs) #print np.shape(F) return mt_feats_norm
def train(files): #extract feature features, classes, filenames = aF.dirsWavFeatureExtraction( files, 1.0, 1.0, aT.shortTermWindow, aT.shortTermStep) #normalize [featuresNorm, MEAN, STD] = aT.normalizeFeatures(features) [X, Y] = aT.listOfFeatures2Matrix(featuresNorm) #train using SVM clf = sklearn.svm.SVC(kernel='linear', probability=True) clf.fit(X, Y) return clf, MEAN, STD
def trainNN(listOfDirs, mtWin, mtStep, stWin, stStep, computeBEAT=False): #Feature Extraction [features, classNames, _] = aF.dirsWavFeatureExtraction(listOfDirs, mtWin, mtStep, stWin, stStep, computeBEAT=computeBEAT) if len(features) == 0: print "feature ERROR" return numOfFeatures = features[0].shape[1] featureNames = ["features" + str(d + 1) for d in range(numOfFeatures)] aT.writeTrainDataToARFF(modelName, features, classNames, featureNames) for i, f in enumerate(features): if len(f) == 0: print "feature ERROR" return C = len(classNames) [featuresNorm, MEAN, STD] = aT.normalizeFeatures(features) # normalize features MEAN = MEAN.tolist() STD = STD.tolist() featuresNew = featuresNorm bestParam = evaluate(featuresNew, classNames, 100, numpy.array([1, 2, 3, 4, 5, 6]), 0, perTrain=0.80) clf = train(featuresNew, bestParam) with open(modelName, 'wb') as fid: cPickle.dump(clf, fid) fo = open(modelName + "MEANS", "wb") cPickle.dump(MEAN, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(STD, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mtWin, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mtStep, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(stWin, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(stStep, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(computeBEAT, fo, protocol=cPickle.HIGHEST_PROTOCOL) fo.close()
def selfSimilarityMatrix(featureVectors): ''' This function computes the self-similarity matrix for a sequence of feature vectors. ARGUMENTS: - featureVectors: a numpy matrix (nDims x nVectors) whose i-th column corresponds to the i-th feature vector RETURNS: - S: the self-similarity matrix (nVectors x nVectors) ''' [nDims, nVectors] = featureVectors.shape [featureVectors2, MEAN, STD] = aT.normalizeFeatures([featureVectors.T]) featureVectors2 = featureVectors2[0].T S = 1.0 - distance.squareform(distance.pdist(featureVectors2.T, 'cosine')) return S
def train_SVM(st_feats): st_energy = st_feats[1, :] en = np.sort(st_energy) l1 = int(len(en) / 10) t1 = np.mean(en[0:l1]) + 0.000000000000001 # 计算10%较低能量的均值,作为低阈值 t2 = np.mean(en[-l1:-1]) + 0.000000000000001 # 计算10%较高能量的均值,作为高阈值 class1 = st_feats[:, np.where(st_energy <= t1)[0]] # 将能量低于低阈值的帧,作为class1 class2 = st_feats[:, np.where(st_energy >= t2)[0]] # 将能量高于高阈值的帧,作为class2 feats_s = [class1.T, class2.T] # class1.T:(58,68)|class2.T:(38,68) [feats_s_norm, means_s, stds_s] = aT.normalizeFeatures(feats_s) # 标准化:减均值除方差 svm = aT.trainSVM(feats_s_norm, 1.0) return svm, means_s, stds_s
def trainDirs(self, dir_root): """ Train all wav files within the list of directories within dir The class name is derived as last entry after splitting /path/to/dir """ dir_list = glob.glob(dir_root+'/*') features=[] #is a list of feature matrices, one for each class self.classNames=[] for d in dir_list: log.logv('featurize %s\n' % (d)) self.classNames.append(d.split('/')[-1]) first = True class_features = np.array([]) for w in os.listdir(d) : if w.endswith('.wav') : _f = self.featurize(os.path.join(d, w)) # returns a matrix of numBlocks x numFeatures if first : first = False class_features = _f else: class_features = np.vstack((class_features, _f)) if class_features.shape[0] > 0 : #class features is a matrix M*Features features.append(class_features) classifierParams = np.array([0.001, 0.01, 0.5, 1.0, 5.0, 10.0]) # parameter mode 0 for best accuracy, 1 for best f1 score [featuresNew, self.MEAN, self.STD] = aT.normalizeFeatures(features) # normalize features bestParam = aT.evaluateClassifier(features, self.classNames, 100, "svm", classifierParams, 0, perTrain=0.90) print "Selected params: {0:.5f}".format(bestParam) # TODO # 1. normalize before evaluating? # 2. try gaussian kernel? self.Classifier = aT.trainSVM(featuresNew, bestParam)
def getTotalEnergyVector( folder_to_wavs ): # given a single list of wav paths, return their aggregate 10% vector [Fs, x] = getTotalAudio(folder_to_wavs) ShortTermFeatures = aF.stFeatureExtraction(x, Fs, stWin * Fs, stStep * Fs) EnergySt = ShortTermFeatures[1, :] E = np.sort(EnergySt) L1 = int(len(E) / 10) T1 = np.mean(E[0:L1]) + 0.000000000000001 T2 = np.mean( E[-L1:-1]) + 0.000000000000001 # compute "higher" 10% energy threshold Class1 = ShortTermFeatures[:, np.where( EnergySt <= T1)[0]] # get all features that correspond to low energy # Class1 = ShortTermFeatures[1,:][np.where(EnergySt <= T1)[0]] # purely energy Class2 = ShortTermFeatures[:, np.where( EnergySt >= T2)[0]] # get all features that correspond to high energy # Class2 = ShortTermFeatures[1,:][np.where(EnergySt >= T2)[0]] # purely energy featuresSS = [Class1.T, Class2.T] # form the binary classification task [featuresNormSS, MEANSS, STDSS] = aT.normalizeFeatures(featuresSS) # normalize to 0-mean 1-std [X, y] = featureListToVectors(featuresNormSS) return X, y, Fs
[res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1) [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2) MidTermFeatures2[0:mt_feats.shape[0], i] = mt_feats[:, i] MidTermFeatures2[mt_feats.shape[0]:mt_feats.shape[0] + len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[mt_feats.shape[0] + len(classNames1)::, i] = P2 + 0.0001 mt_feats = MidTermFeatures2 # TODO iFeaturesSelect = [ 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ] mt_feats = mt_feats[iFeaturesSelect, :] (mt_feats_norm, MEAN, STD) = aT.normalizeFeatures([mt_feats.T]) mt_feats_norm = mt_feats_norm[0].T n_wins = mt_feats.shape[1] # remove outliers: dist_all = np.sum(distance.squareform(distance.pdist(mt_feats_norm.T)), axis=0) m_dist_all = np.mean(dist_all) i_non_outliers = np.nonzero(dist_all < 1.2 * m_dist_all)[0] # TODO: Combine energy threshold for outlier removal: #EnergyMin = np.min(mt_feats[1,:]) #EnergyMean = np.mean(mt_feats[1,:]) #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 #i_non_outliers = np.nonzero(mt_feats[1,:] > Thres)[0] #print i_non_outliers
for a in classNames: temp = numpy.load( os.path.dirname(os.path.realpath(sys.argv[0])) + '/classifier_data/' + a + '.npy') features.append(temp) classifierParams = numpy.array([0.001, 0.01, 0.5, 1.0, 5.0]) nExp = 50 bestParam = audioTrainTest.evaluateClassifier(features, classNames, nExp, "svm", classifierParams, 0, perTrain=0.01) [featuresNorm, MEAN, STD] = audioTrainTest.normalizeFeatures(features) # normalize features MEAN = MEAN.tolist() STD = STD.tolist() featuresNew = featuresNorm Classifier = audioTrainTest.trainSVM(featuresNew, bestParam) Classifier.save_model( os.path.dirname(os.path.realpath(sys.argv[0])) + '/classifier_data/' + modelName) fo = open( os.path.dirname(os.path.realpath(sys.argv[0])) + '/classifier_data/' + modelName + "MEANS", "wb") cPickle.dump(MEAN, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(STD, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(0, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(0, fo, protocol=cPickle.HIGHEST_PROTOCOL)
def main(rootName, modelType, classifierParam, signal_type): CMall = numpy.zeros((2, 2)) if modelType != "svm" and modelType != "svm_rbf": C = [int(classifierParam)] else: C = [(classifierParam)] F1s = [] Accs = [] for ifold in range(0, 10): # for each fold dirName = rootName + os.sep + "fold_{0:d}".format( ifold) # get fold path name classNamesTrain, featuresTrain = dirFeatureExtraction([ os.path.join(dirName, "train", "fail"), os.path.join(dirName, "train", "success") ], signal_type) # TRAINING data feature extraction bestParam = aT.evaluateClassifier( featuresTrain, classNamesTrain, 2, modelType, C, 0, 0.90) # internal cross-validation (for param selection) classNamesTest, featuresTest = dirFeatureExtraction([ os.path.join(dirName, "test", "fail"), os.path.join(dirName, "test", "success") ], signal_type) # trainGradientBoosting data feature extraction [featuresTrainNew, MEAN, STD] = aT.normalizeFeatures( featuresTrain) # training features NORMALIZATION if modelType == "svm": # classifier training Classifier = aT.trainSVM(featuresTrainNew, bestParam) elif modelType == "svm_rbf": Classifier = aT.trainSVM_RBF(featuresTrainNew, bestParam) elif modelType == "randomforest": Classifier = aT.trainRandomForest(featuresTrainNew, bestParam) elif modelType == "gradientboosting": Classifier = aT.trainGradientBoosting(featuresTrainNew, bestParam) elif modelType == "extratrees": Classifier = aT.trainExtraTrees(featuresTrainNew, bestParam) CM = numpy.zeros((2, 2)) # evaluation on testing data for iC, f in enumerate(featuresTest): # for each class for i in range( f.shape[0]): # for each testing sample (feature vector) curF = f[i, :] # get feature vector curF = (curF - MEAN) / STD # normalize test feature vector winnerClass = classNamesTrain[int( aT.classifierWrapper( Classifier, modelType, curF)[0])] # classify and get winner class trueClass = classNamesTest[iC] # get groundtruth class CM[classNamesTrain.index(trueClass)][classNamesTrain.index( winnerClass)] += 1 # update confusion matrix CMall += CM # update overall confusion matrix Recall, Precision, F1 = computePreRec( CM, classNamesTrain) # get recall, precision and F1 (per class) Acc = numpy.diagonal(CM).sum() / CM.sum() # get overall accuracy F1s.append(numpy.mean(F1)) # append average F1 Accs.append(Acc) # append clasification accuracy print print "FINAL RESULTS" print print "----------------------------------" print "fold\tacc\tf1" print "----------------------------------" for i in range(len(F1s)): print "{0:d}\t{1:.1f}\t{2:.1f}".format(i, 100 * Accs[i], 100 * F1s[i]) Acc = numpy.diagonal(CMall).sum() / CMall.sum() Recall, Precision, F1 = computePreRec(CMall, classNamesTrain) print "----------------------------------" print "{0:s}\t{1:.1f}\t{2:.1f}".format("Avg", 100 * numpy.mean(Accs), 100 * numpy.mean(F1s)) print "{0:s}\t{1:.1f}\t{2:.1f}".format("Av CM", 100 * Acc, 100 * numpy.mean(F1)) print "----------------------------------" print print "Overal Confusion matrix:" aT.printConfusionMatrix(CMall, classNamesTrain) print print "FAIL Recall = {0:.1f}".format(100 * Recall[classNamesTrain.index("fail")]) print "FAIL Precision = {0:.1f}".format( 100 * Precision[classNamesTrain.index("fail")]) print "SUCCESS Recall = {0:.1f}".format( 100 * Recall[classNamesTrain.index("success")]) print "SUCCESS Precision = {0:.1f}".format( 100 * Precision[classNamesTrain.index("success")]) return CMall, Acc, Recall, Precision, F1
def speakerDiarization(fileName, numOfSpeakers, mtSize=2.0, mtStep=0.2, stWin=0.05, LDAdim=0, PLOT=False): ''' ARGUMENTS: - fileName: the name of the WAV file to be analyzed - numOfSpeakers the number of speakers (clusters) in the recording (<=0 for unknown) - mtSize (opt) mid-term window size - mtStep (opt) mid-term window step - stWin (opt) short-term window size - LDAdim (opt) LDA dimension (0 for no LDA) - PLOT (opt) 0 for not plotting the results 1 for plottingy ''' [Fs, x] = pyAudioAnalysis.audioBasicIO.readAudioFile(fileName) x = pyAudioAnalysis.audioBasicIO.stereo2mono(x) Duration = len(x) / Fs #[Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.loadKNNModel(os.path.join("data","knnSpeakerAll")) #[Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.loadKNNModel(os.path.join("data","knnSpeakerFemaleMale")) [Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.loadKNNModel("pyAudioAnalysis/data/knnSpeakerAll") [Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.loadKNNModel("pyAudioAnalysis/data/knnSpeakerFemaleMale") [MidTermFeatures, ShortTermFeatures] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin), round(Fs*stWin * 0.5)) MidTermFeatures2 = numpy.zeros((MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1])) for i in range(MidTermFeatures.shape[1]): curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1 curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2 [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1) [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2) MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i] MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0]+len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1)::, i] = P2 + 0.0001 MidTermFeatures = MidTermFeatures2 # TODO # SELECT FEATURES: #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20]; # SET 0A #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100]; # SET 0B #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96, # 97,98, 99,100]; # SET 0C iFeaturesSelect = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53] # SET 1A #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; # SET 1B #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 1C #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53]; # SET 2A #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; # SET 2B #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 2C #iFeaturesSelect = range(100); # SET 3 #MidTermFeatures += numpy.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010 MidTermFeatures = MidTermFeatures[iFeaturesSelect, :] (MidTermFeaturesNorm, MEAN, STD) = aT.normalizeFeatures([MidTermFeatures.T]) MidTermFeaturesNorm = MidTermFeaturesNorm[0].T numOfWindows = MidTermFeatures.shape[1] # remove outliers: DistancesAll = numpy.sum(distance.squareform(distance.pdist(MidTermFeaturesNorm.T)), axis=0) MDistancesAll = numpy.mean(DistancesAll) iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0] # TODO: Combine energy threshold for outlier removal: #EnergyMin = numpy.min(MidTermFeatures[1,:]) #EnergyMean = numpy.mean(MidTermFeatures[1,:]) #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 #iNonOutLiers = numpy.nonzero(MidTermFeatures[1,:] > Thres)[0] #print iNonOutLiers perOutLier = (100.0 * (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows MidTermFeaturesNormOr = MidTermFeaturesNorm MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers] # LDA dimensionality reduction: if LDAdim > 0: #[mtFeaturesToReduce, _] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, stWin * Fs, round(Fs*stWin), round(Fs*stWin)); # extract mid-term features with minimum step: mtWinRatio = int(round(mtSize / stWin)) mtStepRatio = int(round(stWin / stWin)) mtFeaturesToReduce = [] numOfFeatures = len(ShortTermFeatures) numOfStatistics = 2 #for i in range(numOfStatistics * numOfFeatures + 1): for i in range(numOfStatistics * numOfFeatures): mtFeaturesToReduce.append([]) for i in range(numOfFeatures): # for each of the short-term features: curPos = 0 N = len(ShortTermFeatures[i]) while (curPos < N): N1 = curPos N2 = curPos + mtWinRatio if N2 > N: N2 = N curStFeatures = ShortTermFeatures[i][N1:N2] mtFeaturesToReduce[i].append(numpy.mean(curStFeatures)) mtFeaturesToReduce[i+numOfFeatures].append(numpy.std(curStFeatures)) curPos += mtStepRatio mtFeaturesToReduce = numpy.array(mtFeaturesToReduce) mtFeaturesToReduce2 = numpy.zeros((mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1])) for i in range(mtFeaturesToReduce.shape[1]): curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1 curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2 [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1) [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2) mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i] mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0] + len(classNames1), i] = P1 + 0.0001 mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]+len(classNames1)::, i] = P2 + 0.0001 mtFeaturesToReduce = mtFeaturesToReduce2 mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :] #mtFeaturesToReduce += numpy.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010 (mtFeaturesToReduce, MEAN, STD) = aT.normalizeFeatures([mtFeaturesToReduce.T]) mtFeaturesToReduce = mtFeaturesToReduce[0].T #DistancesAll = numpy.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0) #MDistancesAll = numpy.mean(DistancesAll) #iNonOutLiers2 = numpy.nonzero(DistancesAll < 3.0*MDistancesAll)[0] #mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2] Labels = numpy.zeros((mtFeaturesToReduce.shape[1], )); LDAstep = 1.0 LDAstepRatio = LDAstep / stWin #print LDAstep, LDAstepRatio for i in range(Labels.shape[0]): Labels[i] = int(i*stWin/LDAstepRatio); clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(n_components=LDAdim) clf.fit(mtFeaturesToReduce.T, Labels) MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T if numOfSpeakers <= 0: sRange = range(2, 10) else: sRange = [numOfSpeakers] clsAll = [] silAll = [] centersAll = [] for iSpeakers in sRange: k_means = sklearn.cluster.KMeans(n_clusters = iSpeakers) k_means.fit(MidTermFeaturesNorm.T) cls = k_means.labels_ means = k_means.cluster_centers_ # Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T)) clsAll.append(cls) centersAll.append(means) silA = []; silB = [] for c in range(iSpeakers): # for each speaker (i.e. for each extracted cluster) clusterPerCent = numpy.nonzero(cls==c)[0].shape[0] / float(len(cls)) if clusterPerCent < 0.020: silA.append(0.0) silB.append(0.0) else: MidTermFeaturesNormTemp = MidTermFeaturesNorm[:,cls==c] # get subset of feature vectors Yt = distance.pdist(MidTermFeaturesNormTemp.T) # compute average distance between samples that belong to the cluster (a values) silA.append(numpy.mean(Yt)*clusterPerCent) silBs = [] for c2 in range(iSpeakers): # compute distances from samples of other clusters if c2!=c: clusterPerCent2 = numpy.nonzero(cls==c2)[0].shape[0] / float(len(cls)) MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:,cls==c2] Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T) silBs.append(numpy.mean(Yt)*(clusterPerCent+clusterPerCent2)/2.0) silBs = numpy.array(silBs) silB.append(min(silBs)) # ... and keep the minimum value (i.e. the distance from the "nearest" cluster) silA = numpy.array(silA); silB = numpy.array(silB); sil = [] for c in range(iSpeakers): # for each cluster (speaker) sil.append( ( silB[c] - silA[c]) / (max(silB[c], silA[c])+0.00001) ) # compute silhouette silAll.append(numpy.mean(sil)) # keep the AVERAGE SILLOUETTE #silAll = silAll * (1.0/(numpy.power(numpy.array(sRange),0.5))) imax = numpy.argmax(silAll) # position of the maximum sillouette value nSpeakersFinal = sRange[imax] # optimal number of clusters # generate the final set of cluster labels # (important: need to retrieve the outlier windows: this is achieved by giving them the value of their nearest non-outlier window) cls = numpy.zeros((numOfWindows,)) for i in range(numOfWindows): j = numpy.argmin(numpy.abs(i-iNonOutLiers)) cls[i] = clsAll[imax][j] # Post-process method 1: hmm smoothing for i in range(1): startprob, transmat, means, cov = trainHMM_computeStatistics(MidTermFeaturesNormOr, cls) hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], "diag") # hmm training hmm.startprob_ = startprob hmm.transmat_ = transmat hmm.means_ = means; hmm.covars_ = cov cls = hmm.predict(MidTermFeaturesNormOr.T) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = silAll[imax] # final sillouette classNames = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)]; # load ground-truth if available gtFile = fileName.replace('.wav', '.segments'); # open for annotated file if os.path.isfile(gtFile): # if groundturh exists [segStart, segEnd, segLabels] = readSegmentGT(gtFile) # read GT data flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels, mtStep) # convert to flags if PLOT: fig = plt.figure() if numOfSpeakers>0: ax1 = fig.add_subplot(111) else: ax1 = fig.add_subplot(211) ax1.set_yticks(numpy.array(range(len(classNames)))) ax1.axis((0, Duration, -1, len(classNames))) ax1.set_yticklabels(classNames) ax1.plot(numpy.array(range(len(cls)))*mtStep+mtStep/2.0, cls) if os.path.isfile(gtFile): if PLOT: ax1.plot(numpy.array(range(len(flagsGT)))*mtStep+mtStep/2.0, flagsGT, 'r') purityClusterMean, puritySpeakerMean = evaluateSpeakerDiarization(cls, flagsGT) print "{0:.1f}\t{1:.1f}".format(100*purityClusterMean, 100*puritySpeakerMean) if PLOT: plt.title("Cluster purity: {0:.1f}% - Speaker purity: {1:.1f}%".format(100*purityClusterMean, 100*puritySpeakerMean) ) if PLOT: plt.xlabel("time (seconds)") #print sRange, silAll if numOfSpeakers<=0: plt.subplot(212) plt.plot(sRange, silAll) plt.xlabel("number of clusters"); plt.ylabel("average clustering's sillouette"); plt.show() return cls
def evaluate(features, ClassNames, nExp, Params, parameterMode, perTrain=0.80): (featuresNorm, MEAN, STD) = aT.normalizeFeatures(features) nClasses = len(features) CAll = [] acAll = [] F1All = [] PrecisionClassesAll = [] RecallClassesAll = [] ClassesAll = [] F1ClassesAll = [] CMsAll = [] # compute total number of samples: nSamplesTotal = 0 for f in features: nSamplesTotal += f.shape[0] if nSamplesTotal > 1000 and nExp > 50: nExp = 50 print "Number of training experiments changed to 50 due to high number of samples" if nSamplesTotal > 2000 and nExp > 10: nExp = 10 print "Number of training experiments changed to 10 due to high number of samples" for Ci, C in enumerate(Params): # for each param value CM = numpy.zeros((nClasses, nClasses)) for e in range(nExp): # for each cross-validation iteration: print "Param = {0:.5f} - Classifier Evaluation Experiment {1:d} of {2:d}".format( C, e + 1, nExp) featuresTrain, featuresTest = aT.randSplitFeatures( featuresNorm, perTrain) Classifier = train(featuresTrain, C) CMt = numpy.zeros((nClasses, nClasses)) for c1 in range(nClasses): nTestSamples = len(featuresTest[c1]) Results = numpy.zeros((nTestSamples, 1)) for ss in range(nTestSamples): [Results[ss], _] = classify(Classifier, featuresTest[c1][ss]) for c2 in range(nClasses): CMt[c1][c2] = float(len(numpy.nonzero(Results == c2)[0])) CM = CM + CMt CM = CM + 0.0000000010 Rec = numpy.zeros((CM.shape[0], )) Pre = numpy.zeros((CM.shape[0], )) for ci in range(CM.shape[0]): Rec[ci] = CM[ci, ci] / numpy.sum(CM[ci, :]) Pre[ci] = CM[ci, ci] / numpy.sum(CM[:, ci]) PrecisionClassesAll.append(Pre) RecallClassesAll.append(Rec) F1 = 2 * Rec * Pre / (Rec + Pre) F1ClassesAll.append(F1) acAll.append(numpy.sum(numpy.diagonal(CM)) / numpy.sum(CM)) CMsAll.append(CM) F1All.append(numpy.mean(F1)) print("\t\t"), for i, c in enumerate(ClassNames): if i == len(ClassNames) - 1: print "{0:s}\t\t".format(c), else: print "{0:s}\t\t\t".format(c), print("OVERALL") print("\tC"), for c in ClassNames: print "\tPRE\tREC\tF1", print "\t{0:s}\t{1:s}".format("ACC", "F1") bestAcInd = numpy.argmax(acAll) bestF1Ind = numpy.argmax(F1All) for i in range(len(PrecisionClassesAll)): print "\t{0:.3f}".format(Params[i]), for c in range(len(PrecisionClassesAll[i])): print "\t{0:.1f}\t{1:.1f}\t{2:.1f}".format( 100.0 * PrecisionClassesAll[i][c], 100.0 * RecallClassesAll[i][c], 100.0 * F1ClassesAll[i][c]), print "\t{0:.1f}\t{1:.1f}".format(100.0 * acAll[i], 100.0 * F1All[i]), if i == bestF1Ind: print "\t best F1", if i == bestAcInd: print "\t best Acc", print return Params[bestF1Ind]
def featureAndTrainRegression(dir_name, mt_win, mt_step, st_win, st_step, model_type, model_name, compute_beat=False, feats=["gfcc", "mfcc"]): ''' This function is used as a wrapper to segment-based audio feature extraction and classifier training. ARGUMENTS: dir_name: path of directory containing the WAV files and Regression CSVs mt_win, mt_step: mid-term window length and step st_win, st_step: short-term window and step model_type: "svm" or "knn" or "randomforest" model_name: name of the model to be saved RETURNS: None. Resulting regression model along with the respective model parameters are saved on files. ''' # STEP A: Feature Extraction: [features, _, filenames] = aF.dirsWavFeatureExtraction([dir_name], mt_win, mt_step, st_win, st_step, compute_beat=compute_beat, feats=feats) features = features[0] filenames = [ntpath.basename(f) for f in filenames[0]] f_final = [] # Read CSVs: CSVs = glob.glob(dir_name + os.sep + "*.csv") regression_labels = [] regression_names = [] f_final = [] for c in CSVs: # for each CSV cur_regression_labels = [] f_temp = [] with open(c, 'rt') as csvfile: # open the csv file that contains the current target value's annotations CSVreader = csv.reader(csvfile, delimiter=',', quotechar='|') for row in CSVreader: if len(row) == 2: # if the current row contains two fields (filename, target value) if row[0] in filenames: # ... and if the current filename exists in the list of filenames index = filenames.index(row[0]) cur_regression_labels.append(float(row[1])) f_temp.append(features[index,:]) else: print("Warning: {} not found in list of files.".format(row[0])) else: print("Warning: Row with unknown format in regression file") f_final.append(numpy.array(f_temp)) regression_labels.append(numpy.array(cur_regression_labels)) # cur_regression_labels is the list of values for the current regression problem regression_names.append(ntpath.basename(c).replace(".csv", "")) # regression task name if len(features) == 0: print("ERROR: No data found in any input folder!") return n_feats = f_final[0].shape[1] # TODO: ARRF WRITE???? # STEP B: classifier Evaluation and Parameter Selection: if model_type == "svm" or model_type == "svm_rbf": model_params = numpy.array([0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 5.0, 10.0]) elif model_type == "randomforest": model_params = numpy.array([5, 10, 25, 50, 100]) # elif model_type == "knn": # model_params = numpy.array([1, 3, 5, 7, 9, 11, 13, 15]); errors = [] errors_base = [] best_params = [] for iRegression, r in enumerate(regression_names): # get optimal classifeir parameter: print("Regression task " + r) bestParam, error, berror = evaluateRegression(f_final[iRegression], regression_labels[iRegression], 100, model_type, model_params) errors.append(error) errors_base.append(berror) best_params.append(bestParam) print("Selected params: {0:.5f}".format(bestParam)) [features_norm, MEAN, STD] = normalizeFeatures([f_final[iRegression]]) # normalize features # STEP C: Save the model to file if model_type == "svm": classifier, _ = trainSVMregression(features_norm[0], regression_labels[iRegression], bestParam) if model_type == "svm_rbf": classifier, _ = trainSVMregression_rbf(features_norm[0], regression_labels[iRegression], bestParam) if model_type == "randomforest": classifier, _ = trainRandomForestRegression(features_norm[0], regression_labels[iRegression], bestParam) if model_type == "svm" or model_type == "svm_rbf" or model_type == "randomforest": with open(model_name + "_" + r, 'wb') as fid: cPickle.dump(classifier, fid) fo = open(model_name + "_" + r + "MEANS", "wb") cPickle.dump(MEAN, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(STD, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(st_win, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(st_step, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(compute_beat, fo, protocol=cPickle.HIGHEST_PROTOCOL) fo.close() return errors, errors_base, best_params
def trainTextClassifiers(directoryPath, classifierType, classifierName): subdirectories = get_immediate_subdirectories(directoryPath) #tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features = 10000, stop_words='english') dicts = loadDictionaries("myDicts/") classNames = [] Features = [] # extract features from corpus for si, s in enumerate( subdirectories): # for each directory in training data print "Training folder {0:d} of {1:d} ({2:s})".format( si + 1, len(subdirectories), s), files = getListOfFilesInDir(directoryPath + os.sep + s, "*") # get list of files in directory if MAX_FILES_PER_CLASS > 0 and MAX_FILES_PER_CLASS < len(files): files = random.sample(files, MAX_FILES_PER_CLASS) print " - {0:d} files".format(len(files)) classNames.append(s) for ifile, fi in enumerate(files): # for each file in current class: with open(fi) as f: content = f.read() curF = getFeaturesFromText(content, dicts) # get feature vector if ifile == 0: # update feature matrix Features.append(curF.T) else: Features[-1] = numpy.concatenate((Features[-1], curF.T), axis=0) # define classifier parameters if classifierType == "svm": classifierParams = numpy.array([0.001, 0.01, 0.5, 1.0, 5.0, 10.0]) elif classifierType == "randomforest": classifierParams = numpy.array([10, 25, 50, 100, 200, 500]) elif classifierType == "knn": classifierParams = numpy.array([1, 3, 5, 7, 9, 11, 13, 15]) elif classifierType == "gradientboosting": classifierParams = numpy.array([10, 25, 50, 100, 200, 500]) elif classifierType == "extratrees": classifierParams = numpy.array([10, 25, 50, 100, 200, 500]) # evaluate classifier and select best param nExp = 10 bestParam = audioTrainTest.evaluateClassifier(Features, subdirectories, nExp, classifierType, classifierParams, 0, 0.9) # normalize features C = len(classNames) [featuresNorm, MEAN, STD] = audioTrainTest.normalizeFeatures(Features) MEAN = MEAN.tolist() STD = STD.tolist() featuresNew = featuresNorm # save the classifier to file if classifierType == "svm": Classifier = audioTrainTest.trainSVM(featuresNew, bestParam) elif classifierType == "randomforest": Classifier = audioTrainTest.trainRandomForest(featuresNew, bestParam) elif classifierType == "gradientboosting": Classifier = audioTrainTest.trainGradientBoosting( featuresNew, bestParam) elif classifierType == "extratrees": Classifier = audioTrainTest.trainExtraTrees(featuresNew, bestParam) if 'Classifier' in locals(): with open(classifierName, 'wb') as fid: # save to file cPickle.dump(Classifier, fid) fo = open(classifierName + "MEANS", "wb") cPickle.dump(MEAN, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(STD, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL) fo.close()
def silenceCounter(x, fs, st_win, st_step, smoothWindow=0.5, weight=0.5, plot=False): if weight >= 1: weight = 0.99 if weight <= 0: weight = 0.01 # Step 1: feature extraction x = audioBasicIO.stereo2mono(x) st_feats, _ = aF.stFeatureExtraction(x, fs, st_win * fs, st_step * fs) # Step 2: train binary svm classifier of low vs high energy frames # keep only the energy short-term sequence (2nd feature) st_energy = st_feats[1, :] en = numpy.sort(st_energy) # number of 10% of the total short-term windows l1 = int(len(en) / 10) # compute "lower" 10% energy threshold t1 = numpy.mean(en[0:l1]) + 0.000000000000001 # compute "higher" 10% energy threshold t2 = numpy.mean(en[-l1:-1]) + 0.000000000000001 # get all features that correspond to low energy class1 = st_feats[:, numpy.where(st_energy <= t1)[0]] # get all features that correspond to high energy class2 = st_feats[:, numpy.where(st_energy >= t2)[0]] # form the binary classification task and ... # change the order of the array # faets_s = [class1.T, class2.T] # changing order gives the segmens with silence faets_s = [class2.T, class1.T] # normalize and train the respective svm probabilistic model # (SILENCE vs ONSET) [faets_s_norm, means_s, stds_s] = aT.normalizeFeatures(faets_s) svm = aT.trainSVM(faets_s_norm, 1.0) # Step 3: compute onset probability based on the trained svm prob_on_set = [] for i in range(st_feats.shape[1]): # for each frame cur_fv = (st_feats[:, i] - means_s) / stds_s # get svm probability (that it belongs to the ONSET class) prob_on_set.append(svm.predict_proba(cur_fv.reshape(1, -1))[0][1]) prob_on_set = numpy.array(prob_on_set) # smooth probability: prob_on_set = smoothMovingAvg(prob_on_set, smoothWindow / st_step) # Step 4A: detect onset frame indices: prog_on_set_sort = numpy.sort(prob_on_set) # find probability Threshold as a weighted average # of top 10% and lower 10% of the values Nt = int(prog_on_set_sort.shape[0] / 10) T = (numpy.mean((1 - weight) * prog_on_set_sort[0:Nt]) + weight * numpy.mean(prog_on_set_sort[-Nt::])) max_idx = numpy.where(prob_on_set > T)[0] # get the indices of the frames that satisfy the thresholding i = 0 time_clusters = [] seg_limits = [] # Step 4B: group frame indices to onset segments while i < len(max_idx): # for each of the detected onset indices cur_cluster = [max_idx[i]] if i == len(max_idx) - 1: break while max_idx[i + 1] - cur_cluster[-1] <= 2: cur_cluster.append(max_idx[i + 1]) i += 1 if i == len(max_idx) - 1: break i += 1 time_clusters.append(cur_cluster) seg_limits.append( [cur_cluster[0] * st_step, cur_cluster[-1] * st_step]) # Step 5: Post process: remove very small segments: min_dur = 0.2 seg_limits_2 = [] for s in seg_limits: if s[1] - s[0] > min_dur: seg_limits_2.append(s) print(f"SEGMENTS 0.2: {seg_limits_2}") print(F"SEGMENTS: {seg_limits}")
def speakerDiarization(fileName, sRange=xrange(2, 10), mtSize=2.0, mtStep=0.2, stWin=0.05, LDAdim=35): Fs, x = audioBasicIO.readAudioFile(fileName) x = audioBasicIO.stereo2mono(x) duration = len(x) / Fs Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1 = aT.loadKNNModel( os.path.join( '/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data', 'knnSpeakerAll')) Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2 = aT.loadKNNModel( os.path.join( '/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data', 'knnSpeakerFemaleMale')) MidTermFeatures, ShortTermFeatures = aF.mtFeatureExtraction( x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stWin * 0.5)) MidTermFeatures2 = numpy.zeros( (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1])) for i in range(MidTermFeatures.shape[1]): curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1 curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2 Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1) Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2) MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i] MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0] + len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1):, i] = P2 + 0.0001 MidTermFeatures = MidTermFeatures2 iFeaturesSelect = range(8, 21) + range(41, 54) MidTermFeatures = MidTermFeatures[iFeaturesSelect, :] MidTermFeaturesNorm, MEAN, STD = aT.normalizeFeatures([MidTermFeatures.T]) MidTermFeaturesNorm = MidTermFeaturesNorm[0].T numOfWindows = MidTermFeatures.shape[1] DistancesAll = numpy.sum(distance.squareform( distance.pdist(MidTermFeaturesNorm.T)), axis=0) MDistancesAll = numpy.mean(DistancesAll) iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0] perOutLier = (100.0 * (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows MidTermFeaturesNormOr = MidTermFeaturesNorm MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers] if LDAdim > 0: mtWinRatio, mtStepRatio, mtFeaturesToReduce, numOfFeatures, numOfStatistics = int( round(mtSize / stWin)), int(round( stWin / stWin)), list(), len(ShortTermFeatures), 2 for i in range(numOfStatistics * numOfFeatures): mtFeaturesToReduce.append(list()) for i in range(numOfFeatures): curPos = 0 N = len(ShortTermFeatures[i]) while (curPos < N): N1, N2 = curPos, curPos + mtWinRatio if N2 > N: N2 = N curStFeatures = ShortTermFeatures[i][N1:N2] mtFeaturesToReduce[i].append(numpy.mean(curStFeatures)) mtFeaturesToReduce[i + numOfFeatures].append( numpy.std(curStFeatures)) curPos += mtStepRatio mtFeaturesToReduce = numpy.array(mtFeaturesToReduce) mtFeaturesToReduce2 = numpy.zeros( (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1])) for i in range(mtFeaturesToReduce.shape[1]): curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1 curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2 Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1) Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2) mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i] mtFeaturesToReduce2[ mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0] + len(classNames1), i] = P1 + 0.0001 mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] + len(classNames1):, i] = P2 + 0.0001 mtFeaturesToReduce = mtFeaturesToReduce2 mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :] mtFeaturesToReduce, MEAN, STD = aT.normalizeFeatures( [mtFeaturesToReduce.T]) mtFeaturesToReduce = mtFeaturesToReduce[0].T Labels = numpy.zeros((mtFeaturesToReduce.shape[1], )) LDAstep = 1.0 LDAstepRatio = LDAstep / stWin for i in range(Labels.shape[0]): Labels[i] = int(i * stWin / LDAstepRatio) clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis( n_components=LDAdim) clf.fit(mtFeaturesToReduce.T, Labels) MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T clsAll, silAll, centersAll = list(), list(), list() for iSpeakers in sRange: k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers) k_means.fit(MidTermFeaturesNorm.T) cls = k_means.labels_ means = k_means.cluster_centers_ clsAll.append(cls) centersAll.append(means) silA, silB = list(), list() for c in range(iSpeakers): clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float( len(cls)) if clusterPerCent < 0.02: silA.append(0.0) silB.append(0.0) else: MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls == c] Yt = distance.pdist(MidTermFeaturesNormTemp.T) silA.append(numpy.mean(Yt) * clusterPerCent) silBs = list() for c2 in range(iSpeakers): if c2 != c: clusterPerCent2 = numpy.nonzero( cls == c2)[0].shape[0] / float(len(cls)) MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:, cls == c2] Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T) silBs.append( numpy.mean(Yt) * (clusterPerCent + clusterPerCent2) / 2.0) silBs = numpy.array(silBs) silB.append(min(silBs)) silA, silB, sil = numpy.array(silA), numpy.array(silB), list() for c in range(iSpeakers): sil.append((silB[c] - silA[c]) / (max(silB[c], silA[c]) + 0.00001)) silAll.append(numpy.mean(sil)) imax = numpy.argmax(silAll) nSpeakersFinal = sRange[imax] cls = numpy.zeros((numOfWindows, )) for i in range(numOfWindows): j = numpy.argmin(numpy.abs(i - iNonOutLiers)) cls[i] = clsAll[imax][j] startprob, transmat, means, cov = trainHMM(MidTermFeaturesNormOr, cls) hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], 'diag') hmm.startprob_ = startprob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov cls = hmm.predict(MidTermFeaturesNormOr.T) cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = silAll[imax] classNames = ['SPEAKER{0:d}'.format(c) for c in range(nSpeakersFinal)] return cls, classNames, duration, mtStep, silAll
classNames = classNames.split() for a in classNames: temp = numpy.load( os.path.dirname(os.path.realpath(sys.argv[0])) + '/classifier_data/' + a + '.npy') features.append(temp) classifierParams = numpy.array([0.001, 0.01, 0.5, 1.0, 5.0]) nExp = 50 bestParam = audioTrainTest.evaluateClassifier(features, classNames, nExp, "svm", classifierParams, 0, perTrain=0.01) [featuresNorm, MEAN, STD] = audioTrainTest.normalizeFeatures(features) MEAN = MEAN.tolist() STD = STD.tolist() Classifier = audioTrainTest.trainSVM(featuresNorm, bestParam) #todo #featureAndTrain("/home/fnaser/Music", ) #Classifier.save_model(os.path.dirname(os.path.realpath(sys.argv[0]))+'/classifier_data/'+modelName) with open( os.path.dirname(os.path.realpath(sys.argv[0])) + '/classifier_data/' + modelName, 'wb') as fid: cPickle.dump(Classifier, fid) fo = open( os.path.dirname(os.path.realpath(sys.argv[0])) + '/classifier_data/' + modelName + "MEANS", "wb") cPickle.dump(MEAN, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(STD, fo, protocol=cPickle.HIGHEST_PROTOCOL)
def fileGreenwaySpeakerDiarization(filename, output_folder, speech_key="52fe944f29784ae288482e5eb3092e2a", service_region="eastus2", n_speakers=2, mt_size=2.0, mt_step=0.2, st_win=0.05, lda_dim=35): """ ARGUMENTS: - filename: the name of the WAV file to be analyzed the filename should have a suffix of the form: ..._min_3 this informs the service that audio file corresponds to the 3rd minute of the dialogue - output_folder the folder location for saving the audio snippets generated from diarization - speech_key mid-term window size - service_region the number of speakers (clusters) in the recording (<=0 for unknown) - n_speakers the number of speakers (clusters) in the recording (<=0 for unknown) - mt_size (opt) mid-term window size - mt_step (opt) mid-term window step - st_win (opt) short-term window size - lda_dim (opt LDA dimension (0 for no LDA) - plot_res (opt) 0 for not plotting the results 1 for plotting - save_plot (opt) 1|True for saving plot in output folder """ ''' OUTPUTS: - cls: this is a vector with speaker ids in chronological sequence of speaker dialogue. - output: a list of python dictionaries containing dialogue sequence information. - dialogue_id - sequence_id - start_time - end_time - text ''' filename_only = filename if "/" not in filename else filename.split("/")[-1] nameoffile = filename_only.split("_min_")[0] timeoffile = filename_only.split("_min_")[1] [fs, x] = audioBasicIO.read_audio_file(filename) x = audioBasicIO.stereo_to_mono(x) duration = len(x) / fs [classifier_1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.load_model_knn( os.path.join(os.path.dirname(os.path.realpath(__file__)), "pyAudioAnalysis/data/models", "knn_speaker_10")) [classifier_2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.load_model_knn( os.path.join(os.path.dirname(os.path.realpath(__file__)), "pyAudioAnalysis/data/models", "knn_speaker_male_female")) [mt_feats, st_feats, _] = aF.mid_feature_extraction(x, fs, mt_size * fs, mt_step * fs, round(fs * st_win), round(fs*st_win * 0.5)) MidTermFeatures2 = np.zeros((mt_feats.shape[0] + len(classNames1) + len(classNames2), mt_feats.shape[1])) for i in range(mt_feats.shape[1]): cur_f1 = (mt_feats[:, i] - MEAN1) / STD1 cur_f2 = (mt_feats[:, i] - MEAN2) / STD2 [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1) [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2) MidTermFeatures2[0:mt_feats.shape[0], i] = mt_feats[:, i] MidTermFeatures2[mt_feats.shape[0]:mt_feats.shape[0] + len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[mt_feats.shape[0] + len(classNames1)::, i] = P2 + 0.0001 mt_feats = MidTermFeatures2 # TODO iFeaturesSelect = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53] mt_feats = mt_feats[iFeaturesSelect, :] (mt_feats_norm, MEAN, STD) = aT.normalizeFeatures([mt_feats.T]) mt_feats_norm = mt_feats_norm[0].T n_wins = mt_feats.shape[1] # remove outliers: dist_all = np.sum(distance.squareform(distance.pdist(mt_feats_norm.T)), axis=0) m_dist_all = np.mean(dist_all) i_non_outliers = np.nonzero(dist_all < 1.2 * m_dist_all)[0] # TODO: Combine energy threshold for outlier removal: #EnergyMin = np.min(mt_feats[1,:]) #EnergyMean = np.mean(mt_feats[1,:]) #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 #i_non_outliers = np.nonzero(mt_feats[1,:] > Thres)[0] # print i_non_outliers perOutLier = (100.0 * (n_wins - i_non_outliers.shape[0])) / n_wins mt_feats_norm_or = mt_feats_norm mt_feats_norm = mt_feats_norm[:, i_non_outliers] # LDA dimensionality reduction: if lda_dim > 0: # [mt_feats_to_red, _, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs, # st_win * fs, round(fs*st_win), round(fs*st_win)); # extract mid-term features with minimum step: mt_win_ratio = int(round(mt_size / st_win)) mt_step_ratio = int(round(st_win / st_win)) mt_feats_to_red = [] num_of_features = len(st_feats) num_of_stats = 2 # for i in range(num_of_stats * num_of_features + 1): for i in range(num_of_stats * num_of_features): mt_feats_to_red.append([]) # for each of the short-term features: for i in range(num_of_features): curPos = 0 N = len(st_feats[i]) while (curPos < N): N1 = curPos N2 = curPos + mt_win_ratio if N2 > N: N2 = N curStFeatures = st_feats[i][N1:N2] mt_feats_to_red[i].append(np.mean(curStFeatures)) mt_feats_to_red[i + num_of_features].append(np.std(curStFeatures)) curPos += mt_step_ratio mt_feats_to_red = np.array(mt_feats_to_red) mt_feats_to_red_2 = np.zeros((mt_feats_to_red.shape[0] + len(classNames1) + len(classNames2), mt_feats_to_red.shape[1])) for i in range(mt_feats_to_red.shape[1]): cur_f1 = (mt_feats_to_red[:, i] - MEAN1) / STD1 cur_f2 = (mt_feats_to_red[:, i] - MEAN2) / STD2 [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1) [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2) mt_feats_to_red_2[0:mt_feats_to_red.shape[0], i] = mt_feats_to_red[:, i] mt_feats_to_red_2[mt_feats_to_red.shape[0] :mt_feats_to_red.shape[0] + len(classNames1), i] = P1 + 0.0001 mt_feats_to_red_2[mt_feats_to_red.shape[0] + len(classNames1)::, i] = P2 + 0.0001 mt_feats_to_red = mt_feats_to_red_2 mt_feats_to_red = mt_feats_to_red[iFeaturesSelect, :] #mt_feats_to_red += np.random.rand(mt_feats_to_red.shape[0], mt_feats_to_red.shape[1]) * 0.0000010 (mt_feats_to_red, MEAN, STD) = aT.normalizeFeatures( [mt_feats_to_red.T]) mt_feats_to_red = mt_feats_to_red[0].T #dist_all = np.sum(distance.squareform(distance.pdist(mt_feats_to_red.T)), axis=0) #m_dist_all = np.mean(dist_all) #iNonOutLiers2 = np.nonzero(dist_all < 3.0*m_dist_all)[0] #mt_feats_to_red = mt_feats_to_red[:, iNonOutLiers2] Labels = np.zeros((mt_feats_to_red.shape[1], )) LDAstep = 1.0 LDAstepRatio = LDAstep / st_win # print LDAstep, LDAstepRatio for i in range(Labels.shape[0]): Labels[i] = int(i*st_win/LDAstepRatio) clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis( n_components=lda_dim) clf.fit(mt_feats_to_red.T, Labels) mt_feats_norm = (clf.transform(mt_feats_norm.T)).T if n_speakers <= 0: s_range = range(2, 10) else: s_range = [n_speakers] clsAll = [] sil_all = [] centersAll = [] for iSpeakers in s_range: k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers) k_means.fit(mt_feats_norm.T) cls = k_means.labels_ means = k_means.cluster_centers_ # Y = distance.squareform(distance.pdist(mt_feats_norm.T)) clsAll.append(cls) centersAll.append(means) sil_1 = [] sil_2 = [] for c in range(iSpeakers): # for each speaker (i.e. for each extracted cluster) clust_per_cent = np.nonzero(cls == c)[0].shape[0] / \ float(len(cls)) if clust_per_cent < 0.020: sil_1.append(0.0) sil_2.append(0.0) else: # get subset of feature vectors mt_feats_norm_temp = mt_feats_norm[:, cls == c] # compute average distance between samples # that belong to the cluster (a values) Yt = distance.pdist(mt_feats_norm_temp.T) sil_1.append(np.mean(Yt)*clust_per_cent) silBs = [] for c2 in range(iSpeakers): # compute distances from samples of other clusters if c2 != c: clust_per_cent_2 = np.nonzero(cls == c2)[0].shape[0] /\ float(len(cls)) MidTermFeaturesNormTemp2 = mt_feats_norm[:, cls == c2] Yt = distance.cdist(mt_feats_norm_temp.T, MidTermFeaturesNormTemp2.T) silBs.append(np.mean(Yt)*(clust_per_cent + clust_per_cent_2)/2.0) silBs = np.array(silBs) # ... and keep the minimum value (i.e. # the distance from the "nearest" cluster) sil_2.append(min(silBs)) sil_1 = np.array(sil_1) sil_2 = np.array(sil_2) sil = [] for c in range(iSpeakers): # for each cluster (speaker) compute silhouette sil.append((sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 0.00001)) # keep the AVERAGE SILLOUETTE sil_all.append(np.mean(sil)) imax = np.argmax(sil_all) # optimal number of clusters nSpeakersFinal = s_range[imax] # generate the final set of cluster labels # (important: need to retrieve the outlier windows: # this is achieved by giving them the value of their # nearest non-outlier window) cls = np.zeros((n_wins,)) for i in range(n_wins): j = np.argmin(np.abs(i-i_non_outliers)) cls[i] = clsAll[imax][j] # Post-process method 1: hmm smoothing for i in range(1): # hmm training start_prob, transmat, means, cov = \ trainHMM_computeStatistics(mt_feats_norm_or, cls) hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag") hmm.startprob_ = start_prob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov cls = hmm.predict(mt_feats_norm_or.T) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = sil_all[imax] class_names = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)] # load ground-truth if available gt_file = filename.replace('.wav', '.segments') # if groundturh exists if os.path.isfile(gt_file): [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file) flags_gt, class_names_gt = segs2flags( seg_start, seg_end, seg_labs, mt_step) # if plot_res: # fig = plt.figure() # if n_speakers > 0: # ax1 = fig.add_subplot(111) # else: # ax1 = fig.add_subplot(211) # ax1.set_yticks(np.array(range(len(class_names)))) # ax1.axis((0, duration, -1, len(class_names))) # ax1.set_yticklabels(class_names) # ax1.plot(np.array(range(len(cls)))*mt_step+mt_step/2.0, cls) # if os.path.isfile(gt_file): # if plot_res: # ax1.plot(np.array(range(len(flags_gt))) * # mt_step + mt_step / 2.0, flags_gt, 'r') # purity_cluster_m, purity_speaker_m = \ # evaluateSpeakerDiarization(cls, flags_gt) # print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m, # 100 * purity_speaker_m)) # if plot_res: # plt.title("Cluster purity: {0:.1f}% - " # "Speaker purity: {1:.1f}%".format(100 * purity_cluster_m, # 100 * purity_speaker_m)) # if plot_res: # plt.xlabel("time (seconds)") # # print s_range, sil_all # if n_speakers <= 0: # plt.subplot(212) # plt.plot(s_range, sil_all) # plt.xlabel("number of clusters") # plt.ylabel("average clustering's sillouette") # if save_plot: # plt.savefig( # f"{output_folder}{filename_only}".replace(".wav", ".png")) # else: # pass # plt.show() # Create Time Vector time_vec = np.array(range(len(cls)))*mt_step+mt_step/2.0 # Find Change Points speaker_change_index = np.where(np.roll(cls, 1) != cls)[0] # Create List of dialogue convos output_list = [] temp = {} for ind, sc in enumerate(speaker_change_index): temp['dialogue_id'] = str(datetime.now()).strip() temp['sequence_id'] = str(ind) temp['speaker'] = list(cls)[sc] temp['start_time'] = time_vec[sc] temp['end_time'] = time_vec[speaker_change_index[ind+1] - 1] if ind+1 < len(speaker_change_index) else time_vec[-1] temp["text"] = "" output_list.append(temp) temp = {} def snip_transcribe(output_list, filename, output_folder=output_folder, speech_key=speech_key, service_region=service_region): speech_config = speechsdk.SpeechConfig( subscription=speech_key, region=service_region) speech_config.enable_dictation def recognized_cb(evt): if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech: # Do something with the recognized text output_list[ind]['text'] = output_list[ind]['text'] + \ str(evt.result.text) print(evt.result.text) for ind, diag in enumerate(output_list): t1 = diag['start_time'] t2 = diag['end_time'] newAudio = AudioSegment.from_wav(filename) chunk = newAudio[t1*1000:t2*1000] filename_out = output_folder + f"snippet_{diag['sequence_id']}.wav" # Exports to a wav file in the current path. chunk.export(filename_out, format="wav") done = False def stop_cb(evt): """callback that signals to stop continuous recognition upon receiving an event `evt`""" print('CLOSING on {}'.format(evt)) nonlocal done done = True audio_input = speechsdk.AudioConfig(filename=filename_out) speech_recognizer = speechsdk.SpeechRecognizer( speech_config=speech_config, audio_config=audio_input) output_list[ind]['snippet_path'] = filename_out speech_recognizer.recognized.connect(recognized_cb) speech_recognizer.session_stopped.connect(stop_cb) speech_recognizer.canceled.connect(stop_cb) # Start continuous speech recognition speech_recognizer.start_continuous_recognition() while not done: time.sleep(.5) speech_recognizer.stop_continuous_recognition() return output_list output = snip_transcribe(output_list, filename, output_folder=output_folder) output_json = {filename_only: output} with open(f"{output_folder}{nameoffile}_{timeoffile}.txt", "w") as outfile: json.dump(output_json, outfile) return cls, output_json
""" import os, readchar, sklearn.cluster from pyAudioAnalysis.audioFeatureExtraction import mtFeatureExtraction as mT from pyAudioAnalysis.audioBasicIO import readAudioFile, stereo2mono from pyAudioAnalysis.audioSegmentation import flags2segs from pyAudioAnalysis.audioTrainTest import normalizeFeatures if __name__ == '__main__': # read signal and get normalized segment features: input_file = "../data/song1.mp3" fs, x = readAudioFile(input_file) x = stereo2mono(x) mt_size, mt_step, st_win = 5, 0.5, 0.05 [mt_feats, st_feats, _] = mT(x, fs, mt_size * fs, mt_step * fs, round(fs * st_win), round(fs * st_win * 0.5)) (mt_feats_norm, MEAN, STD) = normalizeFeatures([mt_feats.T]) mt_feats_norm = mt_feats_norm[0].T # perform clustering (k = 4) n_clusters = 4 k_means = sklearn.cluster.KMeans(n_clusters=n_clusters) k_means.fit(mt_feats_norm.T) cls = k_means.labels_ segs, c = flags2segs(cls, mt_step) # convert flags to segment limits for sp in range(n_clusters): # play each cluster's segment for i in range(len(c)): if c[i] == sp and segs[i, 1] - segs[i, 0] > 5: # play long segments of current cluster (only win_to_play seconds) d = segs[i, 1] - segs[i, 0] win_to_play = 10 if win_to_play > d: win_to_play = d
[bestParam, result_matrix, precision_classes_all, recall_classes_all, f1_classes_all, f1_all, ac_all] = \ AudioClassifierManager.getResultMatrixAndBestParam\ (features,classNames,model,AudioClassifierManager.BEST_ACCURACY,perTrain=pT) print("Selected params: {0:.5f}".format(bestParam)) AudioClassifierManager.saveConfusionMatrix(result_matrix, classNames, model_name) AudioClassifierManager.saveParamsFromClassification( classNames, AudioClassifierManager.getListParamsForClassifierType(model), model_name, precision_classes_all, recall_classes_all, f1_classes_all, ac_all, f1_all) # Feature normalization: (features_norm, MEAN, STD) = aT.normalizeFeatures(features) MEAN = MEAN.tolist() STD = STD.tolist() featuresNew = features_norm # Re-apply classification with normalized features and best param finalClassifier = AudioClassifierManager.getTrainClassifier( featuresNew, model, bestParam) # Save final model AudioClassifierManager.saveClassifierModel(featuresNew, model_name, model, finalClassifier, MEAN, STD, classNames, bestParam)
def featureAndTrain(list_of_dirs, mt_win, mt_step, st_win, st_step, classifier_type, model_name, compute_beat=False, perTrain=0.90, feats=["gfcc", "mfcc"]): ''' This function is used as a wrapper to segment-based audio feature extraction and classifier training. ARGUMENTS: list_of_dirs: list of paths of directories. Each directory contains a signle audio class whose samples are stored in seperate WAV files. mt_win, mt_step: mid-term window length and step st_win, st_step: short-term window and step classifier_type: "svm" or "knn" or "randomforest" or "gradientboosting" or "extratrees" model_name: name of the model to be saved RETURNS: None. Resulting classifier along with the respective model parameters are saved on files. ''' # STEP A: Feature Extraction: [features, classNames, _] = aF.dirsWavFeatureExtraction(list_of_dirs, mt_win, mt_step, st_win, st_step, compute_beat=compute_beat, feats=feats) if len(features) == 0: print("trainSVM_feature ERROR: No data found in any input folder!") return n_feats = features[0].shape[1] feature_names = ["features" + str(d + 1) for d in range(n_feats)] writeTrainDataToARFF(model_name, features, classNames, feature_names) for i, f in enumerate(features): if len(f) == 0: print("trainSVM_feature ERROR: " + list_of_dirs[i] + " folder is empty or non-existing!") return # STEP B: classifier Evaluation and Parameter Selection: if classifier_type == "svm" or classifier_type == "svm_rbf": classifier_par = numpy.array([0.001, 0.01, 0.5, 1.0, 5.0, 10.0, 20.0]) elif classifier_type == "randomforest": classifier_par = numpy.array([10, 25, 50, 100,200,500]) elif classifier_type == "knn": classifier_par = numpy.array([1, 3, 5, 7, 9, 11, 13, 15]) elif classifier_type == "gradientboosting": classifier_par = numpy.array([10, 25, 50, 100,200,500]) elif classifier_type == "extratrees": classifier_par = numpy.array([10, 25, 50, 100,200,500]) elif classifier_type == "logisticregression": classifier_par = numpy.array([0.01, 0.1, 1, 5]) # get optimal classifeir parameter: features2 = [] for f in features: fTemp = [] for i in range(f.shape[0]): temp = f[i,:] if (not numpy.isnan(temp).any()) and (not numpy.isinf(temp).any()) : fTemp.append(temp.tolist()) else: print("NaN Found! Feature vector not used for training") features2.append(numpy.array(fTemp)) features = features2 bestParam = evaluateclassifier(features, classNames, 300, classifier_type, classifier_par, 0, perTrain) # Hier!!!! print("Selected params: {0:.5f}".format(bestParam)) C = len(classNames) [features_norm, MEAN, STD] = normalizeFeatures(features) # normalize features MEAN = MEAN.tolist() STD = STD.tolist() featuresNew = features_norm # STEP C: Save the classifier to file if classifier_type == "svm": classifier = trainSVM(featuresNew, bestParam) elif classifier_type == "svm_rbf": classifier = trainSVM_RBF(featuresNew, bestParam) elif classifier_type == "randomforest": classifier = trainRandomForest(featuresNew, bestParam) elif classifier_type == "gradientboosting": classifier = trainGradientBoosting(featuresNew, bestParam) elif classifier_type == "extratrees": classifier = trainExtraTrees(featuresNew, bestParam) elif classifier_type == "logisticregression": classifier = trainLogisticRegression(featuresNew, bestParam) if classifier_type == "knn": [X, Y] = listOfFeatures2Matrix(featuresNew) X = X.tolist() Y = Y.tolist() fo = open(model_name, "wb") cPickle.dump(X, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(Y, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(MEAN, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(STD, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(bestParam, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(st_win, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(st_step, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(compute_beat, fo, protocol=cPickle.HIGHEST_PROTOCOL) fo.close() elif classifier_type == "svm" or classifier_type == "svm_rbf" or \ classifier_type == "randomforest" or \ classifier_type == "gradientboosting" or \ classifier_type == "extratrees" or \ classifier_type == "logisticregression": with open(model_name, 'wb') as fid: cPickle.dump(classifier, fid) fo = open(model_name + "MEANS", "wb") cPickle.dump(MEAN, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(STD, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(st_win, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(st_step, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(compute_beat, fo, protocol=cPickle.HIGHEST_PROTOCOL) fo.close()
def silenceRemoval(x, fs, st_win, st_step, smoothWindow=0.5, weight=0.5, plot=False): ''' Event Detection (silence removal) ARGUMENTS: - x: the input audio signal - fs: sampling freq - st_win, st_step: window size and step in seconds - smoothWindow: (optinal) smooth window (in seconds) - weight: (optinal) weight factor (0 < weight < 1) the higher, the more strict - plot: (optinal) True if results are to be plotted RETURNS: - seg_limits: list of segment limits in seconds (e.g [[0.1, 0.9], [1.4, 3.0]] means that the resulting segments are (0.1 - 0.9) seconds and (1.4, 3.0) seconds ''' if weight >= 1: weight = 0.99 if weight <= 0: weight = 0.01 # Step 1: feature extraction x = audioBasicIO.stereo2mono(x) st_feats, _ = aF.stFeatureExtraction(x, fs, st_win * fs, st_step * fs) # Step 2: train binary svm classifier of low vs high energy frames # keep only the energy short-term sequence (2nd feature) st_energy = st_feats[1, :] en = numpy.sort(st_energy) # number of 10% of the total short-term windows l1 = int(len(en) / 10) # compute "lower" 10% energy threshold t1 = numpy.mean(en[0:l1]) + 0.000000000000001 # compute "higher" 10% energy threshold t2 = numpy.mean(en[-l1:-1]) + 0.000000000000001 # get all features that correspond to low energy class1 = st_feats[:, numpy.where(st_energy <= t1)[0]] # get all features that correspond to high energy class2 = st_feats[:, numpy.where(st_energy >= t2)[0]] # form the binary classification task and ... faets_s = [class1.T, class2.T] # normalize and train the respective svm probabilistic model # (ONSET vs SILENCE) [faets_s_norm, means_s, stds_s] = aT.normalizeFeatures(faets_s) svm = aT.trainSVM(faets_s_norm, 1.0) # Step 3: compute onset probability based on the trained svm prob_on_set = [] for i in range(st_feats.shape[1]): # for each frame cur_fv = (st_feats[:, i] - means_s) / stds_s # get svm probability (that it belongs to the ONSET class) prob_on_set.append(svm.predict_proba(cur_fv.reshape(1, -1))[0][1]) prob_on_set = numpy.array(prob_on_set) # smooth probability: prob_on_set = smoothMovingAvg(prob_on_set, smoothWindow / st_step) # Step 4A: detect onset frame indices: prog_on_set_sort = numpy.sort(prob_on_set) # find probability Threshold as a weighted average # of top 10% and lower 10% of the values Nt = int(prog_on_set_sort.shape[0] / 10) T = (numpy.mean((1 - weight) * prog_on_set_sort[0:Nt]) + weight * numpy.mean(prog_on_set_sort[-Nt::])) max_idx = numpy.where(prob_on_set > T)[0] # get the indices of the frames that satisfy the thresholding i = 0 time_clusters = [] seg_limits = [] # Step 4B: group frame indices to onset segments while i < len(max_idx): # for each of the detected onset indices cur_cluster = [max_idx[i]] if i == len(max_idx) - 1: break while max_idx[i + 1] - cur_cluster[-1] <= 2: cur_cluster.append(max_idx[i + 1]) i += 1 if i == len(max_idx) - 1: break i += 1 time_clusters.append(cur_cluster) seg_limits.append( [cur_cluster[0] * st_step, cur_cluster[-1] * st_step]) # Step 5: Post process: remove very small segments: min_dur = 0.2 seg_limits_2 = [] for s in seg_limits: if s[1] - s[0] > min_dur: seg_limits_2.append(s) seg_limits = seg_limits_2 if plot: timeX = numpy.arange(0, x.shape[0] / float(fs), 1.0 / fs) plt.subplot(2, 1, 1) plt.plot(timeX, x) for s in seg_limits: plt.axvline(x=s[0]) plt.axvline(x=s[1]) plt.subplot(2, 1, 2) plt.plot(numpy.arange(0, prob_on_set.shape[0] * st_step, st_step), prob_on_set) plt.title('Signal') for s in seg_limits: plt.axvline(x=s[0]) plt.axvline(x=s[1]) plt.title('svm Probability') plt.show() return seg_limits
def evaluateclassifier(features, class_names, n_exp, classifier_name, Params, parameterMode, perTrain=0.90): ''' ARGUMENTS: features: a list ([numOfClasses x 1]) whose elements containt numpy matrices of features. each matrix features[i] of class i is [n_samples x numOfDimensions] class_names: list of class names (strings) n_exp: number of cross-validation experiments classifier_name: svm or knn or randomforest Params: list of classifier parameters (for parameter tuning during cross-validation) parameterMode: 0: choose parameters that lead to maximum overall classification ACCURACY 1: choose parameters that lead to maximum overall f1 MEASURE RETURNS: bestParam: the value of the input parameter that optimizes the selected performance measure ''' # feature normalization: (features_norm, MEAN, STD) = normalizeFeatures(features) #features_norm = features; n_classes = len(features) ac_all = [] f1_all = [] precision_classes_all = [] recall_classes_all = [] f1_classes_all = [] cms_all = [] # compute total number of samples: n_samples_total = 0 for f in features: n_samples_total += f.shape[0] if n_samples_total > 1000 and n_exp > 50: n_exp = 50 print("Number of training experiments changed to 50 due to high number of samples") if n_samples_total > 2000 and n_exp > 10: n_exp = 10 print("Number of training experiments changed to 10 due to high number of samples") for Ci, C in enumerate(Params): # for each param value cm = numpy.zeros((n_classes, n_classes)) for e in range(n_exp): # for each cross-validation iteration: print("Param = {0:.5f} - classifier Evaluation " "Experiment {1:d} of {2:d}".format(C, e+1, n_exp)) # split features: f_train, f_test = randSplitFeatures(features_norm, perTrain) # train multi-class svms: if classifier_name == "svm": classifier = trainSVM(f_train, C) elif classifier_name == "svm_rbf": classifier = trainSVM_RBF(f_train, C) elif classifier_name == "knn": classifier = trainKNN(f_train, C) elif classifier_name == "randomforest": classifier = trainRandomForest(f_train, C) elif classifier_name == "gradientboosting": classifier = trainGradientBoosting(f_train, C) elif classifier_name == "extratrees": classifier = trainExtraTrees(f_train, C) elif classifier_name == "logisticregression": classifier = trainLogisticRegression(f_train, C) cmt = numpy.zeros((n_classes, n_classes)) for c1 in range(n_classes): n_test_samples = len(f_test[c1]) res = numpy.zeros((n_test_samples, 1)) for ss in range(n_test_samples): [res[ss], _] = classifierWrapperHead(classifier, classifier_name, f_test[c1][ss]) for c2 in range(n_classes): cmt[c1][c2] = float(len(numpy.nonzero(res == c2)[0])) cm = cm + cmt cm = cm + 0.0000000010 rec = numpy.zeros((cm.shape[0], )) pre = numpy.zeros((cm.shape[0], )) for ci in range(cm.shape[0]): rec[ci] = cm[ci, ci] / numpy.sum(cm[ci, :]) pre[ci] = cm[ci, ci] / numpy.sum(cm[:, ci]) precision_classes_all.append(pre) recall_classes_all.append(rec) f1 = 2 * rec * pre / (rec + pre) f1_classes_all.append(f1) ac_all.append(numpy.sum(numpy.diagonal(cm)) / numpy.sum(cm)) cms_all.append(cm) f1_all.append(numpy.mean(f1)) print("\t\t", end="") for i, c in enumerate(class_names): if i == len(class_names)-1: print("{0:s}\t\t".format(c), end="") else: print("{0:s}\t\t\t".format(c), end="") print("OVERALL") print("\tC", end="") for c in class_names: print("\tPRE\tREC\tf1", end="") print("\t{0:s}\t{1:s}".format("ACC", "f1")) best_ac_ind = numpy.argmax(ac_all) best_f1_ind = numpy.argmax(f1_all) for i in range(len(precision_classes_all)): print("\t{0:.3f}".format(Params[i]), end="") for c in range(len(precision_classes_all[i])): print("\t{0:.1f}\t{1:.1f}\t{2:.1f}".format(100.0 * precision_classes_all[i][c], 100.0 * recall_classes_all[i][c], 100.0 * f1_classes_all[i][c]), end="") print("\t{0:.1f}\t{1:.1f}".format(100.0 * ac_all[i], 100.0 * f1_all[i]), end="") if i == best_f1_ind: print("\t best f1", end="") if i == best_ac_ind: print("\t best Acc", end="") print("") if parameterMode == 0: # keep parameters that maximize overall classification accuracy: print("Confusion Matrix:") printConfusionMatrix(cms_all[best_ac_ind], class_names) return Params[best_ac_ind] elif parameterMode == 1: # keep parameters that maximize overall f1 measure: print("Confusion Matrix:") printConfusionMatrix(cms_all[best_f1_ind], class_names) return Params[best_f1_ind]
import os from pyAudioAnalysis import audioTrainTest if __name__ == '__main__': rospy.init_node("classifier_train_node") modelName = rospy.get_param('~classifier_name', 'modelSVM') features = [] classNames = rospy.get_param('~classes', {'silence', 'speech'}) classNames = classNames.split() for a in classNames: temp = numpy.load(os.path.dirname(os.path.realpath(sys.argv[0]))+'/classifier_data/'+a+'.npy') features.append(temp) classifierParams = numpy.array([0.001, 0.01, 0.5, 1.0, 5.0]) nExp = 50 bestParam = audioTrainTest.evaluateClassifier(features, classNames, nExp, "svm", classifierParams, 0, perTrain = 0.01) [featuresNorm, MEAN, STD] = audioTrainTest.normalizeFeatures(features) # normalize features MEAN = MEAN.tolist() STD = STD.tolist() featuresNew = featuresNorm Classifier = audioTrainTest.trainSVM(featuresNew, bestParam) Classifier.save_model(os.path.dirname(os.path.realpath(sys.argv[0]))+'/classifier_data/'+modelName) fo = open(os.path.dirname(os.path.realpath(sys.argv[0]))+'/classifier_data/'+modelName + "MEANS", "wb") cPickle.dump(MEAN, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(STD, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(0, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(0, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(0, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(0, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(0, fo, protocol=cPickle.HIGHEST_PROTOCOL) fo.close()
def visualizeFeaturesFolder(folder, dimReductionMethod, priorKnowledge = "none"): ''' This function generates a chordial visualization for the recordings of the provided path. ARGUMENTS: - folder: path of the folder that contains the WAV files to be processed - dimReductionMethod: method used to reduce the dimension of the initial feature space before computing the similarity. - priorKnowledge: if this is set equal to "artist" ''' if dimReductionMethod=="pca": allMtFeatures, wavFilesList, _ = aF.dirWavFeatureExtraction(folder, 30.0, 30.0, 0.050, 0.050, compute_beat = True) if allMtFeatures.shape[0]==0: print("Error: No data found! Check input folder") return namesCategoryToVisualize = [ntpath.basename(w).replace('.wav','').split(" --- ")[0] for w in wavFilesList]; namesToVisualize = [ntpath.basename(w).replace('.wav','') for w in wavFilesList]; (F, MEAN, STD) = aT.normalizeFeatures([allMtFeatures]) F = np.concatenate(F) # check that the new PCA dimension is at most equal to the number of samples K1 = 2 K2 = 10 if K1 > F.shape[0]: K1 = F.shape[0] if K2 > F.shape[0]: K2 = F.shape[0] pca1 = sklearn.decomposition.PCA(n_components = K1) pca1.fit(F) pca2 = sklearn.decomposition.PCA(n_components = K2) pca2.fit(F) finalDims = pca1.transform(F) finalDims2 = pca2.transform(F) else: allMtFeatures, Ys, wavFilesList = aF.dirWavFeatureExtractionNoAveraging(folder, 20.0, 5.0, 0.040, 0.040) # long-term statistics cannot be applied in this context (LDA needs mid-term features) if allMtFeatures.shape[0]==0: print("Error: No data found! Check input folder") return namesCategoryToVisualize = [ntpath.basename(w).replace('.wav','').split(" --- ")[0] for w in wavFilesList]; namesToVisualize = [ntpath.basename(w).replace('.wav','') for w in wavFilesList]; ldaLabels = Ys if priorKnowledge=="artist": uNamesCategoryToVisualize = list(set(namesCategoryToVisualize)) YsNew = np.zeros( Ys.shape ) for i, uname in enumerate(uNamesCategoryToVisualize): # for each unique artist name: indicesUCategories = [j for j, x in enumerate(namesCategoryToVisualize) if x == uname] for j in indicesUCategories: indices = np.nonzero(Ys==j) YsNew[indices] = i ldaLabels = YsNew (F, MEAN, STD) = aT.normalizeFeatures([allMtFeatures]) F = np.array(F[0]) clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(n_components=10) clf.fit(F, ldaLabels) reducedDims = clf.transform(F) pca = sklearn.decomposition.PCA(n_components = 2) pca.fit(reducedDims) reducedDims = pca.transform(reducedDims) # TODO: CHECK THIS ... SHOULD LDA USED IN SEMI-SUPERVISED ONLY???? uLabels = np.sort(np.unique((Ys))) # uLabels must have as many labels as the number of wavFilesList elements reducedDimsAvg = np.zeros( (uLabels.shape[0], reducedDims.shape[1] ) ) finalDims = np.zeros( (uLabels.shape[0], 2) ) for i, u in enumerate(uLabels): indices = [j for j, x in enumerate(Ys) if x == u] f = reducedDims[indices, :] finalDims[i, :] = f.mean(axis=0) finalDims2 = reducedDims for i in range(finalDims.shape[0]): plt.text(finalDims[i,0], finalDims[i,1], ntpath.basename(wavFilesList[i].replace('.wav','')), horizontalalignment='center', verticalalignment='center', fontsize=10) plt.plot(finalDims[i,0], finalDims[i,1], '*r') plt.xlim([1.2*finalDims[:,0].min(), 1.2*finalDims[:,0].max()]) plt.ylim([1.2*finalDims[:,1].min(), 1.2*finalDims[:,1].max()]) plt.show() SM = 1.0 - distance.squareform(distance.pdist(finalDims2, 'cosine')) for i in range(SM.shape[0]): SM[i,i] = 0.0; chordialDiagram("visualization", SM, 0.50, namesToVisualize, namesCategoryToVisualize) SM = 1.0 - distance.squareform(distance.pdist(F, 'cosine')) for i in range(SM.shape[0]): SM[i,i] = 0.0; chordialDiagram("visualizationInitial", SM, 0.50, namesToVisualize, namesCategoryToVisualize) # plot super-categories (i.e. artistname uNamesCategoryToVisualize = sort(list(set(namesCategoryToVisualize))) finalDimsGroup = np.zeros( (len(uNamesCategoryToVisualize), finalDims2.shape[1] ) ) for i, uname in enumerate(uNamesCategoryToVisualize): indices = [j for j, x in enumerate(namesCategoryToVisualize) if x == uname] f = finalDims2[indices, :] finalDimsGroup[i, :] = f.mean(axis=0) SMgroup = 1.0 - distance.squareform(distance.pdist(finalDimsGroup, 'cosine')) for i in range(SMgroup.shape[0]): SMgroup[i,i] = 0.0; chordialDiagram("visualizationGroup", SMgroup, 0.50, uNamesCategoryToVisualize, uNamesCategoryToVisualize)
def evaluateClassifier(argv): save = argv[5] dirName = argv[2] # path to csv files fileList = sorted(glob.glob(os.path.join(dirName, "*.csv"))) #data = {} #data['user'] = {} user = [] exercise = [] repetition = [] time = [] emg_raw = [] gt_labels = [] feature_vectors_nofatigue = [] feature_vectors_fatigue = [] for file in fileList: with open(file, 'r') as f: x = f.readlines() if not x: continue time.append([float(label.split(',')[0]) for label in x]) emg_raw.append([float(label.split(',')[1]) for label in x]) gt_labels.append( [int(label.split(',')[2].rstrip()) for label in x]) f.close #split the sample into the positive and negative classes ### feature_vectors, gtWindowLabels = featureExtraction( emg_raw[-1], time[-1], gt_labels[-1], 2, 1, 0.25, 0.25) for i, w in enumerate(gtWindowLabels): if w == 0: feature_vectors_nofatigue.append(feature_vectors[:, i]) else: feature_vectors_fatigue.append(feature_vectors[:, i]) user.append(file.split('/')[-1].split('E')[0][1:]) exercise.append(file.split('/')[-1].split('R')[0][-1]) repetition.append(file.split('/')[-1].split('.')[0][-1]) if argv[-1] == '-s': showEMGData(emg_raw[-1], time[-1][-1] - time[-1][0], gt_labels[-1]) #Collect all features featuresAll = [] featuresAll.append(np.array(feature_vectors_nofatigue)) featuresAll.append(np.array(feature_vectors_fatigue)) labelsAll = ['0:NoFtigue', '1:Fatigue'] # 0:NoFtigue, 1:Fatigue #Normilize features (featuresAll, MEAN, STD) = aT.normalizeFeatures(featuresAll) clf = argv[3][1:] params = argv[4] bestParam = aT.evaluateclassifier(featuresAll, labelsAll, 1000, clf, params, 0, perTrain=0.80) MEAN = MEAN.tolist() STD = STD.tolist() model = Classify(clf, featuresAll, bestParam) if save: saveClassifier(clf, bestParam, model, MEAN, STD, labelsAll) print 'Training of', clf, 'completed' return clf, model, labelsAll, MEAN, STD, bestParam
def visualizeFeaturesFolder(folder, dimReductionMethod, priorKnowledge="none"): ''' This function generates a chordial visualization for the recordings of the provided path. ARGUMENTS: - folder: path of the folder that contains the WAV files to be processed - dimReductionMethod: method used to reduce the dimension of the initial feature space before computing the similarity. - priorKnowledge: if this is set equal to "artist" ''' if dimReductionMethod == "pca": allMtFeatures, wavFilesList, _ = aF.dirWavFeatureExtraction( folder, 30.0, 30.0, 0.050, 0.050, compute_beat=True) if allMtFeatures.shape[0] == 0: print("Error: No data found! Check input folder") return namesCategoryToVisualize = [ ntpath.basename(w).replace('.wav', '').split(" --- ")[0] for w in wavFilesList ] namesToVisualize = [ ntpath.basename(w).replace('.wav', '') for w in wavFilesList ] (F, MEAN, STD) = aT.normalizeFeatures([allMtFeatures]) F = np.concatenate(F) # check that the new PCA dimension is at most equal to the number of samples K1 = 2 K2 = 10 if K1 > F.shape[0]: K1 = F.shape[0] if K2 > F.shape[0]: K2 = F.shape[0] pca1 = sklearn.decomposition.PCA(n_components=K1) pca1.fit(F) pca2 = sklearn.decomposition.PCA(n_components=K2) pca2.fit(F) finalDims = pca1.transform(F) finalDims2 = pca2.transform(F) else: allMtFeatures, Ys, wavFilesList = aF.dirWavFeatureExtractionNoAveraging( folder, 20.0, 5.0, 0.040, 0.040 ) # long-term statistics cannot be applied in this context (LDA needs mid-term features) if allMtFeatures.shape[0] == 0: print("Error: No data found! Check input folder") return namesCategoryToVisualize = [ ntpath.basename(w).replace('.wav', '').split(" --- ")[0] for w in wavFilesList ] namesToVisualize = [ ntpath.basename(w).replace('.wav', '') for w in wavFilesList ] ldaLabels = Ys if priorKnowledge == "artist": uNamesCategoryToVisualize = list(set(namesCategoryToVisualize)) YsNew = np.zeros(Ys.shape) for i, uname in enumerate( uNamesCategoryToVisualize): # for each unique artist name: indicesUCategories = [ j for j, x in enumerate(namesCategoryToVisualize) if x == uname ] for j in indicesUCategories: indices = np.nonzero(Ys == j) YsNew[indices] = i ldaLabels = YsNew (F, MEAN, STD) = aT.normalizeFeatures([allMtFeatures]) F = np.array(F[0]) clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis( n_components=10) clf.fit(F, ldaLabels) reducedDims = clf.transform(F) pca = sklearn.decomposition.PCA(n_components=2) pca.fit(reducedDims) reducedDims = pca.transform(reducedDims) # TODO: CHECK THIS ... SHOULD LDA USED IN SEMI-SUPERVISED ONLY???? uLabels = np.sort( np.unique((Ys)) ) # uLabels must have as many labels as the number of wavFilesList elements reducedDimsAvg = np.zeros((uLabels.shape[0], reducedDims.shape[1])) finalDims = np.zeros((uLabels.shape[0], 2)) for i, u in enumerate(uLabels): indices = [j for j, x in enumerate(Ys) if x == u] f = reducedDims[indices, :] finalDims[i, :] = f.mean(axis=0) finalDims2 = reducedDims for i in range(finalDims.shape[0]): plt.text(finalDims[i, 0], finalDims[i, 1], ntpath.basename(wavFilesList[i].replace('.wav', '')), horizontalalignment='center', verticalalignment='center', fontsize=10) plt.plot(finalDims[i, 0], finalDims[i, 1], '*r') plt.xlim([1.2 * finalDims[:, 0].min(), 1.2 * finalDims[:, 0].max()]) plt.ylim([1.2 * finalDims[:, 1].min(), 1.2 * finalDims[:, 1].max()]) plt.show() SM = 1.0 - distance.squareform(distance.pdist(finalDims2, 'cosine')) for i in range(SM.shape[0]): SM[i, i] = 0.0 chordialDiagram("visualization", SM, 0.50, namesToVisualize, namesCategoryToVisualize) SM = 1.0 - distance.squareform(distance.pdist(F, 'cosine')) for i in range(SM.shape[0]): SM[i, i] = 0.0 chordialDiagram("visualizationInitial", SM, 0.50, namesToVisualize, namesCategoryToVisualize) # plot super-categories (i.e. artistname uNamesCategoryToVisualize = sort(list(set(namesCategoryToVisualize))) finalDimsGroup = np.zeros( (len(uNamesCategoryToVisualize), finalDims2.shape[1])) for i, uname in enumerate(uNamesCategoryToVisualize): indices = [ j for j, x in enumerate(namesCategoryToVisualize) if x == uname ] f = finalDims2[indices, :] finalDimsGroup[i, :] = f.mean(axis=0) SMgroup = 1.0 - distance.squareform( distance.pdist(finalDimsGroup, 'cosine')) for i in range(SMgroup.shape[0]): SMgroup[i, i] = 0.0 chordialDiagram("visualizationGroup", SMgroup, 0.50, uNamesCategoryToVisualize, uNamesCategoryToVisualize)
def silenceRemoval(x, Fs, stWin, stStep, smoothWindow=0.5, Weight=0.5, plot=False): ''' Event Detection (silence removal) ARGUMENTS: - x: the input audio signal - Fs: sampling freq - stWin, stStep: window size and step in seconds - smoothWindow: (optinal) smooth window (in seconds) - Weight: (optinal) weight factor (0 < Weight < 1) the higher, the more strict - plot: (optinal) True if results are to be plotted RETURNS: - segmentLimits: list of segment limits in seconds (e.g [[0.1, 0.9], [1.4, 3.0]] means that the resulting segments are (0.1 - 0.9) seconds and (1.4, 3.0) seconds ''' if Weight >= 1: Weight = 0.99 if Weight <= 0: Weight = 0.01 # Step 1: feature extraction x = audioBasicIO.stereo2mono(x) # convert to mono ShortTermFeatures = aF.stFeatureExtraction( x, Fs, stWin * Fs, stStep * Fs) # extract short-term features # Step 2: train binary SVM classifier of low vs high energy frames EnergySt = ShortTermFeatures[ 1, :] # keep only the energy short-term sequence (2nd feature) E = numpy.sort(EnergySt) # sort the energy feature values: L1 = int(len(E) / 10) # number of 10% of the total short-term windows T1 = numpy.mean( E[0:L1]) + 0.000000000000001 # compute "lower" 10% energy threshold T2 = numpy.mean( E[-L1:-1]) + 0.000000000000001 # compute "higher" 10% energy threshold Class1 = ShortTermFeatures[:, numpy.where( EnergySt <= T1)[0]] # get all features that correspond to low energy Class2 = ShortTermFeatures[:, numpy.where( EnergySt >= T2)[0]] # get all features that correspond to high energy featuresSS = [Class1.T, Class2.T] # form the binary classification task and ... [featuresNormSS, MEANSS, STDSS] = aT.normalizeFeatures(featuresSS) # normalize and ... SVM = aT.trainSVM( featuresNormSS, 1.0) # train the respective SVM probabilistic model (ONSET vs SILENCE) # Step 3: compute onset probability based on the trained SVM ProbOnset = [] for i in range(ShortTermFeatures.shape[1]): # for each frame curFV = (ShortTermFeatures[:, i] - MEANSS) / STDSS # normalize feature vector ProbOnset.append( SVM.predict_proba(curFV.reshape(1, -1))[0] [1]) # get SVM probability (that it belongs to the ONSET class) ProbOnset = numpy.array(ProbOnset) ProbOnset = smoothMovingAvg(ProbOnset, smoothWindow / stStep) # smooth probability # Step 4A: detect onset frame indices: ProbOnsetSorted = numpy.sort( ProbOnset ) # find probability Threshold as a weighted average of top 10% and lower 10% of the values Nt = int(ProbOnsetSorted.shape[0] / 10) T = (numpy.mean((1 - Weight) * ProbOnsetSorted[0:Nt]) + Weight * numpy.mean(ProbOnsetSorted[-Nt::])) MaxIdx = numpy.where(ProbOnset > T)[ 0] # get the indices of the frames that satisfy the thresholding i = 0 timeClusters = [] segmentLimits = [] # Step 4B: group frame indices to onset segments while i < len(MaxIdx): # for each of the detected onset indices curCluster = [MaxIdx[i]] if i == len(MaxIdx) - 1: break while MaxIdx[i + 1] - curCluster[-1] <= 2: curCluster.append(MaxIdx[i + 1]) i += 1 if i == len(MaxIdx) - 1: break i += 1 timeClusters.append(curCluster) segmentLimits.append([curCluster[0] * stStep, curCluster[-1] * stStep]) # Step 5: Post process: remove very small segments: minDuration = 0.2 segmentLimits2 = [] for s in segmentLimits: if s[1] - s[0] > minDuration: segmentLimits2.append(s) segmentLimits = segmentLimits2 if plot: timeX = numpy.arange(0, x.shape[0] / float(Fs), 1.0 / Fs) plt.subplot(2, 1, 1) plt.plot(timeX, x) for s in segmentLimits: plt.axvline(x=s[0]) plt.axvline(x=s[1]) plt.subplot(2, 1, 2) plt.plot(numpy.arange(0, ProbOnset.shape[0] * stStep, stStep), ProbOnset) plt.title('Signal') for s in segmentLimits: plt.axvline(x=s[0]) plt.axvline(x=s[1]) plt.title('SVM Probability') plt.show() return segmentLimits
def speakerDiarization(filename, n_speakers, mt_size=2.0, mt_step=0.2, st_win=0.05, lda_dim=35, plot_res=False): ''' ARGUMENTS: - filename: the name of the WAV file to be analyzed - n_speakers the number of speakers (clusters) in the recording (<=0 for unknown) - mt_size (opt) mid-term window size - mt_step (opt) mid-term window step - st_win (opt) short-term window size - lda_dim (opt) LDA dimension (0 for no LDA) - plot_res (opt) 0 for not plotting the results 1 for plottingy ''' [fs, x] = audioBasicIO.readAudioFile(filename) x = audioBasicIO.stereo2mono(x) duration = len(x) / fs [ classifier_1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1 ] = aT.load_model_knn( os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "knnSpeakerAll")) [ classifier_2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2 ] = aT.load_model_knn( os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "knnSpeakerFemaleMale")) [mt_feats, st_feats, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs, mt_step * fs, round(fs * st_win), round(fs * st_win * 0.5)) MidTermFeatures2 = numpy.zeros( (mt_feats.shape[0] + len(classNames1) + len(classNames2), mt_feats.shape[1])) for i in range(mt_feats.shape[1]): cur_f1 = (mt_feats[:, i] - MEAN1) / STD1 cur_f2 = (mt_feats[:, i] - MEAN2) / STD2 [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1) [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2) MidTermFeatures2[0:mt_feats.shape[0], i] = mt_feats[:, i] MidTermFeatures2[mt_feats.shape[0]:mt_feats.shape[0] + len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[mt_feats.shape[0] + len(classNames1)::, i] = P2 + 0.0001 mt_feats = MidTermFeatures2 # TODO iFeaturesSelect = [ 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ] mt_feats = mt_feats[iFeaturesSelect, :] (mt_feats_norm, MEAN, STD) = aT.normalizeFeatures([mt_feats.T]) mt_feats_norm = mt_feats_norm[0].T n_wins = mt_feats.shape[1] # remove outliers: dist_all = numpy.sum(distance.squareform(distance.pdist(mt_feats_norm.T)), axis=0) m_dist_all = numpy.mean(dist_all) i_non_outliers = numpy.nonzero(dist_all < 1.2 * m_dist_all)[0] # TODO: Combine energy threshold for outlier removal: #EnergyMin = numpy.min(mt_feats[1,:]) #EnergyMean = numpy.mean(mt_feats[1,:]) #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 #i_non_outliers = numpy.nonzero(mt_feats[1,:] > Thres)[0] #print i_non_outliers perOutLier = (100.0 * (n_wins - i_non_outliers.shape[0])) / n_wins mt_feats_norm_or = mt_feats_norm mt_feats_norm = mt_feats_norm[:, i_non_outliers] # LDA dimensionality reduction: if lda_dim > 0: #[mt_feats_to_red, _, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs, st_win * fs, round(fs*st_win), round(fs*st_win)); # extract mid-term features with minimum step: mt_win_ratio = int(round(mt_size / st_win)) mt_step_ratio = int(round(st_win / st_win)) mt_feats_to_red = [] num_of_features = len(st_feats) num_of_stats = 2 #for i in range(num_of_stats * num_of_features + 1): for i in range(num_of_stats * num_of_features): mt_feats_to_red.append([]) for i in range( num_of_features): # for each of the short-term features: curPos = 0 N = len(st_feats[i]) while (curPos < N): N1 = curPos N2 = curPos + mt_win_ratio if N2 > N: N2 = N curStFeatures = st_feats[i][N1:N2] mt_feats_to_red[i].append(numpy.mean(curStFeatures)) mt_feats_to_red[i + num_of_features].append( numpy.std(curStFeatures)) curPos += mt_step_ratio mt_feats_to_red = numpy.array(mt_feats_to_red) mt_feats_to_red_2 = numpy.zeros( (mt_feats_to_red.shape[0] + len(classNames1) + len(classNames2), mt_feats_to_red.shape[1])) for i in range(mt_feats_to_red.shape[1]): cur_f1 = (mt_feats_to_red[:, i] - MEAN1) / STD1 cur_f2 = (mt_feats_to_red[:, i] - MEAN2) / STD2 [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1) [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2) mt_feats_to_red_2[0:mt_feats_to_red.shape[0], i] = mt_feats_to_red[:, i] mt_feats_to_red_2[ mt_feats_to_red.shape[0]:mt_feats_to_red.shape[0] + len(classNames1), i] = P1 + 0.0001 mt_feats_to_red_2[mt_feats_to_red.shape[0] + len(classNames1)::, i] = P2 + 0.0001 mt_feats_to_red = mt_feats_to_red_2 mt_feats_to_red = mt_feats_to_red[iFeaturesSelect, :] #mt_feats_to_red += numpy.random.rand(mt_feats_to_red.shape[0], mt_feats_to_red.shape[1]) * 0.0000010 (mt_feats_to_red, MEAN, STD) = aT.normalizeFeatures([mt_feats_to_red.T]) mt_feats_to_red = mt_feats_to_red[0].T #dist_all = numpy.sum(distance.squareform(distance.pdist(mt_feats_to_red.T)), axis=0) #m_dist_all = numpy.mean(dist_all) #iNonOutLiers2 = numpy.nonzero(dist_all < 3.0*m_dist_all)[0] #mt_feats_to_red = mt_feats_to_red[:, iNonOutLiers2] Labels = numpy.zeros((mt_feats_to_red.shape[1], )) LDAstep = 1.0 LDAstepRatio = LDAstep / st_win #print LDAstep, LDAstepRatio for i in range(Labels.shape[0]): Labels[i] = int(i * st_win / LDAstepRatio) clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis( n_components=lda_dim) clf.fit(mt_feats_to_red.T, Labels) mt_feats_norm = (clf.transform(mt_feats_norm.T)).T if n_speakers <= 0: s_range = range(2, 10) else: s_range = [n_speakers] clsAll = [] sil_all = [] centersAll = [] for iSpeakers in s_range: k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers) k_means.fit(mt_feats_norm.T) cls = k_means.labels_ means = k_means.cluster_centers_ # Y = distance.squareform(distance.pdist(mt_feats_norm.T)) clsAll.append(cls) centersAll.append(means) sil_1 = [] sil_2 = [] for c in range(iSpeakers): # for each speaker (i.e. for each extracted cluster) clust_per_cent = numpy.nonzero(cls == c)[0].shape[0] / \ float(len(cls)) if clust_per_cent < 0.020: sil_1.append(0.0) sil_2.append(0.0) else: # get subset of feature vectors mt_feats_norm_temp = mt_feats_norm[:, cls == c] # compute average distance between samples # that belong to the cluster (a values) Yt = distance.pdist(mt_feats_norm_temp.T) sil_1.append(numpy.mean(Yt) * clust_per_cent) silBs = [] for c2 in range(iSpeakers): # compute distances from samples of other clusters if c2 != c: clust_per_cent_2 = numpy.nonzero(cls == c2)[0].shape[0] /\ float(len(cls)) MidTermFeaturesNormTemp2 = mt_feats_norm[:, cls == c2] Yt = distance.cdist(mt_feats_norm_temp.T, MidTermFeaturesNormTemp2.T) silBs.append( numpy.mean(Yt) * (clust_per_cent + clust_per_cent_2) / 2.0) silBs = numpy.array(silBs) # ... and keep the minimum value (i.e. # the distance from the "nearest" cluster) sil_2.append(min(silBs)) sil_1 = numpy.array(sil_1) sil_2 = numpy.array(sil_2) sil = [] for c in range(iSpeakers): # for each cluster (speaker) compute silhouette sil.append( (sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 0.00001)) # keep the AVERAGE SILLOUETTE sil_all.append(numpy.mean(sil)) imax = numpy.argmax(sil_all) # optimal number of clusters nSpeakersFinal = s_range[imax] # generate the final set of cluster labels # (important: need to retrieve the outlier windows: # this is achieved by giving them the value of their # nearest non-outlier window) cls = numpy.zeros((n_wins, )) for i in range(n_wins): j = numpy.argmin(numpy.abs(i - i_non_outliers)) cls[i] = clsAll[imax][j] # Post-process method 1: hmm smoothing for i in range(1): # hmm training start_prob, transmat, means, cov = \ trainHMM_computeStatistics(mt_feats_norm_or, cls) hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag") hmm.startprob_ = start_prob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov cls = hmm.predict(mt_feats_norm_or.T) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = sil_all[imax] class_names = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)] # load ground-truth if available gt_file = filename.replace('.wav', '.segments') # if groundturh exists if os.path.isfile(gt_file): [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file) flags_gt, class_names_gt = segs2flags(seg_start, seg_end, seg_labs, mt_step) if plot_res: fig = plt.figure() if n_speakers > 0: ax1 = fig.add_subplot(111) else: ax1 = fig.add_subplot(211) ax1.set_yticks(numpy.array(range(len(class_names)))) ax1.axis((0, duration, -1, len(class_names))) ax1.set_yticklabels(class_names) ax1.plot(numpy.array(range(len(cls))) * mt_step + mt_step / 2.0, cls) if os.path.isfile(gt_file): if plot_res: ax1.plot( numpy.array(range(len(flags_gt))) * mt_step + mt_step / 2.0, flags_gt, 'r') purity_cluster_m, purity_speaker_m = \ evaluateSpeakerDiarization(cls, flags_gt) print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.title("Cluster purity: {0:.1f}% - " "Speaker purity: {1:.1f}%".format( 100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.xlabel("time (seconds)") #print s_range, sil_all if n_speakers <= 0: plt.subplot(212) plt.plot(s_range, sil_all) plt.xlabel("number of clusters") plt.ylabel("average clustering's sillouette") plt.show() return cls
def speakerDiarization(fileName, sRange = xrange(2, 10), mtSize = 2.0, mtStep = 0.2, stWin = 0.05, LDAdim = 35): Fs, x = audioBasicIO.readAudioFile(fileName) x = audioBasicIO.stereo2mono(x) duration = len(x) / Fs Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1 = aT.loadKNNModel(os.path.join('/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data', 'knnSpeakerAll')) Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2 = aT.loadKNNModel(os.path.join('/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data', 'knnSpeakerFemaleMale')) MidTermFeatures, ShortTermFeatures = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stWin * 0.5)) MidTermFeatures2 = numpy.zeros((MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1])) for i in range(MidTermFeatures.shape[1]): curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1 curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2 Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1) Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2) MidTermFeatures2[0: MidTermFeatures.shape[0], i] = MidTermFeatures[:, i] MidTermFeatures2[MidTermFeatures.shape[0]: MidTermFeatures.shape[0] + len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1):, i] = P2 + 0.0001 MidTermFeatures = MidTermFeatures2 iFeaturesSelect = range(8, 21) + range(41, 54) MidTermFeatures = MidTermFeatures[iFeaturesSelect, :] MidTermFeaturesNorm, MEAN, STD = aT.normalizeFeatures([MidTermFeatures.T]) MidTermFeaturesNorm = MidTermFeaturesNorm[0].T numOfWindows = MidTermFeatures.shape[1] DistancesAll = numpy.sum(distance.squareform(distance.pdist(MidTermFeaturesNorm.T)), axis = 0) MDistancesAll = numpy.mean(DistancesAll) iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0] perOutLier = (100.0 * (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows MidTermFeaturesNormOr = MidTermFeaturesNorm MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers] if LDAdim > 0: mtWinRatio, mtStepRatio, mtFeaturesToReduce, numOfFeatures, numOfStatistics = int(round(mtSize / stWin)), int(round(stWin / stWin)), list(), len(ShortTermFeatures), 2 for i in range(numOfStatistics * numOfFeatures): mtFeaturesToReduce.append(list()) for i in range(numOfFeatures): curPos = 0 N = len(ShortTermFeatures[i]) while (curPos < N): N1, N2 = curPos, curPos + mtWinRatio if N2 > N: N2 = N curStFeatures = ShortTermFeatures[i][N1: N2] mtFeaturesToReduce[i].append(numpy.mean(curStFeatures)) mtFeaturesToReduce[i + numOfFeatures].append(numpy.std(curStFeatures)) curPos += mtStepRatio mtFeaturesToReduce = numpy.array(mtFeaturesToReduce) mtFeaturesToReduce2 = numpy.zeros((mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1])) for i in range(mtFeaturesToReduce.shape[1]): curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1 curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2 Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1) Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2) mtFeaturesToReduce2[0: mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i] mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]: mtFeaturesToReduce.shape[0] + len(classNames1), i] = P1 + 0.0001 mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] + len(classNames1):, i] = P2 + 0.0001 mtFeaturesToReduce = mtFeaturesToReduce2 mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :] mtFeaturesToReduce, MEAN, STD = aT.normalizeFeatures([mtFeaturesToReduce.T]) mtFeaturesToReduce = mtFeaturesToReduce[0].T Labels = numpy.zeros((mtFeaturesToReduce.shape[1], )) LDAstep = 1.0 LDAstepRatio = LDAstep / stWin for i in range(Labels.shape[0]): Labels[i] = int(i * stWin / LDAstepRatio) clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(n_components = LDAdim) clf.fit(mtFeaturesToReduce.T, Labels) MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T clsAll, silAll, centersAll = list(), list(), list() for iSpeakers in sRange: k_means = sklearn.cluster.KMeans(n_clusters = iSpeakers) k_means.fit(MidTermFeaturesNorm.T) cls = k_means.labels_ means = k_means.cluster_centers_ clsAll.append(cls) centersAll.append(means) silA, silB = list(), list() for c in range(iSpeakers): clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float(len(cls)) if clusterPerCent < 0.02: silA.append(0.0) silB.append(0.0) else: MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls == c] Yt = distance.pdist(MidTermFeaturesNormTemp.T) silA.append(numpy.mean(Yt) * clusterPerCent) silBs = list() for c2 in range(iSpeakers): if c2 != c: clusterPerCent2 = numpy.nonzero(cls == c2)[0].shape[0] / float(len(cls)) MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:, cls == c2] Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T) silBs.append(numpy.mean(Yt) * (clusterPerCent+clusterPerCent2) / 2.0) silBs = numpy.array(silBs) silB.append(min(silBs)) silA, silB, sil = numpy.array(silA), numpy.array(silB), list() for c in range(iSpeakers): sil.append((silB[c] - silA[c]) / (max(silB[c], silA[c]) + 0.00001)) silAll.append(numpy.mean(sil)) imax = numpy.argmax(silAll) nSpeakersFinal = sRange[imax] cls = numpy.zeros((numOfWindows, )) for i in range(numOfWindows): j = numpy.argmin(numpy.abs(i - iNonOutLiers)) cls[i] = clsAll[imax][j] startprob, transmat, means, cov = trainHMM(MidTermFeaturesNormOr, cls) hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], 'diag') hmm.startprob_ = startprob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov cls = hmm.predict(MidTermFeaturesNormOr.T) cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = silAll[imax] classNames = ['SPEAKER{0:d}'.format(c) for c in range(nSpeakersFinal)] return cls, classNames, duration, mtStep, silAll