def testClassifiers(dir_clf, dir_test, species, feature, clf=None, pca=False): ''' Load previously trained classifiers and test on a completely new data set. :param dir_clf: path to the saved classifiers :param dir_test: path to the test dataset :param species: species name :param feature: 'WEraw_all', 'WEraw_band', 'WEraw_spnodes' ... :param clf: classifier name e.g. 'SVM' :return: print out confusion matrix ''' # read test dataset d = pd.read_csv(os.path.join(dir_test, species + '_' + feature + '.tsv'), sep="\t", header=None) # d = pd.read_csv(os.path.join(dir_test, 'Kiwi (Tokoeka Fiordland)_WE_spnodes_seg_test.tsv'), sep=",", header=None) data = d.values targets = data[:, -1] data = data[:, 0:-1] # use PCA if selected if pca: pca1 = PCA(n_components=0.8) # will retain 90% of the variance data = pca1.fit_transform(data) # Test with all 62 nodes learners = Learning(data, targets, testFraction=1) # use all data for testing # # OR test with optimum nodes, for kiwi it is [35, 43, 36, 45] # # kiwiNodes = [35, 43, 36, 45] # kiwiNodes = [34, 35, 36, 37, 38, 41, 42, 43, 44, 45, 46, 55] # kiwiNodes = [n - 1 for n in kiwiNodes] # nodes = list(range(63)) # nonKiwiNodes = list(set(nodes) - set(kiwiNodes)) # learners = Learning(data[:, kiwiNodes], data[:, -1], testFraction=1) # # learners = Learning.Learning(data[:, nonKiwiNodes], data[:, -1]) # # learners = Learning.Learning(data[:, 33:61], data[:, -1]) if clf == None: print("MLP--------------------------------") # Load the model model = load( os.path.join(dir_clf, species + '_' + feature + '_MLP.joblib')) learners.performTest(model) print("kNN--------------------------------") model = load( os.path.join(dir_clf, species + '_' + feature + '_kNN.joblib')) learners.performTest(model) print("SVM--------------------------------") model = load( os.path.join(dir_clf, species + '_' + feature + '_SVM.joblib')) learners.performTest(model) print("GP--------------------------------") model = load( os.path.join(dir_clf, species + '_' + feature + '_GP.joblib')) learners.performTest(model) print("DT--------------------------------") model = load( os.path.join(dir_clf, species + '_' + feature + '_DT.joblib')) learners.performTest(model) print("RF--------------------------------") model = load( os.path.join(dir_clf, species + '_' + feature + '_RF.joblib')) learners.performTest(model) print("Boosting--------------------------------") model = load( os.path.join(dir_clf, species + '_' + feature + '_Boost.joblib')) learners.performTest(model) print("XGB--------------------------------") model = load( os.path.join(dir_clf, species + '_' + feature + '_XGB.joblib')) learners.performTest(model) # print("GMM--------------------------------") # model = load(os.path.join(dir_clf, species + '_' + feature + '_GMM.joblib')) # learners.performTest(model) print("######################################################") else: model = load( os.path.join(dir_clf, species + '_' + feature + '_' + clf + '.joblib')) learners.performTest(model)
def TrainClassifier(dir, species, feature, clf=None, pca=False): ''' Use wavelet energy/MFCC as features, train, and save the classifiers for later use Recommended to use fit_GridSearchCV and plot validation/learning curves to determine hyper-parameter values and see how learning improves with more data, at what point it gets stable Choose what features to show to the classifier. Currently lots of variations of WE and MFCC. (1) Wavelet Energies - All 62 nodes, extracted from raw recordings (feature = 'weraw_all') (2) Wavelet Energies - Limit nodes to match frequency range of the species, extracted from raw recordings (3) Wavelet Energies - Limit to optimum nodes for species, extracted from raw recordings (4) Wavelet Energies - All 62 nodes, extracted with bandpass filter (5) Wavelet Energies - Limit nodes to match frequency range of the species, extracted with bandpass filter (6) Wavelet Energies - Limit to optimum nodes for species, extracted with bandpass filter (7) Wavelet Energies - All 62 nodes, extracted from denoised (8) Wavelet Energies - Limit nodes to match frequency range of the species, extracted from denoised (9) Wavelet Energies - Limit to optimum nodes for species, extracted from denoised (10) Wavelet Energies - All 62 nodes, extracted from denoised + bandpassed (11) Wavelet Energies - Limit nodes to match frequency range of the species, extracted from denoised + bandpassed (12) Wavelet Energies - Limit to optimum nodes for species, extracted from denoised + bandpassed (13) MFCC - Full range extracted from raw ('mfccraw_all') (14) MFCC - Limit to match frquency range of the species extracted from raw ('mfccraw_band') (15) MFCC - Full range extracted from bandpassed ('mfccbp_all') (16) MFCC - Limit to match frquency range of the species extracted from bandpassed (17) MFCC - Full range extracted from denoised (18) MFCC - Limit to match frquency range of the species extracted from denoised (19) MFCC - Full range extracted from bandpassed + denoised (20) MFCC - Limit to match frquency range of the species extracted from bandpassed + denoised :param dir: path to the dataset :param species: species name so that the classifier can be saved accordingly :param feature: 'WEraw_all', 'WEraw_band', 'WEraw_spnodes', 'WEbp_all', 'WEbp_band', 'WEbp_spnodes', 'WEd_all', 'WEd_band', 'WEd_spnodes', 'WEbpd_all', 'WEbpd_band', 'WEbpd_spnodes', 'MFCCraw_all', 'mfccraw_band', 'MFCCbp_all', 'mfccbp_band', 'MFCCd_all', 'MFCCd_band', 'MFCCbpd_all', 'MFCCbpd_band' :param clf: name of the classifier to train :return: save the trained classifier in dirName e.g. kiwi_SVM.joblib ''' # Read previously stored data as required # d = pd.read_csv(os.path.join(dir, 'Kiwi (Tokoeka Fiordland)_WE_spnodes_seg_train.tsv'), sep=",", header=None) d = pd.read_csv(os.path.join(dir, species + '_' + feature + '.tsv'), sep="\t", header=None) data = d.values # Balance the data set targets = data[:, -1] data = data[:, 0:-1] posTargetInd = np.where(targets == 1) negTargetInd = np.where(targets == 0) # randomly select n negative rows n = min(np.shape(posTargetInd)[1], np.shape(negTargetInd)[1]) posTargetInd = posTargetInd[0].tolist() posTargetInd = random.sample(posTargetInd, n) negTargetInd = negTargetInd[0].tolist() negTargetInd = random.sample(negTargetInd, n) inds = posTargetInd + negTargetInd data = data[inds, :] # use PCA if selected if pca: pca1 = PCA(n_components=0.8) # will retain 90% of the variance data = pca1.fit_transform(data) targets = targets[inds] learners = Learning(data, targets, testFraction=0.5) # use whole data set for training # OR learn with optimum nodes, for kiwi it is [35, 43, 36, 45] # kiwiNodes = [35, 43, 36, 45] # kiwiNodes = [34, 35, 36, 37, 38, 41, 42, 43, 44, 45, 46, 55] # kiwiNodes = [n - 1 for n in kiwiNodes] # nodes = list(range(63)) # # nonKiwiNodes = list(set(nodes) - set(kiwiNodes)) # # print(nonKiwiNodes) # learners = Learning(data[:, kiwiNodes], targets) # learners = Learning(data[:, nonKiwiNodes], data[:, -1]) # learners = Learning(data[:, 33:61], data[:, -1]) if clf == None: # then train all the classifiers (expensive option) print("MLP--------------------------------") # model = learners.trainMLP(structure=(100,), learningrate=0.001, solver='adam', epochs=200, alpha=1, # shuffle=True, early_stopping=False) model = learners.trainMLP(structure=(25, ), learningrate=0.001, solver='adam', epochs=200, alpha=1, shuffle=True, early_stopping=False) # Save the model dump(model, os.path.join(dir, species + '_' + feature + '_MLP.joblib')) learners.performTest(model) print("kNN--------------------------------") model = learners.trainKNN(K=3) # Save the model dump(model, os.path.join(dir, species + '_' + feature + '_kNN.joblib')) learners.performTest(model) print("SVM--------------------------------") # model = learners.trainSVM(kernel="rbf", C=1, gamma=0.0077) model = learners.trainSVM(kernel="rbf", C=1, gamma=0.03) learners.performTest(model) # Save the model dump(model, os.path.join(dir, species + '_' + feature + '_SVM.joblib')) learners.performTest(model) print("GP--------------------------------") model = learners.trainGP() # Save the model dump(model, os.path.join(dir, species + '_' + feature + '_GP.joblib')) learners.performTest(model) print("DT--------------------------------") model = learners.trainDecisionTree() # Save the model dump(model, os.path.join(dir, species + '_' + feature + '_DT.joblib')) learners.performTest(model) print("RF--------------------------------") model = learners.trainRandomForest() # Save the model dump(model, os.path.join(dir, species + '_' + feature + '_RF.joblib')) learners.performTest(model) print("Boosting--------------------------------") model = learners.trainBoosting() # Save the model dump(model, os.path.join(dir, species + '_' + feature + '_Boost.joblib')) learners.performTest(model) print("XGB--------------------------------") model = learners.trainXGBoost() # Save the model dump(model, os.path.join(dir, species + '_' + feature + '_XGB.joblib')) learners.performTest(model) # print("GMM--------------------------------") # model = learners.trainGMM(covType='full', maxIts=200, nClasses=4) # # Save the model # dump(model, os.path.join(dir,species+'_'+feature+'_GMM.joblib')) print("######################################################") elif clf == 'MLP': print("MLP--------------------------------") model = learners.trainMLP(structure=(250, ), learningrate=0.001, solver='adam', epochs=200, alpha=1, shuffle=True, early_stopping=True) # Save the model dump(model, os.path.join(dir, species + '_' + feature + '_MLP.joblib')) elif clf == 'kNN': print("kNN--------------------------------") model = learners.trainKNN(K=3) # Save the model dump(model, os.path.join(dir, species + '_' + feature + '_kNN.joblib')) elif clf == 'SVM': print("SVM--------------------------------") model = learners.trainSVM(kernel="rbf", C=1, gamma=0.00018) # Save the model dump(model, os.path.join(dir, species + '_' + feature + '_SVM.joblib')) elif clf == 'GP': print("GP--------------------------------") model = learners.trainGP() # Save the model dump(model, os.path.join(dir, species + '_' + feature + '_GP.joblib')) elif clf == 'DT': print("DT--------------------------------") model = learners.trainDecisionTree() # Save the model dump(model, os.path.join(dir, species + '_' + feature + '_DT.joblib')) elif clf == 'RF': print("RF--------------------------------") model = learners.trainRandomForest() # Save the model dump(model, os.path.join(dir, species + '_' + feature + '_RF.joblib')) elif clf == 'Boost': print("Boosting--------------------------------") model = learners.trainBoosting() # Save the model dump(model, os.path.join(dir, species + '_' + feature + '_Boost.joblib')) elif clf == 'XGB': print("XGB--------------------------------") model = learners.trainXGBoost() # Save the model dump(model, os.path.join(dir, species + '_' + feature + '_XGB.joblib')) elif clf == 'GMM': print("GMM--------------------------------") model = learners.trainGMM(covType='full', maxIts=200, nClasses=4) # Save the model dump(model, os.path.join(dir, species + '_' + feature + '_GMM.joblib'))