def train_svm(paths, mid_window, mid_step, short_window, short_step, model_name, compute_beat=False, train_percentage=0.90): # STEP A: Feature Extraction: [features, class_names, _] = aF.multiple_directory_feature_extraction(paths, mid_window, mid_step, short_window, short_step, compute_beat=compute_beat) n_feats = features[0].shape[1] feature_names = ["features" + str(d + 1) for d in range(n_feats)] write_modearff_file(model_name, features, class_names, feature_names) # STEP B: classifier Evaluation and Parameter Selection and get optimal classifeir parameter:: # temp_features = [] for feat in features: temp = [] for i in range(feat.shape[0]): temp_fv = feat[i, :] if (not np.isnan(temp_fv).any()) and (not np.isinf(temp_fv).any()): temp.append(temp_fv.tolist()) else: print("NaN Found! Feature vector not used for training") temp_features.append(np.array(temp)) features = temp_features best_param = aT.evaluate_classifier(features, class_names, 100, "svm", step_params.container.get_svm(), 0, train_percentage) print("Selected params: {0:.5f}".format(best_param)) features_norm, mean, std = normalize_features(features) mean = mean.tolist() std = std.tolist() # STEP C: Save the classifier to file classifier = aT.train_svm(features_norm, best_param) write_mode_file(model_name, classifier, mean, std, class_names, mid_window, mid_step, short_window, short_step, compute_beat)
def extract_features(music_parent_directory): # List all the subdirectories subdirectories = os.listdir(music_parent_directory) for i in range(0, len(subdirectories)): if subdirectories[i] == '.DS_Store': subdirectories.pop(i) break subdirectories = [ music_parent_directory + '/' + subdirectory for subdirectory in subdirectories ] return aF.multiple_directory_feature_extraction(subdirectories, 1.0, 1.0, aT.shortTermWindow, aT.shortTermStep, True)
def feature_extraction_train_regression(folder_name, mid_window, mid_step, short_window, short_step, model_type, model_name, compute_beat=False): """ This function is used as a wrapper to segment-based audio feature extraction and classifier training. ARGUMENTS: folder_name: path of directory containing the WAV files and Regression CSVs mt_win, mt_step: mid-term window length and step st_win, st_step: short-term window and step model_type: "svm" or "knn" or "randomforest" model_name: name of the model to be saved RETURNS: None. Resulting regression model along with the respective model parameters are saved on files. """ # STEP A: Feature Extraction: features, _, filenames = \ aF.multiple_directory_feature_extraction([folder_name], mid_window, mid_step, short_window, short_step, compute_beat=compute_beat) features = features[0] filenames = [ntpath.basename(f) for f in filenames[0]] f_final = [] # Read CSVs: csv_files = glob.glob(folder_name + os.sep + "*.csv") regression_labels = [] regression_names = [] f_final = [] for c in csv_files: cur_regression_labels = [] f_temp = [] # open the csv file that contains the current target value's annotations with open(c, 'rt') as csvfile: csv_reader = csv.reader(csvfile, delimiter=',', quotechar='|') for row in csv_reader: if len(row) == 2: # ... and if the current filename exists # in the list of filenames if row[0] in filenames: index = filenames.index(row[0]) cur_regression_labels.append(float(row[1])) f_temp.append(features[index, :]) else: print("Warning: {} not found " "in list of files.".format(row[0])) else: print( "Warning: Row with unknown format in regression file") f_final.append(np.array(f_temp)) # cur_regression_labels is the list of values # for the current regression problem regression_labels.append(np.array(cur_regression_labels)) # regression task name regression_names.append(ntpath.basename(c).replace(".csv", "")) if len(features) == 0: print("ERROR: No data found in any input folder!") return # TODO: ARRF WRITE???? # STEP B: classifier Evaluation and Parameter Selection: if model_type == "svm" or model_type == "svm_rbf": model_params = np.array( [0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 5.0, 10.0]) elif model_type == "randomforest": model_params = np.array([5, 10, 25, 50, 100]) errors = [] errors_base = [] best_params = [] for iRegression, r in enumerate(regression_names): # get optimal classifeir parameter: print("Regression task " + r) bestParam, error, berror = evaluate_regression( f_final[iRegression], regression_labels[iRegression], 100, model_type, model_params) errors.append(error) errors_base.append(berror) best_params.append(bestParam) print("Selected params: {0:.5f}".format(bestParam)) features_norm, mean, std = normalize_features([f_final[iRegression]]) # STEP C: Save the model to file if model_type == "svm": classifier, _ = train_svm_regression( features_norm[0], regression_labels[iRegression], bestParam) if model_type == "svm_rbf": classifier, _ = train_svm_regression( features_norm[0], regression_labels[iRegression], bestParam, kernel='rbf') if model_type == "randomforest": classifier, _ = train_random_forest_regression( features_norm[0], regression_labels[iRegression], bestParam) if model_type == "svm" or model_type == "svm_rbf" \ or model_type == "randomforest": with open(model_name + "_" + r, 'wb') as fid: cPickle.dump(classifier, fid) save_path = model_name + "_" + r + "MEANS" save_parameters(save_path, mean, std, mid_window, mid_step, short_window, short_step, compute_beat) return errors, errors_base, best_params
def extract_features_and_train(paths, mid_window, mid_step, short_window, short_step, classifier_type, model_name, compute_beat=False, train_percentage=0.90): """ This function is used as a wrapper to segment-based audio feature extraction and classifier training. ARGUMENTS: paths: list of paths of directories. Each directory contains a signle audio class whose samples are stored in seperate WAV files. mid_window, mid_step: mid-term window length and step short_window, short_step: short-term window and step classifier_type: "svm" or "knn" or "randomforest" or "gradientboosting" or "extratrees" model_name: name of the model to be saved RETURNS: None. Resulting classifier along with the respective model parameters are saved on files. """ # STEP A: Feature Extraction: features, class_names, _ = \ aF.multiple_directory_feature_extraction(paths, mid_window, mid_step, short_window, short_step, compute_beat=compute_beat) if len(features) == 0: print("trainSVM_feature ERROR: No data found in any input folder!") return n_feats = features[0].shape[1] feature_names = ["features" + str(d + 1) for d in range(n_feats)] write_train_data_arff(model_name, features, class_names, feature_names) for i, feat in enumerate(features): if len(feat) == 0: print("trainSVM_feature ERROR: " + paths[i] + " folder is empty or non-existing!") return # STEP B: classifier Evaluation and Parameter Selection: if classifier_type == "svm" or classifier_type == "svm_rbf": classifier_par = np.array([0.001, 0.01, 0.5, 1.0, 5.0, 10.0, 20.0]) elif classifier_type == "randomforest": classifier_par = np.array([10, 25, 50, 100, 200, 500]) elif classifier_type == "knn": classifier_par = np.array([1, 3, 5, 7, 9, 11, 13, 15]) elif classifier_type == "gradientboosting": classifier_par = np.array([10, 25, 50, 100, 200, 500]) elif classifier_type == "extratrees": classifier_par = np.array([10, 25, 50, 100, 200, 500]) # get optimal classifeir parameter: temp_features = [] for feat in features: temp = [] for i in range(feat.shape[0]): temp_fv = feat[i, :] if (not np.isnan(temp_fv).any()) and (not np.isinf(temp_fv).any()): temp.append(temp_fv.tolist()) else: print("NaN Found! Feature vector not used for training") temp_features.append(np.array(temp)) features = temp_features best_param = evaluate_classifier(features, class_names, 100, classifier_type, classifier_par, 0, train_percentage) print("Selected params: {0:.5f}".format(best_param)) features_norm, mean, std = normalize_features(features) mean = mean.tolist() std = std.tolist() # STEP C: Save the classifier to file if classifier_type == "svm": classifier = train_svm(features_norm, best_param) elif classifier_type == "svm_rbf": classifier = train_svm(features_norm, best_param, kernel='rbf') elif classifier_type == "randomforest": classifier = train_random_forest(features_norm, best_param) elif classifier_type == "gradientboosting": classifier = train_gradient_boosting(features_norm, best_param) elif classifier_type == "extratrees": classifier = train_extra_trees(features_norm, best_param) if classifier_type == "knn": feature_matrix, labels = features_to_matrix(features_norm) feature_matrix = feature_matrix.tolist() labels = labels.tolist() save_path = model_name save_parameters(save_path, feature_matrix, labels, mean, std, class_names, best_param, mid_window, mid_step, short_window, short_step, compute_beat) elif classifier_type == "svm" or classifier_type == "svm_rbf" or \ classifier_type == "randomforest" or \ classifier_type == "gradientboosting" or \ classifier_type == "extratrees": with open(model_name, 'wb') as fid: cPickle.dump(classifier, fid) save_path = model_name + "MEANS" save_parameters(save_path, mean, std, class_names, mid_window, mid_step, short_window, short_step, compute_beat)
def featureAndTrain(list_of_dirs, mt_win, mt_step, st_win, st_step, classifier_type, model_name, compute_beat=False, perTrain=0.90): """ This function is used as a wrapper to segment-based audio feature extraction and classifier training. ARGUMENTS: list_of_dirs: list of paths of directories. Each directory contains a signle audio class whose samples are stored in seperate WAV files. mt_win, mt_step: mid-term window length and step st_win, st_step: short-term window and step classifier_type: "svm" or "knn" or "randomforest" or "gradientboosting" or "extratrees" model_name: name of the model to be saved RETURNS: None. Resulting classifier along with the respective model parameters are saved on files. """ # STEP A: Feature Extraction: [features, classNames, _] = aF.multiple_directory_feature_extraction(list_of_dirs, mt_win, mt_step, st_win, st_step, compute_beat=compute_beat) if len(features) == 0: print("trainSVM_feature ERROR: No data found in any input folder!") return n_feats = features[0].shape[1] feature_names = ["features" + str(d + 1) for d in range(n_feats)] writeTrainDataToARFF(model_name, features, classNames, feature_names) for i, f in enumerate(features): if len(f) == 0: print("trainSVM_feature ERROR: " + list_of_dirs[i] + " folder is empty or non-existing!") return # STEP B: classifier Evaluation and Parameter Selection: if classifier_type == "svm" or classifier_type == "svm_rbf": classifier_par = np.array([0.001, 0.01, 0.5, 1.0, 5.0, 10.0, 20.0]) elif classifier_type == "randomforest": classifier_par = np.array([10, 25, 50, 100, 200, 500]) elif classifier_type == "knn": classifier_par = np.array([1, 3, 5, 7, 9, 11, 13, 15]) elif classifier_type == "gradientboosting": classifier_par = np.array([10, 25, 50, 100, 200, 500]) elif classifier_type == "extratrees": classifier_par = np.array([10, 25, 50, 100, 200, 500]) # get optimal classifeir parameter: features2 = [] for f in features: fTemp = [] for i in range(f.shape[0]): temp = f[i, :] if (not np.isnan(temp).any()) and (not np.isinf(temp).any()): fTemp.append(temp.tolist()) else: print("NaN Found! Feature vector not used for training") features2.append(np.array(fTemp)) features = features2 bestParam = evaluateclassifier(features, classNames, 100, classifier_type, classifier_par, 0, perTrain) print("Selected params: {0:.5f}".format(bestParam)) C = len(classNames) [features_norm, MEAN, STD] = normalizeFeatures(features) MEAN = MEAN.tolist() STD = STD.tolist() featuresNew = features_norm # STEP C: Save the classifier to file if classifier_type == "svm": classifier = trainSVM(featuresNew, bestParam) elif classifier_type == "svm_rbf": classifier = trainSVM_RBF(featuresNew, bestParam) elif classifier_type == "randomforest": classifier = trainRandomForest(featuresNew, bestParam) elif classifier_type == "gradientboosting": classifier = trainGradientBoosting(featuresNew, bestParam) elif classifier_type == "extratrees": classifier = trainExtraTrees(featuresNew, bestParam) if classifier_type == "knn": [X, Y] = listOfFeatures2Matrix(featuresNew) X = X.tolist() Y = Y.tolist() fo = open(model_name, "wb") cPickle.dump(X, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(Y, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(MEAN, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(STD, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(bestParam, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(st_win, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(st_step, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(compute_beat, fo, protocol=cPickle.HIGHEST_PROTOCOL) fo.close() elif classifier_type == "svm" or classifier_type == "svm_rbf" or \ classifier_type == "randomforest" or \ classifier_type == "gradientboosting" or \ classifier_type == "extratrees": with open(model_name, 'wb') as fid: cPickle.dump(classifier, fid) fo = open(model_name + "MEANS", "wb") cPickle.dump(MEAN, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(STD, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(st_win, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(st_step, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(compute_beat, fo, protocol=cPickle.HIGHEST_PROTOCOL) fo.close()
def extract_features(paths, mid_window, mid_step, short_window, short_step, compute_beat=False, train_percentage=0.90): """ This function is used as a wrapper to segment-based audio feature extraction and classifier training. ARGUMENTS: paths: list of paths of directories. Each directory contains a signle audio class whose samples are stored in seperate WAV files. mid_window, mid_step: mid-term window length and step short_window, short_step: short-term window and step classifier_type: "svm" or "knn" or "randomforest" or "gradientboosting" or "extratrees" model_name: name of the model to be saved RETURNS: None. Resulting classifier along with the respective model parameters are saved on files. """ # STEP A: Feature Extraction: features, class_names, _ = \ aF.multiple_directory_feature_extraction(paths, mid_window, mid_step, short_window, short_step, compute_beat=compute_beat) if len(features) == 0: print("trainSVM_feature ERROR: No data found in any input folder!") return n_feats = features[0].shape[1] feature_names = ["features" + str(d + 1) for d in range(n_feats)] # write_train_data_arff(model_name, features, class_names, feature_names) for i, feat in enumerate(features): if len(feat) == 0: print("trainSVM_feature ERROR: " + paths[i] + " folder is empty or non-existing!") return features2 = [] for f in features: fTemp = [] for i in range(f.shape[0]): temp = f[i, :] if (not np.isnan(temp).any()) and (not np.isinf(temp).any()): fTemp.append(temp.tolist()) else: print "NaN Found! Feature vector not used for training" features2.append(np.array(fTemp)) features = features2 [featuresNorm, MEAN, STD] = normalize_features(features) # normalize features MEAN = MEAN.tolist() STD = STD.tolist() featuresNew = featuresNorm return featuresNew
def extract_features_and_train(paths, mid_window, mid_step, short_window, short_step, classifier_type, model_name, compute_beat=False, train_percentage=0.90, dict_of_ids=None, use_smote=False): """ This function is used as a wrapper to segment-based audio feature extraction and classifier training. ARGUMENTS: paths: list of paths of directories. Each directory contains a signle audio class whose samples are stored in seperate WAV files. mid_window, mid_step: mid-term window length and step short_window, short_step: short-term window and step classifier_type: "svm" or "knn" or "randomforest" or "gradientboosting" or "extratrees" model_name: name of the model to be saved dict_of_ids: a dictionary which has as keys the full path of audio files and as values the respective group ids RETURNS: None. Resulting classifier along with the respective model parameters are saved on files. """ # STEP A: Feature Extraction: features, class_names, file_names = \ aF.multiple_directory_feature_extraction(paths, mid_window, mid_step, short_window, short_step, compute_beat=compute_beat) file_names = [item for sublist in file_names for item in sublist] if dict_of_ids: list_of_ids = [dict_of_ids[file] for file in file_names] else: list_of_ids = None if len(features) == 0: print("trainSVM_feature ERROR: No data found in any input folder!") return n_feats = features[0].shape[1] feature_names = ["features" + str(d + 1) for d in range(n_feats)] for i, feat in enumerate(features): if len(feat) == 0: print("trainSVM_feature ERROR: " + paths[i] + " folder is empty or non-existing!") return # STEP B: classifier Evaluation and Parameter Selection: if classifier_type == "svm" or classifier_type == "svm_rbf": classifier_par = np.array([0.001, 0.01, 0.5, 1.0, 5.0, 10.0, 20.0]) elif classifier_type == "randomforest": classifier_par = np.array([10, 25, 50, 100, 200, 500]) elif classifier_type == "knn": classifier_par = np.array([1, 3, 5, 7, 9, 11, 13, 15]) elif classifier_type == "gradientboosting": classifier_par = np.array([10, 25, 50, 100, 200, 500]) elif classifier_type == "extratrees": classifier_par = np.array([10, 25, 50, 100, 200, 500]) # get optimal classifier parameter: temp_features = [] for feat in features: temp = [] for i in range(feat.shape[0]): temp_fv = feat[i, :] if (not np.isnan(temp_fv).any()) and (not np.isinf(temp_fv).any()): temp.append(temp_fv.tolist()) else: print("NaN Found! Feature vector not used for training") temp_features.append(np.array(temp)) features = temp_features best_param = evaluate_classifier(features, class_names, classifier_type, classifier_par, 1, list_of_ids, n_exp=-1, train_percentage=train_percentage, smote=use_smote) print("Selected params: {0:.5f}".format(best_param)) # STEP C: Train and Save the classifier to file # Get featues in the X, y format: features, labels = features_to_matrix(features) # Apply smote if necessary: if use_smote: sm = SMOTE(random_state=2) features, labels = sm.fit_resample(features, labels) # Use mean/std standard feature scaling: scaler = StandardScaler() features = scaler.fit_transform(features) mean = scaler.mean_.tolist() std = scaler.scale_.tolist() # Then train the final classifier if classifier_type == "svm": classifier = train_svm(features, labels, best_param) elif classifier_type == "svm_rbf": classifier = train_svm(features, labels, best_param, kernel='rbf') elif classifier_type == "randomforest": classifier = train_random_forest(features, labels, best_param) elif classifier_type == "gradientboosting": classifier = train_gradient_boosting(features, labels, best_param) elif classifier_type == "extratrees": classifier = train_extra_trees(features, labels, best_param) # And save the model to a file, along with # - the scaling -mean/std- vectors) # - the feature extraction parameters if classifier_type == "knn": feature_matrix = features.tolist() labels = labels.tolist() save_path = model_name save_parameters(save_path, feature_matrix, labels, mean, std, class_names, best_param, mid_window, mid_step, short_window, short_step, compute_beat) elif classifier_type == "svm" or classifier_type == "svm_rbf" or \ classifier_type == "randomforest" or \ classifier_type == "gradientboosting" or \ classifier_type == "extratrees": with open(model_name, 'wb') as fid: cPickle.dump(classifier, fid) save_path = model_name + "MEANS" save_parameters(save_path, mean, std, class_names, mid_window, mid_step, short_window, short_step, compute_beat)
values, labels = torch.max(test_result, 1) y_pred = labels.data.numpy() return f1_score(y_pred, test_labels) from pyAudioAnalysis import MidTermFeatures as mt from pyAudioAnalysis import audioTrainTest as aT import numpy as np import os if os.path.isfile("features.npy"): with open('features.npy', 'rb') as f: X = np.load(f) y = np.load(f) else: features, class_names, file_names = mt.multiple_directory_feature_extraction( ["audio/speech", "audio/noise"], 1, 1, 0.1, 0.1, False) X, y = aT.features_to_matrix(features) with open('features.npy', 'wb') as f: np.save(f, np.array(X)) np.save(f, np.array(y)) dimensions = X.shape[1] # Split to train/test X_train = X[::2, :] y_train = y[::2] X_test = X[1::2, :] y_test = y[1::2] n_nodes = 256