save_dict = {} save_dict['train_features'] = train_features save_dict['train_labels'] = train_labels save_dict['test_features'] = test_features save_dict['test_labels'] = test_labels scio.savemat(save_path, save_dict) return train_features, train_labels, test_features, test_labels if __name__ == '__main__': # 提取视觉单词 # extract_words( # '/home/give/homework/CV/dataset/affNIST/training_and_validation_batches', # '/home/give/homework/CV/dataset/affNIST/test.mat', # '/home/give/homework/CV/dataset/affNIST/patches/' # ) # 随机选取单词 # patches = load_words('/home/give/homework/CV/dataset/affNIST/patches', save_path='./vocabulary.npy') # print(np.shape(patches)) # # get_KMeans_model(patches) train_features, train_labels, test_features, test_labels = get_features( '/home/give/homework/CV/dataset/affNIST/training_and_validation_batches', '/home/give/homework/CV/dataset/affNIST/test.mat', './BoVW_model.m', reload=True) from classification import SVM SVM.do(train_features, train_labels, test_features, test_labels)
def main(): params_dict = parse_option() # Read data train = np.genfromtxt(open(params_dict['training_set_fn'],'rb'), delimiter=',') print "Number of training samples: {0}.".format(train.shape[0]) print "Number of features: {0}.".format(train.shape[1]) target = np.genfromtxt(open(params_dict['target_set_fn'],'rb'), delimiter=',') len_train_set = train.shape[0] if params_dict["test_set_flag"]: test = np.genfromtxt(open(params_dict['test_set_fn'],"rb"), delimiter=',') if not params_dict["overnight_simulation"]: print "Visualizing features for understanding the most suitable scaling type." if params_dict["test_set_flag"]: plot_features(np.vstack((train,test))) else: plot_features(train) plt.show() balances = classes_balance(target) counter = 0 for b in balances: print "For class {0} the balance is {1:.4f}.".format(counter, b) counter += 1 n_feat = train.shape[1] num_samples = train.shape[0] #features scaling print "Starting features preprocessing ..." if params_dict["sparse_filtering_flag"]: print "Performing sparse filtering..." if params_dict["load_sf_flag"]: sf, train_sf, test_sf = load_sf_features(params_dict["load_sf_path"]) else: sf = SparseFilter(n_layers=params_dict["n_layers_sf"],n_features=params_dict["n_features_sf"], n_iterations=params_dict["n_iterations_sf"]) if params_dict["test_set_flag"]: sf.fit(np.r_[train,test]) train_sf = sf.transform(train) test_sf = sf.transform(test) else: sf.fit(train) train_sf = sf.transform(train) if params_dict["save_sf_flag"]: if params_dict["test_set_flag"]: save_sf_features(sf, train_sf, test_sf, params_dict["save_sf_path"]) else: save_sf_features(sf, train_sf, None, params_dict["save_sf_path"]) print "Features sparse filtering performed!" print train_sf.shape if params_dict["test_set_flag"]: dataset = np.r_[train, test] else: dataset = train if params_dict["pca_flag"]: print "Performing PCA..." pca = PCA(variance_retain = params_dict["pca_variance_retain"]) pca.fit(dataset) dataset_pca = pca.transform(dataset) if params_dict["test_set_flag"]: train_pca = dataset_pca[:len_train_set,:] test_pca = dataset_pca[len_train_set:,:] else: train_pca = dataset_pca n_feat_pca = dataset_pca.shape[1] print "Number of features after PCA: {0}.".format(n_feat_pca) else: dataset_pca = dataset train_pca = train if params_dict["test_set_flag"]: test_pca = test n_feat_pca = dataset_pca.shape[1] print "Number of features after PCA: {0}.".format(n_feat_pca) if params_dict["pca_flag"]: if not params_dict["overnight_simulation"]: print "Visualizing features after PCA..." plot_features(dataset_pca) plt.show() if params_dict["scaling_flag"]: scaler = Scaler(bias_and_variance_flag = True, log10_flag = False, log2_flag = False, log1p_flag = False) if params_dict["test_set_flag"]: dataset_scaled = scaler.fit(np.r_[train_pca,test_pca]) train_scaled = dataset_scaled[:len_train_set,:] test_scaled = dataset_scaled[len_train_set:,:] else: dataset_scaled = scaler.fit(train_pca) train_scaled = dataset_scaled else: train_scaled = train_pca if params_dict["test_set_flag"]: test_scaled = test_pca if params_dict["scaling_flag"]: if not params_dict["overnight_simulation"]: print "Visualizing features after features preprocessing.." plot_features(dataset_scaled) plt.show() if params_dict["sparse_filtering_flag"]: train_data = np.c_[train_scaled, train_sf] if params_dict["test_set_flag"]: test_data = np.c_[test_scaled, test_sf] else: train_data = train_scaled if params_dict["test_set_flag"]: test_data = test_scaled print "Features preprocessing done!" if params_dict["rf_features_selection_flag"]: print "Starting features selection by means of random forests..." fsrf = FeaturesSelectionRandomForests() fsrf.fit(train_data, target) if not params_dict["overnight_simulation"]: fsrf.plot_features_importance() fsrf_mask = fsrf.features_mask train_data = fsrf.transform(train_data) if params_dict["test_set_flag"]: test_data = fsrf.transform(test_data) n_feat_fsrf = train_data.shape[1] print "Random forests features selection done!" classification_obj=SVM() if not params_dict["skip_model_selection"]: print "Starting model selection ..." if not params_dict.has_key("C_list"): C_list = [0.0001, 0.001,0.01,0.1,1,10,100,1000,10000] else: C_list = params_dict["C_list"] if params_dict["kernel"] == SVM_RBF: if not params_dict.has_key("gamma_list"): gamma_list = [0.0001, 0.001,0.01,0.1,1,10,100,1000,10000] else: gamma_list = params_dict["gamma_list"] else: gamma_list = None #performing model selection ms_result = classification_obj.model_selection(train_data,target, kernel = params_dict["kernel"], n_iterations=params_dict["n_iterations_ms"], C_list = C_list, gamma_list = gamma_list, show_accuracy_flag = params_dict["show_accuracy_flag"], show_precision_flag = params_dict["show_precision_flag"], show_recall_flag = params_dict["show_recall_flag"], show_f1_score_flag = params_dict["show_f1score_flag"], max_num_cpus = params_dict["max_num_cpus"]) if not params_dict["overnight_simulation"]: #displaying model selection if params_dict["kernel"] == SVM_RBF: if params_dict["show_accuracy_flag"]: plot_3d(x=ms_result["gamma_list"], y=ms_result["C_list"], z=ms_result["acc_by_C_and_gamma"], zlabel="accuracy", title="Accuracy by C and gamma") if params_dict["show_precision_flag"]: plot_3d(x=ms_result["gamma_list"], y=ms_result["C_list"], z=ms_result["recall_by_C_and_gamma"], zlabel="recall", title="Recall by C and gamma") if params_dict["show_recall_flag"]: plot_3d(x=ms_result["gamma_list"], y=ms_result["C_list"], z=ms_result["prec_by_C_and_gamma"], zlabel="precision", title="Precision by C and gamma") if params_dict["show_f1score_flag"]: plot_3d(x=ms_result["gamma_list"], y=ms_result["C_list"], z=ms_result["f1_by_C_and_gamma"], zlabel="accuracy", title="f1 score by C and gamma") if params_dict["show_trerr_flag"]: plot_3d(x=ms_result["gamma_list"], y=ms_result["C_list"], z=ms_result["tr_err_by_C_and_gamma"], zlabel="training error", title="Training error score by C and gamma") if params_dict["show_cverr_flag"]: plot_3d(x=ms_result["gamma_list"], y=ms_result["C_list"], z=ms_result["cv_err_by_C_and_gamma"], zlabel="cross-validation error", title="Cross-validation error score by C and gamma") elif params_dict["kernel"] == SVM_linear or params_dict["kernel"] == SVM_RBF_Chi2_squared: if params_dict["show_accuracy_flag"]: plot_2d(x=ms_result["C_list"], y=ms_result["acc_by_C"], ylabel="accuracy", title="Accuracy by C") if params_dict["show_precision_flag"]: plot_2d(x=ms_result["C_list"], y=ms_result["recall_by_C"], ylabel="recall", title="Recall by C") if params_dict["show_recall_flag"]: plot_2d(x=ms_result["C_list"], y=ms_result["prec_by_C"], ylabel="precision", title="Precision by C and gamma") if params_dict["show_f1score_flag"]: plot_2d(x=ms_result["C_list"], y=ms_result["f1_by_C"], ylabel="accuracy", title="f1 score by C") if params_dict["show_trerr_flag"]: plot_2d(x=ms_result["C_list"], y=ms_result["tr_err_by_C"], ylabel="training error", title="Training error score by C") if params_dict["show_cverr_flag"]: plot_2d(x=ms_result["C_list"], y=ms_result["cv_err_by_C"], ylabel="cross-validation error", title="Cross-validation error score by C") else: raise Exception("Unsupported kernel type!") plt.show() if not params_dict["overnight_simulation"]: #entering the C and gamma chosen print "Plotted graphics for model selection. Choose the best C and gamma ..." while True: C_str = raw_input("Enter the C value suggested by model selection:") try: C = float(C_str) except Exception as e: print "Invalid C inserted. C has to be numeric. Exception: {0}".format(e) continue break if params_dict["kernel"] == SVM_RBF: while True: gamma_str = raw_input("Enter the gamma value suggested by model selection:") try: gamma = float(gamma_str) except Exception as e: print "Invalid gamma inserted. gamma has to be numeric. Exception: {0}".format(e) continue break if params_dict["kernel"] == SVM_linear or params_dict["kernel"] == SVM_RBF_Chi2_squared: print "Parameters selection performed! C = {0}.".format(C) else: print "Parameters selection performed! C = {0}, gamma = {1}".format(C, gamma) else: if params_dict["kernel"] == SVM_linear or params_dict["kernel"] == SVM_RBF_Chi2_squared: C,accuracy = classification_obj.best_accuracy_C(ms_result) elif params_dict["kernel"] == SVM_RBF: C,gamma,accuracy = classification_obj.best_accuracy_C_and_gamma(ms_result) else: raise Exception("Unsupported kernel type!") print "C automatically selected equals to {0}.".format(C) if params_dict["kernel"] == SVM_RBF: print "gamma automatically selected equals to {0}.".format(gamma) print "The accuracy attained by those parameters during model selection is {0}.".format(accuracy) else: if params_dict.has_key("C"): C = params_dict["C"] print "C specified by the user: {0}.".format(C) if params_dict.has_key("gamma"): gamma = params_dict["gamma"] print "gamma specified by the user: {0}".format(gamma) if params_dict["rfe_features_selection_flag"]: print "Performing recursive features elimination..." if params_dict["kernel"] == SVM_linear or params_dict["kernel"] == SVM_RBF_Chi2_squared: rfe = RecursiveFeaturesElimination(C=C,kernel=SVM_linear, n_iterations=params_dict["n_iterations_rfe"], test_size=0.3) elif params_dict["kernel"] == SVM_RBF: rfe = RecursiveFeaturesElimination(C=C,gamma=gamma,kernel=params_dict["kernel"], n_iterations=params_dict["n_iterations_rfe"], test_size=0.3) else: raise Exception("Unsupported kernel type!") tr_err_rfe, cv_err_rfe, accuracy_rfe,recall_rfe, precision_rfe, f1_score_rfe = rfe.rfe_curves(train_data, target) if not params_dict["overnight_simulation"]: if params_dict["show_accuracy_flag"]: plot_rfe_curve(accuracy_rfe,"accuracy") if params_dict["show_precision_flag"]: plot_rfe_curve(precision_rfe,"precision") if params_dict["show_recall_flag"]: plot_rfe_curve(recall_rfe,"recall") if params_dict["show_f1score_flag"]: plot_rfe_curve(f1_score_rfe,"f1 score") if params_dict["show_trerr_flag"]: plot_rfe_curve(tr_err_rfe,"training error") if params_dict["show_cverr_flag"]: plot_rfe_curve(cv_err_rfe,"cross-validation error") plt.show() train_data, rfe_mask = rfe.select_features(train_data, accuracy_rfe) if params_dict["test_set_flag"]: test_data = rfe.apply_features_selection(test_data) n_feat_rfe = train_data.shape[1] print "Number of features after Recursive Features Elimination: {0}.".format(n_feat_rfe) print "Recursive features elimination done!." #training print "Performing training..." if params_dict["kernel"] == SVM_linear or params_dict["kernel"] == SVM_RBF_Chi2_squared: model = classification_obj.training(train_data, target, kernel = SVM_linear, C=C) elif params_dict["kernel"] == SVM_RBF: model = classification_obj.training(train_data, target, kernel = params_dict["kernel"], C=C, gamma=gamma) else: raise Exception("Unsupported kernel type!") print "Training performed!" if params_dict["test_set_flag"]: #prediction on kaggle test set print "Performing classification on the test set..." predicted = classification_obj.classify(test_data) print "Classification performed on the test set!" #save data in the submission format save_csv_submitted_labels(predicted, os.path.join(params_dict["dest_path"],params_dict["predicted_set_fn"])) if params_dict["kernel"] == SVM_linear or params_dict["kernel"] == SVM_RBF_Chi2_squared: acc, prec, rec, f1 = classification_obj.performance_estimation(train_data, target, kernel = params_dict["kernel"], C = C, n_iterations = params_dict["n_iterations_performance_estimation"]) elif params_dict["kernel"] == SVM_RBF: acc, prec, rec, f1 = classification_obj.performance_estimation(train_data, target, kernel = params_dict["kernel"], C = C, gamma = gamma, n_iterations = params_dict["n_iterations_performance_estimation"]) print "Estimated performances:\nAccuracy: {0}\nPrecision: {1}\nRecall: {2}\nf1 Score: {3}".format(acc, prec, rec, f1) seedid = random.randint(0,100) today = date.today() if today.day < 10: day = "0%s" % today.day else: day = "%s" % today.day if today.month < 10: month = "0%s" % today.month else: month = "%s" % today.month bn = "{name}_{year}_{month}_{day}_rand{seed}_acc{acc:.4f}_prec{prec:4f}_rec{rec:4f}".format(name=get_model_name(params_dict),seed=seedid,year=today.year, month=month, day=day, acc=acc, prec=prec, rec=rec) bn = bn.replace(".","") """ FILLING MODEL DICT Making the predicted model persistent! """ model_dict = dict() dumped_model = pickle.dumps(model) model_dict["classifier"] = dumped_model model_dict["scaling_flag"] = params_dict["scaling_flag"] if model_dict["scaling_flag"]: dumped_scaler = pickle.dumps(scaler) model_dict["scaler"] = dumped_scaler model_dict["pca_flag"] = params_dict["pca_flag"] if params_dict["pca_flag"]: dumped_pca = pickle.dumps(pca) model_dict["pca"] = dumped_pca model_dict["fsrf_flag"] = params_dict["rf_features_selection_flag"] if params_dict["rf_features_selection_flag"]: dumped_fsrf_mask = pickle.dumps(fsrf_mask) model_dict["fsrf_mask"] = dumped_fsrf_mask else: model_dict["fsrf_mask"] = None model_dict["rfe_flag"] = params_dict["rfe_features_selection_flag"] if params_dict["rfe_features_selection_flag"]: dumped_rfe_mask = pickle.dumps(rfe_mask) model_dict["rfe_mask"] = dumped_rfe_mask else: model_dict["rfe_mask"] = None json_model = json.dumps(model_dict, sort_keys=True, indent=4, separators=(',', ': ')) models_path = os.path.join(params_dict["dest_path"],"models") if not os.path.exists(models_path): os.makedirs(models_path) fn = "model_%s.json" % bn f = open(os.path.join(models_path, fn),"w") f.write(json_model) f.close() """ FILLING EXPERIMENT DICT Saving a summary of the experiment, useful for the data scientist. """ experiment_dict = dict() experiment_dict["01) number of samples dataset"] = num_samples experiment_dict["02) number of features dataset"] = n_feat balances_dict = dict() for i in xrange(len(balances)): balances_dict["{0}".format(i)] = balances[i] experiment_dict["01b) Balance of the classes of the dataset"] = balances_dict if params_dict["kernel"] == SVM_linear: experiment_dict["03) classifier type"] = "SVM linear" elif params_dict["kernel"] == SVM_RBF: experiment_dict["03) classifier type"] = "SVM RBF" elif params_dict["kernel"] == SVM_RBF_Chi2_squared: experiment_dict["03) classifier type"] = "SVM RBF Chi2" else: experiment_dict["03) classifier type"] = "Not specified" if not params_dict["skip_model_selection"]: experiment_dict["04) C list"] = C_list if params_dict["kernel"] == SVM_RBF and not params_dict["skip_model_selection"]: experiment_dict["05) gamma list"] = gamma_list experiment_dict["06) selected_C"] = C if params_dict["kernel"] == SVM_RBF: experiment_dict["07) selected gamma"] = gamma experiment_dict["08) accuracy"] = acc experiment_dict["09) precision"] = prec experiment_dict["10) recall"] = rec experiment_dict["11) f1 score"] = f1 experiment_dict["12) number iterations in model selection"] = params_dict["n_iterations_ms"] experiment_dict["13) pca flag"] = params_dict["pca_flag"] if params_dict["pca_flag"]: experiment_dict["14) pca retain"] = params_dict["pca_variance_retain"] experiment_dict["16) number of features after pca"] = n_feat_pca experiment_dict["17) features scaling"] = params_dict["scaling_flag"] experiment_dict["18) random forests features selection"] = params_dict["rf_features_selection_flag"] if params_dict["rf_features_selection_flag"]: experiment_dict["19) number of features after random forests features selection"] = n_feat_fsrf experiment_dict["20) recursive features elimination"] = params_dict["rfe_features_selection_flag"] if params_dict["rfe_features_selection_flag"]: experiment_dict["21) number of iterations in rfe"] = params_dict["n_iterations_rfe"] json_experiment = json.dumps(experiment_dict, sort_keys=True, indent=4, separators=(',', ': ')) experiments_path = os.path.join(params_dict["dest_path"],"experiments") if not os.path.exists(experiments_path): os.makedirs(experiments_path) fn = "experiment_%s.json" % bn f = open(os.path.join(experiments_path, fn),"w") f.write(json_experiment) f.close() if not params_dict["skip_model_selection"]: if params_dict["kernel"] == SVM_RBF: acc_table = print_model_selection_results(results = ms_result["acc_by_C_and_gamma"], C_list = ms_result["C_list"], gamma_list = ms_result["gamma_list"] ) prec_table = print_model_selection_results(results = ms_result["prec_by_C_and_gamma"], C_list = ms_result["C_list"], gamma_list = ms_result["gamma_list"] ) recall_table = print_model_selection_results(results = ms_result["recall_by_C_and_gamma"], C_list = ms_result["C_list"], gamma_list = ms_result["gamma_list"] ) f1_table = print_model_selection_results(results = ms_result["f1_by_C_and_gamma"], C_list = ms_result["C_list"], gamma_list = ms_result["gamma_list"] ) else: acc_table = print_model_selection_results(results = ms_result["acc_by_C"], C_list = ms_result["C_list"], gamma_list = None ) prec_table = print_model_selection_results(results = ms_result["prec_by_C"], C_list = ms_result["C_list"], gamma_list = None ) recall_table = print_model_selection_results(results = ms_result["recall_by_C"], C_list = ms_result["C_list"], gamma_list = None ) f1_table = print_model_selection_results(results = ms_result["f1_by_C"], C_list = ms_result["C_list"], gamma_list = None ) acc_str = "Accuracy:\n{0}\n".format(acc_table) prec_str = "Precision:\n{0}\n".format(prec_table) recall_str = "Recall:\n{0}\n".format(recall_table) f1_str = "f1_score:\n{0}\n".format(f1_table) fn = "experiment_%s_results.txt" % bn f = open(os.path.join(experiments_path, fn),"w") f.write(acc_str) f.write(prec_str) f.write(recall_str) f.write(f1_str) f.close() basename = os.path.basename(bn) print "Results saved in %s." % basename
from classification import Naive_bayesian, KNN, random_forest, SVM from sklearn.metrics import accuracy_score, classification_report from classification import test_document_list, train_document_list from search import Naive_bayesian classes_test = [test_document[0] for test_document in test_document_list] classes_pred_1 = Naive_bayesian(train_document_list[1:500], test_document_list[1:20]) # print(classes_test, classes_pred_1) print(classification_report(classes_test[1:20], classes_pred_1)) print(accuracy_score(classes_test[1:20], classes_pred_1)) classes_pred_2 = KNN(train_document_list[1:200], test_document_list[1:20], 5) # print(classes_test[1:20], classes_pred_2) print( classification_report([int(c) for c in classes_test[1:20]], classes_pred_2)) print(accuracy_score(classes_test[1:20], classes_pred_2)) classes_pred_3 = SVM(train_document_list[1:500], test_document_list[1:20]) print(classification_report(classes_test[1:20], classes_pred_3)) print(accuracy_score(classes_test[1:20], classes_pred_3)) classes_pred_4 = random_forest(train_document_list[1:500], test_document_list[1:20]) print(classification_report(classes_test[1:20], classes_pred_4)) print(accuracy_score(classes_test[1:20], classes_pred_4))
kernel_func = linear # Use this for clusters data. output_path = '../outputs/' # Where to save the plots. class_colors = {-1: 'b', 1: 'r'} # Colors for plotting. # Load data. x_train, y_train = load_data_csv(os.path.join(data_folder, dataset_name+'_train.csv')) x_test, y_test = load_data_csv(os.path.join(data_folder, dataset_name+'_test.csv')) plot_points(x_train, y_train, class_colors=class_colors, title='Train - correct labels') plot_points(x_test, y_test, class_colors=class_colors, title='Test - correct labels') # Train the SVM classifier on the training data. C_group = [1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4] for i in range(len(C_group)): C = C_group[i] svm = SVM(kernel_func=kernel_func, C=C) print('Training...') svm.train(x_train, y_train) print('Plotting...') plot_svm_decision_boundary(svm, x_train, y_train, title='SVM decision boundary on training data', output_path=output_path, file_name=str(dataset_name) + '_support_vectors_train.png', class_colors=class_colors) # Make predictions on train and test data. y_train_pred = svm.predict(x_train) y_test_pred = svm.predict(x_test) plot_points(x_train, y_train_pred, class_colors=class_colors, title='Your predictions for training data') plot_points(x_test, y_test_pred, class_colors=class_colors,
feature_df = generate_training_feature_vectors(training_dir) else: feature_df = pd.read_csv(training_features) user_input = input('Perform Classification (y/n)?') if (user_input == "y"): data = process() c0 = data[0] c1 = data[1] X_train_0, X_test_0, y_train_0, y_test_0 = c0[0], c0[1], c0[2], c0[3] X_train_1, X_test_1, y_train_1, y_test_1 = c1[0], c1[1], c1[2], c1[3] user_input = input('Perform SVM Classification (y/n)?') if (user_input == "y"): SVM(X_train_0, X_test_0, y_train_0, y_test_0) user_input = input( 'Perform Grid Search Classification (this can take a while) (y/n)? ' ) if (user_input == "y"): GridSearch(X_train_0, X_test_0, y_train_0, y_test_0) user_input = input('Perform Sequential Classification (y/n)?') if (user_input == "y"): seq(X_train_1, X_test_1, y_train_0, y_test_0) # For each image in training set ##### Image Pre-processing #####
print "Reading the dataset from file..." # Read data train = np.genfromtxt(open(os.path.join(testdir, 'train.csv'),'rb'), delimiter=',') target = np.genfromtxt(open(os.path.join(testdir, 'trainLabels.csv'),'rb'), delimiter=',') test = np.genfromtxt(open(os.path.join(testdir, 'test.csv'),'rb'), delimiter=',') print "Dataset loaded!" #features scaling print "Starting features preprocessing ..." dataset_scaled, scaler = dataset_scaling(np.vstack((train,test))) train_scaled = dataset_scaled[:1000] test_scaled = dataset_scaled[1000:] print "Features preprocessing done!" classification_obj=SVM() print "Starting model selection ..." #performing model selection C_list = [0.0001,0.001,0.01,0.1,1,10,100,1000,10000] gamma_list = [0.0001,0.001,0.01,0.1,1,10, 100,10000] ms_result = classification_obj.model_selection(train_scaled,target,n_iterations=3, C_list=C_list, gamma_list=gamma_list) #displaying model selection plot_3d(x=ms_result["gamma_list"], y=ms_result["C_list"], z=ms_result["acc_by_C_and_gamma"], zlabel="accuracy", title="Accuracy by C and gamma") plot_3d(x=ms_result["gamma_list"], y=ms_result["C_list"], z=ms_result["recall_by_C_and_gamma"], zlabel="recall", title="Recall by C and gamma")