def train_and_validate(algorithm): """This function is mandated by the requirements. Argument 'algorithm' takes values of 'naive_bayes', 'decision_tree', 'knn', and 'svm', and the function will train the respective classifier on training data, predict on the testing data, and print that classifier's testing accuracy. """ # I'm going to assume we should use the specified classifier by # itself, and using it as the base classifier in an ensemble is # cheating. # # Below maps algorithm name to a function producing a model # from training data: algos = { "naive_bayes": train_naive_bayes, "decision_tree": train_decision_tree, "knn": train_knn, "svm": train_svm, } # Load data and train respective model: train_X, train_y, test_X, test_y = data_preprocessing.get_processed_data() train_fn = algos[algorithm] model = train_fn(train_X, train_y) # Get testing accuracy: predict_y = model.predict(test_X) test_acc = sklearn.metrics.accuracy_score(test_y, predict_y) print(test_acc)
break print("Feature \"{1}\" raises accuracy from {2} to {3}".format( i, best_feature, base_accuracy, best_accuracy)) base_accuracy = best_accuracy best_features.append(best_feature) incr_accuracy.append(best_accuracy) print("{0} features: {1}".format(len(best_features), best_features)) features.remove(best_feature) df = pd.DataFrame.from_dict({ "Feature": best_features, "Accuracy": incr_accuracy, }) return df train_X, train_y, test_X, test_y = data_preprocessing.get_processed_data() print("-" * 70) print("kNN:") print("-" * 70) knn = sklearn.neighbors.KNeighborsClassifier(n_jobs=-1) params = {'n_neighbors': range(1, 17), 'weights': ['distance', 'uniform']} features = [ 'education_num', 'marital_status_Married-civ-spouse', 'net_capital' ] clf = sklearn.model_selection.GridSearchCV(knn, params) clf.fit(train_X[features], train_y) print(clf.cv_results_) print("Best score: {0}".format(clf.best_score_)) print("Best params: {0}".format(clf.best_params_))