示例#1
0
def train_and_validate(algorithm):
    """This function is mandated by the requirements.  Argument
    'algorithm' takes values of 'naive_bayes', 'decision_tree', 'knn',
    and 'svm', and the function will train the respective classifier
    on training data, predict on the testing data, and print that
    classifier's testing accuracy.
    """
    # I'm going to assume we should use the specified classifier by
    # itself, and using it as the base classifier in an ensemble is
    # cheating.
    #
    # Below maps algorithm name to a function producing a model
    # from training data:
    algos = {
        "naive_bayes": train_naive_bayes,
        "decision_tree": train_decision_tree,
        "knn": train_knn,
        "svm": train_svm,
    }
    # Load data and train respective model:
    train_X, train_y, test_X, test_y = data_preprocessing.get_processed_data()
    train_fn = algos[algorithm]
    model = train_fn(train_X, train_y)
    # Get testing accuracy:
    predict_y = model.predict(test_X)
    test_acc = sklearn.metrics.accuracy_score(test_y, predict_y)
    print(test_acc)
示例#2
0
            break
        print("Feature \"{1}\" raises accuracy from {2} to {3}".format(
            i, best_feature, base_accuracy, best_accuracy))
        base_accuracy = best_accuracy
        best_features.append(best_feature)
        incr_accuracy.append(best_accuracy)
        print("{0} features: {1}".format(len(best_features), best_features))
        features.remove(best_feature)
    df = pd.DataFrame.from_dict({
        "Feature": best_features,
        "Accuracy": incr_accuracy,
    })
    return df


train_X, train_y, test_X, test_y = data_preprocessing.get_processed_data()

print("-" * 70)
print("kNN:")
print("-" * 70)
knn = sklearn.neighbors.KNeighborsClassifier(n_jobs=-1)
params = {'n_neighbors': range(1, 17), 'weights': ['distance', 'uniform']}
features = [
    'education_num', 'marital_status_Married-civ-spouse', 'net_capital'
]
clf = sklearn.model_selection.GridSearchCV(knn, params)
clf.fit(train_X[features], train_y)
print(clf.cv_results_)
print("Best score: {0}".format(clf.best_score_))
print("Best params: {0}".format(clf.best_params_))