예제 #1
0
def main():
    linear = titanic_data.linear()
    data = linear[:-89]
    gen_est_data = linear[-89:]
    regr_baseline = fwd.get_baseline(data)
    predictor_features = data.columns.drop('CleanedFare')

    # Linear Regression
    lin_best, lin_scorechart = fwd.forward_select(data, [], predictor_features, regr_baseline, fwd.score_regr, scores=[])
    lin_labels, lin_scores, lin_error_rates = fwd.process_scorechart(lin_scorechart)
    lin_gen_est_errors = fwd.estimate_generalisation_errors(lin_scorechart, data, gen_est_data, fwd.score_regr)
    fwd.table_display(lin_labels, lin_scores, lin_gen_est_errors)

    # Assume average Fare
    avg_fare = np.mean(data.CleanedFare.values)
    avg_arr = np.empty(len(gen_est_data))
    avg_arr.fill(avg_fare)
    avg_arr.reshape(1,-1)
    print("avg_arr", avg_arr)
    avg_fare_error =  1 - metrics.r2_score(gen_est_data.CleanedFare.values, avg_arr)
    print("Average:", avg_fare)
    print("Average Error:", avg_fare_error)

    # Neural Network
    nn_baseline = fwd.get_baseline(data, test_func=fwd.score_nn)
    nn_best, nn_scorechart = fwd.forward_select(data, [], predictor_features, nn_baseline, test_func=fwd.score_nn, est_func=fwd.test_kfold, scores=[])
    nn_labels, nn_scores, nn_error_rates = fwd.process_scorechart(nn_scorechart)
    nn_gen_est_errors = fwd.estimate_generalisation_errors(nn_scorechart, data, gen_est_data, fwd.score_nn)
    fwd.table_display(nn_labels, nn_scores, nn_gen_est_errors)


    # Graphing code
    f, axarr = plot.subplots(1, 2, sharey=True)

    fwd.plot_scores(axarr[0], lin_scores, "Linear Regression", "Train")
    fwd.plot_scores(axarr[0], lin_gen_est_errors, "Linear Regression", "Test")
    avg_fare_err_arr = np.empty(len(lin_scores))
    avg_fare_err_arr.fill(avg_fare_error)
    axarr[0].plot(avg_fare_err_arr, label="Assume Average Fare")
    axarr[0].legend()

    fwd.plot_scores(axarr[1], nn_scores, "Neural Network", "Train")
    fwd.plot_scores(axarr[1], nn_gen_est_errors, "Neural Network", "Test")
    avg_fare_err_arr = np.empty(len(nn_scores))
    avg_fare_err_arr.fill(avg_fare_error)
    axarr[1].plot(avg_fare_err_arr, label="Assume Average Fare")
    axarr[1].legend()

    f.text(0.5, 0.04, 'Model index', ha='center', va='center')
    f.text(0.06, 0.5, 'Generalisation error', ha='center', va='center', rotation='vertical')

    plot.show()
def main():
    linear = titanic_data.linear()
    data = linear[:-50]
    gen_est_data = linear[-50:]
    root_baseline = fwd.get_baseline(data)
    predictor_features = data.columns.drop('CleanedFare')

    # Holdout
    model = []
    ho_best, ho_scorechart = fwd.forward_select(data, model,
                                                predictor_features,
                                                root_baseline, fwd.score_regr,
                                                fwd.test_holdout, [])
    ho_labels, ho_scores, ho_error_rates = fwd.process_scorechart(
        ho_scorechart)
    ho_gen_est_errors = fwd.estimate_generalisation_errors(
        ho_scorechart, data, gen_est_data, fwd.score_regr)
    fwd.table_display(ho_labels, ho_error_rates, ho_gen_est_errors)
    print("ho_gen_est is", ho_gen_est_errors)

    # K-Fold
    model = []
    k_best, k_scorechart = fwd.forward_select(data, model, predictor_features,
                                              root_baseline, fwd.score_regr,
                                              fwd.test_kfold, [])
    k_labels, k_scores, k_error_rates = fwd.process_scorechart(k_scorechart)
    k_gen_est_errors = fwd.estimate_generalisation_errors(
        k_scorechart, data, gen_est_data, fwd.score_regr)
    fwd.table_display(k_labels, k_error_rates, k_gen_est_errors)

    f, axarr = plot.subplots(1, 2)
    fwd.plot_scores(axarr[0], ho_scores, "Holdout method", "Train")
    fwd.plot_scores(axarr[0], ho_gen_est_errors, "Holdout method", "Test")

    fwd.plot_scores(axarr[1], k_scores, "K-Fold method", "Train")
    fwd.plot_scores(axarr[1], k_gen_est_errors, "K-Fold method", "Test")

    f.text(0.5, 0.04, 'Model index', ha='center', va='center')
    f.text(0.06,
           0.5,
           'Generalisation error',
           ha='center',
           va='center',
           rotation='vertical')

    plot.show()
예제 #3
0
import titanic_data
import forward_select as fwd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

data = titanic_data.linear()
train = data[:-89]
gen_test = data[-89:]
# X_train, X_test, y_train, y_test = train_test_split(data[['Pclass', 'FamilySize']].values, data.Survived.values, test_size=0.4, random_state=0)

# print fwd.score_tree(X_train, X_test, y_train, y_test)
best_model, scorechart = fwd.forward_select(train, [],
                                            data.columns.drop('Survived'),
                                            1,
                                            fwd.score_tree,
                                            target='Survived')
gen_est_errors = fwd.estimate_generalisation_errors(scorechart, train,
                                                    gen_test, fwd.score_tree,
                                                    'Survived')
labels, scores, errors = fwd.process_scorechart(scorechart)
plt.plot(scores, label="Training error", marker='o')
plt.plot(gen_est_errors, label="Test error", marker='o')
plt.title("Decision tree")
plt.xlabel('Model index')
plt.ylabel('Generalisation error')
plt.xticks(range(len(scores)), range(1, len(scores)))
plt.legend()
plt.show()
예제 #4
0
import titanic_data
import forward_select as fwd
from sklearn.preprocessing import StandardScaler

linear = titanic_data.linear()
data = linear[:-50]
gen_est_data = linear[-50:]
predictor_features = data.columns.drop('CleanedFare')
nn_baseline = fwd.get_baseline(data, test_func=fwd.score_nn)
nn_best, nn_scorechart = fwd.forward_select(data, [], predictor_features, nn_baseline, test_func=fwd.score_nn, est_func=fwd.test_kfold)
nn_labels, nn_scores, nn_error_rates = fwd.process_scorechart(nn_scorechart)
fwd.table_display(nn_labels, nn_error_rates)
예제 #5
0
import titanic_data
import forward_select as fwd
from sklearn.model_selection import train_test_split
from sklearn import neighbors
import matplotlib.pyplot as plt

data = titanic_data.linear()
train = data[:-50]
gen_test = data[-50:]
knn_best_model, knn_scorechart = fwd.forward_select(
    train, [],
    data.columns.drop('Survived'),
    1,
    fwd.score_knn,
    target='Survived')
knn_gen_est_errors = fwd.estimate_generalisation_errors(
    knn_scorechart, train, gen_test, fwd.score_knn, 'Survived')
knn_labels, knn_scores, knn_errors = fwd.process_scorechart(knn_scorechart)
plt.plot(knn_scores, label="Training error", marker='o')
plt.plot(knn_gen_est_errors, label="Test error", marker='o')
plt.title("K-Neighbours")
plt.xlabel('Model index')
plt.ylabel('Generalisation error')
plt.xticks(range(len(knn_scores)), range(1, len(knn_scores) + 1))
plt.yticks([0, 0.5, 1])
plt.legend()
plt.show()
예제 #6
0
def main():
    linear = titanic_data.linear()
    data = linear[:-89]
    gen_est_data = linear[-89:]
    tree_baseline = fwd.get_baseline(data, test_func=fwd.score_tree, target='Survived')
    predictor_features = data.columns.drop('Survived')

    # Decision Tree
    tree_best, tree_scorechart = fwd.forward_select(data, [], predictor_features, tree_baseline, fwd.score_cnn, scores=[], target='Survived')
    tree_labels, tree_scores, tree_error_rates = fwd.process_scorechart(tree_scorechart)
    tree_gen_est_errors = fwd.estimate_generalisation_errors(tree_scorechart, data, gen_est_data, fwd.score_cnn, 'Survived')
    print "Decision Tree"
    fwd.table_display(tree_labels, tree_scores, tree_gen_est_errors)

    # Neural Network
    nn_baseline = fwd.get_baseline(data, test_func=fwd.score_cnn, target='Survived')
    nn_baseline = 1
    nn_best, nn_scorechart = fwd.forward_select(data, [], predictor_features, nn_baseline, test_func=fwd.score_nn, est_func=fwd.test_kfold, scores=[], target='Survived')
    nn_labels, nn_scores, nn_error_rates = fwd.process_scorechart(nn_scorechart)
    nn_gen_est_errors = fwd.estimate_generalisation_errors(nn_scorechart, data, gen_est_data, fwd.score_nn, 'Survived')
    print "Neural Network"
    fwd.table_display(nn_labels, nn_scores, nn_gen_est_errors)

    # K-Neighbours
    knn_best_model, knn_scorechart = fwd.forward_select(data, [], data.columns.drop('Survived'), 1, fwd.score_knn, target='Survived')
    knn_gen_est_errors = fwd.estimate_generalisation_errors(knn_scorechart, data, gen_est_data, fwd.score_knn, 'Survived')
    knn_labels, knn_scores, knn_errors = fwd.process_scorechart(knn_scorechart)
    print "K-Neighbour"
    fwd.table_display(knn_labels, knn_scores, knn_gen_est_errors)

    # Assume largest class - largest class did not survive (0)
    all_died = np.zeros(np.shape(gen_est_data.Survived.values))
    all_died_error = np.mean((all_died - gen_est_data.Survived.values) ** 2)

    # Graphing code
    f, axarr = plot.subplots(1, 3)

    fwd.plot_scores(axarr[0], tree_scores, "Decision Tree", "Train")
    fwd.plot_scores(axarr[0], tree_gen_est_errors, "Decision Tree", "Test")
    all_died_arr = np.empty(len(tree_scores))
    all_died_arr.fill(all_died_error)
    axarr[0].plot(all_died_arr, label="Assume Largest Class")
    axarr[0].legend()
    plot.yticks([0, .25, .5, .75, 1])

    fwd.plot_scores(axarr[1], nn_scores, "Neural Network", "Train")
    fwd.plot_scores(axarr[1], nn_gen_est_errors, "Neural Network", "Test")
    all_died_arr = np.empty(len(nn_scores))
    all_died_arr.fill(all_died_error)
    axarr[1].plot(all_died_arr, label="Assume Largest Class")
    axarr[1].legend()
    plot.yticks([0, .25, .5, .75, 1])

    fwd.plot_scores(axarr[2], knn_scores, "K-Neighbors", "Train")
    fwd.plot_scores(axarr[2], knn_gen_est_errors, "K-Neighbors", "Test")
    all_died_arr = np.empty(len(knn_scores))
    all_died_arr.fill(all_died_error)
    axarr[2].plot(all_died_arr, label="Assume Largest Class")
    axarr[2].legend()
    plot.yticks([0, .25, .5, .75, 1])

    f.text(0.5, 0.04, 'Model index', ha='center', va='center')

    f.text(0.06, 0.5, 'Generalisation error', ha='center', va='center', rotation='vertical')

    plot.show()