def main(): linear = titanic_data.linear() data = linear[:-89] gen_est_data = linear[-89:] regr_baseline = fwd.get_baseline(data) predictor_features = data.columns.drop('CleanedFare') # Linear Regression lin_best, lin_scorechart = fwd.forward_select(data, [], predictor_features, regr_baseline, fwd.score_regr, scores=[]) lin_labels, lin_scores, lin_error_rates = fwd.process_scorechart(lin_scorechart) lin_gen_est_errors = fwd.estimate_generalisation_errors(lin_scorechart, data, gen_est_data, fwd.score_regr) fwd.table_display(lin_labels, lin_scores, lin_gen_est_errors) # Assume average Fare avg_fare = np.mean(data.CleanedFare.values) avg_arr = np.empty(len(gen_est_data)) avg_arr.fill(avg_fare) avg_arr.reshape(1,-1) print("avg_arr", avg_arr) avg_fare_error = 1 - metrics.r2_score(gen_est_data.CleanedFare.values, avg_arr) print("Average:", avg_fare) print("Average Error:", avg_fare_error) # Neural Network nn_baseline = fwd.get_baseline(data, test_func=fwd.score_nn) nn_best, nn_scorechart = fwd.forward_select(data, [], predictor_features, nn_baseline, test_func=fwd.score_nn, est_func=fwd.test_kfold, scores=[]) nn_labels, nn_scores, nn_error_rates = fwd.process_scorechart(nn_scorechart) nn_gen_est_errors = fwd.estimate_generalisation_errors(nn_scorechart, data, gen_est_data, fwd.score_nn) fwd.table_display(nn_labels, nn_scores, nn_gen_est_errors) # Graphing code f, axarr = plot.subplots(1, 2, sharey=True) fwd.plot_scores(axarr[0], lin_scores, "Linear Regression", "Train") fwd.plot_scores(axarr[0], lin_gen_est_errors, "Linear Regression", "Test") avg_fare_err_arr = np.empty(len(lin_scores)) avg_fare_err_arr.fill(avg_fare_error) axarr[0].plot(avg_fare_err_arr, label="Assume Average Fare") axarr[0].legend() fwd.plot_scores(axarr[1], nn_scores, "Neural Network", "Train") fwd.plot_scores(axarr[1], nn_gen_est_errors, "Neural Network", "Test") avg_fare_err_arr = np.empty(len(nn_scores)) avg_fare_err_arr.fill(avg_fare_error) axarr[1].plot(avg_fare_err_arr, label="Assume Average Fare") axarr[1].legend() f.text(0.5, 0.04, 'Model index', ha='center', va='center') f.text(0.06, 0.5, 'Generalisation error', ha='center', va='center', rotation='vertical') plot.show()
def main(): linear = titanic_data.linear() data = linear[:-50] gen_est_data = linear[-50:] root_baseline = fwd.get_baseline(data) predictor_features = data.columns.drop('CleanedFare') # Holdout model = [] ho_best, ho_scorechart = fwd.forward_select(data, model, predictor_features, root_baseline, fwd.score_regr, fwd.test_holdout, []) ho_labels, ho_scores, ho_error_rates = fwd.process_scorechart( ho_scorechart) ho_gen_est_errors = fwd.estimate_generalisation_errors( ho_scorechart, data, gen_est_data, fwd.score_regr) fwd.table_display(ho_labels, ho_error_rates, ho_gen_est_errors) print("ho_gen_est is", ho_gen_est_errors) # K-Fold model = [] k_best, k_scorechart = fwd.forward_select(data, model, predictor_features, root_baseline, fwd.score_regr, fwd.test_kfold, []) k_labels, k_scores, k_error_rates = fwd.process_scorechart(k_scorechart) k_gen_est_errors = fwd.estimate_generalisation_errors( k_scorechart, data, gen_est_data, fwd.score_regr) fwd.table_display(k_labels, k_error_rates, k_gen_est_errors) f, axarr = plot.subplots(1, 2) fwd.plot_scores(axarr[0], ho_scores, "Holdout method", "Train") fwd.plot_scores(axarr[0], ho_gen_est_errors, "Holdout method", "Test") fwd.plot_scores(axarr[1], k_scores, "K-Fold method", "Train") fwd.plot_scores(axarr[1], k_gen_est_errors, "K-Fold method", "Test") f.text(0.5, 0.04, 'Model index', ha='center', va='center') f.text(0.06, 0.5, 'Generalisation error', ha='center', va='center', rotation='vertical') plot.show()
import titanic_data import forward_select as fwd from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt data = titanic_data.linear() train = data[:-89] gen_test = data[-89:] # X_train, X_test, y_train, y_test = train_test_split(data[['Pclass', 'FamilySize']].values, data.Survived.values, test_size=0.4, random_state=0) # print fwd.score_tree(X_train, X_test, y_train, y_test) best_model, scorechart = fwd.forward_select(train, [], data.columns.drop('Survived'), 1, fwd.score_tree, target='Survived') gen_est_errors = fwd.estimate_generalisation_errors(scorechart, train, gen_test, fwd.score_tree, 'Survived') labels, scores, errors = fwd.process_scorechart(scorechart) plt.plot(scores, label="Training error", marker='o') plt.plot(gen_est_errors, label="Test error", marker='o') plt.title("Decision tree") plt.xlabel('Model index') plt.ylabel('Generalisation error') plt.xticks(range(len(scores)), range(1, len(scores))) plt.legend() plt.show()
import titanic_data import forward_select as fwd from sklearn.preprocessing import StandardScaler linear = titanic_data.linear() data = linear[:-50] gen_est_data = linear[-50:] predictor_features = data.columns.drop('CleanedFare') nn_baseline = fwd.get_baseline(data, test_func=fwd.score_nn) nn_best, nn_scorechart = fwd.forward_select(data, [], predictor_features, nn_baseline, test_func=fwd.score_nn, est_func=fwd.test_kfold) nn_labels, nn_scores, nn_error_rates = fwd.process_scorechart(nn_scorechart) fwd.table_display(nn_labels, nn_error_rates)
import titanic_data import forward_select as fwd from sklearn.model_selection import train_test_split from sklearn import neighbors import matplotlib.pyplot as plt data = titanic_data.linear() train = data[:-50] gen_test = data[-50:] knn_best_model, knn_scorechart = fwd.forward_select( train, [], data.columns.drop('Survived'), 1, fwd.score_knn, target='Survived') knn_gen_est_errors = fwd.estimate_generalisation_errors( knn_scorechart, train, gen_test, fwd.score_knn, 'Survived') knn_labels, knn_scores, knn_errors = fwd.process_scorechart(knn_scorechart) plt.plot(knn_scores, label="Training error", marker='o') plt.plot(knn_gen_est_errors, label="Test error", marker='o') plt.title("K-Neighbours") plt.xlabel('Model index') plt.ylabel('Generalisation error') plt.xticks(range(len(knn_scores)), range(1, len(knn_scores) + 1)) plt.yticks([0, 0.5, 1]) plt.legend() plt.show()
def main(): linear = titanic_data.linear() data = linear[:-89] gen_est_data = linear[-89:] tree_baseline = fwd.get_baseline(data, test_func=fwd.score_tree, target='Survived') predictor_features = data.columns.drop('Survived') # Decision Tree tree_best, tree_scorechart = fwd.forward_select(data, [], predictor_features, tree_baseline, fwd.score_cnn, scores=[], target='Survived') tree_labels, tree_scores, tree_error_rates = fwd.process_scorechart(tree_scorechart) tree_gen_est_errors = fwd.estimate_generalisation_errors(tree_scorechart, data, gen_est_data, fwd.score_cnn, 'Survived') print "Decision Tree" fwd.table_display(tree_labels, tree_scores, tree_gen_est_errors) # Neural Network nn_baseline = fwd.get_baseline(data, test_func=fwd.score_cnn, target='Survived') nn_baseline = 1 nn_best, nn_scorechart = fwd.forward_select(data, [], predictor_features, nn_baseline, test_func=fwd.score_nn, est_func=fwd.test_kfold, scores=[], target='Survived') nn_labels, nn_scores, nn_error_rates = fwd.process_scorechart(nn_scorechart) nn_gen_est_errors = fwd.estimate_generalisation_errors(nn_scorechart, data, gen_est_data, fwd.score_nn, 'Survived') print "Neural Network" fwd.table_display(nn_labels, nn_scores, nn_gen_est_errors) # K-Neighbours knn_best_model, knn_scorechart = fwd.forward_select(data, [], data.columns.drop('Survived'), 1, fwd.score_knn, target='Survived') knn_gen_est_errors = fwd.estimate_generalisation_errors(knn_scorechart, data, gen_est_data, fwd.score_knn, 'Survived') knn_labels, knn_scores, knn_errors = fwd.process_scorechart(knn_scorechart) print "K-Neighbour" fwd.table_display(knn_labels, knn_scores, knn_gen_est_errors) # Assume largest class - largest class did not survive (0) all_died = np.zeros(np.shape(gen_est_data.Survived.values)) all_died_error = np.mean((all_died - gen_est_data.Survived.values) ** 2) # Graphing code f, axarr = plot.subplots(1, 3) fwd.plot_scores(axarr[0], tree_scores, "Decision Tree", "Train") fwd.plot_scores(axarr[0], tree_gen_est_errors, "Decision Tree", "Test") all_died_arr = np.empty(len(tree_scores)) all_died_arr.fill(all_died_error) axarr[0].plot(all_died_arr, label="Assume Largest Class") axarr[0].legend() plot.yticks([0, .25, .5, .75, 1]) fwd.plot_scores(axarr[1], nn_scores, "Neural Network", "Train") fwd.plot_scores(axarr[1], nn_gen_est_errors, "Neural Network", "Test") all_died_arr = np.empty(len(nn_scores)) all_died_arr.fill(all_died_error) axarr[1].plot(all_died_arr, label="Assume Largest Class") axarr[1].legend() plot.yticks([0, .25, .5, .75, 1]) fwd.plot_scores(axarr[2], knn_scores, "K-Neighbors", "Train") fwd.plot_scores(axarr[2], knn_gen_est_errors, "K-Neighbors", "Test") all_died_arr = np.empty(len(knn_scores)) all_died_arr.fill(all_died_error) axarr[2].plot(all_died_arr, label="Assume Largest Class") axarr[2].legend() plot.yticks([0, .25, .5, .75, 1]) f.text(0.5, 0.04, 'Model index', ha='center', va='center') f.text(0.06, 0.5, 'Generalisation error', ha='center', va='center', rotation='vertical') plot.show()