mlp = consider_neural_net() print("Best Logistic Regression", logit) print("Best Perceptron", perceptron) print("Best DTree", dtree) print("Best RForest", rforest) print("Best MLP", mlp) #%% Plot Results # Helper method to make a series of box-plots from a dictionary: simple_boxplot( { "Logistic Regression": bootstrap_accuracy(logit.model, X_vali, y_vali), "Perceptron": bootstrap_accuracy(perceptron.model, X_vali, y_vali), "Decision Tree": bootstrap_accuracy(dtree.model, X_vali, y_vali), "RandomForest": bootstrap_accuracy(rforest.model, X_vali, y_vali), "MLP/NN": bootstrap_accuracy(mlp.model, X_vali, y_vali), }, title="Validation Accuracy", xlabel="Model", ylabel="Accuracy", save="model-cmp.png", ) TODO("1. Understand consider_decision_trees; I have 'tuned' it.") TODO("2. Find appropriate max_iter settings to stop warning messages.") TODO( "3. Pick a model: {perceptron, logistic regression, neural_network} and optimize it!" )
from sklearn.tree import DecisionTreeRegressor # Direct feature-importances (can think of them as how many times a feature was used): rf = RandomForestRegressor(random_state=RAND, n_estimators=100) rf.fit(train_X, train_y) # loop over each tree and ask them how important each feature was! importances = dict((name, []) for name in numberer.feature_names_) for tree in rf.estimators_: for name, weight in zip(numberer.feature_names_, tree.feature_importances_): importances[name].append(weight) # Think: what does 'how many splits' actually measure? Usefulness, or something else? simple_boxplot( importances, title="Tree Importances", ylabel="Decision Tree Criterion Importances", save="graphs/DecisionTree-importances.png", ) #%% # graphs: T.Dict[str, T.List[float]] = {} # @dataclass # class Model: # vali_score: float # m: T.Any
linear, roc_auc_score(y_score=scores, y_true=y_vali))) # bootstrap AUC: (doing this manually because the helper function doesn't accept scores out of nowhere!) dist = [] # do the bootstrap: for trial in range(100): sample_pred, sample_truth = resample(scores, y_vali, random_state=trial + RANDOM_SEED) # type:ignore score = roc_auc_score(y_true=sample_truth, y_score=sample_pred) # type:ignore dist.append(score) results["Linear[{}]".format(linear)] = dist #%% Boxplot all AUC results: simple_boxplot(results, ylabel="AUC", save="{}-text-AUC.png".format(dataset)) from shared import TODO TODO( "1. Explore alpha and linear parameters; make a decision about what a good choice for this dataset might be." ) # 2 is once again a choose-your-own: TODO( "2A. Explore ngrams, lowercase v. uppercase, etc. (how changing CountVectorizer changes performance, or not)" ) TODO( "2B. Explore the difference between today's approaches to the WIKI dataset and yesterday's." ) TODO(
ca = ca_restart() train_score = ca.score(X_vali, y_vali) if train_score > best_score or best_model is None: best_score = train_score best_model = ca print("ca[{}] = {:.3}".format(i, train_score)) accuracy_graphs["CoordinateAscent"] = bootstrap_accuracy( best_model, X_vali, y_vali) auc_graphs["CoordinateAscent"] = bootstrap_auc(best_model, X_vali, y_vali) do_slow_AUC_experiment = False if do_slow_AUC_experiment: best_score = 0.0 best_model = None for i in range(20): ca = ca_restart(measure=lambda m, X, y: m.compute_auc(X, y)) train_score = ca.score(X_vali, y_vali) if train_score > best_score or best_model is None: best_score = train_score best_model = ca print("ca-AUC[{}] = {:.3}".format(i, train_score)) accuracy_graphs["CoordinateAscent-AUC"] = bootstrap_accuracy( best_model, X_vali, y_vali) auc_graphs["CoordinateAscent-AUC"] = bootstrap_auc(best_model, X_vali, y_vali) simple_boxplot(auc_graphs, "Linear Model AUC", save="graphs/p11-AUC.png") simple_boxplot(accuracy_graphs, "Linear Model Accuracy", save="graphs/p11-Accuracy.png")
) vali_sX = np.hstack( [ best_textual.m.predict_proba(vali_xd["textual"]), vali_xd["numeric"], ] ) test_sX = np.hstack( [ best_textual.m.predict_proba(test_xd["textual"]), test_xd["numeric"], ] ) stacked = LogisticRegression(random_state=RAND) stacked.fit((train_sX), train_y) graphs = { "textual": bootstrap_auc(best_textual.m, test_xd["textual"], test_y), "numeric": bootstrap_auc(best_numeric.m, test_xd["numeric"], test_y), "merged": bootstrap_auc(best_merged.m, test_xd["merged"], test_y), "stacked": bootstrap_auc(stacked, test_sX, test_y), } simple_boxplot( graphs, ylabel="AUC", xlabel="method", save="graphs/p10-early-vs-stacked.png" ) # %%
# plot area from means & stddev plt.fill_between(sample_subset, means - std, means + std, alpha=0.2) # Manage axes/show: plt.xlabel("Training Data by 50") plt.ylabel("Mean Accuracy") plt.xlim([0, N]) plt.title("Shaded Accuracy Plot") plt.savefig("graphs/p09-area-Accuracy.png") plt.show() # Second look at the boxplots in-order: (I like this better, IMO) simple_boxplot( scores, "Learning Curve", xlabel="Percent Training Data", ylabel="Accuracy", save="graphs/p09-boxplots-Accuracy.png", ) # TODO: (practical tasks) # 1. Swap in a better, but potentially more expensive classifier. # - Even DecisionTreeClassifier has some more interesting behavior on these plots. # 2. Change the plots to operate over multiples of 50 samples, instead of percentages. # - This will likely be how you want to make these plots for your project. # OPTIONAL CHALLENGE: # Refactor the code so that you can evaluate multiple models in this fashion. # Two different models at the same time will likely max out the visual utility of the plot.
# TODO: C is the most important value for a SVM. # 1/C is how important the model stays small. # TODO: RBF Kernel is the best; explore it's 'gamma' parameter. for cfg in configs: variants: T.List[ModelInfo] = [] for class_weights in [None, "balanced"]: for c_val in [10, 25, 30, 50]: if (cfg["kernel"] == "gamma"): continue for gam in ["scale", "auto"]: svm = SVMClassifier(C=c_val, class_weight=class_weights, **cfg) svm.fit(X_train, y_train) name = "k={}{} C={} {}, gamma={}".format( cfg["kernel"], cfg.get("degree", ""), c_val, class_weights or "", gam) accuracy = svm.score(X_vali, y_vali) print("{}. score= {:.3}".format(name, accuracy)) variants.append(ModelInfo(name, accuracy, svm)) best = max(variants, key=lambda x: x.accuracy) graphs[best.name] = bootstrap_accuracy(best.model, X_vali, y_vali) simple_boxplot( graphs, title="Kernelized Models for Poetry", ylabel="Accuracy", save="graphs/p15-kernel-cmp.png", )
importances = dict((name, []) for name in feature_numbering.feature_names_) for tree in rf.estimators_: for name, weight in zip(feature_numbering.feature_names_, tree.feature_importances_): importances[name].append(weight) im = {} import statistics as st for name in importances.keys(): if st.mean(importances[name]) > 0.04: im[name] = importances[name] from shared import simple_boxplot, bootstrap_r2 simple_boxplot(im, title="Tree Importances", ylabel="Decision Tree Criterion Importances", save='graphs/project/feature-importance') import typing as T from dataclasses import dataclass from sklearn.tree import DecisionTreeRegressor from sklearn.neighbors import KNeighborsRegressor graphs: T.Dict[str, T.List[float]] = {} @dataclass class Model: vali_score: float m: T.Any
# Coordinate ascent; try them: best_score = 0.0 best_model = None for i in range(3): ca = ca_restart() train_score = ca.score(X_vali, y_vali) if train_score > best_score or best_model is None: best_score = train_score best_model = ca print("ca[{}] = {:.3}".format(i, train_score)) graphs["CoordinateAscent"] = bootstrap_r2(best_model, X_vali, y_vali) # --- now try some nonlinear models --- # rf = RandomForestRegressor() rf.fit(X_train, y_train) graphs["RF"] = bootstrap_r2(rf, X_vali, y_vali) knn = KNeighborsRegressor() knn.fit(X_train, y_train) graphs["KNN"] = bootstrap_r2(knn, X_vali, y_vali) # Graph everything: simple_boxplot(graphs, "{} R**2".format(PREDICT_COL), save="graphs/p11-r2-score.png") ## # TODO: # 1. remove the 'best-of-random' graph, so you can see the other ones! # 2. See if there's anything here that might help your project.
f_single.fit(X_train, y_train) y_pred = f_single.predict(X_vali) assert f_single.score(X_vali, y_vali) == accuracy_score(y_vali, y_pred) # type:ignore # do the bootstrap: for trial in range(N_SAMPLES): sample_pred, sample_truth = resample(y_pred, y_vali, random_state=trial + RANDOM_SEED) # type:ignore score = accuracy_score(y_true=sample_truth, y_pred=sample_pred) # type:ignore bootstrap_based_accuracies.append(score) plot = simple_boxplot( { "Seed-Based (Best)": splitter_best_seeds, "Seed-Based (Random)": splitter_random_seeds, "Bootstrap-Based": bootstrap_based_accuracies, }, xlabel="Sampling Method", ylabel="Accuracy", show=False, save="split-compare.png", ) plot.ylim([0.8, 1.0]) plot.show() # if plt.show is not working, try opening the result of plt.savefig instead! # plot.savefig("dtree-variance.png") # This doesn't work well on repl.it.
# ExperimentResult(vali_acc=0.7033403492268693, params={'n_neighbors': 8, 'weights': 'uniform'}, model=KNeighborsRegressor(n_neighbors=8)) # Linear model does not work for this dataset. Features are highly correlated, making the linear model # very unstable. Large number of iterations is needed to reach the depth of optimization for linear modesl, # and it is too slow to train. del X_temp, y_temp from shared import simple_boxplot, bootstrap_regressor simple_boxplot( { "Decision Tree": bootstrap_regressor(dtree.model, X_vali, y_vali), "knn": bootstrap_regressor(knn.model, X_vali, y_vali), #"MLP/NN": bootstrap_accuracy(mlp.model, X_vali, y_vali), }, title="Validation Accuracy", xlabel="Model", ylabel="Mean Squared Error", save="graphs/project/model-cmp.png", ) ## Decision tree performs better than knn for this dataset. The bootstrapped boxplot shows # that this dataset has rather high quality without many outliers and much variance. del dtree, knn #%% Is my dataset large enough? #%% Compute performance for each % of training data