auc_graphs = {} best_score = 0.0 best_model = None for i in range(1000): m = LinearModel.random(D) train_score = m.score(X_vali, y_vali) if train_score > best_score or best_model is None: best_score = train_score best_model = m print("rand[{}] = {:.3}".format(i, train_score)) print(["{:1.3f}".format(x[0]) for x in best_model.weights.tolist()]) accuracy_graphs["Random"] = bootstrap_accuracy(best_model, X_vali, y_vali) auc_graphs["Random"] = bootstrap_auc(best_model, X_vali, y_vali) for i in range(20): sgd = SGDClassifier(random_state=i + RANDOM_SEED) sgd.fit(X_train, y_train) train_score = sgd.score(X_vali, y_vali) if train_score > best_score or best_model is None: best_score = train_score best_model = sgd print("sgd[{}] = {:.3}".format(i, train_score)) accuracy_graphs["SGD"] = bootstrap_accuracy(best_model, X_vali, y_vali) auc_graphs["SGD"] = bootstrap_auc(best_model, X_vali, y_vali) def mini_ca():
np.random.seed(RANDOM_SEED) torch.manual_seed(RANDOM_SEED) # import data; choose feature space from dataset_poetry import y_train, Xd_train, y_vali, Xd_vali X_train = Xd_train["numeric"] X_vali = Xd_vali["numeric"] #%% from sklearn.linear_model import LogisticRegression m = LogisticRegression(random_state=RANDOM_SEED, penalty="none", max_iter=2000) m.fit(X_train, y_train) print("skLearn-LR AUC: {:.3}".format(np.mean(bootstrap_auc(m, X_vali, y_vali)))) print("skLearn-LR Acc: {:.3}".format(m.score(X_vali, y_vali))) def nearly_eq(x, y, tolerance=1e-6): return abs(x - y) < tolerance #%% (N, D) = X_train.shape X = torch.from_numpy(X_train).float() y = torch.from_numpy(y_train).long() Xv = torch.from_numpy(X_vali).float() yv = torch.from_numpy(y_vali).long()
#%% try sklearn MultinomialNB: ## SKLearn has it's own Multinomial Naive Bayes, # and it uses the alpha / additive smoothing to deal with zeros! from sklearn.naive_bayes import MultinomialNB # Try a couple alpha values (what to do with zero-prob words!) # Alpha can really be anything positive! for alpha in [0.05, 0.1, 1.0, 10.0, 50.0]: m = MultinomialNB(alpha=alpha) m.fit(X_train, y_train) scores = m.predict_proba(X_vali)[:, 1] print("Accuracy: {:.3}, AUC: {:.3}".format( m.score(X_vali, y_vali), roc_auc_score(y_score=scores, y_true=y_vali))) print("What I called log(beta)={}".format(m.class_log_prior_[1])) results["MNB(alpha={})".format(alpha)] = bootstrap_auc(m, X_vali, y_vali) #%% Showcase linar smoothing: from collections import Counter import typing # P(x|POETRY) / P(x|EVERYTHING) > some random constant? @dataclass class CountLanguageModel: """ The number of times each word has been observed. """ counts: typing.Counter[str] = field(default_factory=Counter) # default_factory: zero-argument callable that will be called when a default value is needed for this field """ The total number of observed words (any word)"""
) vali_sX = np.hstack( [ best_textual.m.predict_proba(vali_xd["textual"]), vali_xd["numeric"], ] ) test_sX = np.hstack( [ best_textual.m.predict_proba(test_xd["textual"]), test_xd["numeric"], ] ) stacked = LogisticRegression(random_state=RAND) stacked.fit((train_sX), train_y) graphs = { "textual": bootstrap_auc(best_textual.m, test_xd["textual"], test_y), "numeric": bootstrap_auc(best_numeric.m, test_xd["numeric"], test_y), "merged": bootstrap_auc(best_merged.m, test_xd["merged"], test_y), "stacked": bootstrap_auc(stacked, test_sX, test_y), } simple_boxplot( graphs, ylabel="AUC", xlabel="method", save="graphs/p10-early-vs-stacked.png" ) # %%
for _ in range(num_iter): for _ in range(n_samples): X_mb, y_mb = resample(X_train, y_train, n_samples=minibatch_size) m.weights += alpha * compute_gradient_update(m, X_mb, y_mb) # record performance: plot.add_sample(m, X_train, y_train, X_vali, y_vali) return m # 2. pick a smaller max_iter that gets good performance. # When num_iter is 1000, both the training and validation curve has flattened out more or less for alpha in [0.05, 0.1, 0.5, 1.0, 2.0]: m = train_logistic_regression_sgd_opt("LR-SGD", alpha, num_iter=1000) print("LR-SGD AUC: {:.3}".format(np.mean(bootstrap_auc(m, X_vali, y_vali)))) print("LR-SGD Acc: {:.3}".format(m.score(X_vali, y_vali))) # (A) Explore Learning Rates: # # 3. make ``alpha``, the learning rate, a parameter of the train function. # 4. make a graph including some faster and slower alphas # .... what do you notice? ## Both training and validation curves move up (flatten out sooner) as alpha increases, but # the change becomes less noticeable for alpha greater or equal to 0.5 ## Create training curve plots: import matplotlib.pyplot as plt for key, dataset in learning_curves.items():
scores = m.decision_function(X_vali) else: scores = m.predict_proba(X_vali)[:, 1] print("\tVali-AUC: {:.3}".format( roc_auc_score(y_score=scores, y_true=y_vali))) from sklearn.utils import resample from sklearn.metrics import accuracy_score import matplotlib.pyplot as plt # Is it randomness? Use simple_boxplot and bootstrap_auc/bootstrap_acc to see if the differences are meaningful! from shared import bootstrap_accuracy, bootstrap_auc f = DecisionTreeClassifier() f.fit(X_train, y_train) bootstrap_acc = bootstrap_accuracy(f=f, X=X_vali, y=y_vali) bootstrap_auc = bootstrap_auc(f=f, X=X_vali, y=y_vali) print(bootstrap_acc[:1]) print(bootstrap_auc[:1]) plt.boxplot([bootstrap_acc, bootstrap_auc]) plt.xticks(ticks=[1, 2], labels=["bootstrap_acc", "bootstrap_auc"]) plt.xlabel("DecisionTree bootstraps") plt.ylabel("Accuracy") plt.ylim([0.3, 1.0]) plt.show() # 2.D. Is it randomness? Control for random_state parameters! """ Results should be something like: