def solve(tX, y): tX_tr, y_tr, tX_te, y_te = split_data(tX, y, ratio=0.8, seed=2019) lambda_ = 1 w, _ = ridge_regression(y_tr, tX_tr, lambda_) y_pr_tr = predict_labels(w, tX_tr) y_pr_te = predict_labels(w, tX_te) acc_tr = compute_accuracy(y_tr, y_pr_tr) acc_te = compute_accuracy(y_te, y_pr_te) return acc_tr, acc_te
def main(): y_train, tX_train, ids = load_csv_data(DATA_TRAIN_PATH) _, tX_test, ids_test = load_csv_data(DATA_TEST_PATH) np.random.seed(2019) # Preprocess data together to have the same shifts while creating log or root features tX_stacked = np.vstack((tX_train, tX_test)) prep_param = { "bias": True, "fill": True, "standardize": False, "degree": 8, "log": True, "root": True } tX_stacked_prep, *_ = preprocess_data(tX_stacked, None, prep_param) tX_train_prep, tX_test_prep = np.split(tX_stacked_prep, [len(tX_train)]) # Split data according to PRI_jet_num value tX_tr_splitted, indices_tr = divide_data(tX_train_prep) tX_te_splitted, indices_te = divide_data(tX_test_prep) n_models = len(indices_tr) y_tr_splitted = [] for i in range(n_models): y_tr_splitted.append(y_train[indices_tr[i]]) # Train weights = [] for i in range(n_models): lambda_ = lambda_cv(tX_tr_splitted[i], y_tr_splitted[i]) print(f"Class {i}, lambda: {lambda_}") weights.append( ridge_regression(y_tr_splitted[i], tX_tr_splitted[i], lambda_)[0]) # Predict y_pr_tr = np.zeros(tX_train.shape[0]) y_pr_te = np.zeros(tX_test.shape[0]) for i in range(n_models): y_pr_tr[indices_tr[i]] = predict_labels(weights[i], tX_tr_splitted[i]) y_pr_te[indices_te[i]] = predict_labels(weights[i], tX_te_splitted[i]) acc_tr = compute_accuracy(y_train, y_pr_tr) print(f"Total accuracy train: {acc_tr}") _, counts = np.unique(y_pr_te, return_counts=True) print( f"Distribution on test data class -1: {counts[0]}, class +1: {counts[1]}" ) create_csv_submission(ids_test, y_pr_te, OUTPUT_PATH)
def train_3models(tX, y): # Preprocess data together to have the same shifts while creating log or root features prep_param = { "bias": True, "fill": True, "standardize": False, "degree": 8, "log": True, "root": True } tX_new, y_new, _ = preprocess_data(tX, y, prep_param) tX_tr, y_tr, tX_te, y_te = split_data(tX_new, y_new, ratio=0.8, seed=2019) # Split data according to PRI_jet_num value tX_tr_splitted, indices_tr = divide_data(tX_tr) tX_te_splitted, indices_te = divide_data(tX_te) n_models = len(tX_tr_splitted) y_tr_splitted = [] for i in range(len(indices_tr)): y_tr_splitted.append(y_tr[indices_tr[i]]) print(tX_tr_splitted[i].shape) # Train weights = [] for i in range(n_models): lambda_ = lambda_cv(tX_tr_splitted[i], y_tr_splitted[i]) print(f"Class {i}, lambda: {lambda_}") weights.append( ridge_regression(y_tr_splitted[i], tX_tr_splitted[i], lambda_)[0]) print(len(weights[-1])) # Predict y_pr_tr = np.zeros(y_tr.shape) y_pr_te = np.zeros(y_te.shape) for i in range(n_models): y_pr_tr[indices_tr[i]] = predict_labels(weights[i], tX_tr_splitted[i]) y_pr_te[indices_te[i]] = predict_labels(weights[i], tX_te_splitted[i]) # Get accuracy acc_tr = compute_accuracy(y_tr, y_pr_tr) acc_te = compute_accuracy(y_te, y_pr_te) print(f"Total accuracy tr: {acc_tr}, te: {acc_te}") for i in range(n_models): acc_tr = compute_accuracy(y_tr[indices_tr[i]], y_pr_tr[indices_tr[i]]) acc_te = compute_accuracy(y_te[indices_te[i]], y_pr_te[indices_te[i]]) print(f"Class {i}, Accuracy tr: {acc_tr}, te: {acc_te}")
def least_squares_demo(y, x, k): """return error for least square model""" seed = 1 weights=[] mse_errors = [] tx = helpers.build_poly(x, 1) # Initialization w_initial = np.zeros(tx.shape[1]) # split data in k fold k_indices = helpers.build_k_indices(y, k, seed) for i in range(k): mse_te, opt_w = cross_validation_ls(y, tx, k_indices, i) mse_errors.append(mse_te) weights.append([opt_w]) mse = np.min(mse_errors) opt_w = weights[np.argmin(mse_errors)] y_model = helpers.predict_labels(np.array(opt_w).T, tx) #Computing accuracy print(" mse={mse}".format(mse = mse)) accuracy = (list(y_model.flatten() == y).count(True))/len(y_model) print(" accuracy={acc:.3f}".format(acc=accuracy))
def experiment_for_submitting(): y_train, tX_train, ids = load_csv_data(DATA_TRAIN_PATH) _, tX_test, ids_test = load_csv_data(DATA_TEST_PATH) np.random.seed(2019) results = pd.DataFrame( columns=["Preprocessing", "Class -1 count", "Class +1 count"]) for preprocessing_param in preprocessing_options: tX_stacked = np.vstack((tX_train, tX_test)) prep_param = { "bias": True, "fill": True, "standardize": False, "degree": 11, "log": True, "root": True } tX_stacked_prep, _, desc_prep = preprocess_data( tX_stacked, None, prep_param) tX_train_prep, tX_test_prep = np.split(tX_stacked_prep, [len(tX_train)]) lambda_ = lambda_cv(tX_train_prep, y_train) print(f"Best lambda: {lambda_}") w, _ = ridge_regression(y_train, tX_train_prep, lambda_) y_pred = predict_labels(w, tX_test_prep) uniq, count = np.unique(y_pred, return_counts=True) print(preprocessing_param, f"Class -1: {count[0]}, Class +1: {count[1]}") results.loc[len(results)] = (desc_prep, count[0], count[1]) results.to_csv("Submitting experiment.csv", sep=";")
def accuracy(y, tx, w): """ Computes the accuracy of a model. Parameters ---------- y: ndarray The labels tx: ndarray The feature matrix w: ndarray The learned weights Returns ------- float The accuracy """ ny = map_0_1(predict_labels(w, tx)) assert ny.shape == y.shape assert y.min() in [0, 1] assert y.max() in [0, 1] assert ny.min() in [0, 1] assert ny.max() in [0, 1] return np.equal(y, ny).astype(int).sum() / y.shape[0]
def lrr_demo(y, x, k): """find best hyperparameters and return error for regularized logistic regression model""" #Adding constant term tx = helpers.build_poly(x, 4) seed = 1 max_iters = 50 lambdas = np.logspace(-4, -3, 1) gammas = np.logspace(-4, -3, 1) hyperparams = [(gamma,lambda_) for gamma in gammas for lambda_ in lambdas] w_initial = np.zeros(tx.shape[1]) # split data in k fold k_indices = helpers.build_k_indices(y, k, seed) result_loss =[] result_opt_w=[] for gamma,lambda_ in hyperparams: loss_errors=[] weights=[] for i in range(k): loss_te, opt_w = cross_validation_lrr(y, tx, k_indices, i, lambda_, gamma, max_iters, w_initial) loss_errors.append(loss_te) weights.append([opt_w]) result_loss.append(np.mean(loss_errors)) result_opt_w.append(np.mean(weights,axis=0)) del loss_errors del weights mse = np.min(result_loss) hyper_opt= hyperparams[np.argmin(result_loss)] print(" gamma={g:.3f}, mse={mse:.3f} lambda{l:.3f}".format(mse = mse, g=hyper_opt[0], l=hyper_opt[1])) opt_w = result_opt_w[np.argmin(result_loss)] #Training Accuracy y_predicted = helpers.predict_labels(opt_w.T, tx) accuracy = (list(y_predicted.flatten() == y).count(True))/len(y) print(" accuracy={acc:.3f}".format(acc=accuracy)) del result_loss del result_opt_w
def lr_demo(y, x, k): """find best hyperparameters and return error for logistic regression model""" max_iters = 100 gammas = np.logspace(-4, -3, 1) seed = 1 # adding constant term tx = helpers.build_poly(x, 1) # Initialization w_initial = np.zeros(tx.shape[1]) # split data in k fold k_indices = helpers.build_k_indices(y, k, seed) gen_opt_w = [] gen_loss = [] #gamma selection for gamma in gammas: weights=[] loss_errors = [] for i in range(k): loss_te, opt_w = cross_validation_lr(y, tx, k_indices, i, gamma, max_iters, w_initial) loss_errors.append(loss_te) weights.append([opt_w]) gen_loss.append(np.mean(loss_errors)) gen_opt_w.append(np.mean(weights,axis=0)) del weights del loss_errors opt_gamma = gammas[np.nanargmin(gen_loss)] opt_w = gen_opt_w[np.nanargmin(gen_loss)] print(" gamma={l:.3f},loss={loss:.3f}".format(loss = np.min(gen_loss), l = opt_gamma)) #Training Accuracy y_predicted = helpers.predict_labels(opt_w.T, tx) accuracy = (list(y_predicted.flatten() == y).count(True))/len(y) print(" accuracy={acc:.3f}".format(acc = accuracy)) del gen_opt_w del gen_loss
def LS_SGD_demo(y, x, k): """find best hyperparameters and return error for least square SGD model""" #Adding constant term tx = helpers.build_poly(x, 1) seed = 1 max_iters = 50 gammas = np.logspace(-3, 0, 10) batch_sizes = np.array([1]) # Initialization w_initial = np.zeros(tx.shape[1]) # split data in k fold k_indices = helpers.build_k_indices(y, k, seed) temp_mse = [] temp_opt_w = [] hyperparams = [(batch_size,gamma) for batch_size in batch_sizes for gamma in gammas ] for batch_size, gamma in hyperparams: mse_errors = [] weights = [] for i in range(k): mse_te, opt_w = cross_validation_ls_SGD(y, tx, k_indices, i, gamma, max_iters, w_initial, batch_size) mse_errors.append(mse_te) weights.append([opt_w]) temp_mse.append(np.mean(mse_errors)) temp_opt_w.append(np.mean(weights, axis=0)) mse = np.min(temp_mse) hyper_opt= hyperparams[np.argmin(temp_mse)] print(" gamma={g:.3f}, batch={b:.2f}, mse={mse:.3f}".format(mse = mse, g = hyper_opt[1], b = hyper_opt[0])) opt_w = temp_opt_w[np.nanargmin(temp_mse)] #Training Accuracy y_predicted = helpers.predict_labels(opt_w.T, tx) accuracy = (list(y == y_predicted.flatten()).count(True))/len(y) print(" accuracy={acc:.3f}".format(acc = accuracy))
def LS_GD_demo(y, x, k): """find best hyperparameters and return error for least square GD model""" seed=1 max_iters = 50 gammas = np.logspace(-3, 0, 10) tx = helpers.build_poly(x, 1) # Initialization w_initial = np.zeros(tx.shape[1]) # split data in k fold k_indices = helpers.build_k_indices(y, k, seed) gen_opt_w = [] gen_mse = [] #gamma selection for gamma in gammas: weights=[] mse_errors = [] for i in range(k): mse_te, opt_w = cross_validation_ls_GD(y, tx, k_indices, i, gamma,max_iters, w_initial) mse_errors.append(mse_te) weights.append([opt_w]) gen_mse.append(np.mean(mse_errors)) gen_opt_w.append(np.mean(weights, axis=0)) del weights del mse_errors opt_gamma = gammas[np.nanargmin(gen_mse)] opt_w = gen_opt_w[np.nanargmin(gen_mse)] mse_LS_GD = np.nanmin(gen_mse) print(" gamma={l:.3f}, mse={mse:.3f}".format(mse = mse_LS_GD, l = opt_gamma)) #Training Accuracy y_predicted = helpers.predict_labels(opt_w.T, tx) accuracy = (list(y == y_predicted.flatten()).count(True))/len(y) print(" accuracy={acc:.3f}".format(acc=accuracy))
def ridge_regression_demo(y, x, degree, k_fold): """find best hyperparameters and return error for ridge regression model""" seed = 1 lambdas = np.logspace(-1.1, -0.8, 20) # split data in k fold k_indices = helpers.build_k_indices(y, k_fold, seed) # define lists to store the loss of training data and test data rmse_tr = [] rmse_te = [] # iterate over all the lambdas, compute model parameters, store the rmse for i in range(len(lambdas)): l = lambdas[i] avg_err_tr = 0 avg_err_te = 0 for k in range(k_fold): err = cross_validation_rr(y, x, k_indices, k, l, degree) avg_err_tr += err[0] avg_err_te += err[1] rmse_tr.append(np.sqrt(2 * avg_err_tr / k_fold)) rmse_te.append(np.sqrt(2 * avg_err_te / k_fold)) helpers.visualization(lambdas, rmse_tr, rmse_te) # find the best lambda min_err_index = 0 for i in range(1, len(rmse_te)): if rmse_te[i] < rmse_te[min_err_index]: min_err_index = i lambda_opt = lambdas[min_err_index] x_poly = helpers.build_poly(x, degree) w_opt, mse = imp.ridge_regression(y, x_poly, lambda_opt) print(" lambda={l:.3f}, mse={mse:.3f}".format(mse = mse, l = lambda_opt)) #Training Accuracy y_predicted = helpers.predict_labels(w_opt.T, x_poly) accuracy = (list(y == y_predicted.flatten()).count(True))/len(y) print(" accuracy={acc:.3f}".format(acc = accuracy))
# Build the model initial_w = np.random.randn(D) optimal_gamma, optimal_lambda_, measure_tr, measure_te = \ gamma_lambda_selection_cv(y_train_subset, X_train_subset, k_fold, initial_w, max_iters, gammas[i], lambdas[i], seed = seed, batch_size = batch_size, metric = metric, model = model) print('CA_bs:', CA_baseline) print('Iter:', i, ' Best gamma:', optimal_gamma, ' Best lambda:', optimal_lambda_, '\n') # Update the expected training error exp_measure_tr += measure_tr * X_train_subset.shape[0] / X_train.shape[0] exp_measure_te += measure_te * X_test_subset.shape[0] / X_test.shape[0] # Build the model with the best hyperparameters w = get_model(model, y_train_subset, X_train_subset, initial_w, max_iters, optimal_gamma, optimal_lambda_, batch_size) # Get predictions y_pred_test = np.array(map_minus_1_1(predict_labels(w, X_test_subset))) # Insert the ids and predictions to the ids and y_pred arrays ids = np.concatenate((ids, ids_test_subset)) y_pred = np.concatenate((y_pred, y_pred_test)) # Sort the ids and y_pred arrays ids, y_pred = sort_arr(ids, y_pred) # Create the submission CSV file create_csv_submission(ids, y_pred, sumbission_fname) print("Expected training accuracy / loss:", exp_measure_tr) print("Expected test accuracy / loss:", exp_measure_te)
def cross_validation(y, tX, gamma, method='logistic_regression'): """Cross validation for logistic regression @param gamma: learning rate @return : the average accuracy over the four fold validations """ N, D = tX.shape # Logistic regression parameters max_iters = 100 batch_size = N / 100 # Cross validation parameters seed = 1 k_fold = 4 k_indices = build_k_indices(y, k_fold, seed) N_fold = N * (k_fold - 1) / k_fold N_test = N / k_fold acc = [] for k in range(k_fold): yTr = np.array([]) xTr = np.zeros((0, D)) for i in range(k_fold): if i == k: yTe = y[k_indices[i]] xTe = tX[k_indices[i]] else: yTr = np.append(yTr, y[k_indices[i]], axis=0) xTr = np.append(xTr, tX[k_indices[i]], axis=0) initial_w = np.zeros(tX.shape[1]) if method == 'logistic_regression': initial_w = np.zeros((tX.shape[1], 1)) w, loss = logistic_regression(yTr, xTr, initial_w, max_iters, gamma) y_est = sigmoid(np.dot(xTe, w)) y_label = [0 if i < 0.5 else 1 for i in y_est] elif method == 'reg_logistic_regression': initial_w = np.zeros((tX.shape[1], 1)) lambda_ = 0.1 w, loss = reg_logistic_regression(yTr, xTr, lambda_, initial_w, max_iters, gamma) y_est = sigmoid(np.dot(xTe, w)) y_label = [0 if i < 0.5 else 1 for i in y_est] elif method == 'least_squares_GD': w, loss = least_squares_GD(yTr, xTr, initial_w, max_iters, gamma) y_label = predict_labels(w, xTe) elif method == 'least_squares_SGD': w, loss = least_squares_SGD(yTr, xTr, initial_w, max_iters, gamma) y_label = predict_labels(w, xTe) elif method == 'least_squares': w, loss = least_squares(yTr, xTr) y_label = predict_labels(w, xTe) elif method == 'ridge_regression': w, loss = ridge_regression(yTr, xTr, 0.1) y_label = predict_labels(w, xTe) else: raise Exception('Invalid method') corr = [ True if i == yTe[ind] else False for ind, i in enumerate(y_label) ] acc.append(sum(corr) / N_test) # print("Fold: {f}, Accuracy: {acc}, Loss:{loss}".format(f=k, acc=acc[k], loss=loss)) return (sum(acc) / k_fold), acc
k_indices = k_fold_indices(train_data_split.shape[0], 5, SEED) for i, deg in enumerate(POSSIBLE_DEGREES): train_data, _ = preprocessing_pipeline(train_data_split, degree=deg) train_set_folds = k_fold_cross_split_data(train_classes_split, train_data, k_indices) for j, lambda_ in enumerate(POSSIBLE_LAMBDA_VALUES): folds_train_accuracy = [] folds_validation_accuracy = [] # Train a Ridge Regression model on each fold for x_train, y_train, x_test, y_test in train_set_folds: w, train_loss = ridge_regression(y_train, x_train, lambda_) folds_train_accuracy.append( compute_accuracy(predict_labels(w, x_train), y_train)) folds_validation_accuracy.append( compute_accuracy(predict_labels(w, x_test), y_test)) train_accuracy_matrix[jet_num, 0, i, j] = \ (np.mean(folds_train_accuracy), np.std(folds_train_accuracy)) validation_accuracy_matrix[jet_num, 0, i, j] = \ (np.mean(folds_validation_accuracy), np.std(folds_validation_accuracy)) train_data_log_svm = preprocessing_pipeline(train_data_split, degree=deg, norm_first=False) train_set_folds = k_fold_cross_split_data(train_classes_split, train_data_log_svm, k_indices) for j, lambda_ in enumerate(POSSIBLE_LAMBDA_LOG):
y_train_subset = map_0_1(y_train_subset) # Standardize the data X_train_subset, X_test_subset = standardize(X_train_subset, X_test_subset) # Build the polynomial features and expand the data print(f"Train shape before feature expansion: {str(X_train_subset.shape):>12} Test shape: {str(X_test_subset.shape):>12}") X_train_subset, X_test_subset = build_poly(X_train_subset, max_degree[i]), build_poly(X_test_subset, max_degree[i]) print(f"Train shape after feature expansion: {str(X_train_subset.shape):>12} Test shape: {str(X_test_subset.shape):>12}") # Set the maximum number of iterations for building the model max_iters = 440 # Set batch size to 1 to enforce SGD batch_size = 1 # Set the initial coefficients randomly initial_w = np.random.rand(X_train_subset.shape[1]) # Get the coefficients of the optimal regularized logistic regression model w = get_model("LOG_REG_GD", y_train_subset, X_train_subset, initial_w, max_iters, gammas_opt[i], lambdas_opt[i], batch_size) # Get the predictions y_pred_test = np.array(predict_labels(w, X_test_subset)) # Insert the ids and predictions to the ids and y_pred arrays ids = np.concatenate((ids, ids_test_subset)) y_pred = np.concatenate((y_pred, y_pred_test)) # Sort the ids and y_pred arrays ids, y_pred = sort_arr(ids, y_pred) # Create the submission CSV file create_csv_submission(ids, y_pred, sumbission_fname)
PRI_JET_NUM_INDEX) # We achieved our best results using Regularized Logistic Regression, # so we only load only those previously computed optimal params to generate the submission logistic_best_params = np.load("results/logistic_best_params.npy", allow_pickle=True) logistic_best_models = [] for (lambda_, deg, gamma), train_classes_split, train_data_split in \ zip(logistic_best_params, train_classes_jet_num_splits, train_data_jet_num_splits): data_split, columns_to_remove, mean, std = preprocessing_pipeline(train_data_split, degree=np.int(deg), cross_term=True, norm_first=False) initial_w = np.zeros((data_split.shape[1],)) w, loss = reg_logistic_regression(train_classes_split, data_split, lambda_, initial_w, 500, gamma, 1) print(f'Loss: {loss:.3f} Accuracy : {compute_accuracy(predict_labels(w, data_split), train_classes_split)}') logistic_best_models.append((w, loss, columns_to_remove, mean, std)) # Calculate the predictions for each of the 4 subsets using the weights and then combine them results = None for (w, _, col_to_rm, mean, std), (_, deg, _), test_classes_split, test_data_split, test_ids_split in \ zip(logistic_best_models, logistic_best_params, test_classes_jet_num_splits, test_data_jet_num_splits, test_ids_jet_num_splits): test_data_split, _, _, _ = preprocessing_pipeline(test_data_split, degree=np.int(deg), columns_to_remove=col_to_rm, cross_term=True, norm_first=False, mean=mean, std=std) pred = predict_labels(w, test_data_split) out = np.stack((test_ids_split, pred), axis=-1) results = out if results is None else np.vstack((results, out)) # Create the submission create_csv_submission(results[:, 0], results[:, 1], 'results/logistic_submission.csv')