def cross_validation_ridge(y, x, k_indices, k, lambda_, degree): """return the loss of ridge regression computed over a k-fold cross validation with polynomial degrees""" losses_tr = [] losses_te = [] for k_group in range(k): # divide in test and train set: 1 set for test all the others for train index_te = k_indices[k_group] index_tr = np.setdiff1d(np.arange(len(y)), index_te) x_te = x[index_te] x_tr = x[index_tr] y_te = y[index_te] y_tr = y[index_tr] # form data with polynomial degree x_te_poly = build_poly(x_te, degree) x_tr_poly = build_poly(x_tr, degree) # compute w with ridge regression w, _ = ridge_regression(y_tr, x_tr_poly, lambda_) # calculate the loss for train and test data rmse_tr = compute_rmse_ridge(y_tr, x_tr_poly, w, lambda_) rmse_te = compute_rmse_ridge(y_te, x_te_poly, w, lambda_) losses_tr.append(rmse_tr) losses_te.append(rmse_te) #return losses average loss_tr = np.mean(losses_tr) loss_te = np.mean(losses_te) return loss_tr, loss_te
def ridge_regression_demo(y, tx, lamb, degree): # define parameter tX = im.build_poly(tx, degree) weight, loss = im.ridge_regression(y, tX, lamb) print("Training RMSE={tr:.3f}".format(tr=loss)) return weight, loss
def lambda_cv(tX, y, plot=False): lambdas = np.logspace(-5, 5, 15) tX_tr, y_tr, tX_te, y_te = split_data(tX, y, ratio=0.8, seed=1) accs_tr = [] accs_te = [] for lambda_ in lambdas: w, _ = implementations.ridge_regression(y_tr, tX_tr, lambda_) y_pr_tr = predict_labels(w, tX_tr) y_pr_te = predict_labels(w, tX_te) accs_tr.append(compute_accuracy(y_tr, y_pr_tr)) accs_te.append(compute_accuracy(y_te, y_pr_te)) min_acc = max(accs_te) best_lambda = lambdas[np.argwhere(accs_te == min_acc)][0][0] if plot: plt.plot(lambdas, accs_tr, label="Train") plt.plot(lambdas, accs_te, label="Test") plt.plot(best_lambda, min_acc, "*", label="Best value") plt.xlabel("Lambda") plt.ylabel("Accuracy") plt.legend() plt.show() return best_lambda
def cross_validation(y, x, k_fold, lambda_, degree): """ Return the loss for ridge regression for this given lambda_ and given degree Arguments: - y: the column of ground truth results - x: the features matrix - k_fold: the number of fold to do cross validation - lambda_: penalizing parameter for ridge regression - degree: the degree of the polynomial data augmentation """ k_indices = build_k_indices(y, k_fold, 1) x_k, y_k = x[k_indices], y[k_indices] Loss_tr = [] Loss_te = [] for k in range(k_fold): x_train, y_train, x_test, y_test = [], [], [], [] x_test = x_k[k] y_test = y_k[k] x_train = np.delete(x_k, k, axis=0) y_train = np.delete(y_k, k, axis=0) phi_x_train = build_poly(x_train, degree) phi_x_test = build_poly(x_test, degree) loss_tr, weights = implementations.ridge_regression( y_train, phi_x_train, lambda_) loss_te = implementations.compute_mse(y_test, phi_x_test, weights) Loss_tr.append(loss_tr) Loss_te.append(loss_te) Loss_tr = np.array(Loss_tr) Loss_te = np.array(Loss_te) return Loss_tr.mean(), Loss_te.mean()
def __init__(self, model_name, w=None, learning_param=None, debug=True): # Set weights self.w = w # Set debug object if debug: self.dbg = debugger.Debugger(['loss', 'w']) else: self.dbg = None """Depending on the chosen model, we choose the approriate output, loss prediction, and learning functions. """ if model_name == 'logistic_regression': self.model_output = misc.lr_output self.compute_loss = cost.compute_loss_ce self.predict_output = misc.map_prediction max_iters = learning_param['max_iters'] gamma = learning_param['gamma'] self.learn = lambda y, x, w, dbg: impl.logistic_regression(y, x, w, max_iters, gamma, dbg) if model_name == 'reg_logistic_regression': self.model_output = misc.lr_output self.compute_loss = cost.compute_loss_reg_ce self.predict_output = misc.map_prediction max_iters = learning_param['max_iters'] gamma = learning_param['gamma'] lambda_ = learning_param['lambda_'] self.learn = lambda y, x, w, dbg: impl.reg_logistic_regression(y, x, lambda_, w, max_iters, gamma, dbg) if model_name == 'least_squares_GD': self.model_output = np.dot self.compute_loss = cost.compute_loss_ls self.predict_output = misc.predict_ls max_iters = learning_param['max_iters'] gamma = learning_param['gamma'] self.learn = lambda y, x, w, dbg: impl.least_squares_GD(y, x, w, max_iters, gamma, dbg) if model_name == 'ridge_regression': self.model_output = np.dot self.compute_loss = cost.compute_loss_ls self.predict_output = misc.predict_ls lambda_ = learning_param['lambda_'] self.learn = lambda y, x, w, dbg: impl.ridge_regression(y, x, lambda_) if model_name == 'least_squares': self.model_output = np.dot self.compute_loss = cost.compute_loss_ls self.predict_output = misc.predict_ls self.learn = lambda y, x, w, dbg: impl.least_squares(y, x)
def cross_validation_ridge(y, x, k_indices, k, lambda_, degree): """Cross validation helper function for ridge regression techniques :param y: outpus/labels, numpy array (-1 = background and 1 = signal) :param x: vector of the data samples :param k_indices: k indices groups for k-fold :param k: k'th group to select :param lambda_: regularization factor (penalty factor) :param degree: maximum degree of the polynomial basis :return: loss for train, loss for test, weights """ # Build test and training set te_indice = k_indices[k] tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)] tr_indice = tr_indice.reshape(-1) y_test = y[te_indice] y_train = y[tr_indice] X_test = x[te_indice] X_train = x[tr_indice] # form data with polynomial degree tx_train = build_poly(X_train, degree) tx_test = build_poly(X_test, degree) # ridge regression w, loss = imp.ridge_regression(y_train, tx_train, lambda_) # calculate the loss for train and test data loss_train = imp.calculate_rmse(loss) loss_test = imp.calculate_rmse(imp.compute_loss(y_test, tx_test, w)) accuracy = calculate_accuracy(y_test, predict_labels(w, tx_test)) return loss_train, loss_test, accuracy, w
def experiment_for_submitting(): y_train, tX_train, ids = load_csv_data(DATA_TRAIN_PATH) _, tX_test, ids_test = load_csv_data(DATA_TEST_PATH) np.random.seed(2019) results = pd.DataFrame( columns=["Preprocessing", "Class -1 count", "Class +1 count"]) for preprocessing_param in preprocessing_options: tX_stacked = np.vstack((tX_train, tX_test)) prep_param = { "bias": True, "fill": True, "standardize": False, "degree": 11, "log": True, "root": True } tX_stacked_prep, _, desc_prep = preprocess_data( tX_stacked, None, prep_param) tX_train_prep, tX_test_prep = np.split(tX_stacked_prep, [len(tX_train)]) lambda_ = lambda_cv(tX_train_prep, y_train) print(f"Best lambda: {lambda_}") w, _ = ridge_regression(y_train, tX_train_prep, lambda_) y_pred = predict_labels(w, tX_test_prep) uniq, count = np.unique(y_pred, return_counts=True) print(preprocessing_param, f"Class -1: {count[0]}, Class +1: {count[1]}") results.loc[len(results)] = (desc_prep, count[0], count[1]) results.to_csv("Submitting experiment.csv", sep=";")
def cross_validation(y, x, degree, k, k_indices,method, error, feature_augmentation, hyperparams): """""" from helpers_data import feature_processing, feat_augmentation, standardize, build_poly from implementations import ridge_regression, least_squares, least_squares_GD, least_squares_SGD, logistic_regression, reg_logistic_regression # get k'th subgroup in test, others in train te_indice = k_indices[k] tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)] tr_indice = tr_indice.reshape(-1) y_te = y[te_indice] y_tr = y[tr_indice] x_te = x[te_indice] x_tr = x[tr_indice] x_tr, y_tr, median = feature_processing (x_tr, y_tr, 'mean', replace_feature = True, suppr_outliers = hyperparams[-1], threshold = 3, ref_median=[]) x_te, y_te, _= feature_processing (x_te, y_te, 'mean', replace_feature = True, suppr_outliers = False, threshold = 3, ref_median=median) tx_tr_aug = [] tx_te_aug = [] if feature_augmentation: tx_tr_aug, index = feat_augmentation(x_tr, 0.003) tx_te_aug, _ = feat_augmentation(x_te, 0.003, False, index) # form data with polynomial degree tx_tr = build_poly(x_tr, degree, feature_augmentation, tx_tr_aug) tx_te = build_poly(x_te, degree, feature_augmentation, tx_te_aug) tx_tr, mean, std = standardize(tx_tr) tx_te, _, _ = standardize(tx_te, mean, std) #print('Mean and std of each feature in train set: {} , {}'.format(tx_tr.mean(axis = 0),tx_tr.std(axis = 0))) #print('Mean and std of each feature in test set: {} , {}'.format(tx_te.mean(axis = 0),tx_te.std(axis = 0))) if method == 'rr': w,_ = ridge_regression(y_tr, tx_tr, hyperparams[0]) # ridge regression elif method == 'ls': w,_ = least_squares(y_tr, tx_tr) # least square elif method == 'lsGD': w,_ = least_squares_GD(y_tr, tx_tr, hyperparams[0], hyperparams[1], hyperparams[2]) # gradient descent elif method == 'lsSGD': w,_ = least_squares_SGD(y_tr, tx_tr, hyperparams[0], hyperparams[1], hyperparams[2], hyperparams[3]) # stoch GD elif method == 'log': w,_ = logistic_regression(y_tr, tx_tr, hyperparams[0], hyperparams[1], hyperparams[2]) # logistic reg elif method == 'rlog': w,_ =reg_logistic_regression(y_tr, tx_tr, hyperparams[3], np.zeros(tx_tr.shape[1]), hyperparams[1], hyperparams[2]) # regularised logistic reg else: raise NotImplementedError if method == 'log': loss_tr = cal_loglike(y_tr, tx_tr, w) loss_te = cal_loglike(y_te, tx_te, w) elif method == 'rlog': loss_tr = cal_loglike_r(y_tr, tx_tr, w, hyperparams[3]) loss_te = cal_loglike_r(y_te, tx_te, w, hyperparams[3]) else : # calculate the loss for train and test data loss_tr = compute_loss(y_tr, tx_tr, w, error) loss_te = compute_loss(y_te, tx_te, w, error) y_pred = predict_labels(np.array(w).T, tx_te) acc = accuracy(y_te,y_pred) return loss_tr, loss_te, w, acc
def main(): # Model Parameters degree = 13 whis = 2.5 lambda_ = 0.0001 # Load the training data print("Loading the training Datas...") y, tX, ids = load_csv_data(DATA_TRAIN_PATH) # Clean and prepare our data print("Clean and prepare the training datas...") y_train, tX_train, ids_train = prepareData(y, tX, ids, degree, whis) # Train our models print("Train the models...") weights_0, loss_0 = ridge_regression(y_train[0], tX_train[0], lambda_) weights_1, loss_1 = ridge_regression(y_train[1], tX_train[1], lambda_) weights_2, loss_2 = ridge_regression(y_train[2], tX_train[2], lambda_) weights_3, loss_3 = ridge_regression(y_train[3], tX_train[3], lambda_) # Load the dataset to predict print("Loading the testing Datas...") y_test, tX_test, ids_test = load_csv_data(DATA_TEST_PATH) # Prepare the data in the same way as the train dataset print("Clean and prepare the testing datas...") y_test, tX_test, ids_test = prepareData(y_test, tX_test, ids_test, degree, whis) # Predict each class print("Predict the testing datas...") y_pred_0 = predict_labels(weights_0, tX_test[0]) y_pred_1 = predict_labels(weights_1, tX_test[1]) y_pred_2 = predict_labels(weights_2, tX_test[2]) y_pred_3 = predict_labels(weights_3, tX_test[3]) # Concatenate the results y_pred = np.concatenate([y_pred_0, y_pred_1, y_pred_2, y_pred_3]) ids_test = np.concatenate( [ids_test[0], ids_test[1], ids_test[2], ids_test[3]]) # Write the results in a csv file print("Writing the results...") create_csv_submission(ids_test, y_pred, OUTPUT_PATH) print("DONE!, your predictions are available in ", OUTPUT_PATH)
def cross_validation(y, tx, mlfunction, split_number=5, lambda_=1e-6, gamma=0.001): '''Performs a ml_function given as parameters using cross validation on the training set split_number folds (5 as default value) ''' # define empty lists to store train/test losses and accuracy train_loss_ = [] test_loss_ = [] train_accuracy_ = [] test_accuracy_ = [] # get k_indices k_indices = build_k_indices(len(y), split_number) for ki in range(len(k_indices)): # set the k'th indices as test, and others as training set #train_idx = np.asarray([k_indices[i] for i in np.delete( np.arange(len(k_indices)), ki)]).flatten() test_idx = np.asarray(k_indices[ki]) train_idx = np.delete(np.arange(len(y)), test_idx) train_tX = tx[train_idx] train_y = y[train_idx] test_tX = tx[test_idx] test_y = y[test_idx] if (mlfunction == 'ridge_regression'): w, loss = impl.ridge_regression(train_y, train_tX, lambda_) elif (mlfunction == 'least_squares'): w, loss = impl.least_squares(train_y, train_tX) elif (mlfunction == 'logistic_regression'): w, loss = impl.logistic_regression(train_y, train_tX) elif (mlfunction == 'reg_logistic_regression'): w, loss = impl.reg_logistic_regression(train_y, train_tX, lambda_) elif (mlfunction == 'least_squares_sgd'): w, loss = impl.least_squares_SGD(train_y, train_tX, gamma) elif (mlfunction == 'least_squares_gd'): w, loss = impl.least_squares_GD(train_y, train_tX, gamma) else: print('ERROR: ml_function not recognized') print( 'least_squares, least_squares_gd, least_squares_sgd, logistic_regression, reg_logistic_regression' ) return None # Calculate different losses and accuracy train_loss_.append(impl.compute_loss_mse(train_y, train_tX, w)) test_loss_.append(impl.compute_loss_mse(test_y, test_tX, w)) train_accuracy_ = impl.compute_accuracy(train_y, train_tX, w) test_accuracy_ = impl.compute_accuracy(test_y, test_tX, w) return np.mean(train_loss_), np.mean(test_loss_), np.mean( train_accuracy_), np.mean(test_accuracy_)
def cross_validation_ridge(y_train, x_train, num_folds, lambda_, seed=1): np.random.seed(seed) scores = [] for x_train_sub, x_val_sub, y_train_sub, y_val_sub in k_fold_splits(y_train, x_train, num_folds): w, _ = ridge_regression(y_train_sub, x_train_sub, lambda_) y_val_predict = predict_labels(w, x_val_sub) score = np.mean(y_val_predict == y_val_sub) scores.append(score) return np.array(scores)
def get_model(model, y, tx, initial_w, max_iters, gamma, lambda_, batch_size): """ Returns the learned weights 'w' (last weight vector) and the corresponding loss function by a given model. Parameters ---------- model: string The model y: ndarray The labels tx: ndarray The feature matrix initial_w: ndarray The initial weights max_iters: integer The number of steps to run gamma: integer The step size lambda_: integer The regularization parameter batch_size: integer The batch size Returns ------- tuple The learned weights """ if model == "MSE_GD": w, _ = least_squares_GD(y, tx, initial_w, max_iters, gamma) elif model == "MSE_SGD": w, _ = least_squares_SGD(y, tx, initial_w, batch_size, max_iters, gamma) elif model == "MSE_OPT": w, _ = least_squares(y, tx) elif model == "MSE_OPT_REG": w, _ = ridge_regression(y, tx, lambda_) elif model == "LOG_GD": w, _ = logistic_regression(y, tx, initial_w, max_iters, gamma) elif model == "LOG_REG_GD": w, _ = reg_logistic_regression(y, tx, lambda_, initial_w, max_iters, gamma) elif model == "LOG_REG_L1": w, _ = reg_logistic_regression_L1(y, tx, lambda_, initial_w, max_iters, gamma) elif model == "MSE_GD_L1": w, _ = least_squares_GD_L1(y, tx, lambda_, initial_w, max_iters, gamma) else: raise UnknownModel return w
def solve(tX, y): tX_tr, y_tr, tX_te, y_te = split_data(tX, y, ratio=0.8, seed=2019) lambda_ = 1 w, _ = ridge_regression(y_tr, tX_tr, lambda_) y_pr_tr = predict_labels(w, tX_tr) y_pr_te = predict_labels(w, tX_te) acc_tr = compute_accuracy(y_tr, y_pr_tr) acc_te = compute_accuracy(y_te, y_pr_te) return acc_tr, acc_te
def run(self, data_y, data_x, data_ids, test_x, test_ids): if self.do_drop_minus_999_features: print('Dropping features containing at least one -999 value...', end=' ', flush=True) data_x = modifiers.drop_minus_999_features(data_x) print('DONE') if self.do_eliminate_minus_999: print( 'Eliminating -999 values by setting them to feature median...', end=' ', flush=True) data_x = modifiers.eliminate_minus_999(data_x) print('DONE') # Build polynomial data_x = modifiers.build_poly(data_x, self.degree, True) if self.do_std: print('Standardising...', end=' ', flush=True) data_x = modifiers.standardize(data_x) print('DONE') # Find a good initial w initial_w, _ = impl.ridge_regression(data_y, data_x, lambda_=0.1) w_err_hyper_tuples = [] # (w, err, acc) triplets accumulator for hyper_params in self._obtain_hyper_params(): print('Running with hyper parameters:', end=' ') print_dict(hyper_params) print() result = self._run(data_y, data_x, data_ids, initial_w, **hyper_params) w_err_hyper_tuples.append((result, hyper_params)) # Find w that corresponds to minimum error and predict based on that (w, err, acc), hyper_params = min(w_err_hyper_tuples, key=lambda x: x[0][1]) print('Found optimal w with error={err}, accuracy={acc}'.format( err=err, acc=acc), 'and hyper parameters:', end=' ') print_dict(hyper_params) print() if np.isnan(err): print('Error is infinite, computation has probably diverged.', 'Abandoning predictions!') return self._make_predictions(w, test_x, test_ids)
def main(): y_train, tX_train, ids = load_csv_data(DATA_TRAIN_PATH) _, tX_test, ids_test = load_csv_data(DATA_TEST_PATH) np.random.seed(2019) # Preprocess data together to have the same shifts while creating log or root features tX_stacked = np.vstack((tX_train, tX_test)) prep_param = { "bias": True, "fill": True, "standardize": False, "degree": 8, "log": True, "root": True } tX_stacked_prep, *_ = preprocess_data(tX_stacked, None, prep_param) tX_train_prep, tX_test_prep = np.split(tX_stacked_prep, [len(tX_train)]) # Split data according to PRI_jet_num value tX_tr_splitted, indices_tr = divide_data(tX_train_prep) tX_te_splitted, indices_te = divide_data(tX_test_prep) n_models = len(indices_tr) y_tr_splitted = [] for i in range(n_models): y_tr_splitted.append(y_train[indices_tr[i]]) # Train weights = [] for i in range(n_models): lambda_ = lambda_cv(tX_tr_splitted[i], y_tr_splitted[i]) print(f"Class {i}, lambda: {lambda_}") weights.append( ridge_regression(y_tr_splitted[i], tX_tr_splitted[i], lambda_)[0]) # Predict y_pr_tr = np.zeros(tX_train.shape[0]) y_pr_te = np.zeros(tX_test.shape[0]) for i in range(n_models): y_pr_tr[indices_tr[i]] = predict_labels(weights[i], tX_tr_splitted[i]) y_pr_te[indices_te[i]] = predict_labels(weights[i], tX_te_splitted[i]) acc_tr = compute_accuracy(y_train, y_pr_tr) print(f"Total accuracy train: {acc_tr}") _, counts = np.unique(y_pr_te, return_counts=True) print( f"Distribution on test data class -1: {counts[0]}, class +1: {counts[1]}" ) create_csv_submission(ids_test, y_pr_te, OUTPUT_PATH)
def cross_validation_rr(y, x, k_indices, k, lambda_, degree): """train and test ridge regression model using cross validation""" x_test = x[k_indices[k]] x_train = np.delete(x, [k_indices[k]], axis=0) y_test = y[k_indices[k]] y_train = np.delete(y, [k_indices[k]], axis=0) x_tr_poly = helpers.build_poly(x_train, degree) x_te_poly = helpers.build_poly(x_test, degree) w, loss_tr = imp.ridge_regression(y_train, x_tr_poly, lambda_) loss_te = imp.compute_mse(y_test, x_te_poly, w) return loss_tr, loss_te
def test_ridge_regression(y_train, tx_train, y_test, tx_test): """ Tests ridge_regression method on the splitted data set and reports percentage of correct predictions. Args: y_train: training labels after the splitting tx_train: training features after the splitting y_test: test labels after the splitting tx_test: test features after the splitting """ print('\nTesting ridge_regression...') w, _ = ridge_regression(y_train, tx_train, 1e-08) report_prediction_accuracy(y_test, tx_test, w) print('... testing completed.')
def train_3models(tX, y): # Preprocess data together to have the same shifts while creating log or root features prep_param = { "bias": True, "fill": True, "standardize": False, "degree": 8, "log": True, "root": True } tX_new, y_new, _ = preprocess_data(tX, y, prep_param) tX_tr, y_tr, tX_te, y_te = split_data(tX_new, y_new, ratio=0.8, seed=2019) # Split data according to PRI_jet_num value tX_tr_splitted, indices_tr = divide_data(tX_tr) tX_te_splitted, indices_te = divide_data(tX_te) n_models = len(tX_tr_splitted) y_tr_splitted = [] for i in range(len(indices_tr)): y_tr_splitted.append(y_tr[indices_tr[i]]) print(tX_tr_splitted[i].shape) # Train weights = [] for i in range(n_models): lambda_ = lambda_cv(tX_tr_splitted[i], y_tr_splitted[i]) print(f"Class {i}, lambda: {lambda_}") weights.append( ridge_regression(y_tr_splitted[i], tX_tr_splitted[i], lambda_)[0]) print(len(weights[-1])) # Predict y_pr_tr = np.zeros(y_tr.shape) y_pr_te = np.zeros(y_te.shape) for i in range(n_models): y_pr_tr[indices_tr[i]] = predict_labels(weights[i], tX_tr_splitted[i]) y_pr_te[indices_te[i]] = predict_labels(weights[i], tX_te_splitted[i]) # Get accuracy acc_tr = compute_accuracy(y_tr, y_pr_tr) acc_te = compute_accuracy(y_te, y_pr_te) print(f"Total accuracy tr: {acc_tr}, te: {acc_te}") for i in range(n_models): acc_tr = compute_accuracy(y_tr[indices_tr[i]], y_pr_tr[indices_tr[i]]) acc_te = compute_accuracy(y_te[indices_te[i]], y_pr_te[indices_te[i]]) print(f"Class {i}, Accuracy tr: {acc_tr}, te: {acc_te}")
def cross_validation_ridge_regression(y, x, k_indices, k, lambdas, degrees): """ Completes k-fold cross-validation using the ridge regression method. Here, we build polynomial features and create four subsets using the jet feature. """ # get k'th subgroup in test, others in train msk_test = k_indices[k] msk_train = np.delete(k_indices, (k), axis=0).ravel() x_train_all_jets = x[msk_train, :] x_test_all_jets = x[msk_test, :] y_train_all_jets = y[msk_train] y_test_all_jets = y[msk_test] # split in 4 subsets the training set msk_jets_train = get_jet_masks(x_train_all_jets) msk_jets_test = get_jet_masks(x_test_all_jets) # initialize output vectors y_train_pred = np.zeros(len(y_train_all_jets)) y_test_pred = np.zeros(len(y_test_all_jets)) for idx in range(len(msk_jets_train)): x_train = x_train_all_jets[msk_jets_train[idx]] x_test = x_test_all_jets[msk_jets_test[idx]] y_train = y_train_all_jets[msk_jets_train[idx]] # data pre-processing x_train, x_test = process_data(x_train, x_test, False) phi_train = build_poly(x_train, degrees[idx]) phi_test = build_poly(x_test, degrees[idx]) phi_train = add_constant_column(phi_train) phi_test = add_constant_column(phi_test) # compute weights using given method weights, loss = ridge_regression(y=y_train, tx=phi_train, lambda_=lambdas[idx]) y_train_pred[msk_jets_train[idx]] = predict_labels(weights, phi_train) y_test_pred[msk_jets_test[idx]] = predict_labels(weights, phi_test) # compute accuracy for train and test data acc_train = compute_accuracy(y_train_pred, y_train_all_jets) acc_test = compute_accuracy(y_test_pred, y_test_all_jets) return acc_train, acc_test
def cross_validation_ridge(y, x, k_indices, k, lambda_): """Performs one iteration of the k-fold cross validation using L2 Regularized Logistic regression""" val_indices = k_indices[k] train_indices = k_indices[~(np.arange(len(k_indices)) == k)].reshape(-1) x_val, y_val = x[val_indices], y[val_indices] x_train, y_train = x[train_indices], y[train_indices] x_val, y_val = prepare_for_training(x_val, y_val, logistic=False) x_train, y_train = prepare_for_training(x_train, y_train, logistic=False) w, loss_tr = ridge_regression(y_train, x_train, lambda_) loss_val = compute_mse_loss(y_val, x_val, w) + (2 * lambda_ * np.linalg.norm(w)**2) acc = compute_accuracy(y_val, x_val, w) return w, loss_tr, loss_val, acc
def ridge_regression_demo(y, x, degree, k_fold): """find best hyperparameters and return error for ridge regression model""" seed = 1 lambdas = np.logspace(-1.1, -0.8, 20) # split data in k fold k_indices = helpers.build_k_indices(y, k_fold, seed) # define lists to store the loss of training data and test data rmse_tr = [] rmse_te = [] # iterate over all the lambdas, compute model parameters, store the rmse for i in range(len(lambdas)): l = lambdas[i] avg_err_tr = 0 avg_err_te = 0 for k in range(k_fold): err = cross_validation_rr(y, x, k_indices, k, l, degree) avg_err_tr += err[0] avg_err_te += err[1] rmse_tr.append(np.sqrt(2 * avg_err_tr / k_fold)) rmse_te.append(np.sqrt(2 * avg_err_te / k_fold)) helpers.visualization(lambdas, rmse_tr, rmse_te) # find the best lambda min_err_index = 0 for i in range(1, len(rmse_te)): if rmse_te[i] < rmse_te[min_err_index]: min_err_index = i lambda_opt = lambdas[min_err_index] x_poly = helpers.build_poly(x, degree) w_opt, mse = imp.ridge_regression(y, x_poly, lambda_opt) print(" lambda={l:.3f}, mse={mse:.3f}".format(mse = mse, l = lambda_opt)) #Training Accuracy y_predicted = helpers.predict_labels(w_opt.T, x_poly) accuracy = (list(y == y_predicted.flatten()).count(True))/len(y) print(" accuracy={acc:.3f}".format(acc = accuracy))
def cross_validation(y, x, k_indices, k, lambda_, degree): te_indice = k_indices[k] tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)] tr_indice = tr_indice.reshape(-1) y_te, y_tr = y[te_indice], y[tr_indice] x_te, x_tr = x[te_indice], x[tr_indice] tx_tr = build_poly(x_tr, degree) tx_te = build_poly(x_te, degree) w, _ = ridge_regression(y_tr, tx_tr, lambda_) y_tr_pred = predict_labels(w, tx_tr) y_te_pred = predict_labels(w, tx_te) loss_tr = sum(y_tr_pred != y_tr) / len(y_tr) loss_te = sum(y_te_pred != y_te) / len(y_te) return loss_tr, loss_te, w
def find_optimal_w(tX, y, implementation, log_initial_w, log_max_iters, log_gamma, decreasing_gamma, log_regulator, ridge_lambda): """ Find the optimal weights by training the data set Parameters ---------- tX: array The feature matrices y: array The output log_initial_w: array inital weights in order to perform GD or SGD log_max_iters: integer number of iterations to perform GD or SGD log_gamma: float gamma parameter to perform GD or SGD log_regulator: float lambda to perform logistic regression ridge_lambda: float lambda to perform ridge regression Return ------ optimal_w = array Optimal weights. """ optimal_w = None if implementation == 0: optimal_w, _ = impl.least_squares(y, tX) if implementation == 1: optimal_w, _ = impl.ridge_regression(y, tX, ridge_lambda) if implementation == 2: optimal_w, _ = impl.reg_logistic_regression(y, tX, log_regulator, log_initial_w, log_max_iters, log_gamma, decreasing_gamma) return optimal_w
def learn(predictions, ids_predicted, y_train_jets, tx_train_jets, tx_test_jets, ids_test_jets, lambda_best_jets, degree_best_jets): print('\nLearning by ridge regression...') for jet_num in range(4): print('\nLearning from training set with jet number ', str(jet_num), ' using optimal hyperparameters...') y_train, tx_train = y_train_jets[jet_num], tx_train_jets[jet_num] tx_train = feature_engineering(tx_train, degree_best_jets[jet_num], jet_num > 1) w_best, _ = ridge_regression(y_train, tx_train, lambda_best_jets[jet_num]) tx_test, ids_test = tx_test_jets[jet_num], ids_test_jets[jet_num] tx_test = feature_engineering(tx_test, degree_best_jets[jet_num], jet_num > 1) predictions.append(predict_labels(w_best, tx_test)) ids_predicted.append(ids_test) print('\nReporting prediction accuracy for the training set... \n') report_prediction_accuracy(y_train, tx_train, w_best) print('\n... this gives a rough idea about the training success.') print('\n... predicted labels for test set with jet number ', str(jet_num)) print('\n... ,predicted labels for each test set.')
def cross_validation(y, augmented_tx, k_indices, k, lambda_, report_predictions=False): """ Perform cross_validation for a specific test set from the partitioned set. :param y: label data :param augmented_tx: augmented features :param k_indices: An array of k sub-indices that are randomly partitioned :param k: number of folds :param lambda_: regularization parameters :param report_predictions: report prediction or not :return: root mean square of loss training error, prediction """ y_test = y[k_indices[k]] y_train = np.delete(y, k_indices[k]) augmented_tx_test = augmented_tx[k_indices[k]] augmented_tx_train = np.delete(augmented_tx, k_indices[k], axis=0) w, loss_train = ridge_regression(y_train, augmented_tx_train, lambda_) pred = report_prediction_accuracy(y_test, augmented_tx_test, w, False) return compute_rmse(loss_train), pred
def cross_validation(y, augmented_tx, k_indices, k, lambda_, report_predictions = False): """ Performs cross_validation for a specific test set from the partitioned set. Args: y: labels augmented_tx: augmented features k_indices: an array of k sub-indices that are randomly partitioned k: the test set that is kth partition lambda_: regularization parameter for the ridge regression Returns: rmse_training: numeric value of the root mean squared error loss for the training set pred: correct prediction percentage for the test set """ y_test = y[k_indices[k]] y_training = np.delete(y, k_indices[k]) augmented_tx_test = augmented_tx[k_indices[k]] augmented_tx_training = np.delete(augmented_tx, k_indices[k], axis = 0) w, loss_training = ridge_regression(y_training, augmented_tx_training, lambda_) pred = report_prediction_accuracy(y_test, augmented_tx_test, w, False) # instead of test rmse, return correct prediction percentage (it works better) #loss_test = compute_mse(compute_error_vector(y_test, augmented_tx_test, w)) return compute_rmse(loss_training), pred #compute_rmse(loss_test)
def regress(x, y, lamb=0): """ Computes weights using ridge regression """ w, _ = ridge_regression(y, x, lamb) return w
avg_test_accuracy_LR = cross_validation_LR(X_train, y_train, k_fold=4, seed=1) # Cross validation over both gamma and lambda g, l, avg_test_accuracy_RLR = cross_validation_RLR(X_train, y_train, k_fold=4, seed=1) #%% Testing functions #np.random.seed(42) gamma = 0.2 lambda_ = 4E-5 w, loss = least_squares(y = y_train, tx = X_train) # w, loss = least_squares_SGD(y = y_train, tx = X_train, initial_w = np.random.random(size=num_features)*0.01, max_iters = 200000, gamma = gamma) # w, loss = ridge_regression(y = y_train, tx = X_train, lambda_ = lambda_) # w, loss = logistic_regression(y = y_train, tx = X_train, initial_w = np.random.random(size=num_features)*10, max_iters = 125000, gamma = gamma) # w, loss = reg_logistic_regression(y = y_train, tx = X_train, lambda_ = lambda_, initial_w = np.random.random(size=num_features)*0.01, max_iters = 200000, gamma = gamma) plt.plot(w) #%% Predictive step y_test = X_test @ w plt.hist(y_test, bins=200) y_pred = predict_labels(w, X_test) #%% Create submission
# Get train and test data train_index = jet_train_samples[i] test_index = jet_test_samples[i] x_tr, y_tr = x_train[train_index], y_train[train_index] x_te, y_te = x_test[test_index], y_test[test_index] # Clean train and test data x_tr, x_te = clean_data(x_tr, x_te) # Build polynomial data x_tr, y_tr = augment_data(x_tr, y_tr, degree) x_te, y_te = augment_data(x_te, y_te, degree) # Train model weights, loss = ridge_regression(y_tr, x_tr, lambda_) accuracy = predict_accuracy(y_tr, x_tr, weights) f1_score = compute_f1_score(y_tr, x_tr, weights) y_prediction_test[test_index] = predict_labels(weights, x_te) print(" Accuracy = {acc} \n F1-score = {f1} \n".format(acc=accuracy, f1=f1_score)) mean_accuracy += train_index.shape[0] * accuracy mean_f1_score += train_index.shape[0] * f1_score mean_accuracy /= x_train.shape[0] mean_f1_score /= x_train.shape[0] print("Final accuracy = {acc} \nFinal F1-score = {f1} \n".format( acc=mean_accuracy, f1=mean_f1_score)) # Save ouput for submission OUTPUT_PATH = "../data/submission.csv"
def cross_validation(x, y, k, mode, gamma=None, lambda_=None, max_iters=None, initial_w=None): """ INPUT: @x : input data, dimensions (NxD) @y : target labels, (Nx1) array @k : number of folds OUTPUT: """ D = x.shape[1] #randomly permute data maybe? x_split = np.array_split(x, k, axis=0) y_split = np.array_split(y, k, axis=0) #initialize weights and metrics weights = list() acc = list() tpr = list() fpr = list() losses = list() #loop over folds for fold in range(k): #create model #train_ind = [i for i in range(k) if i!=fold] #val_ind = [i for i in range(k) if i==fold] #pdb.set_trace() x_train = [x_split[i] for i in range(k) if i != fold] y_train = [y_split[i] for i in range(k) if i != fold] x_train = np.concatenate(x_train, axis=0) y_train = np.concatenate(y_train, axis=0) x_val = x_split[fold] y_val = y_split[fold] #model = Proj1_Model(x_train, y_train, mode) #train model for fold #weights[k] = model.train() """here the choice of method""" if mode == 'linear_regression_eq': update, loss = imp.least_squares(y_train, x_train) predictions = np.dot(x_val, update) pr_bool = predictions >= np.mean(predictions) elif mode == 'ridge_regression_eq': update, loss = imp.ridge_regression(y_train, x_train, lambda_) predictions = np.dot(x_val, update) pr_bool = predictions >= np.mean(predictions) elif mode == 'linear_regression_GD': update, loss = imp.least_squares_GD(y_train, x_train, initial_w, max_iters, gamma) predictions = np.dot(x_val, update) pr_bool = predictions >= np.mean(predictions) elif mode == 'linear_regression_SGD': update, loss = imp.least_squares_SGD(y_train, x_train, initial_w, max_iters, gamma) predictions = np.dot(x_val, update) pr_bool = predictions >= np.mean(predictions) elif mode == 'logistic_regression': update, loss = imp.logistic_regression(y_train, x_train, initial_w, max_iters, gamma) predictions = np.dot(x_val, update) predicted_prob = H.sigmoid(predictions) #pdb.set_trace() pr_bool = predicted_prob > 0.5 elif mode == 'reg_logistic_regression': update, loss = imp.reg_logistic_regression(y_train, x_train, initial_w, max_iters, gamma) predictions = np.dot(x_val, update) predicted_prob = H.sigmoid(predictions) #pdb.set_trace() pr_bool = predicted_prob > 0.5 weights.append(update) losses.append(loss) pr_bool = predictions >= np.mean(predictions) y_bool = y_val == 1 correct = pr_bool == y_bool tp = np.logical_and(correct, y_bool) fp = np.logical_and(np.logical_not(correct), pr_bool) #tp = [i for i in range(len(pr_bool)) if (pr_bool[i] == True and y_bool[i] == True)] #all_p = [i for i in range(len(pr_bool)) if y_bool == True] #fp = [i for i in range(len(pr_bool)) if (pr_bool == True and y_bool == False)] #all_n = [i for i in range(len(pr_bool)) if y_bool == False] #print('True signal samples:' + str(sum(y_val)) + ' - Predicted signal samples:' + str(sum(pr_bool))) acc.append(sum(correct) / float(len(y_val))) tpr.append(sum(tp) / float(sum(y_bool))) fpr.append(sum(fp) / float(sum(np.logical_not(y_bool)))) #acc[k] = model.acc() #tpr[k] = model.tpr() #fpr[k] = model.fpr() return acc, tpr, fpr, losses