def ridge_regression_demo(y, tx, lamb, degree): # define parameter tX = im.build_poly(tx, degree) weight, loss = im.ridge_regression(y, tX, lamb) print("Training RMSE={tr:.3f}".format(tr=loss)) return weight, loss
def trial_high_dimension(): """failed attent to search a good gamma hyperparameter, but the attent seemed intresting anyway. Returns: numpy.ndarray: prediction shape (len(_),) """ # reloading to lose normalisation y, tX, ids = load_csv_data(DATA_TRAIN_PATH) _, tX_test, ids_test = load_csv_data(DATA_TEST_PATH) tX_completed = implementations.Datas_completion_lacking_values_predicted( tX) tX_test_completed = implementations.Datas_completion_lacking_values_predicted( tX_test) tX_completed = implementations.normalize(tX_completed) tX_test = implementations.normalize(tX_test_completed) y[y == -1] = 0 tX_completed = implementations.build_poly(tX_completed, 8) tX_test_completed = implementations.build_poly(tX_test_completed, 8) i = 0 w = [] accuracy = 0 indices = implementations.build_k_indices(len(y), 9) x_train, y_train, x_test, y_test = implementations.cross_validation_split( y, tX_completed, indices, 2) F1 = 0 while (F1 < 0.4 and accuracy < 0.6): i += 1 gamma = 5 / 10**i w, l = implementations.get_w_loss(y_train, x_train, 1, 0, 6000, gamma) print("w={}, loss={} ".format(w, l)) y_pred = predict_labels(w, x_test) #F1 = implementations.f1_score(y_test, y_pred) # print(F1) matches = [i for i, j in zip(y_pred, y_test) if i == j] accuracy = len(matches) / len(y_test) print(accuracy) F1 = implementations.f1_score(y_test, y_pred) print(F1) print("found model {}".format(w)) y_pred = predict_labels(w, tX_test_completed) return y_pred
def Feature_Completion_Benchmark(k_fold=10): """Here we try to assess the utility of the feature completion (trying to put a value on each -999 value using least squares) """ method = 1 indices = implementations.build_k_indices(len(y), k_fold) enhanced_completed_tX = implementations.build_poly(tX_completed, 4) enhanced_tX = implementations.build_poly(tX, 4) print("1 : Train test split") x_train, y_train, x_test, y_test = implementations.cross_validation_split( y, enhanced_tX, indices, 0) x_train_completed, y_train_completed, x_test_completed, y_test_completed = implementations.cross_validation_split( y, enhanced_completed_tX, indices, 0) y_test[y_test == 0] = -1 #x_test = np.c_[np.ones((y_test.shape[0], 1)), x_test] print("2 : Compute gradient descent") w, loss_tr = implementations.get_w_loss(y_train, x_train, method, gamma=0.05, max_iters=2000) w_completed, loss_tr_completed = implementations.get_w_loss( y_train, x_train_completed, method, gamma=0.05, max_iters=2000) y_pred = predict_labels(w, x_test) y_pred_completed = predict_labels(w_completed, x_test_completed) print("3 : Compute stats") matches = [i for i, j in zip(y_pred, y_test) if i == j] accuracy = len(matches) / len(y_test) matches = [i for i, j in zip(y_pred_completed, y_test) if i == j] accuracy_completed = len(matches) / len(y_test) F1 = implementations.f1_score(y_test, y_pred) F1_completed = implementations.f1_score(y_test, y_pred_completed) print( "no completion : accuracy = {}, F1 = {}, with completion : accuracy = {}, F1 = {}" .format(accuracy, F1, accuracy_completed, F1_completed)) return F1, accuracy, F1_completed, accuracy_completed, w, w_completed
def I_do_it_all_and_I_try_to_do_it_good_REG_LOG_REG(degree, lambdas_, k_fold=10): method = 6 indices = implementations.build_k_indices(len(y), k_fold) enhanced_tX = implementations.build_poly(tX, degree) best_heuristique = best_accuracy = best_TP = best_TS = best_lambda = best_losses_tr = best_losses_te = 0 best_w = [] print("1 : Train test split") x_train, y_train, x_test, y_test = implementations.cross_validation_split( y, enhanced_tX, indices, 0) print("2 : Compute regularized logistic regression") for lambda_ in lambdas_: w, loss_tr = get_w_loss(y_train, x_train, method, gamma=0.00005, max_iters=100, lambda_=0.001) x_test = np.c_[np.ones((y_test.shape[0], 1)), x_test] print("3 : Predict using generated model with {}".format(lambda_)) y_pred = predict_labels(w, x_test) y_test[y_test == 0] = -1 print("4 : Compute stats with {}".format(lambda_)) matches = [i for i, j in zip(y_pred, y_test) if i == j] accuracy = len(matches) / len(y_test) F1 = implementations.f1_score(y_test, y_pred) # As the set seems not to much unbalanced I give more importance to accuracy than F1. if (2 * accuracy + F1 > best_heuristique): best_w = w best_TP, best_FP, best_FN = implementations.stats(y_test, y_pred) best_lambda = lambda_ best_F1 = F1 best_accuracy = accuracy loss_te_best = loss_te = implementations.calculate_loss( y_test, x_test, w, lambda_) loss_tr_best = loss_tr print("5 : Generate the submission") implementations.submit(_, tX_test, best_model, ids_test, method, degree) return best_model, losses_tr_best, losses_te_best, best_lambda, best_accuracy, best_F1, best_TP, best_FP, best_FN
# Iterate over each subset and build a model # The predictions of every single model are combined for i in range(num_subsets): # Extract the train/test subsets y_train_subset, X_train_subset, ids_train_subset = train_subsets[i] y_test_subset, X_test_subset, ids_test_subset = test_subsets[i] # Map the categorical output labels into [0, 1] y_train_subset = map_0_1(y_train_subset) # Standardize the data X_train_subset, X_test_subset = standardize(X_train_subset, X_test_subset) print( f"Train shape before feature expansion: {str(X_train_subset.shape):>12} Test shape: {str(X_test_subset.shape):>12}" ) # Build the polynomial features and expand the data X_train_subset, X_test_subset = build_poly(X_train_subset, max_degree[i]), build_poly( X_test_subset, max_degree[i]) print( f"Train shape after feature expansion: {str(X_train_subset.shape):>12} Test shape: {str(X_test_subset.shape):>12}" ) # Set n_best_features to X_train_subset.shape[1] if you don't want feature selection n_best_features = round(fs_perc[i] * X_train_subset.shape[1]) D = n_best_features N, _ = X_train_subset.shape # Accuracy by predicting the majority class in the training dataset CA_one = y_train_subset.sum() / N CA_zero = 1 - CA_one CA_baseline = max(CA_zero, CA_one)
w, loss_tr = implementations.get_w_loss(y_train, x_train, method, gamma=0.05, max_iters=2000) w_completed, loss_tr_completed = implementations.get_w_loss( y_train, x_train_completed, method, gamma=0.05, max_iters=2000) y_pred = predict_labels(w, x_test) y_pred_completed = predict_labels(w_completed, x_test_completed) print("3 : Compute stats") matches = [i for i, j in zip(y_pred, y_test) if i == j] accuracy = len(matches) / len(y_test) matches = [i for i, j in zip(y_pred_completed, y_test) if i == j] accuracy_completed = len(matches) / len(y_test) F1 = implementations.f1_score(y_test, y_pred) F1_completed = implementations.f1_score(y_test, y_pred_completed) print( "no completion : accuracy = {}, F1 = {}, with completion : accuracy = {}, F1 = {}" .format(accuracy, F1, accuracy_completed, F1_completed)) return F1, accuracy, F1_completed, accuracy_completed, w, w_completed F1s, accuracies, F1s_completed, accuracies_completed, w, w_completed = Feature_Completion_Benchmark( ) ids = np.array(range(350000, 918938)) y_pred = predict_labels(w_completed, implementations.build_poly(tX_test_completed, 4)) print(y_pred.shape) create_csv_submission(ids, y_pred, "../data/submission.csv")
def main(): #Loading the Data # Training dataset DATA_TRAIN_PATH = '../data/train.csv' y, X, ids = load_csv_data(DATA_TRAIN_PATH) # Testing Dataset DATA_TEST_PATH = '../data/test.csv' y_t, X_t, ids_t = load_csv_data(DATA_TEST_PATH) #Separate training and testing sets into 4 different categories depending #on the PRI_jet_num feature with index -8 feature = -8 X_cat = preproc.get_categories(X, feature=feature) X_t_cat = preproc.get_categories(X_t, feature=feature) #looop for every v in range 4 to obtain the 4 predictions, #then concatenate and create submission file y_pred_all = [] # Found using cross_validation # Setting best hyperparameters (the degree and the corresponding lambda) for each category degrees = [10, 10, 9, 9] lambdas = [0.00047508101621, 7.05480231072e-07, 0.000343046928631, 5.72236765935e-05] for v in range(4): # Extract category (test, train and labels) Xv = X[X_cat[v]] Xv_t = X_t[X_t_cat[v]] y_v = y[X_cat[v]] #Concatenante the train and testing set all_Xv = np.concatenate((Xv, Xv_t), axis=0) # find features (bad_features) with a unique value bad_features = [] for i in range(len(all_Xv.T)): if(len(np.unique(all_Xv.T[i])) == 1): bad_features.append(i) # Delete bad_features and fill missing values all_Xv_c = X_v = np.delete(all_Xv, bad_features, axis=1) all_Xv_filled = preproc.fill_missing_values(all_Xv_c, tresh=1) #Separate train and test Xv_f = all_Xv_filled[:len(Xv)] Xv_t_f = all_Xv_filled[len(Xv):] #Standardize the dataset tXv, mean_x, std_x = preproc.standardize(Xv_f) tXv_t, mean_x, std_x = preproc.standardize(Xv_t_f) ### Generate model final_degree = degrees[v] best_lambda = lambdas[v] # Build the polynomial basis, perform ridge regression final_X = impl.build_poly(tXv, final_degree) final_Xt = impl.build_poly(tXv_t, final_degree) #Generate the model (Using Ridge Regression) final_w, loss_ = impl.ridge_regression(y_v, final_X, best_lambda) # Genereate prediction for this category y_predv = predict_labels(final_w, final_Xt) y_pred_all.append(y_predv) p = len(X_cat[v])/len(X) ### Concatenate all predictions, and sort them by indices Xt_cat_all = [idx for sublist in X_t_cat for idx in sublist] y_pred = [yi for sublist in y_pred_all for yi in sublist] final_ypred = np.asarray(y_pred)[np.argsort(Xt_cat_all)] #Create Submission file OUTPUT_PATH = '../submissions/results__4categories_fillByCat_run.csv' create_csv_submission(ids_t, final_ypred, OUTPUT_PATH) print('Congratulations ........ Submission file created ::: ', OUTPUT_PATH)
# for ridge: for every models test different lambdas and degrees D = len(degrees) L = len(lambdas) #averages of the f1/accuracy over the kfold for each cell metrics_tot = [] #higher level: we keep the k_metrics_train,k_metrics_test and optcutoffs in a similar table save_metrics = [] for idx_subset, (x_train, y_train) in enumerate(clean_data_trains): print('##### START SUBSET {} #####'.format(idx_subset)) save_metric = [] for idx_deg, deg in enumerate(degrees): x_poly = imp.build_poly(x_train, deg) temp1 = [] print("{d}/{D} row".format(d=idx_deg, D=D)) for idx_lambda, lambda_ in enumerate(lambdas): ridge = lambda y, x: imp.ridge_regression(y, x, lambda_) start = datetime.datetime.now() k_metrics_train, k_metrics_test, _ = imp.k_fold_cv(y_train, x_poly, KFOLD, ridge, METRIC, verbose=False) end = datetime.datetime.now()
tX1 = tX_test[index1, :] tX1 = np.delete(tX1, 22, 1) index2 = tX_test[:, 22] == 2 tX2 = tX_test[index2, :] tX2 = np.delete(tX2, 22, 1) index3 = tX_test[:, 22] == 3 tX3 = tX_test[index3, :] tX3 = np.delete(tX3, 22, 1) tX0_final, index_final_0 = im.formating(tX0) tX1_final, index_final_1 = im.formating(tX1) tX2_final, index_final_2 = im.formating(tX2) tX3_final, index_final_3 = im.formating(tX3) #Building the polynomial basis for the test data and predicting results tX0_final_test = im.build_poly(tX0_final, degree0) ypred0 = predict_labels(weight0, tX0_final_test) tX1_final_test = im.build_poly(tX1_final, degree1) ypred1 = predict_labels(weight1, tX1_final_test) tX2_final_test = im.build_poly(tX2_final, degree2) ypred2 = predict_labels(weight2, tX2_final_test) tX3_final_test = im.build_poly(tX3_final, degree3) ypred3 = predict_labels(weight3, tX3_final_test) #Assembling the predicted y y_pred = np.zeros((tX_test.shape[0])) y_pred[index0] = ypred0.reshape(ypred0.shape[0], 1) y_pred[index1] = ypred1.reshape(ypred1.shape[0], 1) y_pred[index2] = ypred2.reshape(ypred2.shape[0], 1) y_pred[index3] = ypred3.reshape(ypred3.shape[0], 1)
xs, ys = clean_input_data(x_loaded.copy(), y_loaded.copy(), corr=1, dimension_expansion=5, bool_col=True) for jet in range(4): # set -1 to 0 ys[jet][ys[jet] == -1] = 0 xs, mean_log, std_log = concatenate_log(xs.copy()) print("Train data cleaned") # 3. Build the polynomials (one for each one of the 4 datasets) degree = 2 txs = [None] * 4 for jet in range(4): txs[jet] = build_poly(xs[jet], degree) print("The train polynomials have been built.") # 4. Set the array of gammas for the logistic regression gamma_constants = [1e-5, 1e-6] # one for the degree 1 and one for the degree 2 gammas = [None] * 4 for jet in range(4): ncolumns = xs[jet].shape[1] gammas[jet] = np.concatenate([[gamma_constants[0]]] + [ncolumns*[g] for g in gamma_constants[:degree]])\ .reshape((-1,1)) # 5. run the logistic regression on the four datasets def logistic_regression_on_jet(jet): y = ys[jet] tx = txs[jet]