def submission(x_test, w, i): x_test = remove_columns(x_test) x_test = replace_outliers_with_mean(x_test) x_test = standardize(x_test) #x_test = build_poly(x_test,3) x_test = addones(x_test) y_predictions = predict_labels(w, x_test) y_predictions = predict_reverse(y_predictions) y_predictions.reshape(y_predictions.shape[0], ) create_csv_submission(i, y_predictions, 'data/sample-submission.csv')
def create_submission(self, inputs, name): """Create the submission csv file. Args: inputs: Test data to run the predictions on name: The name of the submission file """ pred = self.predict(inputs) if self.use_logistic: pred[np.where(pred == 0)] = -1 create_csv_submission(list(range(350000, 350000 + len(pred))), pred, name) return name
def predict_test(self, x=None, ids=None): if x is None or ids is None: if self._orig_test is False: _, _tX_test, self._ids_test = load_csv_data( self._DATA_TEST_PATH) _, self._tX_test = self.prepare_all_data(None, _tX_test) self._tX_orig = self._tX_test.copy() _, self._tX_test = self._prepare_model_data( None, self._tX_test) self._orig_test = True else: _, self._tX_test = self.prepare_all_data(None, x.copy()) self._tX_orig = self._tX_test.copy() self._ids_test = ids.copy() _, self._tX_test = self._prepare_model_data(None, self._tX_test) self._orig_test = False y_test_pred = self._predict(self._tX_test) create_csv_submission(self._ids_test, y_test_pred, self._output_path)
def main(): """ The main function that initializes the final training and prediction of the proposed models. """ # Load train and test datasets data_obj = DataLoader() # Train model for each jet and get predictions print("Jet 0") ids_test_sub_0, y_pred_0 = best_model_predictions(data_obj=data_obj, jet=0, degrees=6) print("Jet 1") ids_test_sub_1, y_pred_1 = best_model_predictions(data_obj=data_obj, jet=1, degrees=10) print("Jet 2") ids_test_sub_2, y_pred_2 = best_model_predictions(data_obj=data_obj, jet=2, degrees=4) print("Jet 3") ids_test_sub_3, y_pred_3 = best_model_predictions(data_obj=data_obj, jet=3, degrees=6) # Concatenate all the predictions with their label ids_all = np.concatenate( (ids_test_sub_0, ids_test_sub_1, ids_test_sub_2, ids_test_sub_3), axis=0) preds_all = np.concatenate((y_pred_0, y_pred_1, y_pred_2, y_pred_3), axis=0) # Change 0 label to -1 preds_all = np.where(preds_all == 0, -1, preds_all) OUTPUT_PATH = './../results/predictions/best_model_predictions.csv' # Create submission create_csv_submission(ids_all, preds_all, OUTPUT_PATH) print("Predictions have been created.")
def generate_submission(ids_te, Y_te): """ Generate submission in submissions path. Args: ids_te (ndarray): array with IDs of samples Y_te (ndarray): array with class labels of samples Returns: None """ # generate submission print("[!] Generating Submission...") date_time = START_TIME # TODO replace whitespaces in function names csv_name = f"HB_SUBMISSION_{date_time}.csv" Path(SUBMISSION_PATH).mkdir(exist_ok=True) create_csv_submission(ids_te, Y_te, csv_name, SUBMISSION_PATH) print(f"[+] Submission {csv_name} was generated!")
def combine_and_create_submission(predictions, ids_predicted, submission_name): ids_gathered = [] predictions_gathered = [] current_id = min(ids_predicted[:][0]) length = np.sum(len(prediction) for prediction in predictions) print('\nGathering ids and predictions for each jet number together...') for _ in range(length): for jet_num in range(4): if len(ids_predicted[jet_num]) > 0: if ids_predicted[jet_num][0] == current_id: predictions_gathered.append(predictions[jet_num][0]) ids_gathered.append(current_id) predictions[jet_num] = np.delete(predictions[jet_num], 0) ids_predicted[jet_num] = np.delete(ids_predicted[jet_num], 0) break current_id += 1 print('\n... ids and predictions for each jet number were gathered.') print('\n Creating submission file with name ', str(submission_name), ' ...') create_csv_submission(np.array(ids_gathered), np.array(predictions_gathered), submission_name) print('\n... ', str(submission_name), ' is created. Ready to submit :) !')
def create_prediction(): """Create predictions for kaggle.""" y, X, dict_mask_jets_train, ids = helpers_us.process_data('Data/train.csv', inv_log=True) best_param = [[2,0.0072],[2,0.1389],[2,0.1389]] #found with the function best_model_logistic best_w = [] for i in range(len(dict_mask_jets_train)): xi = X[i] yi = y[dict_mask_jets_train[i]] _,_,w = cross_reg_logistic_regression(yi, xi, degree = best_param[i][0], k_fold=6, lambda_= best_param[i][1], max_iters = 500, gamma = -1, batch=35) best_w.append(w) y, X, dict_mask_jets_train, ids = helpers_us.process_data('Data/test.csv', inv_log=True) y_pred = np.zeros(y.shape[0]) for i in range(len(dict_mask_jets_train)): xi = X[i] xi = modselection.build_poly(xi, 2) y_test_pred = modselection.predict_labels_logistic(best_w[i], xi) y_pred[dict_mask_jets_train[i]] = y_test_pred helpers.create_csv_submission(ids, y_pred, "true_prediction.csv")
corr=1, dimension_expansion=5, bool_col=True) x_te, _, _ = concatenate_log(x_te.copy(), mean_log=mean_log, std_log=std_log) print("Test data cleaned.") # 7. Build the polynomials tx_te = [] for jet in range(4): tx_te.append(build_poly(x_te[jet], degree)) print("The test polynomials have been built.") # 8. Predict and concatenate the predicitions y_te_pred = [] for jet in range(4): y_te_pred.append(predict_labels(weigths[jet], tx_te[jet])) for jet in range(4): ids_te[jet] = ids_te[jet].reshape((-1, 1)) y_pred = np.row_stack([y_te_pred[0], y_te_pred[1], y_te_pred[2], y_te_pred[3]]) ids = np.row_stack([ids_te[0], ids_te[1], ids_te[2], ids_te[3]]) print("I predicted ", str((y_pred == -1).sum()), "-1s and ", str((y_pred == 1).sum()), "1s") # 9. Store the predictions sub_file_name = "predictions" create_csv_submission(ids, y_pred, sub_file_name) print("Prediction stored in file '" + sub_file_name + "'")
# Use ridge_regression to compute our model weights, pred_score = k_fold_cross_validation(y_train, processed_tx_train, k, imp.ridge_regression, [lambda_]) print("Got predictions score = " + str(pred_score) + "\n") if pred_score > best_pred_score: # Update best results best_weights = np.copy(weights) best_pred_score = pred_score # Update best parameters best_degree = degree best_lambda = lambda_ best_k = k print("Best score on training data is " + str(best_pred_score)) print("Best parameters are (degree, lambda, k) = (" + str(best_degree) + ", " + str(best_lambda) + ", " + str(best_k) + ")") # Create the predictions processed_tx_test = preprocess.build_poly(tx_test, best_degree) y_pred = helper.predict_labels(best_weights, processed_tx_test) # Save the predictions program_path = os.path.dirname(os.path.realpath(__file__)) filename = program_path + '/results/run_ridge.csv' helper.create_csv_submission(ids, y_pred, filename) # Best score on training data is 0.817712 # Best parameters are (degree, lambda, k) = (12, 0.0001, 5)
X_train_pri_0 = np.load('X_pri_0.npy') X_train_pri_1 = np.load('X_pri_1.npy') X_train_pri_23 = np.load('X_pri_23.npy') y_train_pri_0 = np.load('y_pri_0.npy') y_train_pri_1 = np.load('y_pri_1.npy') y_train_pri_23 = np.load('y_pri_23.npy') if USE_PRETRAINED_WEIGHTS == True: w0 = np.load('w0.npy') w1 = np.load('w1.npy') w23 = np.load('w23.npy') else: # Model trained here w0, loss0 = ridge_regression(y_train_pri_0, build_poly(X_train_pri_0, 12), 1e-14) w1, loss1 = ridge_regression(y_train_pri_1, build_poly(X_train_pri_1, 12), 1e-3) w23, loss23 = ridge_regression(y_train_pri_23, build_poly(X_train_pri_23, 11), 1e-5) pri_0_y = predict_labels(w0, build_poly(X_pri_0, 12)) pri_1_y = predict_labels(w1, build_poly(X_pri_1, 12)) pri_23_y = predict_labels(w23, build_poly(X_pri_23, 11)) predictions[ids_pri_0] = pri_0_y predictions[ids_pri_1] = pri_1_y predictions[ids_pri_23] = pri_23_y create_csv_submission(ids, predictions, 'output.csv')
""" Load the datasets, train a model, and create a Kaggle submission for the first Machine Learning project Authors: Kirill IVANOV, Matthias RAMIREZ, Nicolas TALABOT """ ### Import modules and datasets from proj1_helpers import load_csv_data, predict_labels, create_csv_submission from implementations import least_squares from utilities import split_data, preprocess_data y_train, x_train, ids_train = load_csv_data("train.csv") y_test, x_test, ids_test = load_csv_data("test.csv") # Parameters seed = 3 degree = 11 ratio = 0.66 # Learn the model tx, x_mean, x_std = preprocess_data(x_train, degree) x_tr, y_tr, x_te, y_te = split_data(tx, y_train, ratio, seed) w, loss_tr = least_squares(y_tr, x_tr) # Create a Kaggle submission x_kaggle,_,_ = preprocess_data(x_test, degree, compute_mean_std=False, \ x_mean=x_mean, x_std=x_std) y_pred = predict_labels(w, x_kaggle) create_csv_submission(ids_test, y_pred, "run_submission.csv")
tX_improved = f_e.feature_engineer(tX_preprocessed) # In case we want to test our model locally by splitting our data if params.LOCAL_PREDICTION: pred.locally_predict(tX_improved, y_preprocessed, counts) else: print('Test set:') y_test, tX_test, ids_test = helpers.load_csv_data( params.DATA_TEST_PATH) y_test_preprocessed, tX_test_preprocessed, ids_test_preprocessed, masks_test, counts_test = prep.preprocess( y_test, tX_test, ids_test) tX_test_improved = f_e.feature_engineer(tX_test_preprocessed) log_initial_ws = [] for i in range(len(tX_test_improved)): log_initial_ws.append(np.repeat(0, tX_test_improved[i].shape[1])) optimal_ws = pred.find_optimal_ws_grouped( tX_improved, y_preprocessed, params.IMPLEMENTATION, log_initial_ws, params.MAX_ITERS, params.GAMMA, params.DECREASING_GAMMA, params.LOG_LAMBDA, params.RIDGE_LAMBDA) y_preds = [] for i in range(len(optimal_ws)): y_preds.append( helpers.predict_labels(optimal_ws[i], tX_test_improved[i], params.IMPLEMENTATION)[1]) flat_y_preds = helpers.flatten_list(y_preds) flat_ids = helpers.flatten_list(ids_test_preprocessed) ids_indices = np.argsort(flat_ids) y_preds_sorted = np.array(flat_y_preds)[ids_indices] helpers.create_csv_submission(ids_test, y_preds_sorted, params.OUTPUT_PATH)
from data_processing import process_data, build_poly print("Loading data\n") # Loading data from csv files y_tr, tx_tr, ids_tr = load_csv_data("data/train.csv") y_te, tx_te, ids_te = load_csv_data("data/test.csv") # Hyper-parameters definitions degree = 7 lambda_ = 0.00025 # Preprocessing data: cleaning, standardazing and adding constant column tx_tr, tx_te = process_data(tx_tr, tx_te, y_tr, y_te) # Feature augmentation through polynomials tx_tr = build_poly(tx_tr, degree) tx_te = build_poly(tx_te, degree) # Training with ridge regression print("Training the model\n") weights, _ = ridge_regression(y_tr, tx_tr, lambda_) # Computing prediction vector y_pred = predict_labels(weights, tx_te) # Creating file for submission create_csv_submission(ids_te, y_pred, "prediction.csv") print("Done")
# Initialise training w_initial = np.ones(tx_train.shape[1]) # Run gradient descent w, loss = logistic_regression_mean(y_train, tx_train, w_initial, MAX_ITERS, GAMMA, verbose=True) print(f'Training loss: {loss}') acc = eval_model(y_train, tx_train, w, thresh=0.5) print(f'Training accuracy: {acc}') # Load test data y_test, x_test, ids_test = load_csv_data(path.join(DATA_PATH, 'test.csv')) fx_test = feature_transform(x_test) # Standardise to mean and s.d. of training data fx_test = standardise_to_fixed(fx_test, mu_train, sigma_train) # Add offset term tx_test = np.c_[np.ones(fx_test.shape[0]), fx_test] # Get predictions on test set y_pred = predict_labels(w, tx_test, thresh=0.5) create_csv_submission(ids_test, y_pred, path.join(DATA_PATH, 'final_submission.csv'))
def create_submission(name, tx_test): """Creates the submission file using the given test data and filename.""" predictions = run_on_test_data(tx_test) predictions[predictions == 0] = -1 create_csv_submission(list(range(350000, 350000 + len(predictions))), predictions, name)
ws = [] for k in range(k_fold): loss_tr, loss_te, w = cross_validation(y, x, k_indices, k, lambda_, degree) losses_te.append(loss_te) losses_tr.append(loss_tr) ws.append(w) return np.mean(losses_te, axis=0), np.mean(losses_tr, axis=0), np.mean(ws, axis=0) losses_te, losses_tr, w = cross_validation_ridge() print( f'Average Missclassification proportion on test folds was {losses_te}. On Train folds it was {losses_tr}.' ) test_y, test_x, test_ids = load_csv_data(DATA_PATH + 'test.csv') # replace missing values with means determined from training data test_x, _, _, _ = normalize(test_x, col_mean, xmin, xmax) # create final predictions on testing data and submission csv y_pred = predict_labels(w, build_poly(test_x, degree)) create_csv_submission(test_ids, y_pred, DATA_PATH + 'inferred.csv') print( 'Your final submission has been created and is called /data/inferred.csv')
for i_22 in range(4): full_std, _, _ = standardize(only_good_data_full[i_22]) test_std, _, _ = standardize(only_good_data_test[i_22]) phi_full = build_poly(full_std, bests[i_22][0]) phi_test = build_poly(test_std, bests[i_22][0]) w, _ = ridge_regression(yb_full_by_22[i_22], phi_full, bests[i_22][1]) # Get predictions by nearest value yb_test_by_22[i_22] = predict(w, phi_test) preds[i_22] = prediction(w, phi_full, yb_full_by_22[i_22]) print('Ratio of good predictions for jet', i_22, ':', preds[i_22]) # Weighted average for predictions overall = (preds[0] * yb_full_by_22[0].shape[0] + preds[1] * yb_full_by_22[1].shape[0] + preds[2] * yb_full_by_22[2].shape[0] + preds[3] * yb_full_by_22[3].shape[0]) / ( yb_full_by_22[0].shape[0] + yb_full_by_22[1].shape[0] + yb_full_by_22[2].shape[0] + yb_full_by_22[3].shape[0]) print('Overall prediction', overall) print('Creating submission') yb_submit = np.concatenate(yb_test_by_22) ids_submit = np.concatenate(ids_test_by_22) create_csv_submission(ids_submit, yb_submit, 'submission_by_cat.csv') print('Done')
## Prediction ############################ print('prediction started') # load the test set print('loading the testing dataset...') y_test, tx_test, ids_test = load_csv_data(dat_dir + "test.csv") print('data loaded...') # combine all the selected features for testing set # Note we used the same means and stds from training set test_log, _, _ = compute_log(tx_test, index_log, mean_log, std_log) test_theta, _, _ = compute_theta(tx_test, index_theta, mean_theta, std_theta) test_physics, _, _ = compute_physics(tx_test, index_physics_A, index_physics_B, index_physics_C, mean_physics, std_physics) test_new = np.c_[test_log, test_theta, test_physics] # reconstruct all the features of test set using the best degrees from training set test_best_degree = build_poly_by_feature(test_new, best_degrees) # Normalization X_test = sigmoid(test_best_degree) # predict y_pred = predict_regression_labels(best_weights, X_test, threshold=0) print('prediction ended') # generate submission create_csv_submission(ids_test, y_pred, 'submission.csv') print('submission generated')
else: dict_lambda_weight[lambda_] = [loss_te, weight] rmse_tr_tmp.append(loss_tr) rmse_tr.append(np.mean(rmse_tr_tmp)) rmse_te.append(np.mean(rmse_te_tmp)) print("lambda={l:.3f}, Training RMSE={tr:.3f}, Testing RMSE={te:.3f}". format(l=lambda_, tr=rmse_tr[ind], te=rmse_te[ind])) ind_lambda_opt = np.argmin(rmse_te) best_lambda = lambdas[ind_lambda_opt] best_rmse = rmse_te[ind_lambda_opt] best_weight = dict_lambda_weight[best_lambda][1] return best_weight, best_rmse, best_lambda print("training") optimal_weight, best_rmse, best_lambda = cross_validation_demo() x_train2 = build_poly(x_train, degree) y_pred = predict_labels(optimal_weight, x_train2) output = accuracy(y_pred, y_train) print("done, training accuracy:") print(output) x_test2 = build_poly(x_test, degree) y_pred2 = predict_labels(optimal_weight, x_test2) print("creating submission") create_csv_submission(ids_test, y_pred2, 'ridge_regression_final.csv')
run.py is used to launch the application of weights on a test dataset and serialize the results. """ def load_npy(*npy_paths): """ Returns numpy arrays serialized at npy_paths. Args: npy_paths : a sequence of serialized np.arrays files paths. Returns: Deserialized numpy arrays """ return (np.load(p) for p in npy_paths) # Load the test dataset _, test_data, test_ids, _ = load_csv_data('all/test.csv') # Load the weights, feature masks and parameters (mean, std_dev) weights, clean_features, parameters = load_npy('all/weights.npy', 'all/clean_features.npy', 'all/parameters.npy') # Runs the weights against the test dataset pri_jet_num_idx = 22 polynomial_degree = 3 predictions = model_predictions(test_data, weights, pri_jet_num_idx, clean_features, parameters, polynomial_degree) create_csv_submission(test_ids, predictions, 'all/predictions.csv')
g, l, avg_test_accuracy_RLR = cross_validation_RLR(X_train, y_train, k_fold=4, seed=1) #%% Testing functions #np.random.seed(42) gamma = 0.2 lambda_ = 4E-5 w, loss = least_squares(y = y_train, tx = X_train) # w, loss = least_squares_SGD(y = y_train, tx = X_train, initial_w = np.random.random(size=num_features)*0.01, max_iters = 200000, gamma = gamma) # w, loss = ridge_regression(y = y_train, tx = X_train, lambda_ = lambda_) # w, loss = logistic_regression(y = y_train, tx = X_train, initial_w = np.random.random(size=num_features)*10, max_iters = 125000, gamma = gamma) # w, loss = reg_logistic_regression(y = y_train, tx = X_train, lambda_ = lambda_, initial_w = np.random.random(size=num_features)*0.01, max_iters = 200000, gamma = gamma) plt.plot(w) #%% Predictive step y_test = X_test @ w plt.hist(y_test, bins=200) y_pred = predict_labels(w, X_test) #%% Create submission create_csv_submission(test.Id, y_pred, 'submission.csv')
print("Computing optimal weights") w, _ = ridge_regression(y_correspond, x_train_aug, LAMBDAS[i]) del x_train_aug # features engineering test set print("Augmenting testing set") x_test_aug_fname = "cache/x_test_augmented_jet{}_{}dim.np".format( i, COMBINED_DEGREES[i]) try: with open(x_test_aug_fname, "rb") as f: x_test_aug = np.load(f) except FileNotFoundError: # not existing, recomputing x_test_aug = augment(XS_TEST[i], COMBINED_DEGREES[i], SIMPLE_DEGREES[i], TAN_HYP_DEGREES[i], INVERSE_LOG_DEGREES[i], ROOT_DEGREES[i]) if CACHE: with open(x_test_aug_fname, "wb") as f: np.save(f, x_test_aug) # compute predictions and store print("Predicting labels for subset") y_submission[MASKS_TEST[i]] = predict_labels(w, x_test_aug) del x_test_aug del w # all predictions completed, create CSV print("Creating submission") create_csv_submission(ids_test, y_submission, OUTPUT_PATH)
N = y_train.size # TRAIN test accuracy for sanity n_err = len( np.where( y_train != predict_01_labels(w, tX_train, 0.5).reshape(y_train.shape)) [0]) print("train accuracy :", 1 - n_err / N) ########################################################################## #### Generate a prediction on the test set ########################################################################## # load data from test set _, _tX_test, ids_test = load_csv_data("test.csv") # Do the corresponding steps for tX_test (we do not need to remove outliers or oversample, even if we did it on the training set) tX_test = replace_data(_tX_test) one_hot_columns = one_hot_encode(tX_test, 22) tX_test = normalize_data(tX_test) tX_test = polynomial_expansion(np.delete(tX_test, 22, axis=1), polynomial_deg) tX_test = np.c_[np.delete(tX_test, 22, axis=1), one_hot_columns] #predict the labels with 0 threshold y_pred = predict_labels(w, tX_test, 0.0) name = "my_submission.csv" #save predictions create_csv_submission(ids_test, y_pred, name)
3.57813748e-01, 3.09738153e-03, -3.00165489e-02, -1.96892274e-02, -6.63026620e-03, -6.16628770e-03, 3.24687388e-02, -3.66001378e-03, 1.59398191e-02, 4.65051845e-03, 1.01299540e-02, -3.51035034e-02, 1.52764982e-02, 5.76623633e-03, 6.11417966e-03, 3.15704313e-02, -6.83057630e-03, -4.28346753e-03, -1.17045931e-02, -1.18246783e-01, -1.67412873e-03, 4.92261691e-03, -4.41274760e-03, -3.21693847e-02, -2.74392089e-02, 3.35931046e-02, -1.09660753e-01, 2.52705139e-01, 2.77765221e-03, 1.91389879e-03, 3.25497546e-02, 2.85366822e-02, -1.02954086e-05, 8.91734653e-04, -3.61480432e-03, -1.42486539e-02, 3.27415717e-02, 2.83215314e-02, -4.36361344e-03, 2.04638731e-03, 6.34341119e-02, 4.53017769e-04, 4.98469992e-02, -6.59353018e-02, -5.19988600e-02, -3.28215812e-02, 1.50462194e-04, -5.62069645e-04, -7.85663783e-04, -2.79324395e-02, -5.99052349e-04, 4.93552796e-04, 1.69177167e-02, 3.83051056e-04, 7.79895566e-02 ]) # Training accuracy (correct_count, total_count) = helpers.prediction_accuracy(y, X, w_star) correct_ratio = correct_count / total_count ################################################################################ # analysis of results # ################################################################################ print("classification precision: {cp}".format(cp=correct_ratio)) ################################################################################ # store data for submission # ################################################################################ y_pred = proj1_helpers.predict_labels(w_star, XTest) proj1_helpers.create_csv_submission(idTest, y_pred, DATA_PREDICTIONS_PATH)