Exemplo n.º 1
0
def solve(tX, y):
    tX_tr, y_tr, tX_te, y_te = split_data(tX, y, ratio=0.8, seed=2019)

    lambda_ = 1
    w, _ = ridge_regression(y_tr, tX_tr, lambda_)
    y_pr_tr = predict_labels(w, tX_tr)
    y_pr_te = predict_labels(w, tX_te)
    acc_tr = compute_accuracy(y_tr, y_pr_tr)
    acc_te = compute_accuracy(y_te, y_pr_te)

    return acc_tr, acc_te
Exemplo n.º 2
0
def main():
    y_train, tX_train, ids = load_csv_data(DATA_TRAIN_PATH)
    _, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

    np.random.seed(2019)

    # Preprocess data together to have the same shifts while creating log or root features
    tX_stacked = np.vstack((tX_train, tX_test))
    prep_param = {
        "bias": True,
        "fill": True,
        "standardize": False,
        "degree": 8,
        "log": True,
        "root": True
    }
    tX_stacked_prep, *_ = preprocess_data(tX_stacked, None, prep_param)
    tX_train_prep, tX_test_prep = np.split(tX_stacked_prep, [len(tX_train)])

    # Split data according to PRI_jet_num value
    tX_tr_splitted, indices_tr = divide_data(tX_train_prep)
    tX_te_splitted, indices_te = divide_data(tX_test_prep)
    n_models = len(indices_tr)

    y_tr_splitted = []
    for i in range(n_models):
        y_tr_splitted.append(y_train[indices_tr[i]])

    # Train
    weights = []
    for i in range(n_models):
        lambda_ = lambda_cv(tX_tr_splitted[i], y_tr_splitted[i])
        print(f"Class {i}, lambda: {lambda_}")
        weights.append(
            ridge_regression(y_tr_splitted[i], tX_tr_splitted[i], lambda_)[0])

    # Predict
    y_pr_tr = np.zeros(tX_train.shape[0])
    y_pr_te = np.zeros(tX_test.shape[0])
    for i in range(n_models):
        y_pr_tr[indices_tr[i]] = predict_labels(weights[i], tX_tr_splitted[i])
        y_pr_te[indices_te[i]] = predict_labels(weights[i], tX_te_splitted[i])

    acc_tr = compute_accuracy(y_train, y_pr_tr)
    print(f"Total accuracy train: {acc_tr}")
    _, counts = np.unique(y_pr_te, return_counts=True)
    print(
        f"Distribution on test data class -1: {counts[0]}, class +1: {counts[1]}"
    )

    create_csv_submission(ids_test, y_pr_te, OUTPUT_PATH)
Exemplo n.º 3
0
def train_3models(tX, y):
    # Preprocess data together to have the same shifts while creating log or root features
    prep_param = {
        "bias": True,
        "fill": True,
        "standardize": False,
        "degree": 8,
        "log": True,
        "root": True
    }
    tX_new, y_new, _ = preprocess_data(tX, y, prep_param)

    tX_tr, y_tr, tX_te, y_te = split_data(tX_new, y_new, ratio=0.8, seed=2019)

    # Split data according to PRI_jet_num value
    tX_tr_splitted, indices_tr = divide_data(tX_tr)
    tX_te_splitted, indices_te = divide_data(tX_te)
    n_models = len(tX_tr_splitted)

    y_tr_splitted = []
    for i in range(len(indices_tr)):
        y_tr_splitted.append(y_tr[indices_tr[i]])
        print(tX_tr_splitted[i].shape)

    # Train
    weights = []
    for i in range(n_models):
        lambda_ = lambda_cv(tX_tr_splitted[i], y_tr_splitted[i])
        print(f"Class {i}, lambda: {lambda_}")
        weights.append(
            ridge_regression(y_tr_splitted[i], tX_tr_splitted[i], lambda_)[0])
        print(len(weights[-1]))

    # Predict
    y_pr_tr = np.zeros(y_tr.shape)
    y_pr_te = np.zeros(y_te.shape)
    for i in range(n_models):
        y_pr_tr[indices_tr[i]] = predict_labels(weights[i], tX_tr_splitted[i])
        y_pr_te[indices_te[i]] = predict_labels(weights[i], tX_te_splitted[i])

    # Get accuracy
    acc_tr = compute_accuracy(y_tr, y_pr_tr)
    acc_te = compute_accuracy(y_te, y_pr_te)
    print(f"Total accuracy tr: {acc_tr}, te: {acc_te}")

    for i in range(n_models):
        acc_tr = compute_accuracy(y_tr[indices_tr[i]], y_pr_tr[indices_tr[i]])
        acc_te = compute_accuracy(y_te[indices_te[i]], y_pr_te[indices_te[i]])
        print(f"Class {i}, Accuracy tr: {acc_tr}, te: {acc_te}")
Exemplo n.º 4
0
def least_squares_demo(y, x, k):
    """return error for least square model"""
    seed = 1
    weights=[]
    mse_errors = []
    
    tx = helpers.build_poly(x, 1)

    # Initialization
    w_initial = np.zeros(tx.shape[1])

    # split data in k fold
    k_indices = helpers.build_k_indices(y, k, seed)
    
    for i in range(k):
            mse_te, opt_w = cross_validation_ls(y, tx, k_indices, i)
            mse_errors.append(mse_te)
            weights.append([opt_w])
    
    mse = np.min(mse_errors)
    opt_w = weights[np.argmin(mse_errors)]
    y_model = helpers.predict_labels(np.array(opt_w).T, tx)

    #Computing accuracy
    print("   mse={mse}".format(mse = mse))
    accuracy = (list(y_model.flatten() == y).count(True))/len(y_model)
    print("   accuracy={acc:.3f}".format(acc=accuracy))
Exemplo n.º 5
0
def experiment_for_submitting():
    y_train, tX_train, ids = load_csv_data(DATA_TRAIN_PATH)
    _, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

    np.random.seed(2019)
    results = pd.DataFrame(
        columns=["Preprocessing", "Class -1 count", "Class +1 count"])

    for preprocessing_param in preprocessing_options:
        tX_stacked = np.vstack((tX_train, tX_test))
        prep_param = {
            "bias": True,
            "fill": True,
            "standardize": False,
            "degree": 11,
            "log": True,
            "root": True
        }
        tX_stacked_prep, _, desc_prep = preprocess_data(
            tX_stacked, None, prep_param)
        tX_train_prep, tX_test_prep = np.split(tX_stacked_prep,
                                               [len(tX_train)])

        lambda_ = lambda_cv(tX_train_prep, y_train)
        print(f"Best lambda: {lambda_}")
        w, _ = ridge_regression(y_train, tX_train_prep, lambda_)

        y_pred = predict_labels(w, tX_test_prep)
        uniq, count = np.unique(y_pred, return_counts=True)

        print(preprocessing_param,
              f"Class -1: {count[0]}, Class +1: {count[1]}")
        results.loc[len(results)] = (desc_prep, count[0], count[1])

    results.to_csv("Submitting experiment.csv", sep=";")
Exemplo n.º 6
0
def accuracy(y, tx, w):
    """ Computes the accuracy of a model.

    Parameters
    ----------
    y: ndarray
        The labels
    tx: ndarray
        The feature matrix
    w: ndarray
        The learned weights

    Returns
    -------
    float
        The accuracy  
    """
    ny = map_0_1(predict_labels(w, tx))
    
    assert ny.shape == y.shape
    assert y.min() in [0, 1]
    assert y.max() in [0, 1]
    assert ny.min() in [0, 1]
    assert ny.max() in [0, 1]

    return np.equal(y, ny).astype(int).sum() / y.shape[0]
Exemplo n.º 7
0
def lrr_demo(y, x, k):
    """find best hyperparameters and return error for regularized logistic regression model"""
    #Adding constant term
    tx = helpers.build_poly(x, 4)
    
    seed = 1
    max_iters = 50
    lambdas = np.logspace(-4, -3, 1)
    gammas = np.logspace(-4, -3, 1)
    hyperparams = [(gamma,lambda_) for gamma in gammas for lambda_ in lambdas]

    w_initial = np.zeros(tx.shape[1])
    
    # split data in k fold
    k_indices = helpers.build_k_indices(y, k, seed)

    result_loss =[]
    result_opt_w=[]
    for gamma,lambda_ in hyperparams:  
            loss_errors=[]
            weights=[]
            
            for i in range(k):
                loss_te, opt_w = cross_validation_lrr(y, tx, k_indices, i, lambda_, gamma, max_iters, w_initial)
                loss_errors.append(loss_te)
                weights.append([opt_w])
    
            result_loss.append(np.mean(loss_errors))
            result_opt_w.append(np.mean(weights,axis=0))

    
    del loss_errors
    del weights
    
    mse = np.min(result_loss)
    hyper_opt= hyperparams[np.argmin(result_loss)]
    print("   gamma={g:.3f}, mse={mse:.3f} lambda{l:.3f}".format(mse = mse, g=hyper_opt[0], l=hyper_opt[1]))

    opt_w = result_opt_w[np.argmin(result_loss)]
   
    #Training Accuracy
    y_predicted = helpers.predict_labels(opt_w.T, tx)
    accuracy = (list(y_predicted.flatten() == y).count(True))/len(y)
    print("   accuracy={acc:.3f}".format(acc=accuracy))
    
    del result_loss
    del result_opt_w
Exemplo n.º 8
0
def lr_demo(y, x, k):
    """find best hyperparameters and return error for logistic regression model"""
    max_iters = 100
    gammas = np.logspace(-4, -3, 1)
    seed = 1
    
    # adding constant term
    tx = helpers.build_poly(x, 1)
    
    # Initialization
    w_initial = np.zeros(tx.shape[1])
    
    # split data in k fold
    k_indices = helpers.build_k_indices(y, k, seed)

    gen_opt_w = []
    gen_loss = []

    #gamma selection
    for gamma in gammas:
        weights=[]
        loss_errors = []
        
        for i in range(k):
            loss_te, opt_w = cross_validation_lr(y, tx, k_indices, i, gamma, max_iters, w_initial)
            loss_errors.append(loss_te)
            weights.append([opt_w])
    
        gen_loss.append(np.mean(loss_errors))
        gen_opt_w.append(np.mean(weights,axis=0))
    
    del weights
    del loss_errors
        
    opt_gamma = gammas[np.nanargmin(gen_loss)]
    opt_w = gen_opt_w[np.nanargmin(gen_loss)]
    print("   gamma={l:.3f},loss={loss:.3f}".format(loss = np.min(gen_loss), l = opt_gamma))

     #Training Accuracy
    y_predicted = helpers.predict_labels(opt_w.T, tx)
    accuracy = (list(y_predicted.flatten() == y).count(True))/len(y)
    print("   accuracy={acc:.3f}".format(acc = accuracy))
    
    del gen_opt_w
    del gen_loss
Exemplo n.º 9
0
def LS_SGD_demo(y, x, k):
    """find best hyperparameters and return error for least square SGD model"""

    #Adding constant term
    tx = helpers.build_poly(x, 1)

    seed = 1
    max_iters = 50
    gammas = np.logspace(-3, 0, 10)
    batch_sizes = np.array([1])
    
    # Initialization
    w_initial = np.zeros(tx.shape[1])
    
    # split data in k fold
    k_indices = helpers.build_k_indices(y, k, seed)

    temp_mse = []
    temp_opt_w = []
    
    hyperparams = [(batch_size,gamma) for batch_size in batch_sizes for gamma in gammas ]
    
    for batch_size, gamma in hyperparams:  
            mse_errors = []
            weights = []
            
            for i in range(k):
                mse_te, opt_w = cross_validation_ls_SGD(y, tx, k_indices, i, gamma, max_iters, w_initial, batch_size)
                mse_errors.append(mse_te)
                weights.append([opt_w])
    
            temp_mse.append(np.mean(mse_errors))
            temp_opt_w.append(np.mean(weights, axis=0))
    
    mse = np.min(temp_mse)
    hyper_opt= hyperparams[np.argmin(temp_mse)]
    print("   gamma={g:.3f}, batch={b:.2f}, mse={mse:.3f}".format(mse = mse, g = hyper_opt[1], b = hyper_opt[0]))

    opt_w = temp_opt_w[np.nanargmin(temp_mse)]

    #Training Accuracy
    y_predicted = helpers.predict_labels(opt_w.T, tx)
    accuracy = (list(y == y_predicted.flatten()).count(True))/len(y)
    print("   accuracy={acc:.3f}".format(acc = accuracy))
Exemplo n.º 10
0
def LS_GD_demo(y, x, k):
    """find best hyperparameters and return error for least square GD model"""
    seed=1
    max_iters = 50
    gammas = np.logspace(-3, 0, 10)
    
    tx = helpers.build_poly(x, 1)

    # Initialization
    w_initial = np.zeros(tx.shape[1])
    
    # split data in k fold
    k_indices = helpers.build_k_indices(y, k, seed)

    gen_opt_w = []
    gen_mse = []

    #gamma selection
    for gamma in gammas:
        weights=[]
        mse_errors = []
        for i in range(k):
            mse_te, opt_w = cross_validation_ls_GD(y, tx, k_indices, i, gamma,max_iters, w_initial)
            mse_errors.append(mse_te)
            weights.append([opt_w])
        
        gen_mse.append(np.mean(mse_errors))
        gen_opt_w.append(np.mean(weights, axis=0))
        
    del weights
    del mse_errors
    
    opt_gamma = gammas[np.nanargmin(gen_mse)]
    opt_w = gen_opt_w[np.nanargmin(gen_mse)]
    mse_LS_GD = np.nanmin(gen_mse)
    
    print("   gamma={l:.3f}, mse={mse:.3f}".format(mse = mse_LS_GD, l = opt_gamma))

    #Training Accuracy
    y_predicted = helpers.predict_labels(opt_w.T, tx)
    accuracy = (list(y == y_predicted.flatten()).count(True))/len(y)
    print("   accuracy={acc:.3f}".format(acc=accuracy))
Exemplo n.º 11
0
def ridge_regression_demo(y, x, degree, k_fold):
    """find best hyperparameters and return error for ridge regression model"""
    seed = 1
    lambdas = np.logspace(-1.1, -0.8, 20)
    
    # split data in k fold
    k_indices = helpers.build_k_indices(y, k_fold, seed)
    
    # define lists to store the loss of training data and test data
    rmse_tr = []
    rmse_te = []
    
    # iterate over all the lambdas, compute model parameters, store the rmse
    for i in range(len(lambdas)):
        l = lambdas[i]
        avg_err_tr = 0
        avg_err_te = 0
        for k in range(k_fold):
            err = cross_validation_rr(y, x, k_indices, k, l, degree)
            avg_err_tr += err[0]
            avg_err_te += err[1]
        rmse_tr.append(np.sqrt(2 * avg_err_tr / k_fold))
        rmse_te.append(np.sqrt(2 * avg_err_te / k_fold))
    helpers.visualization(lambdas, rmse_tr, rmse_te)
    
    # find the best lambda
    min_err_index = 0
    for i in range(1, len(rmse_te)):
        if rmse_te[i] < rmse_te[min_err_index]:
            min_err_index = i
            
    lambda_opt = lambdas[min_err_index]
    
    x_poly = helpers.build_poly(x, degree)
    w_opt, mse = imp.ridge_regression(y, x_poly, lambda_opt)
    
    print("   lambda={l:.3f}, mse={mse:.3f}".format(mse = mse, l = lambda_opt))

    #Training Accuracy
    y_predicted = helpers.predict_labels(w_opt.T, x_poly)
    accuracy = (list(y == y_predicted.flatten()).count(True))/len(y)
    print("   accuracy={acc:.3f}".format(acc = accuracy))
Exemplo n.º 12
0
    # Build the model
    initial_w = np.random.randn(D)
    optimal_gamma, optimal_lambda_, measure_tr, measure_te = \
        gamma_lambda_selection_cv(y_train_subset, X_train_subset, k_fold, initial_w, max_iters, gammas[i], lambdas[i],
                                  seed = seed, batch_size = batch_size, metric = metric, model = model)
    print('CA_bs:', CA_baseline)
    print('Iter:', i, ' Best gamma:', optimal_gamma, ' Best lambda:',
          optimal_lambda_, '\n')

    # Update the expected training error
    exp_measure_tr += measure_tr * X_train_subset.shape[0] / X_train.shape[0]
    exp_measure_te += measure_te * X_test_subset.shape[0] / X_test.shape[0]

    # Build the model with the best hyperparameters
    w = get_model(model, y_train_subset, X_train_subset, initial_w, max_iters,
                  optimal_gamma, optimal_lambda_, batch_size)

    # Get predictions
    y_pred_test = np.array(map_minus_1_1(predict_labels(w, X_test_subset)))

    # Insert the ids and predictions to the ids and y_pred arrays
    ids = np.concatenate((ids, ids_test_subset))
    y_pred = np.concatenate((y_pred, y_pred_test))

# Sort the ids and y_pred arrays
ids, y_pred = sort_arr(ids, y_pred)
# Create the submission CSV file
create_csv_submission(ids, y_pred, sumbission_fname)

print("Expected training accuracy / loss:", exp_measure_tr)
print("Expected test accuracy / loss:", exp_measure_te)
Exemplo n.º 13
0
def cross_validation(y, tX, gamma, method='logistic_regression'):
    """Cross validation for logistic regression
	@param gamma: learning rate
	@return : the average accuracy over the four fold validations
	"""
    N, D = tX.shape

    # Logistic regression parameters
    max_iters = 100
    batch_size = N / 100

    # Cross validation parameters
    seed = 1
    k_fold = 4
    k_indices = build_k_indices(y, k_fold, seed)

    N_fold = N * (k_fold - 1) / k_fold
    N_test = N / k_fold

    acc = []

    for k in range(k_fold):
        yTr = np.array([])
        xTr = np.zeros((0, D))
        for i in range(k_fold):
            if i == k:
                yTe = y[k_indices[i]]
                xTe = tX[k_indices[i]]
            else:
                yTr = np.append(yTr, y[k_indices[i]], axis=0)
                xTr = np.append(xTr, tX[k_indices[i]], axis=0)

        initial_w = np.zeros(tX.shape[1])
        if method == 'logistic_regression':
            initial_w = np.zeros((tX.shape[1], 1))
            w, loss = logistic_regression(yTr, xTr, initial_w, max_iters,
                                          gamma)
            y_est = sigmoid(np.dot(xTe, w))
            y_label = [0 if i < 0.5 else 1 for i in y_est]
        elif method == 'reg_logistic_regression':
            initial_w = np.zeros((tX.shape[1], 1))
            lambda_ = 0.1
            w, loss = reg_logistic_regression(yTr, xTr, lambda_, initial_w,
                                              max_iters, gamma)
            y_est = sigmoid(np.dot(xTe, w))
            y_label = [0 if i < 0.5 else 1 for i in y_est]
        elif method == 'least_squares_GD':
            w, loss = least_squares_GD(yTr, xTr, initial_w, max_iters, gamma)
            y_label = predict_labels(w, xTe)
        elif method == 'least_squares_SGD':
            w, loss = least_squares_SGD(yTr, xTr, initial_w, max_iters, gamma)
            y_label = predict_labels(w, xTe)
        elif method == 'least_squares':
            w, loss = least_squares(yTr, xTr)
            y_label = predict_labels(w, xTe)
        elif method == 'ridge_regression':
            w, loss = ridge_regression(yTr, xTr, 0.1)
            y_label = predict_labels(w, xTe)
        else:
            raise Exception('Invalid method')

        corr = [
            True if i == yTe[ind] else False for ind, i in enumerate(y_label)
        ]
        acc.append(sum(corr) / N_test)
        # print("Fold: {f}, Accuracy: {acc}, Loss:{loss}".format(f=k, acc=acc[k], loss=loss))
    return (sum(acc) / k_fold), acc
Exemplo n.º 14
0
        k_indices = k_fold_indices(train_data_split.shape[0], 5, SEED)
        for i, deg in enumerate(POSSIBLE_DEGREES):
            train_data, _ = preprocessing_pipeline(train_data_split,
                                                   degree=deg)
            train_set_folds = k_fold_cross_split_data(train_classes_split,
                                                      train_data, k_indices)

            for j, lambda_ in enumerate(POSSIBLE_LAMBDA_VALUES):
                folds_train_accuracy = []
                folds_validation_accuracy = []

                # Train a Ridge Regression model on each fold
                for x_train, y_train, x_test, y_test in train_set_folds:
                    w, train_loss = ridge_regression(y_train, x_train, lambda_)
                    folds_train_accuracy.append(
                        compute_accuracy(predict_labels(w, x_train), y_train))
                    folds_validation_accuracy.append(
                        compute_accuracy(predict_labels(w, x_test), y_test))
                train_accuracy_matrix[jet_num, 0, i, j] = \
                    (np.mean(folds_train_accuracy), np.std(folds_train_accuracy))
                validation_accuracy_matrix[jet_num, 0, i, j] = \
                    (np.mean(folds_validation_accuracy), np.std(folds_validation_accuracy))

            train_data_log_svm = preprocessing_pipeline(train_data_split,
                                                        degree=deg,
                                                        norm_first=False)
            train_set_folds = k_fold_cross_split_data(train_classes_split,
                                                      train_data_log_svm,
                                                      k_indices)

            for j, lambda_ in enumerate(POSSIBLE_LAMBDA_LOG):
Exemplo n.º 15
0
    y_train_subset = map_0_1(y_train_subset)
    # Standardize the data
    X_train_subset, X_test_subset = standardize(X_train_subset, X_test_subset)

    # Build the polynomial features and expand the data
    print(f"Train shape before feature expansion: {str(X_train_subset.shape):>12}   Test shape: {str(X_test_subset.shape):>12}")
    X_train_subset, X_test_subset = build_poly(X_train_subset, max_degree[i]), build_poly(X_test_subset, max_degree[i])
    print(f"Train shape after  feature expansion: {str(X_train_subset.shape):>12}   Test shape: {str(X_test_subset.shape):>12}")
    
    # Set the maximum number of iterations for building the model
    max_iters = 440
    # Set batch size to 1 to enforce SGD
    batch_size = 1
    # Set the initial coefficients randomly
    initial_w = np.random.rand(X_train_subset.shape[1])

    # Get the coefficients of the optimal regularized logistic regression model
    w = get_model("LOG_REG_GD", y_train_subset, X_train_subset, initial_w, max_iters, gammas_opt[i], lambdas_opt[i], batch_size)

    # Get the predictions
    y_pred_test = np.array(predict_labels(w, X_test_subset))

    # Insert the ids and predictions to the ids and y_pred arrays
    ids = np.concatenate((ids, ids_test_subset))
    y_pred = np.concatenate((y_pred, y_pred_test))

# Sort the ids and y_pred arrays
ids, y_pred = sort_arr(ids, y_pred)

# Create the submission CSV file
create_csv_submission(ids, y_pred, sumbission_fname)
Exemplo n.º 16
0
                                         PRI_JET_NUM_INDEX)

    # We achieved our best results using Regularized Logistic Regression,
    # so we only load only those previously computed optimal params to generate the submission
    logistic_best_params = np.load("results/logistic_best_params.npy", allow_pickle=True)
    logistic_best_models = []

    for (lambda_, deg, gamma), train_classes_split, train_data_split in \
            zip(logistic_best_params, train_classes_jet_num_splits, train_data_jet_num_splits):
        data_split, columns_to_remove, mean, std = preprocessing_pipeline(train_data_split, degree=np.int(deg),
                                                                          cross_term=True, norm_first=False)
        initial_w = np.zeros((data_split.shape[1],))
        w, loss = reg_logistic_regression(train_classes_split, data_split, lambda_, initial_w, 500, gamma, 1)
        print(f'Loss: {loss:.3f} Accuracy : {compute_accuracy(predict_labels(w, data_split), train_classes_split)}')
        logistic_best_models.append((w, loss, columns_to_remove, mean, std))

    # Calculate the predictions for each of the 4 subsets using the weights and then combine them
    results = None
    for (w, _, col_to_rm, mean, std), (_, deg, _), test_classes_split, test_data_split, test_ids_split in \
            zip(logistic_best_models, logistic_best_params,
                test_classes_jet_num_splits, test_data_jet_num_splits, test_ids_jet_num_splits):
        test_data_split, _, _, _ = preprocessing_pipeline(test_data_split, degree=np.int(deg),
                                                          columns_to_remove=col_to_rm,
                                                          cross_term=True, norm_first=False, mean=mean, std=std)
        pred = predict_labels(w, test_data_split)
        out = np.stack((test_ids_split, pred), axis=-1)
        results = out if results is None else np.vstack((results, out))

    # Create the submission
    create_csv_submission(results[:, 0], results[:, 1], 'results/logistic_submission.csv')