Пример #1
0
def logistic_trials(y, tx, tx_sub, degree_range, partitions=2):
    ## Split data into test and training sets
    ## If partitions > 2, use k-fold cross-validation
    glob_tx_tr, glob_tx_te, glob_y_tr, glob_y_te = split_data(tx, y, 0.8)

    ## Initial results: losses, weights, preditions and (test) losses
    models = []
    losses = []
    accuracies = []
    predictions = []

    ## Loops over range of degrees
    degrees = range(degree_range[0], degree_range[1])
    for degree in degrees:
        print("Trying degree", degree, ":")

        tx_tr, tx_te, tx_pred = expand(degree, glob_tx_tr, glob_tx_te, tx_sub)
        initial_w = np.ones(tx_tr.shape[1])

        w, loss = logistic_regression(glob_y_tr, tx_tr, initial_w, MAX_ITERS,
                                      GAMMA)
        print("\tTraining Loss = ", loss)

        y_test = predict_labels(w, tx_te)
        test_loss = compute_loss(glob_y_te, tx_te, w, func="logistic")
        accuracy = compute_accuracy((y_test + 1) / 2, glob_y_te)
        y_pred = predict_labels(w, tx_pred)

        print("\tTest Loss = ", test_loss, " Test Accuracy = ", accuracy)
        models.append(("logistic_SGD", degree, w))
        losses.append(test_loss)
        accuracies.append(accuracy)
        predictions.append(y_pred)
    return models, losses, accuracies, predictions
Пример #2
0
def cross_validation(y, x, k_indices, k, regression_method, **args):
    """
    Completes k-fold cross-validation using the regression method
    passed as argument.
    """
    # get k'th subgroup in test, others in train
    msk_test = k_indices[k]
    msk_train = np.delete(k_indices, (k), axis=0).ravel()

    x_train = x[msk_train, :]
    x_test = x[msk_test, :]
    y_train = y[msk_train]
    y_test = y[msk_test]

    # data pre-processing
    x_train, x_test = process_data(x_train, x_test, True)

    # compute weights using given method
    weights, loss = regression_method(y=y_train, tx=x_train, **args)

    # predict output for train and test data
    y_train_pred = predict_labels(weights, x_train)
    y_test_pred = predict_labels(weights, x_test)

    # compute accuracy for train and test data
    acc_train = compute_accuracy(y_train_pred, y_train)
    acc_test = compute_accuracy(y_test_pred, y_test)

    return acc_train, acc_test
Пример #3
0
def generate_prediction(x_tr_0, y_tr_0, x_tr_1, y_tr_1, x_tr_2, y_tr_2, x_tr_3,
                        y_tr_3, x_te_0, x_te_1, x_te_2, x_te_3, jet_num_te):
    """Generate a prediction for a test dataset already split according to jet_num
    by calculating weights using a training dataset also already split."""
    #compute the weights using predetermined polynomial degrees
    w_0, _ = least_squares(y_tr_0, build_poly(x_tr_0, 9))
    w_1, _ = least_squares(y_tr_1, build_poly(x_tr_1, 15))
    w_2, _ = least_squares(y_tr_2, build_poly(x_tr_2, 13))
    w_3, _ = least_squares(y_tr_3, build_poly(x_tr_3, 12))

    #compute the prediction using the weights
    y_te_0 = predict_labels(w_0, build_poly(x_te_0, 9))
    y_te_1 = predict_labels(w_1, build_poly(x_te_1, 15))
    y_te_2 = predict_labels(w_2, build_poly(x_te_2, 13))
    y_te_3 = predict_labels(w_3, build_poly(x_te_3, 12))

    #join the four predictions into a single one matching the original indices
    predicted_y_te = []
    i_0, i_1, i_2, i_3 = 0, 0, 0, 0
    for jet_num in jet_num_te:
        if jet_num == 0:
            predicted_y_te.append(y_te_0[i_0])
            i_0 += 1
        elif jet_num == 1:
            predicted_y_te.append(y_te_1[i_1])
            i_1 += 1
        elif jet_num == 2:
            predicted_y_te.append(y_te_2[i_2])
            i_2 += 1
        else:
            predicted_y_te.append(y_te_3[i_3])
            i_3 += 1

    return predicted_y_te
def cross_validation_lr(y, x, k_indices, k, gamma, lambda_, max_iters, degree):
    """ Return the classification error of the logistic regression for each step of the k-fold cross validation.
    
    @param y : raw output variable 
    @param x :raw input variable, might be a polynomial basis obtained from the input x
    @param k_indices : the indices of the data that belong to each of the K groups of the cross_validation.
    @param k : the index of the group that we are using for the testing.
    @param gamma : the gamma with which we're doing the cross_validation
    @param lambda : the penalization parameter we're working on.
    @param max_iters : the max number of iterations of the logistic regression
    @param degree : the degree of the polynomial basis with which we're doing the cross validation
    @return loss_tr : the classification error made on the training data.
    @return loss_te : the classification error made on the testing data.
    """
    #1. WE DIVIDE THE DATA IN THE SUBGROUPS
    # get k'th subgroup in test, others in train:
    x_test = np.array(x[k_indices[k - 1]])
    y_test = np.array(y[k_indices[k - 1]])
    x_train = np.empty((0, x.shape[1]))
    y_train = np.empty((0, 1))
    #This for loops gets the other groups
    for k_iter, validation_points in enumerate(k_indices):
        if (k_iter != k - 1):
            x_train = np.append(x_train, x[validation_points], axis=0)
            y_train = np.append(y_train, y[validation_points])

    #2. WE FORMAT THE DATA
    #we sanitize and standardize our training data here, and apply the same median, mean and variance to the testing data
    x_train = count_NaN(x_train)
    x_test = count_NaN(x_test)

    x_train, median_train = sanitize_NaN(x_train)
    x_test, median_test = sanitize_NaN(x_test, median_train)

    x_train, mean_tr, std_tr = standardize(x_train)
    x_test, mean_te, ste_te = standardize(x_test, mean_tr, std_tr)

    # form data with polynomial degree:
    x_train_poly = build_poly(x_train, degree)
    x_test_poly = build_poly(x_test, degree)
    #print('Shape of polynomial training date :', x_train_poly.shape)

    #3. WE RUN THE MODEL AND COMPUTE THE ERROR
    # Relgularized logistic regression:
    w_rlr = regularized_logistic_regression(y_train, x_train_poly, gamma,
                                            lambda_, max_iters)

    # calculate the classification error for train and test data:
    loss_tr = sum(
        abs((2 * (y_train) - 1) -
            predict_labels(w_rlr, x_train_poly))) / (2 * len(y_train))
    loss_te = sum(abs((2 * y_test - 1) -
                      predict_labels(w_rlr, x_test_poly))) / (2 * len(y_test))

    return loss_tr, loss_te
Пример #5
0
def cross_validation_ridge_regression(y, x, k_indices, k, lambdas, degrees):
    """
    Completes k-fold cross-validation using the ridge regression method.
    Here, we build polynomial features and create four subsets using
    the jet feature.
    """
    # get k'th subgroup in test, others in train
    msk_test = k_indices[k]
    msk_train = np.delete(k_indices, (k), axis=0).ravel()

    x_train_all_jets = x[msk_train, :]
    x_test_all_jets = x[msk_test, :]
    y_train_all_jets = y[msk_train]
    y_test_all_jets = y[msk_test]

    # split in 4 subsets the training set
    msk_jets_train = get_jet_masks(x_train_all_jets)
    msk_jets_test = get_jet_masks(x_test_all_jets)

    # initialize output vectors
    y_train_pred = np.zeros(len(y_train_all_jets))
    y_test_pred = np.zeros(len(y_test_all_jets))

    for idx in range(len(msk_jets_train)):
        x_train = x_train_all_jets[msk_jets_train[idx]]
        x_test = x_test_all_jets[msk_jets_test[idx]]
        y_train = y_train_all_jets[msk_jets_train[idx]]

        # data pre-processing
        x_train, x_test = process_data(x_train, x_test, False)

        phi_train = build_poly(x_train, degrees[idx])
        phi_test = build_poly(x_test, degrees[idx])

        phi_train = add_constant_column(phi_train)
        phi_test = add_constant_column(phi_test)

        # compute weights using given method
        weights, loss = ridge_regression(y=y_train, tx=phi_train, lambda_=lambdas[idx])

        y_train_pred[msk_jets_train[idx]] = predict_labels(weights, phi_train)
        y_test_pred[msk_jets_test[idx]] = predict_labels(weights, phi_test)

    # compute accuracy for train and test data
    acc_train = compute_accuracy(y_train_pred, y_train_all_jets)
    acc_test = compute_accuracy(y_test_pred, y_test_all_jets)

    return acc_train, acc_test
Пример #6
0
def calculate_f1(x, y, w):
    """ Compute F1 score of the found model """
    y_sol = np.copy(y)
    y_pred = predict_labels(w, x)
    nP = 2
    nS = 2
    P = np.zeros((nP, nS))
    R = np.zeros((nP, nS))
    F1 = np.zeros((nP, nS))
    M = len(y_sol)
    F1_overall = 0
    y_sol[np.where(y_sol == -1)] = 0
    y_pred[np.where(y_pred == -1)] = 0

    for i in range(nS):
        ci = sum(y_sol == i)
        for j in range(nP):
            true_value = 0
            for m in range(M):
                if y_sol[m] == i and y_pred[m] == j:
                    true_value = true_value + 1
            kj = sum(y_pred == j)
            if kj != 0:
                P[j, i] = true_value / kj
            if ci != 0:
                R[j, i] = true_value / ci
            if R[j, i] + P[j, i] != 0:
                F1[j, i] = (2 * R[j, i] * P[j, i]) / (R[j, i] + P[j, i])
        F1_overall = F1_overall + ci / M * max(F1[:, i])
    return F1_overall * 100
Пример #7
0
def cross_validation_SGD(X, y, k_fold=4, seed=1):
    """# Returns the mean accuracy based on k_fold cross validation"""
    all_indices = build_k_indices(y, k_fold, seed)
    
    # Try over the Gamma (learning rate)
    gamma = np.logspace(-6, -3, 10)
    
    # This is going to be a grid search on gamma
    accuracy = np.zeros((k_fold, len(gamma)))

    for k in range(k_fold):
        test_indices = all_indices[k]
        train_indices = np.setdiff1d(range(len(y)), test_indices)
        
        y_test = y[test_indices]
        X_test = X[test_indices]
        
        y_train = y[train_indices]
        X_train = X[train_indices]
        
        for j in range(len(gamma)):
            # Corresponds to 1 'epoch'
            print(gamma[j])
            w, loss_tr = least_squares_SGD(y = y_train, tx = X_train, initial_w = np.random.random(size=X_train.shape[1])*0.01, max_iters = len(y_train), gamma = gamma[j], verbose=False)
            
            prediction = predict_labels(w, X_test, 0.0)
            
            accuracy[k, j] = len(np.where(y_test == prediction)[0]) / len(y_test) * 100
        
    return np.hstack((gamma.reshape(-1,1), np.mean(accuracy, axis=0).reshape(-1,1)))
Пример #8
0
def cross_validation_RLR(X, y, k_fold=4, seed=1):
    """# Returns the mean accuracy based on k_fold cross validation"""
    all_indices = build_k_indices(y, k_fold, seed)
    
    # Try over the Gamma (learning rate)
    gamma = np.logspace(-6, -3, 2)
    
    # Try over the lambda (regularisation)
    lambda_ = np.logspace(-6, -3, 5)
    
    # This is going to be a grid search on gamma and lambda_
    accuracy = np.zeros((k_fold, len(gamma), len(lambda_)))

    for k in range(k_fold):
        test_indices = all_indices[k]
        train_indices = np.setdiff1d(range(len(y)), test_indices)
        
        y_test = y[test_indices]
        X_test = X[test_indices]
        
        y_train = y[train_indices]
        X_train = X[train_indices]
        
        for j in range(len(gamma)):
            # Corresponds to 1 'epoch'
            print(gamma[j])
            for l in range(len(lambda_)):
                w, loss_tr = reg_logistic_regression(y = y_train, tx = X_train, lambda_ = lambda_[l], initial_w = np.random.random(size=X_train.shape[1])*0.01, max_iters = len(y_train), gamma = gamma[j], verbose=False)
                
                prediction = predict_labels(w, X_test, 0.0)
                
                accuracy[k, j, l] = len(np.where(y_test == prediction)[0]) / len(y_test) * 100
    
    return gamma, lambda_, np.mean(accuracy, axis=0) 
Пример #9
0
def cross_validation_RR(X, y, k_fold=4, seed=1):
    """# Returns the mean accuracy based on k_fold cross validation"""
    all_indices = build_k_indices(y, k_fold, seed)
    
    # Try over the lambda (regularisation)
    lambda_ = np.logspace(-6, -3, 20)
    
    # This is going to be a grid search on lambda_
    accuracy = np.zeros((k_fold, len(lambda_)))
    
    for k in range(k_fold):
        test_indices = all_indices[k]
        train_indices = np.setdiff1d(range(len(y)), test_indices)
        
        y_test = y[test_indices]
        X_test = X[test_indices]
        
        y_train = y[train_indices]
        X_train = X[train_indices]

        for j in range(len(lambda_)):
            # Corresponds to 1 'epoch'
            print(lambda_[j])
            
            w, loss_tr = ridge_regression(y = y_train, tx = X_train, lambda_ = lambda_[j])
            
            prediction = predict_labels(w, X_test, 0.0)
            
            accuracy[k, j] = len(np.where(y_test == prediction)[0]) / len(y_test) * 100

    return np.hstack((lambda_.reshape(-1,1), np.mean(accuracy, axis=0).reshape(-1,1)))
Пример #10
0
def cross_validation(y, x, k_indices, k, lambda_, degree):

    # Dividing in subgroups
    te_indice = k_indices[k]
    tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
    tr_indice = tr_indice.reshape(-1)

    y_te = y[te_indice]
    y_tr = y[tr_indice]
    tx_te = x[te_indice]
    tx_tr = x[tr_indice]

    # Preprocessing data: cleaning, standardazing and adding constant column
    tx_tr, tx_te = process_data(tx_tr, tx_te, y_tr, y_te)

    # Feature augmentation through polynomials
    tx_tr = build_poly(tx_tr, degree)
    tx_te = build_poly(tx_te, degree)

    # Printing degree and lambda tested
    print("Test: d = ", degree, "; l = ", lambda_)

    # Training with ridge regression
    w, loss = ridge_regression(y_tr, tx_tr, lambda_)

    # Computing prediction vector
    y_pred = predict_labels(w, tx_te)

    # Computing accuracy on test set
    accuracy = compute_accuracy(y_te, y_pred)

    # Log informations
    print("Accuracy = ", accuracy, "; loss = ", loss, "\n")

    return loss_te, accuracy
Пример #11
0
def pipeline(tx_train, y_train, tx_val, y_val, degrees, gamma, lambda_, epochs,
             verbose):
    """ Run the model training and evaluation on the given parameters """

    # Perform data cleaning (missing values, constant features, outliers, standardization)
    data_cleaner = DataCleaning()
    tx_train = data_cleaner.fit_transform(tx_train)
    tx_val = data_cleaner.transform(tx_val)

    # Perform feature engineering
    feature_generator = FeatureEngineering()
    x_train = feature_generator.fit_transform(tx=tx_train, degree=degrees)
    x_val = feature_generator.transform(tx=tx_val)

    # Initialize values
    initial_w = np.zeros(x_train.shape[1])
    # Train model
    w, _ = reg_logistic_regression(y_train, x_train, lambda_, initial_w,
                                   epochs, gamma, verbose)

    # Perform inference on validation
    pred = predict_labels(weights=w, data=x_val, logistic=True)

    evaluator = Evaluation(y_val, pred)
    return evaluator.get_f1(), evaluator.get_accuracy()
Пример #12
0
def run_stochastic_gradient_descent(tx_train, y_train, tx_val, y_val):
    """It performs training and evaluation of least squares with stochastic gradient descent."""

    print('\nTraining with Stochastic Gradient Descent')
    initial_w = np.zeros((tx_train.shape[1]))
    gamma = 0.005
    max_iter = 3000

    # Train the model
    w, _ = least_squares_SGD(y=y_train,
                             tx=tx_train,
                             initial_w=initial_w,
                             max_iters=max_iter,
                             gamma=gamma,
                             verbose=False)

    # Perform predictions
    y_pred = predict_labels(weights=w, data=tx_val, logistic=False)

    # Evaluate
    evaluation = Evaluation(y_actual=y_val, y_pred=y_pred)
    acc = evaluation.get_accuracy()
    f1 = evaluation.get_f1()
    print('Accuracy: {acc}, F1: {f1}'.format(acc=acc, f1=f1))

    return acc, f1
Пример #13
0
def run_regularized_logistic_regression(tx_train, y_train, tx_val, y_val):
    """It performs training and evaluation of regularized logistic regression."""

    print('\nTraining with regularized logistic regression ')
    # Initialize parameters
    initial_w = np.zeros((tx_train.shape[1]))
    gamma = 1e-6
    max_iter = 1000
    lambda_ = 0.00001

    # Train the model
    w, _ = reg_logistic_regression(y=y_train,
                                   tx=tx_train,
                                   initial_w=initial_w,
                                   max_iters=max_iter,
                                   gamma=gamma,
                                   lambda_=lambda_)

    # Perform predictions
    y_pred = predict_labels(weights=w, data=tx_val, logistic=True)

    # Evaluate
    evaluation = Evaluation(y_actual=y_val, y_pred=y_pred)
    acc = evaluation.get_accuracy()
    f1 = evaluation.get_f1()
    print('Accuracy: {acc}, F1: {f1}'.format(acc=acc, f1=f1))

    return acc, f1
Пример #14
0
def compute_model_accuracy(x, y, w):
    """ Compute the accuracy of the found model """
    y_pred = predict_labels(w, x)
    size = y.shape[0]
    false_values = np.count_nonzero(
        y_pred.reshape(size, 1) - y.reshape(size, 1))
    diff = false_values / size
    accuracy = 1 - diff
    return 100 * accuracy
Пример #15
0
def cross_validation_ridge(y_train, x_train, num_folds, lambda_, seed=1):
    np.random.seed(seed)
    scores = []
    for x_train_sub, x_val_sub, y_train_sub, y_val_sub in k_fold_splits(y_train, x_train, num_folds):
        w, _ = ridge_regression(y_train_sub, x_train_sub, lambda_)
        y_val_predict = predict_labels(w, x_val_sub)
        score = np.mean(y_val_predict == y_val_sub)
        scores.append(score)
    return np.array(scores)
Пример #16
0
def multi_models_splitter_experimental(y_train, tx_train, tx_test,
                                       feature_column_index, k, fun_model,
                                       fun_model_args):
    """Creates a predictions vector by creating different models based on the value
    of a categorizing feature in the dataset.

    Args:
        y_train (N x 1 vector): Training labels vector.
        tx_train (N x D matrix): Training features matrix (already pre-processed).
        tx_test (N x D matrix): Test features matrix (already pre-processed).
        feature_column_index (int): Categorizing feature's column index.
        k (int): Number of folds used for cross validation.
        fun_model (*function(...) return (weights,loss)): Function that computes a model.
        fun_model_args ([...]): Arguments list for fun_model (except y and tx).
    Returns:
        D x 1 vector: Predictions vector for tx_test.
        float: Average of all predictions score.
    """

    # feature_column_index must be positive
    if (feature_column_index < 0):
        raise ValueError("Parameter feature_column_index must be positive")

    # Get range of categorization values
    categorization_values = np.unique(tx_train[:, feature_column_index])
    num_models = len(categorization_values)

    # Accumulators
    idx_array = []
    y_pred_array = []
    pred_scores_array = []

    for i in range(num_models):

        # Only consider datapoints of one category
        idx_categorized = np.where(
            tx_train[:, feature_column_index] == categorization_values[i])
        y_categorized = y_train[idx_categorized]
        tx_categorized = tx_train[idx_categorized]

        # Run cross-validation on the model
        weights, avg_pred_score = k_fold_cross_validation(
            y_categorized, tx_categorized, k, fun_model, fun_model_args)

        # Get predictions
        idx_categorized_test = np.where(
            tx_test[:, feature_column_index] == categorization_values[i])
        tx_categorized_test = tx_test[idx_categorized_test]
        y_pred_categorized = predict_labels(weights, tx_categorized_test)

        # Update accumulators
        idx_array.append(idx_categorized_test)
        y_pred_array.append(y_pred_categorized)
        pred_scores_array.append(avg_pred_score)

    return idx_array, y_pred_array, pred_scores_array
Пример #17
0
def generate_best(param_dict=None, log_param_dict_path="../data/logs/best.json"):
    """
    Generate submission for the best function-parameters combination. These parameters are given either
    randomly through param_dict, or automatically fetched from the logs.

    Args:
        param_dict (dict): dictionary with function and its parameters
        log_param_dict_path (string): path to logs with best parameters

    Returns:
        None
    """
    # if not parameters are given manually, look for a log dictionary
    if not param_dict:
        try:
            with open(log_param_dict_path, "r") as f:
                log_dict = json.load(f)
                param_dict = transform_log_dict_to_param_dict(log_dict)
        except OSError:
            print(f"Could not open/read file: {log_param_dict_path}")
            sys.exit()

    M_list = [param_dict[str(group_indx)]["M"] for group_indx in range(1, 7)]
    class_equalizer_list = [param_dict[str(group_indx)]["class_eq"] for group_indx in range(1, 7)]
    z_outlier_list = [param_dict[str(group_indx)]["z_outlier"] for group_indx in range(1, 7)]
    corr_anal_list = [param_dict[str(group_indx)]["corr_anal"] for group_indx in range(1, 7)]

    # divide the dataset into the multiple groups and preprocess it
    # TODO change preexisting to False
    groups_tr_X, groups_tr_Y, indc_list_tr, groups_te_X, groups_te_Y, indc_list_te, ids_te = get_data(
        use_preexisting=False, save_preprocessed=False, z_outlier=z_outlier_list, feature_expansion=True,
        correlation_analysis=corr_anal_list, class_equalizer=class_equalizer_list, M=M_list)
    # numpy array for submission
    Y_te = np.zeros(shape=(568238,))

    # for each group...
    for group_indx, (X_tr, Y_tr, X_te, Y_te_indx) in enumerate(
            zip(groups_tr_X, groups_tr_Y, groups_te_X, indc_list_te)):
        # get shape and create initial parameters
        N, D = X_tr.shape
        W_init = np.random.rand(D, )
        best_params_train = {
            "tx": X_tr, "y": Y_tr, "initial_w": W_init,
            "max_iters": param_dict[str(group_indx + 1)]["params"][0],
            "gamma": param_dict[str(group_indx + 1)]["params"][1],
            "lambda_": param_dict[str(group_indx + 1)]["params"][2]
        }

        # train it on all available training data
        W_best, _ = IMPLEMENTATIONS[param_dict[str(group_indx + 1)]["function_name"]]["function"](**best_params_train)
        # write into the corresponding indexes of this group
        Y_te[Y_te_indx] = predict_labels(W_best, X_te)

    generate_submission(ids_te, Y_te)
Пример #18
0
def cross_validation(y, x, k_indices, k, lambda_, degree):

    te_indice = k_indices[k]
    tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
    tr_indice = tr_indice.reshape(-1)
    y_te, y_tr = y[te_indice], y[tr_indice]
    x_te, x_tr = x[te_indice], x[tr_indice]

    tx_tr = build_poly(x_tr, degree)
    tx_te = build_poly(x_te, degree)

    w, _ = ridge_regression(y_tr, tx_tr, lambda_)

    y_tr_pred = predict_labels(w, tx_tr)
    y_te_pred = predict_labels(w, tx_te)

    loss_tr = sum(y_tr_pred != y_tr) / len(y_tr)
    loss_te = sum(y_te_pred != y_te) / len(y_te)

    return loss_tr, loss_te, w
Пример #19
0
def submission(x_test, w, i):
    x_test = remove_columns(x_test)
    x_test = replace_outliers_with_mean(x_test)
    x_test = standardize(x_test)
    #x_test = build_poly(x_test,3)
    x_test = addones(x_test)

    y_predictions = predict_labels(w, x_test)
    y_predictions = predict_reverse(y_predictions)
    y_predictions.reshape(y_predictions.shape[0], )
    create_csv_submission(i, y_predictions, 'data/sample-submission.csv')
def compute_predictions_score(y_ref, weights, data):
    """Computes the prediction score obtained by a weights vector.

    Args:
        y_ref (N x 1 vector): Reference labels vector.
        weights (D x 1 matrix): Weights vector
        data (N x D matrix): Features matrix (already pre-processed).
    Returns:
        float: the proportion of correctly predicted labels (between 0 and 1)
    """
    y_pred = proj1_helpers.predict_labels(weights, data)
    return float(np.sum(y_pred == y_ref)) / float(y_ref.shape[0])
Пример #21
0
def ridge_trials(y, tx, tx_sub, degree_range, lambda_range, partitions=2):
    ## Split data into test and training sets
    ## If partitions > 2, use k-fold cross-validation
    glob_tx_tr, glob_tx_te, glob_y_tr, glob_y_te = split_data(tx, y, 0.8)

    ## Initial results: losses, weights, preditions and (test) losses
    models = []
    losses = []
    accuracies = []
    predictions = []

    ## Loops over range of degrees
    degrees = range(degree_range[0], degree_range[1])
    lambdas = np.logspace(lambda_range[0],
                          lambda_range[1],
                          num=1 + (lambda_range[1] - lambda_range[0]))
    for degree in degrees:
        ## Loops over range of lambdas
        for lambda_ in lambdas:
            print("Trying degree", degree, "with lambda =", lambda_, ":")

            tx_tr, tx_te, tx_pred = expand(degree, glob_tx_tr, glob_tx_te,
                                           tx_sub)

            w, loss = ridge_regression(glob_y_tr, tx_tr, lambda_)
            print("\tTraining Loss = ", loss)

            y_test = predict_labels(w, tx_te)
            test_loss = compute_loss(glob_y_te, tx_te, w)
            accuracy = compute_accuracy((y_test + 1) / 2, glob_y_te)
            y_pred = predict_labels(w, tx_pred)

            print("\tTest Loss = ", test_loss, " Test Accuracy = ", accuracy)
            models.append(("ridge_regression", degree, lambda_, w))
            losses.append(test_loss)
            accuracies.append(accuracy)
            predictions.append(y_pred)
    return models, losses, accuracies, predictions
Пример #22
0
def best_model_predictions(data_obj, jet, degrees):
    """
    This method splits the data based on the jet value
    trains the model and gets the predictions on the test dataset.

    :param data_obj: DataLoader obj
    :param jet: int, the jet value
    :param degrees: int, the polynomial degree
    :return:
        pred: np.array with the predicted labels
        ids: np.array with the row index
    """
    print('Training for Jet {jet}'.format(jet=jet))
    # Split data based on jet value for train and val datasets
    y, tx = get_jet_data_split(data_obj.y, data_obj.tx, jet)
    ids_test, tx_test = get_jet_data_split(data_obj.ids_test, data_obj.test,
                                           jet)

    # Perform data cleaning (missing values, constant features, outliers, standardization)
    data_cleaner = DataCleaning()
    tx = data_cleaner.fit_transform(tx)
    tx_test = data_cleaner.transform(tx_test)

    # Perform feature engineering
    feature_generator = FeatureEngineering()
    tx = feature_generator.fit_transform(tx, degrees)
    tx_test = feature_generator.transform(tx_test)

    # Initialize values
    initial_w = np.zeros((tx.shape[1]))
    lambda_ = 1e-06
    gamma = 1e-06
    max_iter = 1000

    # Train model
    w, loss = reg_logistic_regression(y,
                                      tx,
                                      lambda_,
                                      initial_w,
                                      max_iter,
                                      gamma,
                                      verbose=True)

    # Perform inference on test set
    pred = predict_labels(w, tx_test, True)

    return ids_test, pred
def compute_score(y, tx, w):
    """
        Compute percentage of well predicted labels.
        
            INPUT:
                y           - Labels vector
                tx          - Samples
                w           - Weights
                
            OUTPUT:
                score       - Percentage obtained
    """
    # Predict labels
    y_pred = predict_labels(w, tx)
    # Calculate the percentage of correct predictions
    score = np.sum(y_pred == y) / len(y)
    return score
Пример #24
0
def run_least_squares(tx_train, y_train, tx_val, y_val):
    """It performs training and evaluation of least squares with normal equations."""

    print('\nTraining with least squares')
    # Train the model
    w, _ = least_squares(y=y_train, tx=tx_train)

    # Perform predictions
    y_pred = predict_labels(weights=w, data=tx_val, logistic=False)

    # Evaluate
    evaluation = Evaluation(y_actual=y_val, y_pred=y_pred)
    acc = evaluation.get_accuracy()
    f1 = evaluation.get_f1()
    print('Accuracy: {acc}, F1: {f1}'.format(acc=acc, f1=f1))

    return acc, f1
Пример #25
0
def crossvalidation(y, x, k, n, param):
    #data divided in n part, validate on subset k and train on the other n-k"
    """
    Trains and evaluates model for given K-fold CV parameter set.

    For an input matrix x and total of n folds, this function trains and
    evaluates the model. The intention of this function is for it to be
    called n times. Each time, the test set is composed of the rows from the
    training data from row N to row N=k to row N=k+N/n. For example, if the
    dataset has a total of 1000 rows, and this function is called with arguments
    k=3, n=10, this function will use x[300:400] (with y[300:400] as the
    corresponding labels) as the test dataset and all remaining datapoints
    for training (i.e., concatenation of x[0:300] and x[400:1000]).

    Positional parameters:
    y ------ All labels available for training.
    x ------ Input training data matrix; should be shuffled prior to passing
             to the cross_validation function.
    k ------ Which row to begin data segmentation at.
    n ------ Total number of folds in cross-validation.
    param -- list of parameters used in train() function. See implementation
             of train() for more details.
    """
    x_validate = x[k:k + x.shape[0] // n]
    y_validate = y[k:k + y.shape[0] // n]

    x_train = np.concatenate((x[:k], x[k:k + x.shape[0] + 1]), axis=0)
    y_train = np.concatenate((y[:k], y[k:k + y.shape[0] + 1]), axis=0)

    x_validate = replace_outliers_with_mean(x_validate)
    x_train = replace_outliers_with_mean(x_train)

    x_train = standardize(x_train)
    x_validate = standardize(x_validate)

    #x_train = build_poly(x_train, 3)
    #x_validate = build_poly(x_validate, 3)

    x_train = addones(x_train)
    x_validate = addones(x_validate)

    print(x_train.shape)
    w = train(y_train, x_train, param)
    y_predictions = predict_labels(w, x_validate)
    accuracy = calculate_prediction_accuracy(y_predictions, y_validate)
    return accuracy, y_predictions, w
Пример #26
0
def run_ridge_regression(tx_train, y_train, tx_val, y_val):
    """It performs training and evaluation of ridge regression."""

    print('\nTraining with ridge regression')
    lambda_ = 1e-06

    # Train the model
    w, _ = ridge_regression(y=y_train, tx=tx_train, lambda_=lambda_)

    # Perform predictions
    y_pred = predict_labels(weights=w, data=tx_val, logistic=False)

    # Evaluate
    evaluation = Evaluation(y_actual=y_val, y_pred=y_pred)
    acc = evaluation.get_accuracy()
    f1 = evaluation.get_f1()
    print('Accuracy: {acc}, F1: {f1}'.format(acc=acc, f1=f1))

    return acc, f1
Пример #27
0
def learn(predictions, ids_predicted, y_train_jets, tx_train_jets,
          tx_test_jets, ids_test_jets, lambda_best_jets, degree_best_jets):
    print('\nLearning by ridge regression...')
    for jet_num in range(4):
        print('\nLearning from training set with jet number ', str(jet_num),
              ' using optimal hyperparameters...')
        y_train, tx_train = y_train_jets[jet_num], tx_train_jets[jet_num]
        tx_train = feature_engineering(tx_train, degree_best_jets[jet_num],
                                       jet_num > 1)
        w_best, _ = ridge_regression(y_train, tx_train,
                                     lambda_best_jets[jet_num])
        tx_test, ids_test = tx_test_jets[jet_num], ids_test_jets[jet_num]
        tx_test = feature_engineering(tx_test, degree_best_jets[jet_num],
                                      jet_num > 1)
        predictions.append(predict_labels(w_best, tx_test))
        ids_predicted.append(ids_test)
        print('\nReporting prediction accuracy for the training set... \n')
        report_prediction_accuracy(y_train, tx_train, w_best)
        print('\n... this gives a rough idea about the training success.')
        print('\n... predicted labels for test set with jet number ',
              str(jet_num))
    print('\n... ,predicted labels for each test set.')
Пример #28
0
def cross_validation(y, x, k_indices, k, lambda_):#, degree):
    """return the loss of ridge regression."""
    # ***************************************************
    # INSERT YOUR CODE HERE
    # get k'th subgroup in test, others in train: TODO
    # ***************************************************
    other_indices = np.setdiff1d(range(len(y)), k_indices)
    
    y_test = y[k_indices]
    tx_test = x[k_indices]
    
    y_train = y[other_indices]
    tx_train = x[other_indices]
    
    w, loss_tr = ridge_regression(y_train, tx_train, lambda_)
    
    loss_te = 1./(2*len(y)) * np.sum((y_test - tx_test@w)**2) + lambda_ * np.linalg.norm(w)**2
        
    prediction = predict_labels(w, tx_test, 0.0)
    
    accuracy = len(np.where(y_test - prediction == 0)[0]) / len(y) * 100
    
    return loss_tr, loss_te, accuracy
def predict(initial_y, tx, tx_test, indices_test_group, indices_train_group,
            best_weights, best_degrees, logistic):
    """Return the prediction labels for the testing dataset"""

    y_pred = initial_y

    for i, indice_test_group in enumerate(indices_test_group):
        # for standardizing the test subset, we need the data of both train and test subsets
        tx_subset = tx[indices_train_group[i]]
        tx_test_subset = tx_test[indice_test_group]

        # get the standardized test subset
        _, standardized_tx_test_subset = preprocess_data(
            tx_subset, tx_test_subset)

        # predict the labels
        y_pred_subset = predict_labels(
            best_weights[i],
            build_poly(standardized_tx_test_subset, best_degrees[i]), logistic)

        y_pred[indice_test_group] = y_pred_subset

    return y_pred
Пример #30
0
def cross_validation_OLS(X, y, k_fold=4, seed=1):
    """Returns the mean accuracy based on k_fold cross validation"""
    all_indices = build_k_indices(y, k_fold, seed)
    
    accuracy = np.zeros(k_fold)
    
    for k in range(k_fold):
        test_indices = all_indices[k]
        train_indices = np.setdiff1d(range(len(y)), test_indices)
        
        y_test = y[test_indices]
        X_test = X[test_indices]
        
        y_train = y[train_indices]
        X_train = X[train_indices]
        
        w, loss_tr = least_squares(y_train, X_train)
        
        prediction = predict_labels(w, X_test, 0.0)

        accuracy[k] = len(np.where(y_test == prediction)[0]) / len(y_test) * 100
    
    
    return np.mean(accuracy)