예제 #1
0
def cross_validation(y, tX, degrees, gammas, max_iters, k_fold, seed):
    """
        Uses the cross_validation to find the best of the the given parameters and returns the best result (degree, error and gamma)
        The best result will be the one associated with the gamma minimizing the classification error, i.e. the percentage of failures in the retrieval process.
        Note that we give the RAW data to the cross_validation, without any transformation on them.
        @param y : raw output variable 
        @param tx :raw input variable, might be a polynomial basis obtained from the input x
        @param degrees : a vector containing the different polynomial degrees for the polynomial basis (i.e. we want to return the degree that best fits the data)
        @param gammas : a vector containing the different gammas we want to test on (i.e. we want to return the gamma on this list that minimizes the error)
        @param max_iters : the maximum number of iterations
        @param k_fold : the number of groups in which we partition the data for the cross validation
        @param seed : the seed for the random number generation
        @return best_degree_final : the degree of the polynomial basis that best fits the data
        @return best_gamma_final : the gamma that minimizes the error for the best_degree_final polynomial basis
        @return best_error_final : the classification error done by our data, i.e. the percentage of mismatches
    """

    # split data in k fold
    k_indices = build_k_indices(y, k_fold, seed)

    # cross validation:
    best_gamma = np.zeros(len(degrees))
    best_error = np.zeros(len(degrees))
    for j, degree in enumerate(degrees):

        print('\n Testing for a polynomial of degree ', degree)
        #Training and testing errors for each gamma, so we are able to visualize them afterwards.
        class_error_tr = np.zeros(len(gammas))
        class_error_te = np.zeros(len(gammas))

        for i, gamma in enumerate(gammas):
            print('gamma=', round(gamma, 6), end=", ")

            #This is actually where the k-fold cross-validation is computed. We sum all the errors and then average them.
            loss_tr_sum = 0
            loss_te_sum = 0
            for k in range(k_fold + 1):
                loss_tr_tmp, loss_te_tmp = cross_validation_lr(
                    y, tX, k_indices, k, gamma, max_iters, degree)
                loss_tr_sum += loss_tr_tmp
                loss_te_sum += loss_te_tmp

            class_error_tr[i] = loss_tr_sum / k_fold
            class_error_te[i] = loss_te_sum / k_fold
            print('Percentage of classification error : ', class_error_te[i])
        best_error[j] = min(class_error_te)
        best_gamma[j] = gammas[int(np.argmin(class_error_te))]
        cross_validation_visualization(gammas, class_error_tr, class_error_te,
                                       degree)

    best_error_final = min(best_error)
    print(best_error_final.shape)
    print(np.argmin(best_error))
    best_gamma_final = best_gamma[int(np.argmin(best_error))]
    best_degree_final = degrees[int(np.argmin(best_error))]

    print('\nBest degree :', best_degree_final)
    print('Best error :', best_error_final)
    print('Best gamma :', best_gamma_final)
    return best_degree_final, best_gamma_final, best_error_final
예제 #2
0
def cross_validation_demo(x, y):
    seed = 1
    degree = 7
    k_fold = 4
    lambdas = np.logspace(-4, 0, 30)
    # split data in k fold
    k_indices = build_k_indices(y, k_fold, seed)
    # define lists to store the loss of training data and test data
    rmse_tr = []
    rmse_te = []
    # ***************************************************
    # INSERT YOUR CODE HERE
    # cross validation: TODO
    # ***************************************************
    for l in lambdas:

        k_loss_tr, k_loss_te = cross_validation(y, x, k_indices, k_fold, l,
                                                degree)

        mean_loss_tr = (k_loss_tr)
        mean_loss_te = (k_loss_te)

        rmse_tr = np.append(rmse_tr, np.sqrt(mean_loss_tr))
        rmse_te = np.append(rmse_te, np.sqrt(mean_loss_te))

    cross_validation_visualization(lambdas, rmse_tr, rmse_te)
예제 #3
0
def optimize_model(y,
                   x,
                   degree_min,
                   degree_max,
                   lambdas=np.logspace(-4, 2, 30),
                   k_fold=4,
                   max_iter=200,
                   alpha=10**-6):
    # lambda_ is penalization for the norm of w
    # max_iter is for the number of iteration when we compute the gradiernt
    # alpha is the size of the step
    seed = 1
    k_fold_multiplier = 1  # -3*k_fold+35 #rule of thumb according to stackexchange
    deg_range = range(degree_min, degree_max + 1)
    print("Number of degrees tested: ", len(deg_range))
    print("Number of lambdas tested: ", len(lambdas))
    print("Number of lambdas tested: ", lambdas)

    min_lambdas = []
    print("List of best lambda per degree tuples (degree, lambda, RMSE, var):")
    for degree in deg_range:
        # define lists to store the loss of training data and test data
        rmse_tr = []
        rmse_te = []
        print("### DEGREE: ", degree)
        for l_idx, lambda_ in enumerate(lambdas):
            rmse_tr_lamb = []
            rmse_te_lamb = []
            for km in range(k_fold_multiplier):
                # get new splits for every iteration int the k fold process
                k_indices = build_k_indices(y, k_fold, seed)
                for k_idx in range(k_fold):
                    mse_tr, mse_te, _ = cross_validation(
                        y, x, k_indices, k_idx, lambda_, degree, max_iter,
                        alpha)
                    rmse_tr_lamb.append(np.sqrt(2 * mse_tr))
                    rmse_te_lamb.append(np.sqrt(2 * mse_te))
            rmse_tr.append(np.mean(rmse_tr_lamb))
            rmse_te.append(np.mean(rmse_te_lamb))
            print("# Degree: ", degree, " lambda: ", lambda_, " mean tr: ",
                  np.mean(rmse_tr_lamb))
            print("# Degree: ", degree, " lambda: ", lambda_, " mean te: ",
                  np.mean(rmse_te_lamb))
        lamb_tuple = (degree, lambdas[np.argmin(rmse_te)], min(rmse_te),
                      np.var(rmse_te_lamb))
        min_lambdas.append(lamb_tuple)
        print(lamb_tuple)
        cross_validation_visualization(degree, lambda_, lambdas, rmse_tr,
                                       rmse_te)
    best_rmse = min(min_lambdas, key=lambda t: t[2])
    best_degree = best_rmse[0]
    best_lambda = best_rmse[1]
    print("Best degree: ", best_degree)
    print("Best lambda: ", best_lambda)
    print("Best RMSE: ", best_rmse[2])
    print("Confidence variance: ", best_rmse[3])
    return best_degree, best_lambda
예제 #4
0
def cross_validation_accross_degrees(x, y, regression, degrees = range(1,10), k_fold = 4, lambda_ = 0):
    seed = 2
    # split data in k fold
    k_indices = build_k_indices(y, k_fold, seed)
    # define lists to store the loss of training data and test data
    mean_losses_tr = []
    mean_losses_te = []
    # ***************************************************

    for degree in degrees:
        loss_train = 0
        loss_test = 0
        for k in range(k_fold):
            loss_tr, loss_te = cross_validation(y, x, k_indices, k, lambda_, degree, regression)   
            loss_train += loss_tr
            loss_test += loss_te
        #append the losses with the mean error
        mean_losses_tr += [loss_train/k_fold]
        mean_losses_te += [loss_test/k_fold]
    # ***************************************************  
    cross_validation_visualization(xs=degrees, mse_tr= mean_losses_tr, mse_te=mean_losses_te, x_name='degree', xscale='lin')
    return mean_losses_tr, mean_losses_te
예제 #5
0
def cross_validation_accross_lambdas(x, y, regression, degree = 3, k_fold = 4, lambdas = np.logspace(-5, 3, 8)):
    seed = 2
    
    # split data in k fold
    k_indices = build_k_indices(y, k_fold, seed)
    # define lists to store the loss of training data and test data
    mean_losses_tr = []
    mean_losses_te = []
    # ***************************************************
    for lambda_ in lambdas:
        loss_train = 0
        loss_test = 0
        for k in range(k_fold):
            loss_tr, loss_te = cross_validation(y, x, k_indices, k, lambda_, degree, regression)
            loss_train += loss_tr
            loss_test += loss_te
        #append the losses
        mean_losses_tr += [loss_train/k_fold]
        mean_losses_te += [loss_test/k_fold]
    # ***************************************************
    cross_validation_visualization(xs=lambdas, mse_tr= mean_losses_tr, mse_te=mean_losses_te, x_name='lambda')
    return mean_losses_tr, mean_losses_te
예제 #6
0
    def cross_validation(self,
                         cv,
                         lambdas,
                         lambda_name,
                         seed=1,
                         skip=False,
                         plot=False,
                         **kwargs):
        """
        Cross validation method to acquire the best prediction parameters.
        It will use the train_x y as data and do K-fold cross validation.

        :param cv:              cross validation times
        :param lambdas:         array of lambdas to be validated
        :param lambda_name:     the lambda name tag
        :param seed:            random seed
        :param skip:            skip the cross validation, only valid 1 time
        :param plot             plot cross-validation plot, if machine does not
                                support matplotlib.pyplot, set to false.
        :param kwargs:          other parameters could pass into compute_weight
        :return: best weights, best_lambda, (training error, valid error)
        """
        np.set_printoptions(precision=4)
        k_indices = build_k_indices(self.train_y, cv, seed)
        # define lists to store the loss of training data and test data
        err_tr = []
        err_te = []
        weights = []
        print("K-fold ({}) cross validation to examine [{}]".format(
            cv, lambdas))
        for lamb in lambdas:
            print("For lambda: {}".format(lamb))
            _mse_tr = []
            _mse_te = []
            _weight = []
            for k in range(cv):
                print('Cross valid iteration {}'.format(k))
                weight, loss_tr, loss_te = self._loop_cross_validation(
                    self.train_y, self.train_x, k_indices, k, lamb,
                    lambda_name, **kwargs)
                _mse_tr += [loss_tr]
                _mse_te += [loss_te]
                _weight.append(weight)
                if skip:
                    break
            avg_tr = np.average(_mse_tr)
            avg_te = np.average(_mse_te)
            err_tr += [avg_tr]
            err_te += [avg_te]
            weights.append(_weight)
            print("\t train error {}, \t valid error {}".format(
                avg_tr, avg_te))
        # Select the best parameter during the cross validations.
        print('K-fold cross validation result: \n {} \n {}'.format(
            err_tr, err_te))
        # Select the best based on least err_te
        min_err_te = np.argmin(err_te)
        print('Best err_te result {}, lambda {}'.format(
            err_te[min_err_te], lambdas[min_err_te]))
        if plot:
            from plots import cross_validation_visualization
            cross_validation_visualization(lambdas,
                                           err_tr,
                                           err_te,
                                           title=lambda_name,
                                           error_name=self.loss_function_name)
        else:
            save_numpy_array(lambdas,
                             err_tr,
                             err_te,
                             names=['lambda', 'err_tr', 'err_te'],
                             title=self.regularizer.name)

        return weights[min_err_te], lambdas[min_err_te], (err_tr, err_te)