def cross_validation(y, x, k_indices, k, lambda_, degree): """return the loss of ridge regression.""" # *************************************************** # get k'th subgroup in test, others in train x_test = x[k_indices[k]] y_test = y[k_indices[k]] ind = np.linspace(0,k_indices.shape[0]-1,k_indices.shape[0], dtype = np.int64) ind = np.delete(ind,k) new_ind = np.ravel(k_indices[ind]) x_train = x[new_ind] y_train = y[new_ind] #form data with polynomial degree x_test_poly = build_poly(x_test,degree).T x_train_poly = build_poly(x_train,degree).T #find weights w = ridge_regression(y_train, x_train_poly, lambda_) # calculate the loss for train and test data loss_tr = np.sqrt(2*compute_mse(y_train,x_train_poly, w)) loss_te = np.sqrt(2*compute_mse(y_test,x_test_poly, w)) # *************************************************** return loss_tr, loss_te
def cross_validation(y, x, k_indices, k, lambda_, degree, cross_features_degree, compute_weightsFunction, compute_lossFunction): """ selects kth group of indices as test set and rest as training set, builds the polynomial features up to degree d computes the weights based on the training set with the specified function returns losses of training set and testing set with the specified function """ # determine the indices in the training set and those in the test set tr_indices = np.concatenate( (k_indices[:k].ravel(), k_indices[k + 1:].ravel())) te_indices = k_indices[k] # select training and testing x and y x_tr = x[tr_indices] y_tr = y[tr_indices] x_te = x[te_indices] y_te = y[te_indices] # build polynomial features x_poly_tr = build_poly(x_tr, degree, cross_features_degree) x_poly_te = build_poly(x_te, degree, cross_features_degree) # find weights using the training data only weights_tr = compute_weightsFunction(y_tr, x_poly_tr, lambda_) # compute the losses for cross validation loss_tr = compute_lossFunction(y_tr, x_poly_tr, weights_tr) # compute without lambda loss_te = compute_lossFunction(y_te, x_poly_te, weights_tr) return loss_tr, loss_te
def cross_validation(y, x, k_indices, k, lambda_, degree): """return the loss of ridge regression.""" loss_tr = [] loss_te = [] for j in range(k): index_te = k_indices[j] ind = np.ones(k_indices.shape[0], bool) ind[j] = False index_tr = k_indices[ind].flatten() x_tr = x[index_tr] x_te = x[index_te] y_tr = y[index_tr] y_te = y[index_te] xpoly_tr = build_poly(x_tr, degree) xpoly_te = build_poly(x_te, degree) w_s = ridge_regression(y_tr, xpoly_tr, lambda_) loss_tr.append(compute_mse(y_tr, xpoly_tr, w_s)) loss_te.append(compute_mse(y_te, xpoly_te, w_s)) return np.mean(loss_tr), np.mean(loss_te)
def cross_validation_lr(y, x, k_indices, k, gamma, lambda_, max_iters, degree): """ Return the classification error of the logistic regression for each step of the k-fold cross validation. @param y : raw output variable @param x :raw input variable, might be a polynomial basis obtained from the input x @param k_indices : the indices of the data that belong to each of the K groups of the cross_validation. @param k : the index of the group that we are using for the testing. @param gamma : the gamma with which we're doing the cross_validation @param lambda : the penalization parameter we're working on. @param max_iters : the max number of iterations of the logistic regression @param degree : the degree of the polynomial basis with which we're doing the cross validation @return loss_tr : the classification error made on the training data. @return loss_te : the classification error made on the testing data. """ #1. WE DIVIDE THE DATA IN THE SUBGROUPS # get k'th subgroup in test, others in train: x_test = np.array(x[k_indices[k - 1]]) y_test = np.array(y[k_indices[k - 1]]) x_train = np.empty((0, x.shape[1])) y_train = np.empty((0, 1)) #This for loops gets the other groups for k_iter, validation_points in enumerate(k_indices): if (k_iter != k - 1): x_train = np.append(x_train, x[validation_points], axis=0) y_train = np.append(y_train, y[validation_points]) #2. WE FORMAT THE DATA #we sanitize and standardize our training data here, and apply the same median, mean and variance to the testing data x_train = count_NaN(x_train) x_test = count_NaN(x_test) x_train, median_train = sanitize_NaN(x_train) x_test, median_test = sanitize_NaN(x_test, median_train) x_train, mean_tr, std_tr = standardize(x_train) x_test, mean_te, ste_te = standardize(x_test, mean_tr, std_tr) # form data with polynomial degree: x_train_poly = build_poly(x_train, degree) x_test_poly = build_poly(x_test, degree) #print('Shape of polynomial training date :', x_train_poly.shape) #3. WE RUN THE MODEL AND COMPUTE THE ERROR # Relgularized logistic regression: w_rlr = regularized_logistic_regression(y_train, x_train_poly, gamma, lambda_, max_iters) # calculate the classification error for train and test data: loss_tr = sum( abs((2 * (y_train) - 1) - predict_labels(w_rlr, x_train_poly))) / (2 * len(y_train)) loss_te = sum(abs((2 * y_test - 1) - predict_labels(w_rlr, x_test_poly))) / (2 * len(y_test)) return loss_tr, loss_te
def plot_fitted_curve(y, x, weights, degree, ax): """plot the fitted curve.""" ax.scatter(x, y, color='b', s=12, facecolors='none', edgecolors='r') xvals = np.arange(min(x) - 0.1, max(x) + 0.1, 0.1) tx = build_poly(xvals, degree) f = tx.dot(weights) ax.plot(xvals, f) ax.set_xlabel("x") ax.set_ylabel("y") ax.set_title("Polynomial degree " + str(degree))
def cross_validation_log_reg(y, x, k_indices, k, lambda_, degree, method='penalized'): """return the classification accuracy of logistic regression.""" # *************************************************** # get k'th subgroup in test, others in train x_test = x[k_indices[k]] y_test = y[k_indices[k]] ind = np.linspace(0,k_indices.shape[0]-1,k_indices.shape[0], dtype = np.int64) ind = np.delete(ind,k) new_ind = np.ravel(k_indices[ind]) x_train = x[new_ind] y_train = y[new_ind] #form data with polynomial degree x_test_poly = build_poly(x_test,degree) x_train_poly = build_poly(x_train,degree) print(x_train_poly.shape,'***********************') #init weights initial_w = np.zeros((x_train_poly.shape[1])) print(initial_w.shape,'***********************') #find weights w = running_gradient(y_train, x_train_poly, initial_w, lambda_, method) # calculate the classification accuracy for train and test data y_pred = predict_labels(w,x_test_poly) test_score= calculate_classification_accuracy(y_test, y_pred) y_pred_train = predict_labels(w,x_train_poly) train_score= calculate_classification_accuracy(y_train, y_pred_train) # *************************************************** return test_score, train_score