예제 #1
0
def run():
    file_path = FileUtils.get_abs_path(__file__, "./data/emailSample1.txt")
    vocab_path = FileUtils.get_abs_path(__file__, "./data/vocab.txt")

    file_contents = open(file_path, "r").read()
    vocabList = open(vocab_path, "r").read()

    vocabList = vocabList.split("\n")[:-1]

    vocabList_d = {}
    for ea in vocabList:
        value, key = ea.split("\t")[:]
        vocabList_d[key] = value

    print(file_contents)

    word_indices = process_email(file_contents, vocabList_d)
    features = email_features(word_indices, vocabList_d)
    print("Length of feature vector: ", len(features))
    print("Number of non-zero entries: ", np.sum(features))

    spam_mat_path = FileUtils.get_abs_path(__file__, "./data/spamTrain.mat")
    spam_mat = loadmat(spam_mat_path)
    X_train = spam_mat["X"]
    y_train = spam_mat["y"]

    C = 0.1
    spam_svc = SVC(C=0.1, kernel="linear")
    spam_svc.fit(X_train, y_train.ravel())
    print("Training Accuracy:",
          (spam_svc.score(X_train, y_train.ravel())) * 100, "%")

    spam_mat_test_path = FileUtils.get_abs_path(__file__,
                                                "./data/spamTest.mat")
    spam_mat_test = loadmat(spam_mat_test_path)
    X_test = spam_mat_test["Xtest"]
    y_test = spam_mat_test["ytest"]

    print("Test Accuracy:", (spam_svc.score(X_test, y_test.ravel())) * 100,
          "%")

    file_path = FileUtils.get_abs_path(__file__, "./data/spamSample1.txt")
    file_contents = open(file_path, "r").read()

    word_indices = process_email(file_contents, vocabList_d)
    features = email_features(word_indices, vocabList_d)
    features = features.reshape([1, 1899])

    print(spam_svc.predict(features))
    print('1 is spam, 0 is not spam')
예제 #2
0
def run():
    data_path = FileUtils.get_abs_path(__file__, "./data/ex3weights.mat")
    mat2 = loadmat(data_path)
    Theta1 = mat2['Theta1']
    Theta2 = mat2['Theta2']

    np.set_printoptions(suppress=True)
    data_path = FileUtils.get_abs_path(__file__, "./data/ex3data1.mat")
    mat = loadmat(data_path)
    X = mat["X"]
    y = mat["y"]

    res = predict_nn(Theta1, Theta2, X)
    print("Accuracy on training set with Neural Network:",
          np.mean((res == y)) * 100)
예제 #3
0
def run():
    np.set_printoptions(suppress=True)

    data_path = FileUtils.get_abs_path(__file__, "./data/ex3data1.mat")
    mat = loadmat(data_path)
    X = mat["X"]
    y = mat["y"]
    fig, axis = plt.subplots(10, 10, figsize=(12, 12))
    for i in range(10):
        for j in range(10):
            axis[i,
                 j].imshow(X[np.random.randint(0, 5001), :].reshape(20,
                                                                    20,
                                                                    order="F"),
                           cmap="hot")  # reshape back to 20 pixel by 20 pixel
            axis[i, j].axis("off")
    plt.show()

    theta_t = str2arr('[-2; -1; 1; 2]')
    X_t = np.array([np.linspace(0.1, 1.5, 15)]).reshape(3, 5).T
    X_t = np.hstack((np.ones((5, 1)), X_t))
    y_t = (str2arr('[1;0;1;0;1]'))
    lambda_t = 3
    cost, grad = cost_function_regularized(theta_t, X_t, y_t, lambda_t)

    print("Cost:", cost, "Expected cost: 2.534819")
    print(
        "Gradients:\n", grad,
        "\nExpected gradients:\n 0.146561\n -0.548558\n 0.724722\n 1.398003")

    lambda_value = 0.1
    num_labels = 10
    all_theta = one_vs_all(X, y, num_labels, lambda_value)
    res = predict_one_vs_all(all_theta, X)
    print("Accuracy on training set with OneVsAll:", np.mean((res == y)) * 100)
예제 #4
0
def run():
    data_path = FileUtils.get_abs_path(__file__, "./data/ex2data1.txt")
    data = np.loadtxt(data_path, delimiter=',')
    n = np.size(data, 1)
    x = data[:, range(n - 1)]
    y = data[:, n - 1]
    m = np.size(y, 0)
    x = np.reshape(x, [m, n - 1])
    y = np.reshape(y, [m, 1])
    ones = np.ones([m, 1])
    x = np.hstack([ones, x])
    theta = np.zeros([n, 1])
    cost, grad = cost_function(theta, x, y)

    print("Cost with theta [0;0;0]: ", cost)
    print('Theta Result with [0;0;0]:\n', grad)

    test_theta = str2arr('[-24; 0.2; 0.2]')
    cost, grad = cost_function(test_theta, x, y)

    print("Cost with theta [-24; 0.2; 0.2]: ", cost)
    print('Theta Result with [-24; 0.2; 0.2]:\n', grad)

    Result = op.minimize(fun=cost_function,
                         x0=theta,
                         args=(x, y),
                         method='TNC',
                         jac=True)
    optimal_theta = Result.x
    print('Optimal theta: ', optimal_theta)

    res = predict(optimal_theta, x)
    print("Accuracy:", np.mean(((res == y).flatten())) * 100)
    plot_decision_boundary(optimal_theta, x, y)
예제 #5
0
def plot_data():
    data_path = FileUtils.get_abs_path(__file__, "./data/ex1data1.txt")
    data = np.loadtxt(data_path, delimiter=',')
    x = data[:, 0]
    y = data[:, 1]
    plt.scatter(x, y, marker='x', cmap='red')
    plt.xlabel("Population of City in 10,000s")
    plt.ylabel('Profit in $10,000s')
예제 #6
0
def run():
    data_path = FileUtils.get_abs_path(__file__, "./data/ex6data1.mat")
    mat = loadmat(data_path)
    X = mat["X"]
    y = mat["y"]

    plot_data(X, y)

    classifier = SVC(C=1, kernel="linear")
    classifier.fit(X, np.ravel(y))

    plot_svc(classifier, X)

    x1 = np.array([1, 2, 1])
    x2 = np.array([0, 4, -1])
    sigma = 2

    print(gaussian_kernel(x1, x2, sigma))

    data_path = FileUtils.get_abs_path(__file__, "./data/ex6data2.mat")
    data2 = loadmat(data_path)

    y2 = data2['y']
    X2 = data2['X']

    plot_data(X2, y2)

    clf2 = SVC(kernel='rbf', gamma=30)
    clf2.fit(X2, y2.ravel())
    plot_svc(clf2, X2)

    data_path = FileUtils.get_abs_path(__file__, "./data/ex6data3.mat")
    data3 = loadmat(data_path)
    X3 = data3["X"]
    y3 = data3["y"]
    Xval = data3["Xval"]
    yval = data3["yval"]

    plot_data(X3, y3)
    C, gamma = dataset_3_params(X3, y3, Xval, yval)
    clf3 = SVC(C=C, gamma=gamma)
    clf3.fit(X3, y3.ravel())
    plot_svc(clf3, X3)
예제 #7
0
def run():
    data_path = FileUtils.get_abs_path(__file__, "./data/ex7data2.mat")
    mat = loadmat(data_path)
    X = mat["X"]
    K = 3
    initial_centroids = np.array([[3, 3], [6, 2], [8, 5]])
    idx = find_closest_centroids(X, initial_centroids)
    print("Closest centroids for the first 3 examples:\n", idx[0:3])

    centroids = compute_centroids(X, idx, K)
    print("Centroids computed after initial finding of closest centroids:\n",
          centroids)
    m, n = X.shape[0], X.shape[1]
    initial_centroids = init_random_centroid(X, K)
    idx = find_closest_centroids(X, initial_centroids)
    plot_kmeans(X, initial_centroids, idx, K, 10)
    plt.show()

    data_path = FileUtils.get_abs_path(__file__, "./data/bird_small.png")
    A = plt.imread(data_path)
    A /= 255
    img_size1, img_size2, rgb = A.shape
    X2 = A.reshape(img_size1 * img_size2, 3)

    K2 = 16
    num_iters = 10
    initial_centroids2 = init_random_centroid(X2, K2)
    centroids2, idx2 = run_kmeans(X2, initial_centroids2, num_iters, K2)

    X2_recovered = centroids2[idx2, :].reshape(A.shape)

    fig, ax = plt.subplots(1, 2, figsize=(8, 4))
    ax[0].imshow(A * 255)
    ax[0].set_title('Original')
    ax[0].grid(False)

    # Display compressed image, rescale back by 255
    ax[1].imshow(X2_recovered * 255)
    ax[1].set_title('Compressed, with %d colors' % K2)
    ax[1].grid(False)

    plt.show()
예제 #8
0
def run():
    data_path = FileUtils.get_abs_path(__file__, "./data/ex1data1.txt")
    data = np.loadtxt(data_path, delimiter=',')
    n = np.size(data, 1)
    x = data[:, range(n - 1)]
    y = data[:, n - 1]
    m = np.size(y, 0)
    x = np.reshape(x, [m, n - 1])
    y = np.reshape(y, [m, 1])
    ones = np.ones([m, 1])
    x = np.hstack([ones, x])

    theta = np.zeros([n, 1])
    alpha = 0.01
    iterations = 1500
    cost = cost_function_j(x, y, theta)
    print('Cost', cost)

    thetaRes, j_hist = gradient_descent(x, y, theta, alpha, iterations)
    print(thetaRes)

    cost = cost_function_j(x, y, str2arr('[-1;2]'))
    print(cost)

    theta0_vals = np.linspace(-10, 10, 100)
    theta1_vals = np.linspace(-1, 4, 100)
    J_vals = np.zeros([len(theta0_vals), len(theta1_vals)])
    for i in range(len(theta0_vals)):
        for j in range(len(theta1_vals)):
            t = np.vstack([theta0_vals[i], theta1_vals[j]])
            J_vals[i, j] = cost_function_j(x, y, t)

    pltData.plot_data()
    plt.plot(x[:, 1], x @ thetaRes, '-', color='red')

    fig1 = plt.figure()
    ax = fig1.add_subplot(111)

    ax.contour(theta0_vals, theta1_vals, J_vals, np.logspace(-2, 3, 20))

    fig2 = plt.figure()
    ax2 = fig2.add_subplot(111, projection='3d')
    theta0_vals, theta1_vals = np.meshgrid(theta0_vals, theta1_vals)

    ax2.plot_surface(theta0_vals, theta1_vals, np.transpose(J_vals))
    plt.show()
예제 #9
0
def run():
    data_path = FileUtils.get_abs_path(__file__, "./data/ex8data1.mat")
    mat = loadmat(data_path)
    X = mat["X"]
    Xval = mat["Xval"]
    yval = mat["yval"]

    plt.scatter(X[:, 0], X[:, 1], marker="x")
    plt.xlim(0, 30)
    plt.ylim(0, 30)
    plt.xlabel("Latency (ms)")
    plt.ylabel("Throughput (mb/s)")
    plt.show()

    mu, sigma2 = estimate_gaussian(X)

    p = multivariate_gaussian(X, mu, sigma2)

    visualize_fit(X, mu, sigma2)

    pval = multivariate_gaussian(Xval, mu, sigma2)
    epsilon, F1 = select_threshold(yval, pval)
    print("Best epsilon found using cross-validation:", epsilon)
    print("Best F1 on Cross Validation Set:", F1)

    outliers = np.nonzero(p < epsilon)[0]
    plt.scatter(X[outliers, 0],
                X[outliers, 1],
                marker="o",
                facecolor="none",
                edgecolor="r",
                s=70)
    plt.xlim(0, 35)
    plt.ylim(0, 35)
    plt.xlabel("Latency (ms)")
    plt.ylabel("Throughput (mb/s)")

    plt.show()
예제 #10
0
def run():
    data_path = FileUtils.get_abs_path(__file__, "./data/ex1data2.txt")
    data = np.loadtxt(data_path, delimiter=',')
    n = np.size(data, 1)
    x = data[:, range(n - 1)]
    y = data[:, n - 1]
    m = np.size(y, 0)
    x = np.reshape(x, [m, n - 1])
    y = np.reshape(y, [m, 1])
    ones = np.ones([m, 1])
    X, mu, sigma = feature_normalize(x)
    x = np.hstack([ones, x])
    X = np.hstack([ones, X])
    theta = np.zeros([n, 1])
    alpha = 0.01
    iterations = 400

    cost = cost_function_j(X, y, theta)
    print('Cost', cost)

    thetaRes, j_hist = gradient_descent(X, y, theta, alpha, iterations)
    print('Theta using gradient descent:\n', thetaRes)

    print('Price of 1650 sq ft and 3 bedroom house: ',
          predict([[1653, 3]], thetaRes, mu, sigma))

    plt.plot(range(400), j_hist)
    plt.xlabel("No of iterations")
    plt.ylabel("Cost")
    plt.title("Gradient Descent")
    plt.show()

    thetaRes = normal_equation(x, y)
    print('Theta using normal equation: \n', thetaRes)

    cost = thetaRes.T @ np.array([[1], [1650], [3]])
    print('Price of 1650 sq ft and 3 bedroom house: ', cost[0][0])
예제 #11
0
def run():
    data_path = FileUtils.get_abs_path(__file__, "./data/ex8_movies.mat")
    mat3 = loadmat(data_path)

    data_path = FileUtils.get_abs_path(__file__, "./data/ex8_movieParams.mat")
    mat4 = loadmat(data_path)

    Y = mat3[
        "Y"]  # 1682 X 943 matrix, containing ratings (1-5) of 1682 movies on 943 user
    R = mat3[
        "R"]  # 1682 X 943 matrix, where R(i,j) = 1 if and only if user j give rating to movie i
    X = mat4[
        "X"]  # 1682 X 10 matrix , num_movies X num_features matrix of movie features
    Theta = mat4[
        "Theta"]  # 943 X 10 matrix, num_users X num_features matrix of user features
    # Compute average rating
    print("Average rating for movie 1 (Toy Story):",
          np.sum(Y[0, :] * R[0, :]) / np.sum(R[0, :]), "/5")
    # Reduce the data set size to run faster
    num_users, num_movies, num_features = 4, 5, 3
    X_test = X[:num_movies, :num_features]
    Theta_test = Theta[:num_users, :num_features]
    Y_test = Y[:num_movies, :num_users]
    R_test = R[:num_movies, :num_users]
    params = np.append(X_test.flatten(), Theta_test.flatten())
    # Evaluate cost function
    J, grad = cofi_cost_function(params, Y_test, R_test, num_users, num_movies,
                                 num_features, 0)
    print("Cost at loaded parameters:", J)
    J2, grad2 = cofi_cost_function(params, Y_test, R_test, num_users,
                                   num_movies, num_features, 1.5)
    print("Cost at loaded parameters (lambda = 1.5):", J2)
    # load movie list

    data_path = FileUtils.get_abs_path(__file__, "./data/movie_ids.txt")

    movieList = open(data_path, "r").read().split("\n")[:-1]
    # see movie list

    # Initialize my ratings
    my_ratings = np.zeros((1682, 1))
    # Create own ratings
    my_ratings[0] = 4
    my_ratings[97] = 2
    my_ratings[6] = 3
    my_ratings[11] = 5
    my_ratings[53] = 4
    my_ratings[63] = 5
    my_ratings[65] = 3
    my_ratings[68] = 5
    my_ratings[82] = 4
    my_ratings[225] = 5
    my_ratings[354] = 5
    print("New user ratings:\n")
    for i in range(len(my_ratings)):
        if my_ratings[i] > 0:
            print("Rated", int(my_ratings[i]), "for index", movieList[i])

    Y = np.hstack((my_ratings, Y))
    R = np.hstack((my_ratings != 0, R))
    # Normalize Ratings
    Ynorm, Ymean = normalize_ratings(Y, R)

    num_users = Y.shape[1]
    num_movies = Y.shape[0]
    num_features = 10
    # Set initial Parameters (Theta,X)
    X = np.random.randn(num_movies, num_features)
    Theta = np.random.randn(num_users, num_features)
    initial_parameters = np.append(X.flatten(), Theta.flatten())
    Lambda = 10

    options = {'maxiter': 100}
    result = op.minimize(fun=cofi_cost_function,
                         x0=initial_parameters,
                         args=(Ynorm, R, num_users, num_movies, num_features,
                               Lambda),
                         method='TNC',
                         jac=True,
                         options=options)
    paramsFinal = result.x

    X = paramsFinal[0:num_movies * num_features].reshape(
        num_movies, num_features)
    Theta = paramsFinal[num_movies * num_features:].reshape(
        num_users, num_features)

    p = X @ Theta.T
    my_predictions = p[:, 0][:, np.newaxis] + Ymean

    df = pd.DataFrame(
        np.hstack((my_predictions, np.array(movieList)[:, np.newaxis])))
    df.sort_values(by=[0], ascending=False, inplace=True)
    df.reset_index(drop=True, inplace=True)
    print("Top recommendations for you:\n")
    for i in range(10):
        print("Predicting rating", round(float(df[0][i]), 1), " for index",
              df[1][i])
예제 #12
0
def run():
    np.set_printoptions(suppress=True)
    data_path = FileUtils.get_abs_path(__file__, "./data/ex4data1.mat")
    mat = loadmat(data_path)
    X = mat["X"]
    y = mat["y"]
    X = np.array(X)
    y = np.array(y)

    data_path = FileUtils.get_abs_path(__file__, "./data/ex4weights.mat")
    mat2 = loadmat(data_path)
    Theta1 = mat2['Theta1']
    Theta2 = mat2['Theta2']

    Theta1 = np.array(Theta1)
    Theta2 = np.array(Theta2)

    nn_params = np.append(Theta1.flatten(), Theta2.flatten())
    input_layer_size = 400
    hidden_layer_size = 25
    num_labels = 10
    lambda_value = 0
    cost = nn_cost_function(nn_params, input_layer_size, hidden_layer_size,
                            num_labels, X, y, lambda_value)[0]
    print(cost)

    input_layer_size = 400
    hidden_layer_size = 25
    num_labels = 10
    lambda_value = 1
    cost = nn_cost_function(nn_params, input_layer_size, hidden_layer_size,
                            num_labels, X, y, lambda_value)[0]
    print(cost)

    initial_Theta1 = random_init_weights(input_layer_size, hidden_layer_size)
    initial_Theta2 = random_init_weights(hidden_layer_size, num_labels)

    nn_params_rand = np.append(initial_Theta1.flatten(),
                               initial_Theta2.flatten())

    lambda_value = 1
    options = {'maxiter': 100}
    result = op.minimize(fun=nn_cost_function,
                         x0=nn_params_rand,
                         args=(input_layer_size, hidden_layer_size, num_labels,
                               X, y, lambda_value),
                         method='TNC',
                         jac=True,
                         options=options)
    optimal_theta = result.x

    Theta1 = optimal_theta[0:hidden_layer_size * (input_layer_size + 1)]
    Theta1 = np.reshape(Theta1, [hidden_layer_size, (input_layer_size + 1)])

    Theta2 = optimal_theta[hidden_layer_size * (input_layer_size + 1):]
    Theta2 = np.reshape(Theta2, [num_labels, (hidden_layer_size + 1)])

    print(Theta1.shape)
    print(Theta2.shape)

    res = predict_nn(Theta1, Theta2, X)

    print("Accuracy on training set with Neural Network:",
          np.mean((res == y)) * 100)

    lambda_value = 2
    options = {'maxiter': 100}
    result = op.minimize(fun=nn_cost_function,
                         x0=nn_params_rand,
                         args=(input_layer_size, hidden_layer_size, num_labels,
                               X, y, lambda_value),
                         method='TNC',
                         jac=True,
                         options=options)
    optimal_theta = result.x

    Theta1 = optimal_theta[0:hidden_layer_size * (input_layer_size + 1)]
    Theta1 = np.reshape(Theta1, [hidden_layer_size, (input_layer_size + 1)])

    Theta2 = optimal_theta[hidden_layer_size * (input_layer_size + 1):]
    Theta2 = np.reshape(Theta2, [num_labels, (hidden_layer_size + 1)])

    print(Theta1.shape)
    print(Theta2.shape)

    res = predict_nn(Theta1, Theta2, X)

    print("Accuracy on training set with Neural Network:",
          np.mean((res == y)) * 100)
예제 #13
0
def run():
      data_path = FileUtils.get_abs_path(__file__, "./data/ex5data1.mat")
      mat = loadmat(data_path)
      X = mat["X"]
      y = mat["y"]
      Xval=mat["Xval"]
      yval=mat["yval"]
      Xtest=mat["Xtest"]
      ytest=mat["ytest"]
      m, n = X.shape
      plt.scatter(X, y)
      plt.xlabel("Change in water level (x)")
      plt.ylabel("Water flowing out of dam (y)")
      plt.show()

      ones = np.ones([m, 1])
      x = np.hstack([ones, X])

      onesVal = np.ones([np.size(Xval,0), 1])
      Xval_ones= np.hstack([onesVal, Xval])

      theta = np.array([[1], [1]])
      J, grad = linear_reg_cost(theta, x , y , 1)
      print('Cost at theta = [1 ; 1]: %', J,
            '\n(this value should be about 303.993192)\n')

      print('Gradient at theta = [1 ; 1]: ', grad,
            '\n(this value should be about [-15.303016; 598.250744])\n')

      lambda_value = 0
      theta = train_linear_reg(x, y, lambda_value)
      plt.scatter(X, y)
      plt.xlabel("Change in water level (x)")
      plt.ylabel("Water flowing out of dam (y)")
      plt.plot(x[:, 1], x @ theta, '-', color='red')
      plt.show()

      lambda_value = 0
      error_train,error_val=learning_curve(x, y, Xval_ones, yval, lambda_value)
      plt.plot(error_val, '-', color='red')
      plt.plot(error_train, '-', color='blue')
      plt.title("Leaning Curve")
      plt.legend(['error_val','error_train'])
      plt.ylabel("Error")
      plt.xlabel("No of samples")
      plt.show()

      p=8
      x_poly = poly_features(X,p)
      x_poly,mu,sigma = feature_normalize(x_poly)
      ones = np.ones([m, 1])
      x_poly = np.hstack([ones, x_poly])


      X_poly_test = poly_features(Xtest, p)
      X_poly_test=(X_poly_test-mu)/sigma


      X_poly_val = poly_features(Xval, p)
      X_poly_val=(X_poly_val-mu)/sigma
      m_poly_val = np.size(X_poly_val,0)
      ones = np.ones([m_poly_val, 1])
      X_poly_val = np.hstack([ones, X_poly_val])



      print('Normalized Training Example 1:\n');
      print(x_poly[0, :])

      lambda_value = 0
      theta = train_linear_reg(x_poly, y, lambda_value)
      plt.scatter(X, y,color='red')
      # plt.plot(x[:, 1], x_poly @ theta, '-', color='red')

      plot_fit(min(x[:, 1]),max(x[:, 1]),mu,sigma,theta,p)
      plt.title("Polynomial Features Fitting")
      plt.show()

      error_train,error_val=learning_curve(x_poly, y, X_poly_val, yval, lambda_value)

      plt.plot(range(1,m+1),error_val, '-', color='red')
      plt.plot(range(1,m+1),error_train, '-', color='blue')
      plt.title("Learning Curve for Polynomial Features")
      plt.legend(['error_val','error_train'])
      plt.ylabel("Error")
      plt.xlabel("No of samples")
      plt.show()

      lambda_vec,error_train,error_val=validation_curve(x_poly, y, X_poly_val, yval)

      plt.plot(lambda_vec,error_val, '-', color='red')
      plt.plot(lambda_vec,error_train, '-', color='blue')
      plt.title("Lambda vs Error for Polynomial Features")
      plt.legend(['error_val','error_train'])
      plt.ylabel("Error")
      plt.xlabel("Lambda")
      plt.show()
예제 #14
0
def run():
    data_path = FileUtils.get_abs_path(__file__, "./data/ex7data1.mat")
    mat3 = loadmat(data_path)
    X3 = mat3["X"]
    plt.scatter(X3[:, 0], X3[:, 1], marker="o", facecolors="none", edgecolors="b")

    X_norm, mu, std = feature_normalize(X3)
    U, S = pca(X_norm)[:2]
    plt.scatter(X3[:, 0], X3[:, 1], marker="o", facecolors="none", edgecolors="b")
    plt.plot([mu[0], (mu + 1.5 * S[0] * U[:, 0].T)[0]], [mu[1], (mu + 1.5 * S[0] * U[:, 0].T)[1]], color="black",
             linewidth=3)
    plt.plot([mu[0], (mu + 1.5 * S[1] * U[:, 1].T)[0]], [mu[1], (mu + 1.5 * S[1] * U[:, 1].T)[1]], color="black",
             linewidth=3)
    plt.xlim(-1, 7)
    plt.ylim(2, 8)
    plt.show()

    print("Top eigenvector U(:,1) =:", U[:, 0])

    K = 1
    Z = project_data(X_norm, U, K)
    print("Projection of the first example:", Z[0][0])

    X_rec = recover_data(Z, U, K)
    print("Approximation of the first example:", X_rec[0, :])

    plt.scatter(X_norm[:, 0], X_norm[:, 1], marker="o", label="Original", facecolors="none", edgecolors="b", s=15)
    plt.scatter(X_rec[:, 0], X_rec[:, 1], marker="o", label="Approximation", facecolors="none", edgecolors="r", s=15)
    plt.title("The Normalized and Projected Data after PCA")
    plt.legend()
    plt.show()

    data_path = FileUtils.get_abs_path(__file__, "./data/ex7faces.mat")
    mat4 = loadmat(data_path)
    X4 = mat4["X"]
    m, n = X4.shape
    print(n)
    fig, ax = plt.subplots(nrows=10, ncols=10, figsize=(30, 30))
    for i in range(0, 100, 10):
        for j in range(10):
            ax[int(i / 10), j].imshow(X4[i + j, :].reshape(32, 32, order="F"), cmap="gray")
            ax[int(i / 10), j].axis("off")
    plt.show()

    X_norm2 = feature_normalize(X4)[0]
    # Run PCA
    U2, S = pca(X_norm2)

    U_reduced = U2[:, :36].T
    fig2, ax2 = plt.subplots(6, 6, figsize=(12, 12))
    for i in range(0, 36, 6):
        for j in range(6):
            ax2[int(i / 6), j].imshow(U_reduced[i + j, :].reshape(32, 32, order="F"), cmap="gray")
            ax2[int(i / 6), j].axis("off")
    plt.show()

    Z2, K2 = project_data_optimal_K(X_norm2,U2, S)
    print("The projected data Z has a size of:", Z2.shape)
    X_rec2 = recover_data(Z2, U2, K2)
    fig3, ax3 = plt.subplots(10, 10, figsize=(20, 20))
    for i in range(0, 100, 10):
        for j in range(10):
            ax3[int(i / 10), j].imshow(X_rec2[i + j, :].reshape(32, 32, order="F"), cmap="gray")
            ax3[int(i / 10), j].axis("off")
    plt.show()