예제 #1
0
def global_average():
    # allocate memory for results:
    err_train = np.zeros(nfolds)
    err_test = np.zeros(nfolds)
    mae_train = np.zeros(nfolds)
    mae_test = np.zeros(nfolds)

    print("Naiv Approach_1_:_Global_Average")
    print("_________________________________")
    print("\n")
    start = t.start()

    # for each fold:
    for fold in range(nfolds):
        train_set = np.array([x != fold for x in seqs])
        test_set = np.array([x == fold for x in seqs])

        train = ratings[train_set]
        test = ratings[test_set]

        # First naiv approach... global
        # calculate model parameters: mean rating over the training set:
        gmr = np.mean(train[:, 2])

        # apply the model to the train set:
        err_train[fold] = np.sqrt(np.mean((train[:, 2] - gmr) ** 2))

        # apply the model to the test set:
        err_test[fold] = np.sqrt(np.mean((test[:, 2] - gmr) ** 2))

        mae_train[fold] = np.mean(np.abs(train[2]) - gmr)

        # print errors:
        print("Fold " + str(fold) + ": RMSE_train=" + str(err_train[fold]) + "; RMSE_test=" + str(err_test[fold]))

        elapsed = t.start() - start

        mem_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss

    # print the final conclusion:
    print("\n")
    print("Mean error on TRAIN: " + str(np.mean(err_train)))
    print("Mean error on  TEST: " + str(np.mean(err_test)))
    print('MAE on TRAIN:' + str(np.mean(mae_train)))
    print('MAE on TEST:' + str(np.mean(mae_train)))
    print("Time: " + str(elapsed % 60) + " seconds")
    print("Memory: " + str(mem_usage) + " kilobytes")
    print("\n")
    print("Global Average :" + str(gmr))
    print("=============================================================")
    print("=============================================================")
    print("\n")
예제 #2
0
def user_item_average():

    start = t.start()
    # for each fold:
    for fold in range(nfolds):
        np.random.shuffle(ratings)

        train_set = np.array([x != fold for x in seqs])
        test_set = np.array([x == fold for x in seqs])

        train = ratings[train_set]
        test = ratings[test_set]

        train_avg_rating = np.zeros((len(train), 2))
        test_avg_rating = np.zeros((len(test), 2))

        regr = linear_model.LinearRegression()

        regr.fit(train_avg_rating, train[:, 2])

        train_reg_pre = rounder.rounder(regr.coef_[0] * train_avg_rating[:, 0] + regr.coef_[1] * train_avg_rating[:, 1] + regr.intercept_)
        test_reg_pre = rounder.rounder(regr.coef_[0] * test_avg_rating[:, 0] + regr.coef_[1] * test_avg_rating[:, 1] + regr.intercept_)

        regr_rmse_error_train = np.sqrt(np.mean((train[:, 2] - train_reg_pre) ** 2))
        regr_rmse_error_test = np.sqrt(np.mean((test[:, 2] - test_reg_pre) ** 2))

        regr_mae_error_train = np.mean(np.absolute((train[:, 2] - train_reg_pre) ** 2))
        egr_mae_error_test = np.mean(np.absolute((test[:, 2] - test_reg_pre) ** 2))

        print("Coefficients:", regr.coef_, regr.intercept_)
        elapsed = t.start() - start
        mem_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    # print errors:
    print("Mean error on TRAIN (RMSE):", regr_rmse_error_train)
    print("Mean error on  TEST (RMSE):", regr_rmse_error_test)
    print("Mean error on TRAIN (MAE):", regr_mae_error_train)
    print("Mean error on  TEST (MAE):", egr_mae_error_test)

    print("Time: " + str(elapsed % 60) + " seconds")
    print("Memory: " + str(mem_usage) + " kilobytes")
예제 #3
0
def user_average():
    ratings_df = pd.DataFrame(ratings, columns=['user_id', 'movie_id', 'rating'], dtype=int)

    # implement the means for each user
    mean_user_all = np.mean(ratings_df.groupby(['user_id'])['rating'].mean())

    # allocate memory for results:
    err_train = np.zeros(nfolds)
    err_test = np.zeros(nfolds)
    mae_train = np.zeros(nfolds)
    mae_test = np.zeros(nfolds)

    print("Naiv Approach_2_:_User_Average")
    print("_________________________________")
    print("\n")

    start = t.start()
    # for each fold:
    for fold in range(nfolds):
        train_sel = np.array([x != fold for x in seqs])
        test_sel = np.array([x == fold for x in seqs])
        train = ratings[train_sel]
        test = ratings[test_sel]

        # make DataFrames for train and test
        train_df = pd.DataFrame(ratings_df.iloc[train_sel],
                                columns=['user_id', 'movie_id', 'rating'],
                                dtype=int)  # .iloc : indexing with np.array in pd.DataFrame)

        test_df = pd.DataFrame(ratings_df.iloc[test_sel],
                               columns=['user_id', 'movie_id', 'rating'],
                               dtype=int)

        # Count the occur frequency of each User in the train & test.
        times_u_train = np.bincount(train_df['user_id'])
        times_u_test = np.bincount(test_df['user_id'])

        # Vector of means Implementation for each User
        mean_u_train = np.array(train_df.groupby(['user_id'])['rating'].mean())

        # After the vector of means Implementation we make equal vectors.
        m_utrain_rep = np.repeat(mean_u_train, times_u_train[1:len(times_u_train)])
        m_utest_rep = np.repeat(mean_u_train, times_u_test[1:len(times_u_test)])

        # apply the model to the train set:f you want to see the results for the first Naiv Approach press 1")
        err_train[fold] = np.sqrt(np.mean((train_df.iloc[:, 2] - m_utrain_rep) ** 2))
        mae_train[fold] = np.mean(np.absolute(train_df.iloc[:,2] - m_utrain_rep)) 

        # apply the model to the test set:
        err_test[fold] = np.sqrt(np.mean((test_df.iloc[:, 2] - m_utest_rep) ** 2))
        mae_test[fold] = np.mean(np.absolute(test_df.iloc[:,2] - m_utest_rep))  

        # print errors for each fold:
        print("Fold " + str(fold) + ": RMSE_train=" + str(err_train[fold]) + "; RMSE_test=" + str(err_test[fold]))

        elapsed = t.start() - start
        mem_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    # print the final conclusion:

    print("\n")
    print("Mean error on TRAIN: " + str(np.mean(err_train)))
    print("Mean error on  TEST: " + str(np.mean(err_test)))
    print("Mean error on TRAIN (MAE): " + str(np.mean(mae_train)))
    print("Mean error on Test (MAE): " + str(np.mean(mae_test)))
    print("Time: " + str(elapsed % 60) + " seconds")
    print("Memory: " + str(mem_usage) + " kilobytes")
    print("\n")
    print("Mean of all user ratings is : " + str(mean_user_all))
    print("=============================================================")
    print("=============================================================")
    print("\n")
예제 #4
0
def mf_gradient_descent():
    """
    Matrix factorization with gradient descent
    :param data:
    :param users:
    :param movies:
    :return:
    """
    num_factors = 10
    steps = 75
    learn_rate = 0.005
    regularization = 0.05  # lambda

    users = np.max(ratings[:, 0])
    movies = np.max(ratings[:, 1])

    start = t.start()

    for fold in range(nfolds):
        print("fold", fold)

        train_set = np.array([ratings[x] for x in np.arange(len(ratings)) if (x % nfolds) != fold])
        test_set = np.array([ratings[x] for x in np.arange(len(ratings)) if (x % nfolds) == fold])

        # Convert the data set to the IxJ matrix  
        x_data = splitmatrix.split_matrix(train_set, users, movies)

        x_hat = np.zeros(users, movies)  # The matrix of predicted train_set

        E = np.zeros(users, movies)  # The error values

        # initialize to random matrices
        U = np.random.rand(users, num_factors)
        M = np.random.rand(num_factors, movies)

        elapsed = 0

        for step in np.arange(steps):
            start = t.start()

            for idx in np.arange(len(train_set)):

                user_id = train_set[idx, 0] - 1
                item_id = train_set[idx, 1] - 1
                actual = train_set[idx, 2]

                error = actual - np.sum(U[user_id, :] * M[:, item_id])

                # Update U and M
                for k in np.arange(num_factors):
                    U[user_id, k] += learn_rate * (2 * error * M[k, item_id] - regularization * U[user_id, k])
                    M[k, item_id] += learn_rate * (2 * error * U[user_id, k] - regularization * M[k, item_id])

            elapsed += t.start() - start

            x_hat = np.dot(U, M)
            E = x_data - x_hat
            intermediate_error = np.sqrt(np.mean(E[np.where(np.isnan(E) == False)] ** 2))

            print("Iteration", step, "out of", steps, "done. Error:", intermediate_error)

            mem_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss

            # Apply U and M one last time and return the result

    x_hat = np.dot(U, M)

    x_train = splitmatrix.split_matrix(train_set, users, movies)
    x_test = splitmatrix.split_matrix(test_set, users, movies)

    e_train = x_train - x_hat
    e = x_test - x_hat

    MF_error_train = np.sqrt(np.mean(e_train[np.where(np.isnan(e_train) == False)] ** 2))
    MF_error_test = np.sqrt(np.mean(e[np.where(np.isnan(e) == False)] ** 2))

    print('Error on MF-GD training set :', MF_error_train)
    print('Error on MF-GD test set:', MF_error_test)
    print("Time: " + str(elapsed % 60) + " seconds")
    print("Memory: " + str(mem_usage) + " kilobytes")