Exemplo n.º 1
0
def main():
    # Load user ratings
    raw_training_dataset_df = pd.read_csv(
        'movie_ratings_data_set_training.csv')
    raw_testing_dataset_df = pd.read_csv('movie_ratings_data_set_testing.csv')

    # Convert the running list of user ratings into a matrix
    ratings_training_df = pd.pivot_table(raw_training_dataset_df,
                                         index='user_id',
                                         columns='movie_id',
                                         aggfunc=np.max)
    ratings_testing_df = pd.pivot_table(raw_testing_dataset_df,
                                        index='user_id',
                                        columns='movie_id',
                                        aggfunc=np.max)

    # Apply matrix factorization to find the latent features
    U, M = matrix_factorization_utilities.low_rank_matrix_factorization(
        ratings_training_df.values, num_features=11, regularization_amount=1.1)

    # Find all predicted ratings by multiplying U and M
    predicted_ratings = np.matmul(U, M)

    # Measure RMSE
    rmse_training = matrix_factorization_utilities.RMSE(
        ratings_training_df.values, predicted_ratings)
    rmse_testing = matrix_factorization_utilities.RMSE(
        ratings_testing_df.values, predicted_ratings)

    print("Training RMSE: {}".format(rmse_training))
    print("Testing RMSE: {}".format(rmse_testing))
Exemplo n.º 2
0
    def stratLearn(self):
        """
        Learn user use other users by matrix fuctorization mathod.
        first create pivot table matrix of users, products and purchases.
        create two matrix by factorization matrix :
        U-users features 
        P-products features
        predicted purchases list get from multiplied U and P matrix 
        """
        self.setDataFrames()

        #normalize quntity
        self.user_product_purch_df_normalized = self.normalize_quantity()

        purchases = pd.pivot_table(self.user_product_purch_df_normalized,
                                   index='user_num',
                                   columns='product_num',
                                   aggfunc=np.max)
        U, P = matrix_factorization_utilities.low_rank_matrix_factorization(
            purchases.as_matrix(), num_features=15, regularization_amount=3.6)
        predicted_purchases = np.matmul(U, P)
        P = np.transpose(P)
        pickle.dump(U, open("user_features.dat", "wb"))
        pickle.dump(P, open("product_features.dat", "wb"))
        pickle.dump(predicted_purchases, open("predicted_purchases.dat", "wb"))

        rmse = matrix_factorization_utilities.RMSE(purchases.as_matrix(),
                                                   predicted_purchases)
Exemplo n.º 3
0
def recommendations(user_id_to_search):

    newuser = []
    conner = sqlite3.connect("db.sqlite3")
    cursor = conner.execute(
        "select id from auth_user where id not in (select user_id_id as id from recommendation_system_reviews);"
    )
    for row in cursor:
        newuser.append(row[0])

    if user_id_to_search in newuser:
        return 0, 0

    conn = sqlite3.connect("db.sqlite3")
    #load user ratings
    raw_dataset_df = pd.read_sql(
        "select user_id_id as userid,movie_id_id as movieid,rating from recommendation_system_reviews;",
        conn)
    #convert user ratings into a matrix
    ratings_df = pd.pivot_table(raw_dataset_df,
                                index='userid',
                                columns='movieid',
                                aggfunc=np.max)
    #load movie titles
    movies_df = pd.read_sql("select * from blog_movie;", conn, index_col="id")

    #Apply Matrix factorization to find the latent features
    U, M = matrix_factorization_utilities.low_rank_matrix_factorization(
        ratings_df.as_matrix(), num_features=15, regularization_amount=0.1)
    #find all predicted ratings
    predicted_ratings = np.matmul(U, M)
    #print("Enter a user id to get recommendations")
    #user_id_to_search=int(input())
    #print(f"Movies previously reviewed by user {user_id_to_search}")
    reviewed_movies_df = raw_dataset_df[raw_dataset_df['userid'] ==
                                        user_id_to_search]
    reviewed_movies_df = reviewed_movies_df.join(movies_df, on="movieid")

    reviewed = list(reviewed_movies_df['movieid'])
    movielist = list(movies_df.index)

    #Movies that the user hasn't watched
    recommended_df = movies_df[movies_df.index.isin(reviewed) == False]

    #Finding Predicted Ratings for user
    if user_id_to_search == 1:
        user_ratings = predicted_ratings[0]
    else:
        user_ratings = predicted_ratings[user_id_to_search - 6]
    #add the user's rating to the movies dataframe and then sort the dataframe in descending order of predicted ratings
    movies_df['ratings'] = user_ratings
    reviewed = list(reviewed_movies_df['movieid'])
    recommended_df = movies_df[movies_df.index.isin(reviewed) == False]
    recommended_df = recommended_df.sort_values(by=['ratings'],
                                                ascending=False)
    movierecommendation = list(recommended_df["content"].head(2))

    return list(reviewed_movies_df['title']), movierecommendation
Exemplo n.º 4
0
def main():
    # Load the data.
    raw_dataset_df = pd.read_csv("movie_ratings_data_set.csv")

    # Load movie titles.
    movies_df = pd.read_csv("movies.csv")

    # Convert running list of users into a matrix.
    ratings_df = pd.pivot_table(raw_dataset_df,
                                index='user_id',
                                columns='movie_id',
                                aggfunc=np.max)

    # Apply matrix factorization to get main features.
    U, M = matrix_factorization_utilities.low_rank_matrix_factorization(
        ratings_df.values, num_features=15, regularization_amount=.1)

    # Swap rows and columns of product features.
    M = np.transpose(M)

    # Choose a singular movie to find similar movies to. In this case, movie number 5.
    movie_id = 5

    # Get the selected movie's name and genre.
    movie_info = movies_df.loc[5]

    print("We are finding movies similar to this movie")
    print(f"Movie title: {movie_info.title}")
    print(f"Genre: {movie_info.genre}")

    # Get movie features.
    current_movie_features = M[movie_id - 1]

    print("The attributes for this movie are:")
    print(current_movie_features)

    ######################################## Logic for finding similar movies ##########################################

    # Subtract current movie features from every other movie's features.
    difference = M - current_movie_features

    # Take the abval of the difference.
    ab_difference = np.abs(difference)

    # Each movie has 15 features, Sum those 15 features to get a total "difference score" for each.
    total_diff = np.sum(ab_difference, axis=1)

    # Create new column in movie list with each movie's difference score.
    movies_df['difference_score'] = total_diff

    # Sort movie list by difference score from least to most different.
    sorted_movie_list = movies_df.sort_values('difference_score')

    # Print the 5 most similar movies to the selected movie.
    print("The 5 most similar movies are:")
    print(sorted_movie_list[['title', 'difference_score']][0:5])
Exemplo n.º 5
0
def main():
    # Load the data.
    raw_dataset_df = pd.read_csv("movie_ratings_data_set.csv")

    # Load movie titles.
    movies_df = pd.read_csv("movies.csv")

    # Convert running list of users into a matrix.
    ratings_df = pd.pivot_table(raw_dataset_df,
                                index='user_id',
                                columns='movie_id',
                                aggfunc=np.max)

    # Apply matrix factorization to get main features.
    U, M = matrix_factorization_utilities.low_rank_matrix_factorization(
        ratings_df.values, num_features=15, regularization_amount=.1)

    # Find all predicted ratings by multiplying U and M matrices.
    predicted_ratings = np.matmul(U, M)

    print("Enter a user_id between 1 and 100 to get a recommendation:")
    user_id_to_search = int(input())

    print(f"Movies previously reviewed by user_id {user_id_to_search}")

    reviewed_movies_df = raw_dataset_df[raw_dataset_df['user_id'] ==
                                        user_id_to_search]
    reviewed_movies_df = reviewed_movies_df.merge(movies_df, on='movie_id')

    print(reviewed_movies_df[['title', 'genre', 'value']])

    input("Press enter to continue.")

    print("Movies we will recommend:")

    user_ratings = predicted_ratings[user_id_to_search - 1]
    movies_df['rating'] = user_ratings

    already_reviewed = reviewed_movies_df['movie_id']
    recommended_df = movies_df[movies_df.index.isin(already_reviewed) == False]
    recommended_df = recommended_df.sort_values(by=['rating'], ascending=False)

    print(recommended_df[['title', 'genre', 'rating']].head(5))
Exemplo n.º 6
0
def main():
    # Load the data.
    raw_dataset_df = pd.read_csv("movie_ratings_data_set.csv")

    # Convert running list of users into a matrix.
    ratings_df = pd.pivot_table(raw_dataset_df,
                                index='user_id',
                                columns='movie_id',
                                aggfunc=np.max)

    # Apply matrix factorization to get main features.
    U, M = matrix_factorization_utilities.low_rank_matrix_factorization(
        ratings_df.values, num_features=15, regularization_amount=.1)

    # Find all predicted ratings by multiplying U and M.
    predicted_ratings = np.matmul(U, M)

    # Save the ratings to a .csv file.
    predicted_ratings_df = pd.DataFrame(index=ratings_df.index,
                                        columns=ratings_df.columns,
                                        data=predicted_ratings)
    predicted_ratings_df.to_csv("predicted_ratings.csv")
Exemplo n.º 7
0
# Load user ratings
raw_training_dataset_df = pd.read_csv(
    "data/movie_ratings_data_set_training.csv")
raw_testing_dataset_df = pd.read_csv("data/movie_ratings_data_set_testing.csv")

# Convert the running list of user ratings into a matrix
ratings_training_df = pd.pivot_table(raw_training_dataset_df,
                                     index='user_id',
                                     columns='movie_id',
                                     aggfunc=np.max)
ratings_testing_df = pd.pivot_table(raw_testing_dataset_df,
                                    index='user_id',
                                    columns='movie_id',
                                    aggfunc=np.max)

# Apply matrix factorization to find latent features
U, M = mfu.low_rank_matrix_factorization(ratings_training_df.as_matrix(),
                                         num_features=11,
                                         regularization_amount=1.1)

# Find all predicted ratings by multiplying U and M
predicted_ratings = np.matmul(U, M)

# Measure RMSE
rmse_training = mfu.RMSE(ratings_training_df.as_matrix(), predicted_ratings)
rmse_testing = mfu.RMSE(ratings_testing_df.as_matrix(), predicted_ratings)

print("Training RMSE: {}".format(rmse_training))
print("Testing RMSE: {}".format(rmse_testing))
import pandas
import matrix_factorization_utilities
import pandas as pd
import numpy as np
df = pd.read_csv("sample_New - sample.csv")
df = df.loc[df['status'] == "Finished"]
#, values='Quantity'
popularity_product = pd.pivot_table(
    df,
    index=['product_id', 'status', 'unit_price'],
    columns='location',
    aggfunc=np.sum)

P, L = matrix_factorization_utilities.low_rank_matrix_factorization(
    popularity_product.as_matrix(), num_features=15, regularization_amount=0.1)
#
predicted_ratings = np.matmul(P, L)

predicted_ratings_df = pd.DataFrame(index=popularity_product.index,
                                    columns=popularity_product.columns,
                                    data=predicted_ratings)

predicted_ratings_df.to_csv("predicted_ratings.csv")

popularity_product.to_csv("product_popularity.csv", na_rep=0)
Exemplo n.º 9
0
import sys
sys.path.insert(0, '../chapter-5')

import pandas
import numpy
import pickle

import matrix_factorization_utilities


users_movie_ratings_list = pandas.read_csv('../chapter-4/movie_ratings_data_set.csv')


users_movie_ratings_pivot_table = pandas.pivot_table(users_movie_ratings_list, index='user_id', columns='movie_id',
                                                     aggfunc=numpy.max)

# normalise ratings around their mean
normalise_ratings, means = matrix_factorization_utilities.normalize_ratings(users_movie_ratings_pivot_table.as_matrix())

U,M = matrix_factorization_utilities.low_rank_matrix_factorization(normalise_ratings,
                                                                   num_features=11,
                                                                   regularization_amount=1.1)

predicted_ratings = numpy.matmul(U, M)

predicted_ratings = predicted_ratings + means

pickle.dump(U, open('user_features.dat', 'wb'))
pickle.dump(M, open('product_features.dat', 'wb'))
pickle.dump(predicted_ratings, open('predicted_ratings.dat', 'wb'))
pickle.dump(means, open('means.dat', 'wb'))
Exemplo n.º 10
0

def best_movies_by_genre(genre, top_n):
    return pd.DataFrame(movie_score.loc[(movie_score[genre] == 1)].sort_values(
        ['weighted_score'],
        ascending=False)[['title', 'count', 'mean',
                          'weighted_score']][:top_n])["title"]


ratings_df = pd.pivot_table(ratings,
                            index='userId',
                            columns='movieId',
                            aggfunc=np.max)
temp = ratings_df.drop([i for i in range(6, 7121)], axis=0)

U, M = matrix_factorization_utilities.low_rank_matrix_factorization(
    temp.to_numpy(), num_features=5, regularization_amount=1.0)
ratings_movies = pd.merge(ratings, movie_list, on='movieId')


def get_other_movies(movie_name):
    df_movie_users_series = ratings_movies.loc[ratings_movies['title'] ==
                                               movie_name]['userId']
    df_movie_users = pd.DataFrame(df_movie_users_series, columns=['userId'])
    other_movies = pd.merge(df_movie_users, ratings_movies, on='userId')
    other_users_watched = pd.DataFrame(
        other_movies.groupby('title')['userId'].count()).sort_values(
            'userId', ascending=False)
    other_users_watched['perc_who_watched'] = round(
        other_users_watched['userId'] * 100 / other_users_watched['userId'][0],
        1)
    titles = []
Exemplo n.º 11
0
import matrix_factorization_utilities

users_movie_ratings_list = pandas.read_csv(
    '../chapter-4/movie_ratings_data_set.csv')

movies_list = pandas.read_csv('../chapter-4/movies.csv', index_col='movie_id')

# convert user movie ratings list into a matrix / pivot table
users_movie_ratings_pivot_table = pandas.pivot_table(users_movie_ratings_list,
                                                     index='user_id',
                                                     columns='movie_id',
                                                     aggfunc=numpy.max)

# apply matrix factorisation to find latent (hidden) features
U, M = matrix_factorization_utilities.low_rank_matrix_factorization(
    users_movie_ratings_pivot_table.as_matrix(),
    num_features=15,
    regularization_amount=1.0)

# find predicted ratings by multiplying U and M using numpy.matmul
predicted_ratings = numpy.matmul(U, M)

print('Enter a user_id to get recommendations (Between 1 and 100):')
user_id_to_search = int(input())

print('Movies previously reviewed by user_id {}:'.format(user_id_to_search))

reviewed_movies_list = users_movie_ratings_list[
    users_movie_ratings_list['user_id'] == user_id_to_search]
reviewed_movies_list = reviewed_movies_list.join(movies_list, on='movie_id')

print(reviewed_movies_list[['title', 'genre', 'value']])
Exemplo n.º 12
0
### SETUP ###

# Import libraries
import numpy as np
import pandas as pd
import matrix_factorization_utilities

# Import data
movie_ratings_df = pd.read_csv(r'C:\Users\Terence\Desktop\Rec_system\movie_ratings_data_set.csv')
movies_df = pd.read_csv(r'C:\Users\Terence\Desktop\Ex_Files_ML_EssT_Recommendations\Ex_Files_ML_EssT_Recommendations\Exercise Files\Chapter 5\movies.csv', index_col='movie_id')

# Create Sparse Matrix (UserID x MovieNames)
user_ratings_df = pd.pivot_table(movie_ratings_df, index='user_id', columns='movie_id', aggfunc=np.max)

# Matrix Factorization to get U x M
U, M = matrix_factorization_utilities.low_rank_matrix_factorization(user_ratings_df.values, num_features=11, regularization_amount=1.1)

# Predict all user ratings
predicted_ratings = np.matmul(U,M)

####################################################################################################################################################################################

## All Time Most Popular ###
def allTime():
    print('Here are the 15 most popular movies:')
    most_popular = pd.DataFrame(movie_ratings_df.groupby('movie_id')['value'].count())
    most_popular = most_popular.join(movies_df, on = 'movie_id')
    most_popular = most_popular.sort_values('value', ascending = False)
    most_popular = most_popular[['title', 'genre']]
    print(most_popular.head(15))
Exemplo n.º 13
0
#!/usr/bin/env/python
"""Make recommendation of movie according to the rating"""

import pandas as pd
import numpy as np
import matrix_factorization_utilities

# Load user ratings
RAW_DATASET_DF = pd.read_csv('movie_ratings_data_set.csv')

# Convert the running list of user ratings into a matrix
RATINGS_DF = pd.pivot_table(RAW_DATASET_DF, index='user_id', columns='movie_id', aggfunc=np.max)

# Apply matrix factorization to find the latent features
U, M = matrix_factorization_utilities.low_rank_matrix_factorization(
    RATINGS_DF.as_matrix(),
    num_features=15,
    regularization_amount=0.1
)

# Find all predicted ratings by multiplying the U by M
PREDICTED_RATINGS = np.matmul(U, M)

# Save all the ratings to a csv file
PREDICTED_RATINGS_DF = pd.DataFrame(
    index=RATINGS_DF.index,
    columns=RATINGS_DF.columns,
    data=PREDICTED_RATINGS
)
PREDICTED_RATINGS_DF.to_csv("predicted_ratings.csv")
#Load user ratings
user_ratings_df = pd.read_csv("movie_ratings.csv")

#Load movie titles
movie_titles_df = pd.read_csv("movie_index.csv")

#Create a matrix of user ratings
user_ratings_matrix = pd.pivot_table(user_ratings_df,
                                     index='user_id',
                                     columns='movie_id',
                                     aggfunc=np.max)

#Apply matrix factorization to find the latent features
U, M = matrix_factorization_utilities.low_rank_matrix_factorization(
    user_ratings_matrix.as_matrix(),
    num_features=15,
    regularization_amount=1.0)
#Transpose M
M = np.transpose(M)

#Choose a movie similar to movie # 5
movie_id = 5

#Get movie 5s title and genra
movie_information = movie_titles_df.loc[5]

#Print title and genra
print("Title of movie " + movie_id + " is: {}".format(movie_information.title))
movie_title = movie_information.title
print("The genre of " + movie_title + "is: {}".format(movie_information.genre))
Exemplo n.º 15
0
import numpy as np
import pandas as pd
import matrix_factorization_utilities

raw_training_dataset_df = pd.read_csv('movie_ratings_data_set_training.csv')
raw_testing_dataset_df = pd.read_csv('movie_ratings_data_set_testing.csv')

ratings_training_df = pd.pivot_table(raw_training_dataset_df,
                                     index='user_id',
                                     columns='movie_id',
                                     aggfunc=np.max)
ratings_testing_df = pd.pivot_table(raw_testing_dataset_df,
                                    index='user_id',
                                    columns='movie_id',
                                    aggfunc=np.max)

U, M = matrix_factorization_utilities.low_rank_matrix_factorization(
    ratings_training_df, num_features=11, regularization_amount=1.1)

predicted_ratings = np.matmul(U, M)

rmse_training = matrix_factorization_utilities.RMSE(
    ratings_training_df.as_matrix(), predicted_ratings)
rmse_testing = matrix_factorization_utilities.RMSE(
    ratings_testing_df.as_matrix(), predicted_ratings)

print("Training RMSE: {}".format(rmse_training))
print("Testing RMSE: {}".format(rmse_testing))
import numpy as np
import pandas as pd
import matrix_factorization_utilities

# Load user ratings
df = pd.read_csv('movie_ratings_data_set.csv')

# Load movie titles
movies_df = pd.read_csv('movies.csv', index_col='movie_id')

# Convert the running list of user ratings into a matrix
ratings_df = pd.pivot_table(df, index='user_id', columns='movie_id', aggfunc=np.max)

# Apply matrix factorization to find the latent features
U, M = matrix_factorization_utilities.low_rank_matrix_factorization(ratings_df.to_numpy(),
                                                                    num_features=15,
                                                                    regularization_amount=1.0)

# Swap the rows and columns of product_features just so it's easier to work with
M =

# Choose a movie to find similar movies to. Let's find movies similar to movie #5:
movie_id =

# Get movie #1's name and genre
movie_information =

print("We are finding movies similar to this movie:")
print("Movie title: {}".format(movie_information.title))
print("Genre: {}".format(movie_information.genre))
import numpy as np
import pandas as pd
import matrix_factorization_utilities
import scipy

# Load user ratings
raw_dataset_df = pd.read_csv('movie_ratings_data_set.csv')

# Set up pivot table like before
ratings_df = pd.pivot_table(raw_dataset_df,
                            index='user_id',
                            columns='movie_id',
                            aggfunc=np.max)

# Apply matrix factorization to find the latent features
U, M = matrix_factorization_utilities.low_rank_matrix_factorization(
    ratings_df.as_matrix(), num_features=15, regularization_amount=0.1)

# Find all predicted ratings by multiplying the U by M
predicted_ratings = np.matmul(
    U,
    M,
)

# Save all the ratings to a csv file
predicted_ratings_df = pd.DataFrame(index=ratings_df.index,
                                    columns=ratings_df.columns,
                                    data=predicted_ratings)

# Save to file
predicted_ratings_df.to_csv("predicted_ratings.csv")
import pandas as pd
import matrix_factorization_utilities

# Load pitcher batter matchups
raw_matchups_df = pd.read_csv('matchups.csv')

# Convert the running list of matchups into a matrix
matchups_df = pd.pivot_table(raw_matchups_df,
                             index='BatterID',
                             columns='PitcherID',
                             aggfunc=np.max)

# Create a csv of the raw mathups data, just for comparison:
matchups_df.to_csv("initial_matchups.csv")

# Apply matrix factorization to find the latent features
U, M = matrix_factorization_utilities.low_rank_matrix_factorization(
    matchups_df.as_matrix(), num_features=15, regularization_amount=.5)

# Find all predicted ratings by multiplying the U by M
predicted_matchups = np.matmul(U, M)

# Save all the ratings to a csv file
predicted_matchups_df = pd.DataFrame(index=matchups_df.index,
                                     columns=matchups_df.columns,
                                     data=predicted_matchups)
predicted_matchups_df.to_csv("predicted_matchups.csv")
# To Convert the generated csv file to an sqlite table:
# FIRST $ pip3 install csv-to-sqlite
# THEN $ csv-to-sqlite -f matchup_predictions.csv -o mlb_stats.db