def main(): # Load user ratings raw_training_dataset_df = pd.read_csv( 'movie_ratings_data_set_training.csv') raw_testing_dataset_df = pd.read_csv('movie_ratings_data_set_testing.csv') # Convert the running list of user ratings into a matrix ratings_training_df = pd.pivot_table(raw_training_dataset_df, index='user_id', columns='movie_id', aggfunc=np.max) ratings_testing_df = pd.pivot_table(raw_testing_dataset_df, index='user_id', columns='movie_id', aggfunc=np.max) # Apply matrix factorization to find the latent features U, M = matrix_factorization_utilities.low_rank_matrix_factorization( ratings_training_df.values, num_features=11, regularization_amount=1.1) # Find all predicted ratings by multiplying U and M predicted_ratings = np.matmul(U, M) # Measure RMSE rmse_training = matrix_factorization_utilities.RMSE( ratings_training_df.values, predicted_ratings) rmse_testing = matrix_factorization_utilities.RMSE( ratings_testing_df.values, predicted_ratings) print("Training RMSE: {}".format(rmse_training)) print("Testing RMSE: {}".format(rmse_testing))
def stratLearn(self): """ Learn user use other users by matrix fuctorization mathod. first create pivot table matrix of users, products and purchases. create two matrix by factorization matrix : U-users features P-products features predicted purchases list get from multiplied U and P matrix """ self.setDataFrames() #normalize quntity self.user_product_purch_df_normalized = self.normalize_quantity() purchases = pd.pivot_table(self.user_product_purch_df_normalized, index='user_num', columns='product_num', aggfunc=np.max) U, P = matrix_factorization_utilities.low_rank_matrix_factorization( purchases.as_matrix(), num_features=15, regularization_amount=3.6) predicted_purchases = np.matmul(U, P) P = np.transpose(P) pickle.dump(U, open("user_features.dat", "wb")) pickle.dump(P, open("product_features.dat", "wb")) pickle.dump(predicted_purchases, open("predicted_purchases.dat", "wb")) rmse = matrix_factorization_utilities.RMSE(purchases.as_matrix(), predicted_purchases)
def recommendations(user_id_to_search): newuser = [] conner = sqlite3.connect("db.sqlite3") cursor = conner.execute( "select id from auth_user where id not in (select user_id_id as id from recommendation_system_reviews);" ) for row in cursor: newuser.append(row[0]) if user_id_to_search in newuser: return 0, 0 conn = sqlite3.connect("db.sqlite3") #load user ratings raw_dataset_df = pd.read_sql( "select user_id_id as userid,movie_id_id as movieid,rating from recommendation_system_reviews;", conn) #convert user ratings into a matrix ratings_df = pd.pivot_table(raw_dataset_df, index='userid', columns='movieid', aggfunc=np.max) #load movie titles movies_df = pd.read_sql("select * from blog_movie;", conn, index_col="id") #Apply Matrix factorization to find the latent features U, M = matrix_factorization_utilities.low_rank_matrix_factorization( ratings_df.as_matrix(), num_features=15, regularization_amount=0.1) #find all predicted ratings predicted_ratings = np.matmul(U, M) #print("Enter a user id to get recommendations") #user_id_to_search=int(input()) #print(f"Movies previously reviewed by user {user_id_to_search}") reviewed_movies_df = raw_dataset_df[raw_dataset_df['userid'] == user_id_to_search] reviewed_movies_df = reviewed_movies_df.join(movies_df, on="movieid") reviewed = list(reviewed_movies_df['movieid']) movielist = list(movies_df.index) #Movies that the user hasn't watched recommended_df = movies_df[movies_df.index.isin(reviewed) == False] #Finding Predicted Ratings for user if user_id_to_search == 1: user_ratings = predicted_ratings[0] else: user_ratings = predicted_ratings[user_id_to_search - 6] #add the user's rating to the movies dataframe and then sort the dataframe in descending order of predicted ratings movies_df['ratings'] = user_ratings reviewed = list(reviewed_movies_df['movieid']) recommended_df = movies_df[movies_df.index.isin(reviewed) == False] recommended_df = recommended_df.sort_values(by=['ratings'], ascending=False) movierecommendation = list(recommended_df["content"].head(2)) return list(reviewed_movies_df['title']), movierecommendation
def main(): # Load the data. raw_dataset_df = pd.read_csv("movie_ratings_data_set.csv") # Load movie titles. movies_df = pd.read_csv("movies.csv") # Convert running list of users into a matrix. ratings_df = pd.pivot_table(raw_dataset_df, index='user_id', columns='movie_id', aggfunc=np.max) # Apply matrix factorization to get main features. U, M = matrix_factorization_utilities.low_rank_matrix_factorization( ratings_df.values, num_features=15, regularization_amount=.1) # Swap rows and columns of product features. M = np.transpose(M) # Choose a singular movie to find similar movies to. In this case, movie number 5. movie_id = 5 # Get the selected movie's name and genre. movie_info = movies_df.loc[5] print("We are finding movies similar to this movie") print(f"Movie title: {movie_info.title}") print(f"Genre: {movie_info.genre}") # Get movie features. current_movie_features = M[movie_id - 1] print("The attributes for this movie are:") print(current_movie_features) ######################################## Logic for finding similar movies ########################################## # Subtract current movie features from every other movie's features. difference = M - current_movie_features # Take the abval of the difference. ab_difference = np.abs(difference) # Each movie has 15 features, Sum those 15 features to get a total "difference score" for each. total_diff = np.sum(ab_difference, axis=1) # Create new column in movie list with each movie's difference score. movies_df['difference_score'] = total_diff # Sort movie list by difference score from least to most different. sorted_movie_list = movies_df.sort_values('difference_score') # Print the 5 most similar movies to the selected movie. print("The 5 most similar movies are:") print(sorted_movie_list[['title', 'difference_score']][0:5])
def main(): # Load the data. raw_dataset_df = pd.read_csv("movie_ratings_data_set.csv") # Load movie titles. movies_df = pd.read_csv("movies.csv") # Convert running list of users into a matrix. ratings_df = pd.pivot_table(raw_dataset_df, index='user_id', columns='movie_id', aggfunc=np.max) # Apply matrix factorization to get main features. U, M = matrix_factorization_utilities.low_rank_matrix_factorization( ratings_df.values, num_features=15, regularization_amount=.1) # Find all predicted ratings by multiplying U and M matrices. predicted_ratings = np.matmul(U, M) print("Enter a user_id between 1 and 100 to get a recommendation:") user_id_to_search = int(input()) print(f"Movies previously reviewed by user_id {user_id_to_search}") reviewed_movies_df = raw_dataset_df[raw_dataset_df['user_id'] == user_id_to_search] reviewed_movies_df = reviewed_movies_df.merge(movies_df, on='movie_id') print(reviewed_movies_df[['title', 'genre', 'value']]) input("Press enter to continue.") print("Movies we will recommend:") user_ratings = predicted_ratings[user_id_to_search - 1] movies_df['rating'] = user_ratings already_reviewed = reviewed_movies_df['movie_id'] recommended_df = movies_df[movies_df.index.isin(already_reviewed) == False] recommended_df = recommended_df.sort_values(by=['rating'], ascending=False) print(recommended_df[['title', 'genre', 'rating']].head(5))
def main(): # Load the data. raw_dataset_df = pd.read_csv("movie_ratings_data_set.csv") # Convert running list of users into a matrix. ratings_df = pd.pivot_table(raw_dataset_df, index='user_id', columns='movie_id', aggfunc=np.max) # Apply matrix factorization to get main features. U, M = matrix_factorization_utilities.low_rank_matrix_factorization( ratings_df.values, num_features=15, regularization_amount=.1) # Find all predicted ratings by multiplying U and M. predicted_ratings = np.matmul(U, M) # Save the ratings to a .csv file. predicted_ratings_df = pd.DataFrame(index=ratings_df.index, columns=ratings_df.columns, data=predicted_ratings) predicted_ratings_df.to_csv("predicted_ratings.csv")
# Load user ratings raw_training_dataset_df = pd.read_csv( "data/movie_ratings_data_set_training.csv") raw_testing_dataset_df = pd.read_csv("data/movie_ratings_data_set_testing.csv") # Convert the running list of user ratings into a matrix ratings_training_df = pd.pivot_table(raw_training_dataset_df, index='user_id', columns='movie_id', aggfunc=np.max) ratings_testing_df = pd.pivot_table(raw_testing_dataset_df, index='user_id', columns='movie_id', aggfunc=np.max) # Apply matrix factorization to find latent features U, M = mfu.low_rank_matrix_factorization(ratings_training_df.as_matrix(), num_features=11, regularization_amount=1.1) # Find all predicted ratings by multiplying U and M predicted_ratings = np.matmul(U, M) # Measure RMSE rmse_training = mfu.RMSE(ratings_training_df.as_matrix(), predicted_ratings) rmse_testing = mfu.RMSE(ratings_testing_df.as_matrix(), predicted_ratings) print("Training RMSE: {}".format(rmse_training)) print("Testing RMSE: {}".format(rmse_testing))
import pandas import matrix_factorization_utilities import pandas as pd import numpy as np df = pd.read_csv("sample_New - sample.csv") df = df.loc[df['status'] == "Finished"] #, values='Quantity' popularity_product = pd.pivot_table( df, index=['product_id', 'status', 'unit_price'], columns='location', aggfunc=np.sum) P, L = matrix_factorization_utilities.low_rank_matrix_factorization( popularity_product.as_matrix(), num_features=15, regularization_amount=0.1) # predicted_ratings = np.matmul(P, L) predicted_ratings_df = pd.DataFrame(index=popularity_product.index, columns=popularity_product.columns, data=predicted_ratings) predicted_ratings_df.to_csv("predicted_ratings.csv") popularity_product.to_csv("product_popularity.csv", na_rep=0)
import sys sys.path.insert(0, '../chapter-5') import pandas import numpy import pickle import matrix_factorization_utilities users_movie_ratings_list = pandas.read_csv('../chapter-4/movie_ratings_data_set.csv') users_movie_ratings_pivot_table = pandas.pivot_table(users_movie_ratings_list, index='user_id', columns='movie_id', aggfunc=numpy.max) # normalise ratings around their mean normalise_ratings, means = matrix_factorization_utilities.normalize_ratings(users_movie_ratings_pivot_table.as_matrix()) U,M = matrix_factorization_utilities.low_rank_matrix_factorization(normalise_ratings, num_features=11, regularization_amount=1.1) predicted_ratings = numpy.matmul(U, M) predicted_ratings = predicted_ratings + means pickle.dump(U, open('user_features.dat', 'wb')) pickle.dump(M, open('product_features.dat', 'wb')) pickle.dump(predicted_ratings, open('predicted_ratings.dat', 'wb')) pickle.dump(means, open('means.dat', 'wb'))
def best_movies_by_genre(genre, top_n): return pd.DataFrame(movie_score.loc[(movie_score[genre] == 1)].sort_values( ['weighted_score'], ascending=False)[['title', 'count', 'mean', 'weighted_score']][:top_n])["title"] ratings_df = pd.pivot_table(ratings, index='userId', columns='movieId', aggfunc=np.max) temp = ratings_df.drop([i for i in range(6, 7121)], axis=0) U, M = matrix_factorization_utilities.low_rank_matrix_factorization( temp.to_numpy(), num_features=5, regularization_amount=1.0) ratings_movies = pd.merge(ratings, movie_list, on='movieId') def get_other_movies(movie_name): df_movie_users_series = ratings_movies.loc[ratings_movies['title'] == movie_name]['userId'] df_movie_users = pd.DataFrame(df_movie_users_series, columns=['userId']) other_movies = pd.merge(df_movie_users, ratings_movies, on='userId') other_users_watched = pd.DataFrame( other_movies.groupby('title')['userId'].count()).sort_values( 'userId', ascending=False) other_users_watched['perc_who_watched'] = round( other_users_watched['userId'] * 100 / other_users_watched['userId'][0], 1) titles = []
import matrix_factorization_utilities users_movie_ratings_list = pandas.read_csv( '../chapter-4/movie_ratings_data_set.csv') movies_list = pandas.read_csv('../chapter-4/movies.csv', index_col='movie_id') # convert user movie ratings list into a matrix / pivot table users_movie_ratings_pivot_table = pandas.pivot_table(users_movie_ratings_list, index='user_id', columns='movie_id', aggfunc=numpy.max) # apply matrix factorisation to find latent (hidden) features U, M = matrix_factorization_utilities.low_rank_matrix_factorization( users_movie_ratings_pivot_table.as_matrix(), num_features=15, regularization_amount=1.0) # find predicted ratings by multiplying U and M using numpy.matmul predicted_ratings = numpy.matmul(U, M) print('Enter a user_id to get recommendations (Between 1 and 100):') user_id_to_search = int(input()) print('Movies previously reviewed by user_id {}:'.format(user_id_to_search)) reviewed_movies_list = users_movie_ratings_list[ users_movie_ratings_list['user_id'] == user_id_to_search] reviewed_movies_list = reviewed_movies_list.join(movies_list, on='movie_id') print(reviewed_movies_list[['title', 'genre', 'value']])
### SETUP ### # Import libraries import numpy as np import pandas as pd import matrix_factorization_utilities # Import data movie_ratings_df = pd.read_csv(r'C:\Users\Terence\Desktop\Rec_system\movie_ratings_data_set.csv') movies_df = pd.read_csv(r'C:\Users\Terence\Desktop\Ex_Files_ML_EssT_Recommendations\Ex_Files_ML_EssT_Recommendations\Exercise Files\Chapter 5\movies.csv', index_col='movie_id') # Create Sparse Matrix (UserID x MovieNames) user_ratings_df = pd.pivot_table(movie_ratings_df, index='user_id', columns='movie_id', aggfunc=np.max) # Matrix Factorization to get U x M U, M = matrix_factorization_utilities.low_rank_matrix_factorization(user_ratings_df.values, num_features=11, regularization_amount=1.1) # Predict all user ratings predicted_ratings = np.matmul(U,M) #################################################################################################################################################################################### ## All Time Most Popular ### def allTime(): print('Here are the 15 most popular movies:') most_popular = pd.DataFrame(movie_ratings_df.groupby('movie_id')['value'].count()) most_popular = most_popular.join(movies_df, on = 'movie_id') most_popular = most_popular.sort_values('value', ascending = False) most_popular = most_popular[['title', 'genre']] print(most_popular.head(15))
#!/usr/bin/env/python """Make recommendation of movie according to the rating""" import pandas as pd import numpy as np import matrix_factorization_utilities # Load user ratings RAW_DATASET_DF = pd.read_csv('movie_ratings_data_set.csv') # Convert the running list of user ratings into a matrix RATINGS_DF = pd.pivot_table(RAW_DATASET_DF, index='user_id', columns='movie_id', aggfunc=np.max) # Apply matrix factorization to find the latent features U, M = matrix_factorization_utilities.low_rank_matrix_factorization( RATINGS_DF.as_matrix(), num_features=15, regularization_amount=0.1 ) # Find all predicted ratings by multiplying the U by M PREDICTED_RATINGS = np.matmul(U, M) # Save all the ratings to a csv file PREDICTED_RATINGS_DF = pd.DataFrame( index=RATINGS_DF.index, columns=RATINGS_DF.columns, data=PREDICTED_RATINGS ) PREDICTED_RATINGS_DF.to_csv("predicted_ratings.csv")
#Load user ratings user_ratings_df = pd.read_csv("movie_ratings.csv") #Load movie titles movie_titles_df = pd.read_csv("movie_index.csv") #Create a matrix of user ratings user_ratings_matrix = pd.pivot_table(user_ratings_df, index='user_id', columns='movie_id', aggfunc=np.max) #Apply matrix factorization to find the latent features U, M = matrix_factorization_utilities.low_rank_matrix_factorization( user_ratings_matrix.as_matrix(), num_features=15, regularization_amount=1.0) #Transpose M M = np.transpose(M) #Choose a movie similar to movie # 5 movie_id = 5 #Get movie 5s title and genra movie_information = movie_titles_df.loc[5] #Print title and genra print("Title of movie " + movie_id + " is: {}".format(movie_information.title)) movie_title = movie_information.title print("The genre of " + movie_title + "is: {}".format(movie_information.genre))
import numpy as np import pandas as pd import matrix_factorization_utilities raw_training_dataset_df = pd.read_csv('movie_ratings_data_set_training.csv') raw_testing_dataset_df = pd.read_csv('movie_ratings_data_set_testing.csv') ratings_training_df = pd.pivot_table(raw_training_dataset_df, index='user_id', columns='movie_id', aggfunc=np.max) ratings_testing_df = pd.pivot_table(raw_testing_dataset_df, index='user_id', columns='movie_id', aggfunc=np.max) U, M = matrix_factorization_utilities.low_rank_matrix_factorization( ratings_training_df, num_features=11, regularization_amount=1.1) predicted_ratings = np.matmul(U, M) rmse_training = matrix_factorization_utilities.RMSE( ratings_training_df.as_matrix(), predicted_ratings) rmse_testing = matrix_factorization_utilities.RMSE( ratings_testing_df.as_matrix(), predicted_ratings) print("Training RMSE: {}".format(rmse_training)) print("Testing RMSE: {}".format(rmse_testing))
import numpy as np import pandas as pd import matrix_factorization_utilities # Load user ratings df = pd.read_csv('movie_ratings_data_set.csv') # Load movie titles movies_df = pd.read_csv('movies.csv', index_col='movie_id') # Convert the running list of user ratings into a matrix ratings_df = pd.pivot_table(df, index='user_id', columns='movie_id', aggfunc=np.max) # Apply matrix factorization to find the latent features U, M = matrix_factorization_utilities.low_rank_matrix_factorization(ratings_df.to_numpy(), num_features=15, regularization_amount=1.0) # Swap the rows and columns of product_features just so it's easier to work with M = # Choose a movie to find similar movies to. Let's find movies similar to movie #5: movie_id = # Get movie #1's name and genre movie_information = print("We are finding movies similar to this movie:") print("Movie title: {}".format(movie_information.title)) print("Genre: {}".format(movie_information.genre))
import numpy as np import pandas as pd import matrix_factorization_utilities import scipy # Load user ratings raw_dataset_df = pd.read_csv('movie_ratings_data_set.csv') # Set up pivot table like before ratings_df = pd.pivot_table(raw_dataset_df, index='user_id', columns='movie_id', aggfunc=np.max) # Apply matrix factorization to find the latent features U, M = matrix_factorization_utilities.low_rank_matrix_factorization( ratings_df.as_matrix(), num_features=15, regularization_amount=0.1) # Find all predicted ratings by multiplying the U by M predicted_ratings = np.matmul( U, M, ) # Save all the ratings to a csv file predicted_ratings_df = pd.DataFrame(index=ratings_df.index, columns=ratings_df.columns, data=predicted_ratings) # Save to file predicted_ratings_df.to_csv("predicted_ratings.csv")
import pandas as pd import matrix_factorization_utilities # Load pitcher batter matchups raw_matchups_df = pd.read_csv('matchups.csv') # Convert the running list of matchups into a matrix matchups_df = pd.pivot_table(raw_matchups_df, index='BatterID', columns='PitcherID', aggfunc=np.max) # Create a csv of the raw mathups data, just for comparison: matchups_df.to_csv("initial_matchups.csv") # Apply matrix factorization to find the latent features U, M = matrix_factorization_utilities.low_rank_matrix_factorization( matchups_df.as_matrix(), num_features=15, regularization_amount=.5) # Find all predicted ratings by multiplying the U by M predicted_matchups = np.matmul(U, M) # Save all the ratings to a csv file predicted_matchups_df = pd.DataFrame(index=matchups_df.index, columns=matchups_df.columns, data=predicted_matchups) predicted_matchups_df.to_csv("predicted_matchups.csv") # To Convert the generated csv file to an sqlite table: # FIRST $ pip3 install csv-to-sqlite # THEN $ csv-to-sqlite -f matchup_predictions.csv -o mlb_stats.db