def fit(self, reviews_pth, movies_pth, latent_features=12, learning_rate=0.0001, iters=100): ''' This function performs matrix factorization using a basic form of FunkSVD with no regularization INPUT: reviews_pth - path to csv with at least the four columns: 'user_id', 'movie_id', 'rating', 'timestamp' movies_pth - path to csv with each movie and movie information in each row latent_features - (int) the number of latent features used learning_rate - (float) the learning rate iters - (int) the number of iterations OUTPUT: None - stores the following as attributes: n_users - the number of users (int) n_movies - the number of movies (int) num_ratings - the number of ratings made (int) reviews - dataframe with four columns: 'user_id', 'movie_id', 'rating', 'timestamp' movies - dataframe of user_item_mat - (np array) a user by item numpy array with ratings and nans for values latent_features - (int) the number of latent features used learning_rate - (float) the learning rate iters - (int) the number of iterations ''' # Store inputs as attributes self.reviews = pd.read_csv(reviews_pth) self.movies = pd.read_csv(movies_pth) # Create user-item matrix usr_itm = self.reviews[['user_id', 'movie_id', 'rating', 'timestamp']] self.user_item_df = usr_itm.groupby(['user_id','movie_id'])['rating'].max().unstack() self.user_item_mat= np.array(self.user_item_df) # Store more inputs self.latent_features = latent_features self.learning_rate = learning_rate self.iters = iters # Set up useful values to be used through the rest of the function self.n_users = self.user_item_mat.shape[0] self.n_movies = self.user_item_mat.shape[1] self.num_ratings = np.count_nonzero(~np.isnan(self.user_item_mat)) self.user_ids_series = np.array(self.user_item_df.index) self.movie_ids_series = np.array(self.user_item_df.columns) # initialize the user and movie matrices with random values user_mat = np.random.rand(self.n_users, self.latent_features) movie_mat = np.random.rand(self.latent_features, self.n_movies) # initialize sse at 0 for first iteration sse_accum = 0 # keep track of iteration and MSE print("Optimizaiton Statistics") print("Iterations | Mean Squared Error ") # for each iteration for iteration in range(self.iters): # update our sse old_sse = sse_accum sse_accum = 0 # For each user-movie pair for i in range(self.n_users): for j in range(self.n_movies): # if the rating exists if self.user_item_mat[i, j] > 0: # compute the error as the actual minus the dot product of the user and movie latent features diff = self.user_item_mat[i, j] - np.dot(user_mat[i, :], movie_mat[:, j]) # Keep track of the sum of squared errors for the matrix sse_accum += diff**2 # update the values in each matrix in the direction of the gradient for k in range(self.latent_features): user_mat[i, k] += self.learning_rate * (2*diff*movie_mat[k, j]) movie_mat[k, j] += self.learning_rate * (2*diff*user_mat[i, k]) # print results print("%d \t\t %f" % (iteration+1, sse_accum / self.num_ratings)) # SVD based fit # Keep user_mat and movie_mat for safe keeping self.user_mat = user_mat self.movie_mat = movie_mat # Knowledge based fit self.ranked_movies = rf.create_ranked_df(self.movies, self.reviews)
def fit(self, reviews_loc, movies_loc, latent_features=15, n_iter=100, learning_rate=0.001): ''' fit the recommender to your dataset and also have this save the results to pull from when you need to make predictions :param reviews_loc: path to the reviews dataset (str) :param movies_loc: path to the movies dataset (str) :param latent_features: number of latent features to keep (int) :param n_iter: number of iterations (int) :param learning_rate: the learning rate (float) :returns None ''' # Read in the datasets self.movies = pd.read_csv(movies_loc) self.reviews = pd.read_csv(reviews_loc) del self.movies['Unnamed: 0'] del self.reviews['Unnamed: 0'] # Create user-by-item matrix self.train_df = self.reviews[[ 'user_id', 'movie_id', 'rating', 'timestamp' ]] self.user_item_df = self.train_df.groupby( ['user_id', 'movie_id'])['rating'].max().unstack() self.user_item_matrix = np.array(self.user_item_df) self.latent_features = latent_features self.learning_rate = learning_rate self.iter = n_iter # Set up useful values to be used through the rest of the function self.n_users = self.user_item_matrix.shape[0] self.n_movies = self.user_item_matrix.shape[1] self.n_ratings = np.count_nonzero(~np.isnan(self.user_item_matrix)) self.movie_ids = np.array(self.user_item_df.columns) self.user_ids = np.array(self.user_item_df.index) # initialize the user and movie matrices with random values user_mat = np.random.rand(self.n_users, self.latent_features) movie_mat = np.random.rand(self.latent_features, self.n_movies) # initialize sse at 0 for first iteration sse_accum = 0 # header for running results print("Optimization Statistics") print("Iterations | Mean Squared Error ") # for each iteration for i in range(n_iter): # update our sse old_sse = sse_accum sse_accum = 0 # For each user-movie pair for user in range(self.n_users): for movie in range(self.n_movies): # if the rating exists if self.user_item_matrix[user, movie] > 0: # compute the error as the actual minus the dot product of the user # and movie latent features prediction = np.dot(user_mat[user], movie_mat[:, movie]) diff = self.user_item_matrix[user, movie] - prediction # Keep track of the sum of squared errors for the matrix sse_accum += diff**2 # update the values in each matrix in the direction of the gradient user_mat[ user] += learning_rate * 2 * diff * movie_mat[:, movie] movie_mat[:, movie] += learning_rate * 2 * diff * user_mat[ user] #print results for iteration print("%d \t\t %f" % (i + 1, sse_accum / self.n_ratings)) # FunkSVD solution # Storing the user mat and movie mat self.user_mat = user_mat self.movie_mat = movie_mat # Knowledge base solution self.ranked_movies = rf.create_ranked_df(self.movies, self.reviews)
def fit(self, trainpath, moviepath, latent_features=15, learning_rate=0.005, iters=100): ''' fit the recommender to your dataset and also have this save the results to pull from when you need to make predictions INPUT: trainpath - train set path moviepath - movie data set path latent_features - (int) the number of latent features used (defule 15) learning_rate - (float) the learning rate (defule 0.005) iters - (int) the number of iterations (defule 100) OUTPUT: None attributes: train_df - review df movies - movie df train_data_df - unstacked train df ratings_mat -rating matrix n_users - num users n_movies - num movies num_ratings - num ratings user_mat - (numpy array) a user by latent feature matrix movie_mat - (numpy array) a latent feature by movie matrix ranked_movies - a dataframe with movies that are sorted by highest avg rating, more reviews, then time, and must have more than 4 ratings ''' self.train_df = pd.read_csv(trainpath) self.movies = pd.read_csv(moviepath) # Create user-by-item matrix - nothing to do here train_user_item = self.train_df[[ 'user_id', 'movie_id', 'rating', 'timestamp' ]] self.train_data_df = train_user_item.groupby( ['user_id', 'movie_id'])['rating'].max().unstack() train_data_np = np.array(self.train_data_df) self.ratings_mat = train_data_np # Set up useful values to be used through the rest of the function self.n_users = self.ratings_mat.shape[0] self.n_movies = self.ratings_mat.shape[1] self.num_ratings = np.count_nonzero(~np.isnan(self.ratings_mat)) # initialize the user and movie matrices with random values user_mat = np.random.rand(self.n_users, latent_features) movie_mat = np.random.rand(latent_features, self.n_movies) # initialize sse at 0 for first iteration sse_accum = 0 # keep track of iteration and MSE print("Optimizaiton Statistics") print("Iterations | Mean Squared Error ") # for each iteration for iteration in range(iters): # update our sse old_sse = sse_accum sse_accum = 0 # For each user-movie pair for i in range(self.n_users): for j in range(self.n_movies): # if the rating exists if self.ratings_mat[i, j] > 0: # compute the error as the actual minus the dot product of the user and movie latent features diff = self.ratings_mat[i, j] - np.dot( user_mat[i, :], movie_mat[:, j]) # Keep track of the sum of squared errors for the matrix sse_accum += diff**2 # update the values in each matrix in the direction of the gradient for k in range(latent_features): user_mat[i, k] += learning_rate * (2 * diff * movie_mat[k, j]) movie_mat[k, j] += learning_rate * (2 * diff * user_mat[i, k]) # print results print("%d \t\t %f" % (iteration + 1, sse_accum / self.num_ratings)) self.user_mat = user_mat self.movie_mat = movie_mat self.ranked_movies = rf.create_ranked_df(self.movies, self.train_df)
def fit(self, movies_path, reviews_path, latent_features=15, learning_rate=0.001, iters=50): """ Fits recommender to dataset, using the FunkSVD and knowledge-based approach. Args: movies_path: Path of CSV file with movies data with necessary columns 'movie', 'rating', 'date' reviews_path: Path of CSV file with reviews (ratings) data with necessary columns 'user_id', 'movie_id', 'rating', 'timestamp' latent_features: Number of latent features (for FunkSVD) to be considered learning_rate: Learning rate for FunkSVD iters: Iterations of FunkSVD to find best user_mat and movies_mat Returns: None - stores the following attributes n_users - the number of users (int) n_movies - the number of movies (int) num_ratings - the number of ratings made (int) reviews - DataFrame with four columns: 'user_id', 'movie_id', 'rating', 'timestamp' movies - DataFrame of movies user_item_mat - (np array) a user by item numpy array with ratings and nans for values user_mat - Matrix with number of users (rows) and latent features (columns) movies_mat - Matrix with number of movies (columns) and latent features (rows) ranked_movies - DataFrame with with movies that are sorted by highest avg rating, more reviews, then time, and must have more than 4 ratings """ # Read in the data self.movies = pd.read_csv(movies_path) self.reviews = pd.read_csv(reviews_path) # Create user-by-item matrix train_user_item = self.reviews[[ 'user_id', 'movie_id', 'rating', 'timestamp' ]] self.train_data_df = train_user_item.groupby( ['user_id', 'movie_id'])['rating'].max().unstack() self.train_data_np = np.array(self.train_data_df) # Set up useful values to be used through the rest of the function self.n_users = self.train_data_np.shape[0] self.n_movies = self.train_data_np.shape[1] self.num_ratings = np.count_nonzero(~np.isnan(self.train_data_np)) # Store more inputs self.latent_features = latent_features self.learning_rate = learning_rate self.iters = iters # initialize the user and movie matrices with random values user_mat = np.random.rand(self.n_users, self.latent_features) movie_mat = np.random.rand(self.latent_features, self.n_movies) # initialize sse at 0 for first iteration sse_accum = 0 # keep track of iteration and MSE print("Optimization Statistics") print("Iterations | Mean Squared Error ") # for each iteration for iteration in range(iters): # update our sse sse_accum = 0 # For each user-movie pair for i in range(self.n_users): for j in range(self.n_movies): # if the rating exists if self.train_data_np[i, j] > 0: # compute the error as the actual minus the dot product of the user and movie latent features diff = self.train_data_np[i, j] - np.dot( user_mat[i, :], movie_mat[:, j]) # Keep track of the sum of squared errors for the matrix sse_accum += diff**2 # update the values in each matrix in the direction of the gradient for k in range(latent_features): user_mat[i, k] += learning_rate * (2 * diff * movie_mat[k, j]) movie_mat[k, j] += learning_rate * (2 * diff * user_mat[i, k]) # print results print("%d \t\t %f" % (iteration + 1, sse_accum / self.num_ratings)) # SVD approach: self.user_mat = user_mat self.movie_mat = movie_mat # Knowledge-based approach: self.ranked_movies = rf.create_ranked_df(self.movies, self.reviews)
def fit(self, reviews_pth, movies_pth, latent_features=5, iters=100, learning_rate = 0.001 ):#FunkSVD & Knowledge based ''' fit the recommender to your dataset and also have this save the results to pull from when you need to make predictions This function performs matrix factorization using FunkSVD INPUT: reviews_pth - path to csv with at least the four columns: 'user_id', 'movie_id'. 'rating', 'timestamp' movies_pth latent_features - the number of latent features used iters - the number of iterations learning_rate - the learning rate OUTPUT: No Output - Stores the fllw attributes n_users - the number of users(int) n_movies - the number of movies(int) num_ratings - the number of ratings made reviews - dataframe with four columns: 'user_id', 'movie_id', 'rating', 'timestamp movies - dataframe of user_item_mat - (np arrays) a use by items numpy array with rating and nans for values Latent_features - the number of latent features used learning_rate - the learning rate iters - the number of iterations ''' #Store inputs as attributes self.reviews = pd.read_csv(reviews_pth) self.movies = pd.read_csv(movies_pth) #Create user-item matrix usr_itm = self.reviews[['user_id', 'movie_id', 'rating', 'timestamp']] self.user_item_df = usr_itm.groupby(['user_id', 'movie_id'])['rating'].max().unstack() self.user_item_mat = np.array(self.user_item_df) #Store more inputs self.latent_features = latent_features self.learning_rate = learning_rate self.iters = iters #set up useful values to be used throught the rest of the function self.n_users = self.user_item_mat.shape[0] self.n_movies = self.user_item_mat.shape[0] self.num_ratings = np.count_nonzero(~np.isnan(self.user_item_mat)) self.user_ids_series =np.array(self.user_item_df.index) self.movie_ids_series = np.array(self.user_item_df.columns) #initialize the user and movie matrices with random values user_mat = np.random.rand(self.n_users, self.latent_features) movie_mat = np.random.rand(self.latent_features, self.n_movies) #intialize sse at 0 for first iteration sse_accum = 0 #keeping track of the iteration and MSE print('Optimization Statistics') print('Iterations | Mean Squared Error') #for each iteration for iteration in range(sellf.iters): #update our sse old_sse = see_accum see_accum = 0 #for each user-movie pair for i in range (self.n_users): for j in range(self.n_movies): #if the rating exists if self.user_item_mat[i, j] > 0: # compute the error as the actual minus the dot product of the user and movie latent features diff = self.user_item_mat[i, j] - np.dot(user_mat[i, :], movie_mat[:, j]) # Keep track of the sum of squared errors for the matrix sse_accum += diff**2 # update the values in each matrix in the direction of the gradient for k in range(self.latent_features): user_mat[i, k] += self.learning_rate * (2*diff*movie_mat[k, j]) movie_mat[k, j] += self.learning_rate * (2*diff*user_mat[i, k]) # print results print("%d \t\t %f" % (iteration+1, sse_accum / self.num_ratings)) #svd based fit # Keep user_mat and movie-mat for safe keeping self.user_mat = user_mat self.movie_mat = movie_mat # Knowledge based fir self.ranked_movies = rf.create_ranked_df(self.movies, self.reviews)
def fit(self, reviews_path, movies_path, latent_features=15, learning_rate=0.0001, iterations=250): """ This function performs matrix factorization using a basic form of FunkSVD with no regularization INPUT: reviews_path - (string) path to a matrix with users as rows, movies as columns, and ratings as values movies_path - (string) path to a matrx with XXX latent_features - (int) the number of latent features used learning_rate - (float) the learning rate iterations - (int) the number of iterations OUTPUT: """ self.reviews = pd.read_csv(reviews_path) self.movies = pd.read_csv(movies_path) self.latent_features = latent_features self.learning_rate = learning_rate self.iterations = iterations # create user item matrix for collaborative filtering user_item = self.reviews[[ 'user_id', 'movie_id', 'rating', 'timestamp' ]] self.user_item_df = user_item.groupby(['user_id', 'movie_id' ])['rating'].max().unstack() self.user_item_matrix = np.array(self.user_item_df) self.amount_users = self.user_item_matrix.shape[0] self.amount_movies = self.user_item_matrix.shape[1] self.amount_ratings = np.count_nonzero( ~np.isnan(self.user_item_matrix)) self.user_ids_series = np.array(self.user_item_df.index) self.movie_ids_series = np.array(self.user_item_df.columns) # intialize user and movie matrices with random values for FunkSVD user_matrix = np.random.rand(self.amount_users, self.latent_features) movie_matrix = np.random.rand(self.latent_features, self.amount_movies) # initialize sse at 0 for first iteration sum_squared_error_accumulated = 0 # keep track of iteration and MSE print("Optimizaiton Statistics") print("Iterations | Mean Squared Error ") # for each iteration for iteration in range(self.iterations): # update our sse old_sum_squared_error_accumulated = sum_squared_error_accumulated sum_squared_error_accumulated = 0 # For each user-movie pair for i in range(self.amount_users): for j in range(self.amount_movies): # if the rating exists if self.user_item_matrix[i, j] > 0: # compute the error as the actual minus the dot product of the user and movie latent features difference = self.user_item_matrix[i, j] - np.dot( user_matrix[i, :], movie_matrix[:, j]) # Keep track of the sum of squared errors for the matrix sum_squared_error_accumulated += difference**2 # update the values in each matrix in the direction of the gradient for k in range(self.latent_features): user_matrix[i, k] += self.learning_rate * ( 2 * difference * movie_matrix[k, j]) movie_matrix[k, j] += self.learning_rate * ( 2 * difference * user_matrix[i, k]) print("%d \t\t %f" % (iteration + 1, sum_squared_error_accumulated / self.amount_ratings)) # SVD based fit self.user_matrix = user_matrix self.movie_matrix = movie_matrix # Knowledge based fit self.ranked_movies = rf.create_ranked_df(self.movies, self.reviews)
def fit(self, movies_pth, reviews_pth): ''' This function performs matrix factorization using a basic form of FunkSVD with no regularization Params: -------- reviews_pth : path to csv with at least the four columns: 'user_id', 'movie_id', 'rating', 'timestamp' movies_pth : path to csv with each movie and movie information in each row latent_features : (int) the number of latent features used learning_rate : (float) the learning rate iters : (int) the number of iterations Returns: -------- None Stores the following as attributes: n_users : the number of users (int) n_movies : the number of movies (int) num_ratings : the number of ratings made (int) reviews : dataframe with four columns: 'user_id', 'movie_id', 'rating', 'timestamp' movies : dataframe of user_item_mat : (np array) a user by item numpy array with ratings and nans for values latent_features : (int) the number of latent features used learning_rate : (float) the learning rate iters : (int) the number of iterations ''' from numpy import dot, subtract, add, multiply, square # Store inputs as attributes self.reviews = pd.read_csv(reviews_pth)[:60000] self.movies = pd.read_csv(movies_pth) # Create user-item matrix usr_itm = self.reviews[['user_id', 'movie_id', 'rating']] self.user_item_df = usr_itm.groupby(['user_id','movie_id'])['rating'].max().unstack() self.user_item_mat = self.user_item_df.values del usr_itm gc.collect() # Set up useful values to be used through the rest of the function self.n_users = self.user_item_mat.shape[0] self.n_movies = self.user_item_mat.shape[1] self.num_ratings = np.count_nonzero(~np.isnan(self.user_item_mat)) # TODO: get index of user ids # self.user_ids_series = np.array(self.user_item_df.index) # self.movie_ids_series = np.array(self.user_item_df.columns) # initialize the user and movie matrices with random values user_mat = np.random.rand(self.n_users, self.latent_features) movie_mat = np.random.rand(self.latent_features, self.n_movies) # initialize sse at 0 for first iteration sse_accum = 0 # keep track of iteration and MSE print("Optimization Statistics") print("Iterations | Mean Squared Error ") start_time = time.perf_counter() # for each iteration for iteration in range(self.iters): # update our sse old_sse = sse_accum sse_accum = 0 # For each user-movie pair for i in range(self.n_users): for j in range(self.n_movies): # if the rating exists if self.user_item_mat[i, j] > 0: # compute the error as the actual minus the dot product of the user and movie latent features actual_rating = self.user_item_mat[i, j] dot_prod = dot(user_mat[i, :], movie_mat[:, j]) diff = subtract(actual_rating, dot_prod) del actual_rating, dot_prod # Keep track of the sum of squared errors for the matrix sse_accum += square(diff) # update the values in each matrix in the direction of the gradient for k in range(self.latent_features): user_mat[i, k] += self.learning_rate * (2 * diff * movie_mat[k, j]) movie_mat[k, j] += self.learning_rate * (2 * diff * user_mat[i, k]) # print results print("%d \t\t %f" % (iteration+1, sse_accum / self.num_ratings)) print('Update time:', time.perf_counter() - start_time) # SVD based fit # Keep user_mat and movie_mat for safe keeping self.user_mat = user_mat self.movie_mat = movie_mat # Knowledge based fit self.ranked_movies = rf.create_ranked_df(self.movies, self.reviews)