def generate_prediction(training_file, testing_file, all_songs): # Reverse the comments in the next four lines of code to generate # ItemBasedPredictions intead of UserBasedPredictions s_u = utilities.song_to_users(training_file) #dict songs:{users} pr = prediction.ItemBasedPrediction(s_u, _sim=0) # u_s = utilities.user_to_songs(training_file) #dict songs:{users} # pr = prediction.UserBasedPrediction(u_s) # the recommender rec = recommender.Recommender(all_songs, pr, _k=500) testing_u_s = utilities.user_to_songs(testing_file) pool = Pool(4) for user in testing_u_s.keys(): #recommend for each user-- songs they would like to listen to based on our recommender pool.apply_async(parallel_rec_worker, args=( user, rec, testing_u_s, ), callback=log_result) print('finished applying') pool.close() print('will join when finished evaluating all users...') pool.join() print('finished jobs...')
def test_all(): rec = recommender.Recommender() user_1 = {} user_2 = {"vegan": 5, "halal": 1} user_3 = {"american": 4, "mexican": 3, "japanese": 2} user_4 = {"thai": 4, "mexican": 1, "japanese": 4} user_5 = {} user_6 = {"thai": -3, "japanese": -2, "american": -1} user_7 = {"thai": 113, "japanese": 112, "american": 111} user_8 = {"thai": 12, "japanese": 8, "american": 4} me = {"thai": 5, "japanese": 2, "american": 1} users = [user_1, user_2, user_3, user_4, user_5, user_6, user_7, user_8] print(rec.recommend_me(me, users, 8, intersection=True, with_keys=False)) print(rec.recommend_me(me, users, 8, intersection=False, with_keys=False)) # most similar user with strongest shared interests index = rec.recommend_me(me, users, 1, intersection=True, with_keys=False)[0] print(index, rec.find_most_shared_interests(me, users[index], 3))
def test_find_similarity(): rec = recommender.Recommender() john = {"vegan": 1} joe = {"vegan": 5, "halal": 1} print(rec.find_similarity(john, joe)) john = {"vegan": 1} joe = {} print("empty case:", rec.find_similarity(john, joe))
def test_array_average(self): global_avg = 1 array = np.array([[2, 4], [1, 5], [67, 2], [23, 1], [23, 4]]) total_entries = 100 test = recommender.Recommender(None) result = test.array_average(array, total_entries) self.assertEqual(len(result), total_entries + 1) self.assertEqual(result[23], 2.5) self.assertEqual(result[2], 4) self.assertEqual(np.isnan(result[3]), True) self.assertEqual(np.isnan(result[0]), True) self.assertEqual(np.isnan(result[100]), True)
def test_find_most_shared_interests(): rec = recommender.Recommender() me = {"thai": 3, "japanese": 2, "american": 1} user_1 = {"vegan": 1, "japanese": 4} print(rec.find_most_shared_interests(me, user_1, 1)) me = {"thai": 3, "japanese": 2, "american": 1} user_5 = {} print(rec.find_most_shared_interests(me, user_5, 3)) user_3 = {"american": 4, "mexican": 3, "japanese": 2} x = {"thai": 4, "mexican": 1, "japanese": 4, "american": 4} print(rec.find_most_shared_interests(user_3, x, 3))
import numpy as np import time import recommender start_time = time.time() # initalizing print('[%.2fs] Initializing...' % (time.time() - start_time)) rec_system = recommender.Recommender() # training print('\nTraining data:') print('-> Number of ratings: %s' % len(rec_system.ratings_train.data)) print('-> Number of distinct users: %s' % len(np.unique(rec_system.ratings_train.row))) print('-> Number of distinct items: %s' % len(np.unique(rec_system.ratings_train.col))) print('-> Number of latent factors: %d' % (rec_system.num_user_factors)) print('\n[%.2fs] Training...' % (time.time() - start_time)) rec_system.train() print('\nLearned values:') print('\n-> User factors:') print(rec_system.user_factors) print('\n-> Item factors:') print(rec_system.item_factors) # testing
from flask import Flask, request, render_template, session app = Flask(__name__) import pandas as pd import numpy as np import graphlab as gl from pymongo import MongoClient import time from pprint import pprint from flask import Flask import recommender as rec import info as info from dispatcher import add_job app.secret_key = 'datascience' model = rec.Recommender() #Post Data Request: #@app.route("/") @app.route('/') @app.route('/index') def index(): session["user_id"] = None session["profile"] = None session["rated"] = None return render_template('home.html') @app.route('/recs', methods=['GET']) def show_five(): if session["user_id"] == None:
from flask import Flask, render_template, request import recommender app = Flask(__name__) rcmdr = recommender.Recommender() @app.route('/') def main(): reader_list = rcmdr.getListOfReaders() # reader_list = ["Hello" , "World"] # print(reader_list) return render_template('index.html', option_list=reader_list) @app.route("/get_purchased", methods=['POST']) def get_input(): customerID = request.data if (customerID != ''): actualPurchased_list = rcmdr.getPurchased_Items(customerID) return actualPurchased_list @app.route("/get_recommender", methods=['POST']) def get_recommender(): customerID = request.data # print("customer ID is {} type is {}".format(customerID, type(customerID))) if (customerID != ''): recommended_list = rcmdr.getRec_Items(customerID) # actualPurchased_list = rcmdr.getPurchased_Items(customerID)
b = recommendObject.addClub('Not in it club', 'something', 3,"") c = recommendObject.addClub("Random Other Club", 'something', 4,"") u101.addClub("Common Club", recommendObject) u101.addClub("Random Other Club", recommendObject) u102.addClub("Common Club", recommendObject) u102.addClub("Not in it club", recommendObject) u102.addClub("Random Other Club", recommendObject) c = recommendObject.createClubRecommendation(101) assert(c.getDestination().getClubName() == "Not in it club") print("The assertion passed") return None def tryAddingExcelClubs(): recommendObject.addExcelClubs() print("Added the clubs from excel") return None def clubBasedTests(): miniDataSet() smallDataSet() largeDataSet() caseForNoRelated() checkNotReturningClubAlreadyIn() return 0 recommendObject = recommender.Recommender() clubBasedTests()
options = parser.parse_args() do_ingest = options.ingest user_id = options.user_id top_rated_songs = options.top_rated_songs top_played_songs = options.top_played_songs get_added_songs = options.get_added_songs recommend_songs = options.recommend_songs if do_ingest: print '\n' * 2 + '*' * 10 + ' DATA INGESTION ' + '*' * 10 ingesta.ingest() if top_rated_songs: print '\n' * 2 + '*' * 10 + ' TOP RATED SONGS ' + '*' * 10 r = recommender.Recommender() results = r.top_rated(top_rated_songs) for result in results: print result if top_played_songs: print '\n' * 2 + '*' * 10 + ' TOP PLAYED SONGS ' + '*' * 10 r = recommender.Recommender() results = r.top_listened(top_played_songs) for result in results: print result if get_added_songs and user_id: print '\n' * 2 + '*' * 10 + ' SONGS ADDED BY USER ' + '*' * 10 r = recommender.Recommender() playlists = r.user_based(user_id)[0].get('playlists')
class Recommender: """ This Recommender uses FunkSVD to make predictions of exact ratings. And uses either FunkSVD or a Knowledge Based recommendation (highest ranked) to make recommendations for users. Finally, if given a movie, the recommender will provide movies that are most similar as a Content Based Recommender. """ def __init__(self): pass def fit(self, reviews_pth, movies_pth, latent_features=12, learning_rate=0.0001, iters=100): """ This function performs matrix factorization using a basic form of FunkSVD with no regularization INPUT: reviews_pth - path to csv with at least the four columns: 'user_id', 'movie_id', 'rating', 'timestamp' movies_pth - path to csv with each movie and movie information in each row latent_features - (int) the number of latent features used learning_rate - (float) the learning rate iters - (int) the number of iterations OUTPUT: None - stores the following as attributes: n_users - the number of users (int) n_movies - the number of movies (int) num_ratings - the number of ratings made (int) reviews - dataframe with four columns: 'user_id', 'movie_id', 'rating', 'timestamp' movies - dataframe of user_item_mat - (np array) a user by item numpy array with ratings and nans for values latent_features - (int) the number of latent features used learning_rate - (float) the learning rate iters - (int) the number of iterations """ # Store inputs as attributes self.reviews = pd.read_csv(reviews_pth) self.movies = pd.read_csv(movies_pth) # Create user-item matrix usr_itm = self.reviews[['user_id', 'movie_id', 'rating', 'timestamp']] self.user_item_df = usr_itm.groupby(['user_id', 'movie_id' ])['rating'].max().unstack() self.user_item_mat = np.array(self.user_item_df) # Store more inputs self.latent_features = latent_features self.learning_rate = learning_rate self.iters = iters # Set up useful values to be used through the rest of the function self.n_users = self.user_item_mat.shape[0] self.n_movies = self.user_item_mat.shape[1] self.num_ratings = np.count_nonzero(~np.isnan(self.user_item_mat)) self.user_ids_series = np.array(self.user_item_df.index) self.movie_ids_series = np.array(self.user_item_df.columns) # initialize the user and movie matrices with random values user_mat = np.random.rand(self.n_users, self.latent_features) movie_mat = np.random.rand(self.latent_features, self.n_movies) # initialize sse at 0 for first iteration sse_accum = 0 # keep track of iteration and MSE print("Optimizaiton Statistics") print("Iterations | Mean Squared Error ") # for each iteration for iteration in range(self.iters): # update our sse old_sse = sse_accum sse_accum = 0 # For each user-movie pair for i in range(self.n_users): for j in range(self.n_movies): # if the rating exists if self.user_item_mat[i, j] > 0: # compute the error as the actual minus the dot product of the user and movie latent features diff = self.user_item_mat[i, j] - np.dot( user_mat[i, :], movie_mat[:, j]) # Keep track of the sum of squared errors for the matrix sse_accum += diff**2 # update the values in each matrix in the direction of the gradient for k in range(self.latent_features): user_mat[i, k] += self.learning_rate * ( 2 * diff * movie_mat[k, j]) movie_mat[k, j] += self.learning_rate * ( 2 * diff * user_mat[i, k]) # print results print("%d \t\t %f" % (iteration + 1, sse_accum / self.num_ratings)) # SVD based fit # Keep user_mat and movie_mat for safe keeping self.user_mat = user_mat self.movie_mat = movie_mat # Knowledge based fit self.ranked_movies = rf.create_ranked_df(self.movies, self.reviews) def predict_rating(self, user_id, movie_id): """ INPUT: user_id - the user_id from the reviews df movie_id - the movie_id according the movies df OUTPUT: pred - the predicted rating for user_id-movie_id according to FunkSVD """ try: # User row and Movie Column user_row = np.where(self.user_ids_series == user_id)[0][0] movie_col = np.where(self.movie_ids_series == movie_id)[0][0] # Take dot product of that row and column in U and V to make prediction pred = np.dot(self.user_mat[user_row, :], self.movie_mat[:, movie_col]) movie_name = str( self.movies[self.movies['movie_id'] == movie_id]['movie'])[5:] movie_name = movie_name.replace('\nName: movie, dtype: object', '') print( "For user {} we predict a {} rating for the movie {}.".format( user_id, round(pred, 2), str(movie_name))) return pred except: print( "I'm sorry, but a prediction cannot be made for this user-movie pair. It looks like one of these " "items does not exist in our current database.") return None def make_recommendations(self, _id, _id_type='movie', rec_num=5): """ INPUT: _id - either a user or movie id (int) _id_type - "movie" or "user" (str) rec_num - number of recommendations to return (int) OUTPUT: recs - (array) a list or numpy array of recommended movies like the given movie, or recs for a user_id given """ # if the user is available from the matrix factorization data, # I will use this and rank movies based on the predicted values # For use with user indexing rec_ids, rec_names = None, None if _id_type == 'user': if _id in self.user_ids_series: # Get the index of which row the user is in for use in U matrix idx = np.where(self.user_ids_series == _id)[0][0] # take the dot product of that row and the V matrix preds = np.dot(self.user_mat[idx, :], self.movie_mat) # pull the top movies according to the prediction indices = preds.argsort()[-rec_num:][::-1] # indices rec_ids = self.movie_ids_series[indices] rec_names = rf.get_movie_names(rec_ids, self.movies) else: # if we don't have this user, give just top ratings back rec_names = rf.popular_recommendations(_id, rec_num, self.ranked_movies) print( "Because this user wasn't in our database, we are giving back the top movie recommendations for " "all users.") # Fi nd similar movies if it is a movie that is passed else: if _id in self.movie_ids_series: rec_names = list(rf.find_similar_movies(_id, self.movies))[:rec_num] else: print( "That movie doesn't exist in our database. Sorry, we don't have any recommendations for you." ) return rec_ids, rec_names if __name__ == '__main__': import recommender as r # instantiate recommender rec = r.Recommender() # fit recommender rec.fit(reviews_pth='data/train_data.csv', movies_pth='data/movies_clean.csv', learning_rate=.01, iters=1) # predict rec.predict_rating(user_id=8, movie_id=2844) # make recommendations print(rec.make_recommendations(8, 'user')) # user in the dataset print(rec.make_recommendations(1, 'user')) # user not in dataset print(rec.make_recommendations(1853728)) # movie in the dataset print(rec.make_recommendations(1)) # movie not in dataset print(rec.n_users) print(rec.n_movies) print(rec.num_ratings)
__ACTIVITY = "activity_v2.csv" __DEAL_ITEMS = "dealitems.csv" __DEAL_DETAILS = "deal_details.csv" # Params N_dealitems = 10 # load raw data activity_train = pd.read_csv('train_' + __ACTIVITY) deal_items_train = pd.read_csv('train_' + __DEAL_ITEMS) deal_details_train = pd.read_csv('train_' + __DEAL_DETAILS) activity_test = pd.read_csv('test_' + __ACTIVITY) deal_items_test = pd.read_csv('test_' + __DEAL_ITEMS) deal_details_test = pd.read_csv('test_' + __DEAL_DETAILS) full_data, grouped_by_users_train, grouped_by_dealitem_id_train = processing.get_proceed_data( activity_train, deal_items_train, deal_details_train) _, grouped_by_users_test, grouped_by_dealitem_id_test = processing.get_proceed_data( activity_test, deal_items_test, deal_details_test) actual_time = activity_train['create_time'].max() # acctual_time = 1406852020 model = r.Recommender(actual_time) model.fit(full_data, grouped_by_users_train, grouped_by_dealitem_id_train, deal_items_train, deal_details_train, top_N_items=N_dealitems) model.predict(activity_train, grouped_by_users_test, distance_treshold=0.4)