def train_surprise_model(): # import reduced dataset: df = import_reduced_reviews( 'C:/Users/lukas/OneDrive/Desktop/Reviews_Reduced.csv') df = df[['user_key', 'game_key', 'rating']] # drop duplicates: df = df.drop_duplicates(subset=['game_key', 'user_key']) ### Modelling part with Surprise: # get data in a format surprise can work with: reader = Reader(rating_scale=(1, 10)) data = Dataset.load_from_df(df[['user_key', 'game_key', 'rating']], reader) # Build trainset from the whole dataset: trainsetfull = data.build_full_trainset() print('Number of users: ', trainsetfull.n_users, '\n') print('Number of items: ', trainsetfull.n_items, '\n') # Parameters: sim_option = {'name': 'cosine', 'user_based': False} k = 10 min_k = 5 algo = KNNWithMeans(k=k, min_k=min_k, sim_options=sim_option) # Run fit: start_time = time.time() algo.fit(trainsetfull) print("--- %s seconds ---" % (time.time() - start_time)) ### Test: is it possible to exchange the sim matrix? sim_matrix_imported = pd.read_csv( '../Data/Recommender/selfmade_item-item-similarity-matrix.csv', index_col=0) sim_matrix_imported.columns = sim_matrix_imported.columns.astype(int) sim_matrix_imported = sim_matrix_imported.to_numpy() a = algo.predict(93681, 100007) algo.sim = sim_matrix_imported b = algo.predict(93681, 100007) # We now need to save the similarity matrix somewhere: sim_matrix = algo.sim pd.DataFrame(sim_matrix).to_csv( '../Data/Recommender/sim_matrix-myKNNWithMeans_item_based_model') # Save the precomputed model: dump.dump('../Data/Recommender/myKNNWithMeans_item_based_model', algo)
def collaborative_filtering_using_surprise(): """ https://towardsdatascience.com/how-to-build-a-memory-based-recommendation-system-using-python-surprise-55f3257b2cf4 Predict games for user with user_key = 93681 """ target_user_key = 93681 # import reduced dataset: df = import_reduced_reviews() # check for duplicates: duplicates = len(df) - len( df.drop_duplicates(subset=['game_key', 'user_key'])) # drop duplicates: df = df.drop_duplicates(subset=['game_key', 'user_key']) print('duplicates removed: ' + str(duplicates)) # check out our user: df_target_user = df[df['user_key'] == target_user_key] # build utility matrix: # data_pivot = df.pivot(index='user_key', columns='game_key', values='rating') # calculate sparsity # sparsity = data_pivot.isnull().sum().sum() / data_pivot.size # print('Sparcity of utility matrix: ' + str(sparsity)) ### Modelling part with Surprise: # get data in a format surprise can work with: reader = Reader(rating_scale=(1, 10)) data = Dataset.load_from_df(df[['user_key', 'game_key', 'rating']], reader) # Split in trainset and testset trainset, testset = train_test_split(data, test_size=0.2) print('Number of users: ', trainset.n_users, '\n') print('Number of items: ', trainset.n_items, '\n') # When surprise creates a Trainset or Testset object, it takes the raw_id’s (the ones that you used in the file # you imported), and converts them to so-called inner_id’s (basically a series of integers, starting from 0). You # might need to trace back to the original names. Using the items as an example (you can do the same approach # with users, just swap iid's with uid's in the code), to get the list of inner_iids, you can use the all_items # method. To convert from raw to inner id you can use the to_inner_iid method, and the to_raw_iid to convert back. # An example on how to save a list of inner and raw item id’s: trainset_iids = list(trainset.all_items()) iid_converter = lambda x: trainset.to_raw_iid(x) trainset_raw_iids = list(map(iid_converter, trainset_iids)) ## Model parameters: of kNN: # Two hyperparameters we can tune: # 1. k parameter # 2. similarity option # a) user-user vs item-item # b) similarity function (cosine, pearson, msd) sim_option = {'name': 'pearson', 'user_based': False} # 3 different KNN Models: KNNBasic, KNNWithMeans, KNNWithZScore k = 40 min_k = 5 algo = KNNWithMeans(k=k, min_k=min_k, sim_options=sim_option) algo.fit(trainset) ## Testing: predictions = algo.test(testset) accuracy.rmse(predictions) # Own similarity matrix: sim_matrix_imported = pd.read_csv( '../Data/Recommender/selfmade_item-item-similarity-matrix.csv', index_col=0) sim_matrix_imported.columns = sim_matrix_imported.columns.astype(int) sim_matrix_imported = sim_matrix_imported.to_numpy() algo.sim = sim_matrix_imported predictions = algo.test(testset) accuracy.rmse(predictions) # Cross validation: skip = True if not skip: results = cross_validate(algo=algo, data=data, measures=['RMSE'], cv=5, return_train_measures=True) results_mean = results['test_rmse'].mean() ## Predictions # Lets assume we are happy with the method and now want to apply it to the entire data set. # Estimate for a specific user a specific item: single_item_single_user_prediction = algo.predict(uid=target_user_key, iid=100010, verbose=True) # Estimate all items for a specific user: list_of_all_items = trainset_raw_iids target_predictions = [] for item in list_of_all_items: single_prediction = algo.predict(uid=target_user_key, iid=item) target_predictions.append( (single_prediction.uid, single_prediction.iid, single_prediction.est)) # Then sort the predictions for each user and retrieve the k highest ones: target_predictions.sort(key=lambda x: x[2], reverse=True) n = 20 top_n = target_predictions[:n] top_n = [row[1] for row in top_n] print('end')