def KNNPred(data): #KNN Means algorithm print("\nTraining KNN Means model..\n") global x_test, y_test, testlen, trainlen, y_train, model_params, X, Y, avg_rat, cold_itm options = model_params[0] knnModel = KNNWithMeans(sim_options=options) knnModel_1 = KNNWithMeans() train = data.build_full_trainset() knnModel.fit(train) print("\nTraining done..\nPrediction started..") knnModel_1.fit(train) #y_pred_w_m = [knnModel.predict(x_test[i][0], x_test[i][1]).est for i in range(testlen)] #y_pred_wo_m = [knnModel_1.predict(x_test[i][0], x_test[i][1]).est for i in range(testlen)] y_pred_w_m = [0 for i in range(testlen)] y_pred_wo_m = [0 for i in range(testlen)] kk = 0 for i in x_test: if i[1] - 1 in cold_itm: y_pred_w_m[kk] = avg_rat[i[0] - 1] y_pred_wo_m[kk] = avg_rat[i[0] - 1] else: y_pred_w_m[kk] = knnModel.predict(i[0], i[1]).est y_pred_wo_m[kk] = knnModel_1.predict(i[0], i[1]).est kk += 1 #y_pred_train = [knnModel_1.predict(x_train[i][0], x_train[i][1]).est for i in range(trainlen)] #y_pred_tot = [knnModel_1.predict(X[i][0], X[i][1]).est for i in range(trainlen+testlen)] print("\nPrediction done..\n") return [y_pred_w_m, y_pred_wo_m, knnModel, knnModel_1] #, y_pred_train, y_pred_tot
def train_surprise_model(): # import reduced dataset: df = import_reduced_reviews( 'C:/Users/lukas/OneDrive/Desktop/Reviews_Reduced.csv') df = df[['user_key', 'game_key', 'rating']] # drop duplicates: df = df.drop_duplicates(subset=['game_key', 'user_key']) ### Modelling part with Surprise: # get data in a format surprise can work with: reader = Reader(rating_scale=(1, 10)) data = Dataset.load_from_df(df[['user_key', 'game_key', 'rating']], reader) # Build trainset from the whole dataset: trainsetfull = data.build_full_trainset() print('Number of users: ', trainsetfull.n_users, '\n') print('Number of items: ', trainsetfull.n_items, '\n') # Parameters: sim_option = {'name': 'cosine', 'user_based': False} k = 10 min_k = 5 algo = KNNWithMeans(k=k, min_k=min_k, sim_options=sim_option) # Run fit: start_time = time.time() algo.fit(trainsetfull) print("--- %s seconds ---" % (time.time() - start_time)) ### Test: is it possible to exchange the sim matrix? sim_matrix_imported = pd.read_csv( '../Data/Recommender/selfmade_item-item-similarity-matrix.csv', index_col=0) sim_matrix_imported.columns = sim_matrix_imported.columns.astype(int) sim_matrix_imported = sim_matrix_imported.to_numpy() a = algo.predict(93681, 100007) algo.sim = sim_matrix_imported b = algo.predict(93681, 100007) # We now need to save the similarity matrix somewhere: sim_matrix = algo.sim pd.DataFrame(sim_matrix).to_csv( '../Data/Recommender/sim_matrix-myKNNWithMeans_item_based_model') # Save the precomputed model: dump.dump('../Data/Recommender/myKNNWithMeans_item_based_model', algo)
def CFM(self): u_id = [] I_id = [] r_ui_ = np.array([]) _est = np.array([]) sim_options = {'name': 'cosine', 'user_based': True} algo = KNNWithMeans(k=40, min_k=1, sim_options=sim_options) algo.fit(self.trainset) for uid in (self.list): lids = self.data[self.data.uid == uid] a = self.data[self.data.uid == uid] for i in range(1, len(a)): lid = lids[i - 1:i].lid.values[0] r_ui = lids[i - 1:i].rate.values[0] pred = algo.predict(uid, lid, r_ui, verbose=True) u_id.append(int(pred.uid)) I_id.append(int(pred.iid)) r_ui_ = np.append(r_ui_, pred.r_ui) _est = np.append(_est, pred.est) self.df_est = pd.DataFrame({ 'uid': u_id, 'Iid': I_id, 'r_ui': r_ui_, 'est': _est }) self.arr = self.df_est['uid'].unique() self.CFWM_ndcg_ = self.Calculate_NDCG()
class Rater: def __init__(self, ratings): self.classifier = KNNWithMeans(sim_options={"name": "cosine", "user_based": False}) self.training_set = None self.ratings_dict = None self._prepare_data_(ratings) self._train_() def _prepare_data_(self, ratings): self.ratings_dict = { "user_id": [item.user_id for item in ratings], "movie_id": [item.movie_id for item in ratings], "mark": [item.mark for item in ratings] } df = pd.DataFrame(self.ratings_dict) data = Dataset.load_from_df(df[["user_id", "movie_id", "mark"]], Reader(rating_scale=Constants.RATING_SCALE)) self.training_set = data.build_full_trainset() def _train_(self): self.classifier.fit(self.training_set) def get_ratings(self, user_id): predicted_ratings = {} for movie_id in self.ratings_dict["movie_id"]: prediction = self.classifier.predict(user_id, movie_id) predicted_ratings[movie_id] = prediction.est return predicted_ratings
def run(self): #will run model ratings = pd.read_csv('rating_final.csv') ratings_dict = {"userID": list(ratings.userID), "placeID": list(ratings.placeID), "rating": list(ratings.rating)} df = pd.DataFrame(ratings_dict) reader = Reader(rating_scale=(0, 2)) data = Dataset.load_from_df(df[["userID", "placeID", "rating"]], reader) # To use item-based cosine similarity sim_options = { "name": "cosine", "user_based": True, # Compute similarities between items "min_support":9 } # define a cross-validation iterator kf = KFold(n_splits=5) algo = KNNWithMeans(sim_options=sim_options) places = list(df['placeID'].unique()) ordered = ArrayList() for i in places: total=0 for trainset, testset in kf.split(data): #finds result for each fold # train algorithm. algo.fit(trainset) #test algorithm #predictions = algo.test(testset) # Compute and print Root Mean Squared Error #accuracy.rmse(predictions, verbose=True) #gets predicted rating for each place prediction = algo.predict(self.user, i, verbose=False) total+=prediction.est ordered.append(i, total/5) #we find average of estimate for each fold ordered.sort() highest = ordered.inArray[ordered.count - 5:ordered.count] place = pd.read_csv('geoplaces2.csv') #placedf = pd.DataFrame({"placeID": list(place.placeID), "name": list(place.name)}) count = 0 finalRec=ArrayList() for i in range(len(highest) - 1, -1, -1): count += 1 name = list(place[place["placeID"].unique() == highest[i].id]['name']) finalRec.append(count, name[0]) #printing accuracy score out = cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=False) mean_rmse = '{:.3f}'.format(np.mean(out['test_rmse'])) print(mean_rmse) return finalRec.inArray
def CFM(self): sim_options = {'name': 'cosine', 'user_based': True} algo = KNNWithMeans(k=40, min_k=1, sim_options=sim_options) algo.fit(self.trainset) for uid in (self.list): lids = self.data[self.data.uid == uid] a = self.data[self.data.uid == uid] for i in range(1, len(a)): lid = lids[i - 1:i].lid.values[0] r_ui = lids[i - 1:i].rate.values[0] pred = algo.predict(uid, lid, r_ui, verbose=True) return pred
def computeKNNMeansMovie(data, test_np): """Compute the k-NN with mean item based method and return the predictions on the test The method is on all the data and got the following settings: - Similarity function : Pearson baseline, item based - Number of closest neighbors : 108 data : data frame which represent the train set test_np : data frame on which the prediction will be returned return : test_np with a column of prediction named 'knnmeans_item_rating'""" trainset, test = dataTrainSurprise(data, test_np) sim_options = {'name':'pearson_baseline','user_based': False} knnmeans_algo = KNNWithMeans(k = 108, sim_options =sim_options).fit(trainset) test['knnmeans_item_rating'] = test[['user_id', 'movie_id']] \ .apply(lambda row: knnmeans_algo.predict(row['user_id'], row['movie_id'])[3], axis=1) return test
def knn_centered_user(train, test, ids, Xtest, Xids): """ kNN approach taking into account the mean ratings of each user Argument : train, the trainset test, the testset ids, unknown ratings Xtest, predicted ratings for testset, to be used for final blending Xids, predicted ratings for unknown ratings, to be used for final blending """ print('Centered kNN User') algo = KNNWithMeans(k=200, name='pearson_baseline', min_support=5, user_based=True, shrinkage=120) #Train algorithm on training set algo.fit(train) #Predict on train and compute RMSE predictions = algo.test(train.build_testset()) print(' Training RMSE: ', accuracy.rmse(predictions, verbose=False)) #Predict on test and compute RMSE predictions = algo.test(test) rmse = accuracy.rmse(predictions, verbose=False) print(' Test RMSE: ', rmse) preds_test = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds_test[j] = pred.est #Predict unknown ratings preds_ids = [] for i in range(len(ids[0])): pred = algo.predict(str(ids[0][i]), str(ids[1][i])) preds_ids.append(pred.est) Xtest.append(preds_test) Xids.append(preds_ids) return rmse, Xtest, Xids, preds_test, preds_ids
def user_collaborative_filtering(trainset, testset): # Use user_based true/false to switch between user-based or item-based collaborative filtering algo = KNNWithMeans(k=50, sim_options={ 'name': 'pearson_baseline', 'user_based': True }) algo.fit(trainset) # we can now query for specific predicions uid = str(196) # raw user id iid = str(302) # raw item id # get a prediction for specific users and items. pred = algo.predict(uid, iid, r_ui=4, verbose=True) # run the trained model against the testset test_pred = algo.test(testset) # get RMSE print("User-based Model : Test Set") accuracy.rmse(test_pred, verbose=True)
startTime = time.time() # 数据读取 reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1) data = Dataset.load_from_file('./ratings.csv', reader=reader) trainset = data.build_full_trainset() # ItemCF 计算得分 # 取最相似的用户计算时,只取最相似的k个 algo = KNNWithMeans(k=50, sim_options={'user_based': False, 'verbose': 'True'}) # 定义K折交叉验证迭代器,K=3 kf = KFold(n_splits=3) for trainset, testset in kf.split(data): # 训练并预测 algo.fit(trainset) predictions = algo.test(testset) # 计算RMSE accuracy.rmse(predictions, verbose=True) # 计算MAE accuracy.mae(predictions, verbose=True) # algo.fit(trainset) uid = str(196) iid = str(302) pred = algo.predict(uid, iid) print(pred) endTime = time.time() print("程序运行的时间:{}".format(endTime - startTime))
# In[ ]: trainset, testset = train_test_split(data, test_size=.15) # In[ ]: algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True}) algo.fit(trainset) # In[ ]: test_pred = algo.test(testset) # In[ ]: accuracy.rmse(test_pred, verbose=True) # In[ ]: algo.predict(uid=2, iid='Fight Club (1999)').est
r_ui1 = 4 r_ui2 = 4 r_ui3 = 1 r_ui4 = 3 verboseFlag = True # get a prediction for specific users and items. print("KNNBaseLine:") predBaseLine1 = algoBaseLine.predict(uid1, iid1, r_ui = r_ui1, verbose = verboseFlag) predBaseLine2 = algoBaseLine.predict(uid2, iid2, r_ui = r_ui2, verbose = verboseFlag) predBaseLine3 = algoBaseLine.predict(uid3, iid3, r_ui = r_ui3, verbose = verboseFlag) predBaseLine4 = algoBaseLine.predict(uid4, iid4, r_ui = r_ui4, verbose = verboseFlag) print("\nKNNBasic:") predBasic1 = algoBasic.predict(uid1, iid1, r_ui = r_ui1, verbose = verboseFlag) predBasic2 = algoBasic.predict(uid2, iid2, r_ui = r_ui2, verbose = verboseFlag) predBasic3 = algoBasic.predict(uid3, iid3, r_ui = r_ui3, verbose = verboseFlag) predBasic4 = algoBasic.predict(uid4, iid4, r_ui = r_ui4, verbose = verboseFlag) print("\nKNNWithMeans:") predWithMeans1 = algoWithMeans.predict(uid1, iid1, r_ui = r_ui1, verbose = verboseFlag) predWithMeans2 = algoWithMeans.predict(uid2, iid2, r_ui = r_ui2, verbose = verboseFlag) predWithMeans3 = algoWithMeans.predict(uid3, iid3, r_ui = r_ui3, verbose = verboseFlag) predWithMeans4 = algoWithMeans.predict(uid4, iid4, r_ui = r_ui4, verbose = verboseFlag) print("\nKNNWithZScore:") predWithZScore1 = algoWithZScore.predict(uid1, iid1, r_ui = r_ui1, verbose = verboseFlag) predWithZScore2 = algoWithZScore.predict(uid2, iid2, r_ui = r_ui2, verbose = verboseFlag) predWithZScore3 = algoWithZScore.predict(uid3, iid3, r_ui = r_ui3, verbose = verboseFlag) predWithZScore4 = algoWithZScore.predict(uid4, iid4, r_ui = r_ui4, verbose = verboseFlag)
r_ui=r_ui3, verbose=verboseFlag) predBaseLine4 = algoBaseLine.predict(uid4, iid4, r_ui=r_ui4, verbose=verboseFlag) print("\nKNNBasic:") predBasic1 = algoBasic.predict(uid1, iid1, r_ui=r_ui1, verbose=verboseFlag) predBasic2 = algoBasic.predict(uid2, iid2, r_ui=r_ui2, verbose=verboseFlag) predBasic3 = algoBasic.predict(uid3, iid3, r_ui=r_ui3, verbose=verboseFlag) predBasic4 = algoBasic.predict(uid4, iid4, r_ui=r_ui4, verbose=verboseFlag) print("\nKNNWithMeans:") predWithMeans1 = algoWithMeans.predict(uid1, iid1, r_ui=r_ui1, verbose=verboseFlag) predWithMeans2 = algoWithMeans.predict(uid2, iid2, r_ui=r_ui2, verbose=verboseFlag) predWithMeans3 = algoWithMeans.predict(uid3, iid3, r_ui=r_ui3, verbose=verboseFlag) predWithMeans4 = algoWithMeans.predict(uid4, iid4, r_ui=r_ui4, verbose=verboseFlag) print("\nKNNWithZScore:")
for trainset, testset in kf.split(data): algo.fit(trainset) predictions = algo.test(testset) accuracy.rmse(predictions) accuracy.mae(predictions) top_n = get_top_n(predictions, n=5) precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4) # Precision and recall can then be averaged over all users print('precision', sum(prec for prec in precisions.values()) / len(precisions)) print('recall', sum(rec for rec in recalls.values()) / len(recalls)) # Retrieve inner id of the movie Toy Story # trainset = data.build_full_trainset() # toy_story_raw_id = 'eSQ3z93DlzkpXK_H6MFEMw' # toy_story_inner_id = algo.trainset.to_inner_uid(toy_story_raw_id) # Retrieve inner ids of the nearest neighbors of Toy Story. r = restaurants_and_food.copy() r['Estimate_Score'] = r['business_id'].apply( lambda x: algo.predict('gRdBkmXdRqUzDMkcMtt7rQ', x).est) r = r.sort_values(by=['Estimate_Score'], ascending=False) print(r[['business_id', 'name', 'categories', 'stars', 'Estimate_Score']].head(10))
data = Dataset.load_builtin('ml-100k') train, test = train_test_split(data, test_size=0.25, random_state=10) algo = SVD() algo.n_epochs = 20 algo.random_state = 15 algo.fit(train) predictions = algo.test(test) accuracy.rmse(predictions) uid = str(196) # raw user id iid = str(302) # raw item id r_ui = 4 # already know the true rating is 4, so we can make a comparison pred = algo.predict(uid, iid, r_ui=r_ui, verbose=True) print(pred.est) knn = KNNWithMeans( sim_options={ "name": "msd", # cosine / msd / pearson / pearson_baseline "min_support": 2, "user_based": False }) knn.fit(train) predictions = knn.test(test) accuracy.rmse(predictions) pred = knn.predict(uid, iid, r_ui=r_ui, verbose=True) print(pred.est)
def collaborative_filtering_using_surprise(): """ https://towardsdatascience.com/how-to-build-a-memory-based-recommendation-system-using-python-surprise-55f3257b2cf4 Predict games for user with user_key = 93681 """ target_user_key = 93681 # import reduced dataset: df = import_reduced_reviews() # check for duplicates: duplicates = len(df) - len( df.drop_duplicates(subset=['game_key', 'user_key'])) # drop duplicates: df = df.drop_duplicates(subset=['game_key', 'user_key']) print('duplicates removed: ' + str(duplicates)) # check out our user: df_target_user = df[df['user_key'] == target_user_key] # build utility matrix: # data_pivot = df.pivot(index='user_key', columns='game_key', values='rating') # calculate sparsity # sparsity = data_pivot.isnull().sum().sum() / data_pivot.size # print('Sparcity of utility matrix: ' + str(sparsity)) ### Modelling part with Surprise: # get data in a format surprise can work with: reader = Reader(rating_scale=(1, 10)) data = Dataset.load_from_df(df[['user_key', 'game_key', 'rating']], reader) # Split in trainset and testset trainset, testset = train_test_split(data, test_size=0.2) print('Number of users: ', trainset.n_users, '\n') print('Number of items: ', trainset.n_items, '\n') # When surprise creates a Trainset or Testset object, it takes the raw_id’s (the ones that you used in the file # you imported), and converts them to so-called inner_id’s (basically a series of integers, starting from 0). You # might need to trace back to the original names. Using the items as an example (you can do the same approach # with users, just swap iid's with uid's in the code), to get the list of inner_iids, you can use the all_items # method. To convert from raw to inner id you can use the to_inner_iid method, and the to_raw_iid to convert back. # An example on how to save a list of inner and raw item id’s: trainset_iids = list(trainset.all_items()) iid_converter = lambda x: trainset.to_raw_iid(x) trainset_raw_iids = list(map(iid_converter, trainset_iids)) ## Model parameters: of kNN: # Two hyperparameters we can tune: # 1. k parameter # 2. similarity option # a) user-user vs item-item # b) similarity function (cosine, pearson, msd) sim_option = {'name': 'pearson', 'user_based': False} # 3 different KNN Models: KNNBasic, KNNWithMeans, KNNWithZScore k = 40 min_k = 5 algo = KNNWithMeans(k=k, min_k=min_k, sim_options=sim_option) algo.fit(trainset) ## Testing: predictions = algo.test(testset) accuracy.rmse(predictions) # Own similarity matrix: sim_matrix_imported = pd.read_csv( '../Data/Recommender/selfmade_item-item-similarity-matrix.csv', index_col=0) sim_matrix_imported.columns = sim_matrix_imported.columns.astype(int) sim_matrix_imported = sim_matrix_imported.to_numpy() algo.sim = sim_matrix_imported predictions = algo.test(testset) accuracy.rmse(predictions) # Cross validation: skip = True if not skip: results = cross_validate(algo=algo, data=data, measures=['RMSE'], cv=5, return_train_measures=True) results_mean = results['test_rmse'].mean() ## Predictions # Lets assume we are happy with the method and now want to apply it to the entire data set. # Estimate for a specific user a specific item: single_item_single_user_prediction = algo.predict(uid=target_user_key, iid=100010, verbose=True) # Estimate all items for a specific user: list_of_all_items = trainset_raw_iids target_predictions = [] for item in list_of_all_items: single_prediction = algo.predict(uid=target_user_key, iid=item) target_predictions.append( (single_prediction.uid, single_prediction.iid, single_prediction.est)) # Then sort the predictions for each user and retrieve the k highest ones: target_predictions.sort(key=lambda x: x[2], reverse=True) n = 20 top_n = target_predictions[:n] top_n = [row[1] for row in top_n] print('end')
# df = pd.DataFrame(ratings_dict) # df.to_csv('csv_example') df = pd.read_csv('csv_example') print(df) reader = Reader(rating_scale=(0, 5)) data = Dataset.load_from_df(df[["user", "item", "rating"]], reader) # movielens = Dataset.load_builtin('ml-100k') from surprise import KNNWithMeans # To use item-based cosine similarity sim_options = { "name": "cosine", "user_based": False, # Compute similarities between items } algo = KNNWithMeans(sim_options=sim_options) trainingSet = data.build_full_trainset() algo.fit(trainingSet) prediction = algo.predict('E', fifa) print (prediction.est)
from collections import defaultdict import pprint # 数据读取 path = './movielens_sample.txt' df = pd.read_csv(path, usecols=[0, 1, 2], skiprows=1) df.columns = ['user', 'item', 'rating'] reader = Reader(line_format='user item rating', sep=',') data = Dataset.load_from_df(df, reader=reader) trainset = data.build_full_trainset() # ItemCF 计算得分 # 取最相似的用户计算时,只取最相似的k个 kf = KFold(n_splits=5) algo = KNNWithMeans(k=50, sim_options={'user_based': False, 'verbose': 'True'}) for trainset, testset in kf.split(data): algo.fit(trainset) predictions = algo.test(testset) rmse = accuracy.rmse(predictions, verbose=True) print(rmse, rmse * rmse) predictions = [] for row in df.itertuples(): user, item = getattr(row, 'user'), getattr(row, 'item') predictions.append([user, item, algo.predict(user, item).est]) print("*" * 100) print("user\titem\tpredict\n") pprint.pprint(predictions)
def selfmade_approach(): # import reduced dataset: df = import_reduced_reviews( 'C:/Users/lukas/OneDrive/Desktop/Reviews_Reduced.csv') df = df[['user_key', 'game_key', 'rating']] # drop duplicates: df = df.drop_duplicates(subset=['game_key', 'user_key']) ### Modelling part with Surprise: # get data in a format surprise can work with: reader = Reader(rating_scale=(1, 10)) data = Dataset.load_from_df(df[['user_key', 'game_key', 'rating']], reader) # Build trainset from the whole dataset: trainsetfull = data.build_full_trainset() print('Number of users: ', trainsetfull.n_users, '\n') print('Number of items: ', trainsetfull.n_items, '\n') # Parameters: sim_option = {'name': 'cosine', 'user_based': False} k = 10 min_k = 5 algo = KNNWithMeans(k=k, min_k=min_k, sim_options=sim_option) # Run fit: start_time = time.time() algo.fit(trainsetfull) print("--- %s seconds ---" % (time.time() - start_time)) # 1st approach: Calculate for a single user contained in dataset: target_user_key = 286189 target_user_info = df[df['user_key'] == target_user_key] # Estimate single game: target_game_key = 100098 # data structures: # sim_matrix = ndarray(312,312) # xr = defaultdict: 312 # yr = defaultdict 8787 # later on replace these by self-written structures xr = algo.xr yr = algo.yr sim_matrix = algo.sim item_means = algo.means inner_target_uid = algo.trainset.to_inner_uid(target_user_key) inner_target_iid = algo.trainset.to_inner_iid(target_game_key) # switch: uid and idd: x = inner_target_uid y = inner_target_iid # pred2: inner_2_raw_item_ids = algo.trainset._raw2inner_id_items # swap keys and values: inner_2_raw_item_ids = dict( (v, k) for k, v in inner_2_raw_item_ids.items()) # similarity matrix with raw ids instead of inner surprise ids: sim_matrix_df = pd.DataFrame(sim_matrix) sim_matrix_df = sim_matrix_df.rename( columns=lambda x: inner_2_raw_item_ids[x]) sim_matrix_df = sim_matrix_df.rename( index=lambda x: inner_2_raw_item_ids[x]) target_user_ratings = yr[x] # convert from inner to raw: target_user_ratings2 = [] for (inner_iid, rating) in target_user_ratings: target_user_ratings2.append((inner_2_raw_item_ids[inner_iid], rating)) # convert item means from inner to raw: item_means2 = {} for i, mean in enumerate(item_means): item_means2[inner_2_raw_item_ids[i]] = mean myKNN = MyKnnWithMeans(sim_matrix=sim_matrix_df, target_user_ratings=target_user_ratings2, item_means=item_means2, k=k, min_k=min_k) pred = myKNN.predict_single_game(user_key=target_user_key, game_key=target_game_key) pred_surprise = algo.predict(uid=inner_target_uid, iid=inner_target_iid) estimate = pred print("Estimate for user %s for game %s is %s" % (target_user_key, target_game_key, estimate)) # Estimate for user not contained in dataset: target_user_key = 123456789 target_game_key = 100098 user_ratings = [ (100284, 7), (100311, 8), (105154, 2), (100020, 4), (100001, 9), (100277, 7), ] myKNN2 = MyKnnWithMeans(sim_matrix_df, user_ratings, item_means2, k, min_k) prediction = myKNN2.predict_single_game(target_user_key, target_game_key) # export similarity matrix: sim_matrix_df.to_csv( '../Data/Recommender/item-item-sim-matrix-surprise.csv') # export item means: export_path = '../Data/Recommender/item-means.json' with open(export_path, 'w') as fp: json.dump(item_means2, fp, sort_keys=False, indent=4) test = sim_matrix_df.loc[100516, 100284] pass
# We'll use the famous SVD algorithm. for i in range(10): bsl_options = {'method': 'sgd', 'reg': i, 'learning_rate': 0.005, } sim_options = {'name': 'msd', 'shrinkage': 0, 'user_based': 'True', 'min_support': 10, } algo = KNNWithMeans(bsl_options=bsl_options, sim_options=sim_options) # algo = SVD(n_factors=20, n_epochs=30, lr_all=0.005, reg=0.04) algo.train(trainset) indices = [] with open("data/sampleSubmission.csv", 'r') as sample: samples = sample.read().splitlines()[1:] indices = [ re.match(r'r(\d+?)_c(\d+?),.*?', line, re.DOTALL).groups() for line in samples ] with open("data/pred_svd.csv", 'w') as csvfile: fieldnames = ['Id', 'Prediction'] writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames) writer.writeheader() for item, user in indices: pred = algo.predict(user, item) writer.writerow({'Id':"r" + item + "_c" + user,'Prediction':pred.est}) perf = evaluate(algo, data, measures=['RMSE', 'MAE']) print(perf)
class recommender: def __init__(self, algorithm): # Always call base method before doing anything. self.name = algorithm.lower() # SVD, NMF, SAE, LSTM self.surprise_algorithms = ['svd', 'nmf', 'knnbasic', 'knnmeans'] self.devooght_algorithms = ['fism'] ''' To implement with surprise: - Matrix-Factorization Based: SVDpp: The SVD++ algorithm, an extension of SVD taking into account implicit ratings. - Neighbourhood-based: Coclustering KNNWithZScore: A basic collaborative filtering algorithm, taking into account the z-score normalization of each user. KNNBaseline: A basic collaborative filtering algorithm taking into account a baseline rating. - Random Predictor NormalPredictor: Algorithm predicting a random rating based on the distribution of the training set, which is assumed to be normal. - Baseline BaselineOnly: Algorithm predicting the baseline estimate for given user and item. - Slope One SlopeOne: A simple yet accurate collaborative filtering algorithm. To implement using RNN: - LSTM - GRU (Devooght, Bersini) - GRU with clustering (Devooght, Bersini) To extract latent factors: - Stacked Autoencoders - CNN - CNN with Stacked Autoencoders ''' self.df_known_predictions = None self.df_unknown_predictions = None self.known_sequence_dict = None self.unknown_sequence_dict = None self.k = None self.k_min = None self.metrics = None def get_name(self, verbose=False): return self.name def fit(self, df_ratings=None, columns=['userId', 'itemId', 'rating'], verbose=False, **kwargs): self.columns = np.array(columns) # If Surprise lib is the base package to fit, then df_ratings must be used. # Algorithms that use Surprise Lib: NMF, SVD, KNN, SVDpp if (df_ratings is not None): self.df_ratings = df_ratings.copy() ########################################### # Convert Utility Matrix to df_ratings if utility matrix is passed # # ########################################### if self.name in self.surprise_algorithms: # Surprise-based recommenders from surprise import Dataset from surprise import Reader # A reader is still needed but only the rating_scale param is required. # The Reader class is used to parse a file containing ratings. reader = Reader(rating_scale=(0.5, 5.0)) # Separating timestamp column if ('timestamp' in columns): self.df_timestamp = self.df_ratings['timestamp'].copy() self.df_ratings.drop(labels='timestamp', inplace=True, axis=1) # The columns must correspond to user id, item id and ratings (in that order). data = Dataset.load_from_df( self.df_ratings[self.columns[np.where( self.columns != 'timestamp')]], reader) # Creting trainset variable to be used in prediction functions of Surprise self.trainset = data.build_full_trainset() # Creating Model if self.name == 'svd': from surprise import SVD # Setting Number of Factors in Matrix Factorization if ('n_factors' in kwargs): self.n_factors = kwargs['n_factors'] else: self.n_factors = 100 if (verbose): print("Using default number of factors: {}".format( self.n_factors)) # Setting number of epochs in stocastic gradient descent if ('n_epochs' in kwargs): self.n_epochs = kwargs['n_epochs'] else: self.n_epochs = 20 if (verbose): print("Using default number of epochs: {}".format( self.n_epochs)) self.model = SVD(n_factors=self.n_factors, n_epochs=self.n_epochs, verbose=verbose) elif self.name == 'nmf': from surprise import NMF # Setting Number of Factors in Matrix Factorization if ('n_factors' in kwargs): self.n_factors = kwargs['n_factors'] else: self.n_factors = 15 if (verbose): print("Using default number of factors: {}".format( self.n_factors)) # Setting number of epochs in stocastic gradient descent if ('n_epochs' in kwargs): self.n_epochs = kwargs['n_epochs'] else: self.n_epochs = 50 if (verbose): print("Using default number of epochs: {}".format( self.n_epochs)) self.model = NMF(n_factors=self.n_factors, n_epochs=self.n_epochs, verbose=verbose) elif self.name == 'knnbasic': from surprise import KNNBasic # Setting number of neighbours if ('k' in kwargs): self.k = kwargs['k'] else: self.k = 40 if (verbose): print("Using default k: {}".format(self.k)) # Setting minimum number of neighbours if ('k_min' in kwargs): self.k_min = kwargs['k_min'] else: self.k_min = 1 if (verbose): print("Using default k_min: {}".format(1)) self.model = KNNBasic(k=self.k, min_k=self.k_min, verbose=verbose) elif self.name == 'kmeans': from surprise import KNNWithMeans # Setting number of neighbours if ('k' in kwargs): self.k = kwargs['k'] else: self.k = 40 if (verbose): print("Using default k: {}".format(40)) # Setting minimum number of neighbours if ('k_min' in kwargs): self.k_min = kwargs['k_min'] else: self.k_min = 1 if (verbose): print("Using default k_min: {}".format(1)) self.model = KNNWithMeans(k=self.k, min_k=self.k_min, verbose=verbose) else: if (verbose): print("Algorithm not configured: {}".format(self.name)) return -1 # Train the algorithm on the trainset, and predict ratings for the testset self.model.train(self.trainset) return 0 elif (self.name in self.devooght_algorithms): # Arguments directory_path = os.path.join( '.', 'Sequence_based_recommendation_files', self.name) preprocess.create_dirs(dirname=directory_path, verbose=verbose) data = preprocess.remove_rare_elements(data=df_ratings, min_user_activity=1, min_item_popularity=1, verbose=verbose) data = preprocess.save_index_mapping(data=data, dirname=directory_path, separator=',') train_set, val_set, test_set = preprocess.split_data( data=data, nb_val_users=0.1, # val_size nb_test_users=0.1, # test_size dirname=directory_path, verbose=verbose) preprocess.make_sequence_format(train_set=train_set, val_set=val_set, test_set=test_set, dirname=directory_path, verbose=verbose) preprocess.save_data_stats(data=data, train_set=train_set, val_set=val_set, test_set=test_set, dirname=directory_path, verbose=verbose) # Training Algorithm parser = parse.command_parser(parse.predictor_command_parser, train.training_command_parser, parse.early_stopping_command_parser) if self.name == 'fism': args = parser.parse_args([ '--dir', os.path.join(directory_path, 'models'), '-d', directory_path, #directory_path + '/', '-b', '20', # Batch size: the number of training examples present in a single blatch '--max_iter', '50', # Maximum number of iterations: the number of batches needed to complete one epoch '--progress', '10', # when progress information should be printed during training '-m', self.name.upper(), # Method #'-i', '-1', # Number of batches - only on test parser '--loss', 'RMSE', '--save', 'Best' ]) self.model = parse.get_predictor(args) dataset = handler.DataHandler( dirname=args.dataset, extended_training_set=args.extended_set, shuffle_training=args.tshuffle) self.model.prepare_model(dataset) self.metrics = self.model.train( dataset, save_dir=args.dir, time_based_progress=args.time_based_progress, progress=float(args.progress), autosave=args.save, max_progress_interval=args.mpi, max_iter=args.max_iter, min_iterations=args.min_iter, max_time=args.max_time, early_stopping=parse.get_early_stopper(args), load_last_model=args.load_last_model, validation_metrics=args.metrics.split(',')) else: if (verbose): print("Algorithm not configured: {}".format(self.name)) return -1 return 0 else: # if self.name not in self.surprise_algorithms if (verbose): print("Invalid algorithm: {}".format(self.name)) def get_model(self): return self.model def get_metrics(self): return self.metrics def calculate_known_predictions(self): # Calculating all predictions for known items if self.name in self.surprise_algorithms: # Calculating predictions dataframe as userId, itemId, rating, prediction # predictions return raw uid and iid known_predictions = self.model.test(self.trainset.build_testset( )) # Brings all predictions of existing ratings for prediction in known_predictions: arr = np.array([ int(prediction.uid), int(prediction.iid), prediction.r_ui, prediction.est ]) if prediction == known_predictions[0]: predictions = np.array([arr]) else: predictions = np.append(predictions, [arr], axis=0) self.df_known_predictions = pd.DataFrame({ 'userId': predictions[:, 0], 'itemId': predictions[:, 1], 'rating': predictions[:, 2], 'prediction': predictions[:, 3] }) if ('timestamp' in self.columns): self.df_known_predictions = self.df_known_predictions.set_index( keys=['userId', 'itemId']).join( df_ratings.drop('rating', axis=1).set_index( keys=['userId', 'itemId'])).reset_index() self.df_known_predictions['userId'] = self.df_known_predictions[ 'userId'].astype(int) self.df_known_predictions['itemId'] = self.df_known_predictions[ 'itemId'].astype(int) def get_known_predictions(self, calculate_predictions=False): if self.df_known_predictions is None or calculate_predictions == True: self.calculate_known_predictions() return self.df_known_predictions def calculate_unknown_predictions(self): # Calculating all predictions for known items # predictions return raw uid and iid if self.name in self.surprise_algorithms: unknown_predictions = self.model.test( self.trainset.build_anti_testset( )) # => Brings all predictions of non-existing ratings for prediction in unknown_predictions: arr = np.array([ int(prediction.uid), int(prediction.iid), 0, prediction.est ]) if prediction == unknown_predictions[0]: predictions = np.array([arr]) else: predictions = np.append(predictions, [arr], axis=0) self.df_unknown_predictions = pd.DataFrame({ 'userId': predictions[:, 0], 'itemId': predictions[:, 1], 'rating': predictions[:, 2], 'prediction': predictions[:, 3] }) def get_unknown_predictions(self, calculate_predictions=False): if self.df_unknown_predictions is None or calculate_predictions == True: self.calculate_unknown_predictions() return self.df_unknown_predictions def predict(self, userId, itemId, verbose=False): if self.name in self.surprise_algorithms: prediction = self.model.predict( uid=int(userId), iid=int(itemId)) # Take as input the raw user id and item id #ref: http://surprise.readthedocs.io/en/stable/algobase.html#surprise.prediction_algorithms.algo_base.AlgoBase.predict if prediction.details['was_impossible'] == True: if (verbose): print( "Impossible to predict item {} rating for user {} (one of them may not have been in training step)" .format(itemId, userId)) return 0 else: return prediction.est def get_top_n(self, n=10, source='unknown', calculate_sequence=False): '''Return the top-N recommendation for each user from a set of predictions. Args: n(int): The number of recommendation to output for each user. Default is 10. Returns: A dict where keys are user (raw) ids and values are lists of tuples: [(raw item id, rating estimation), ...] of size n. ''' if (source.lower() == 'known'): # Checking if known predictions are calculated if (self.df_known_predictions is None): self.get_unknown_predictions(calculate_predictions=True) if (calculate_sequence == True or self.known_sequence_dict is None): self.known_sequence_dict = dict() for userId in self.df_known_predictions['userId'].unique(): # Selecting single user df_user = self.df_known_predictions[ self.df_known_predictions['userId'] == userId].copy() # Sorting values by prediction df_user.sort_values(by=['prediction'], ascending=False, inplace=True) # Saving the first K in sequence dict self.known_sequence_dict[userId] = np.array( df_user['itemId'].head(n)) return self.known_sequence_dict
} df = pd.DataFrame(ratings_dict) reader = Reader(rating_scale=(1, 5)) # Loads Pandas dataframe data = Dataset.load_from_df(df[["user", "item", "rating"]], reader) # recommender.py from surprise import KNNWithMeans # To use user-based cosine similarity sim_options = { "name": "cosine", "user_based": True, # Compute similarities between user } algo = KNNWithMeans(sim_options=sim_options) trainingSet = data.build_full_trainset() algo.fit(trainingSet) #predicting on the basis of user that how much E would rate to the second product prediction = algo.predict('E', 2) prediction.est if(prediction.est>3.5): print('The product is recommended.') print('Here, the prediction estimate is : '+str(prediction.est))
playlist_neighbors = algo.get_neighbors(playlist_inner_id, k=10) # 把歌曲id转成歌曲名字 # to_raw_uid映射回去 playlist_neighbors = (algo.trainset.to_raw_uid(inner_id) for inner_id in playlist_neighbors) playlist_neighbors = (id_name_dic[playlist_id] for playlist_id in playlist_neighbors) print "和歌单 《", current_playlist, "》 最接近的10个歌单为:\n" for playlist in playlist_neighbors: print playlist, algo.trainset.to_inner_uid(name_id_dic[playlist]) # 重建歌曲id到歌曲名的映射字典 song_id_name_dic = pickle.load(open("data/song.pkl", "rb"), encoding='utf-8') print "加载歌曲id到歌曲名的映射字典完成..." # 重建歌曲名到歌曲id的映射字典 song_name_id_dic = {} for song_id in song_id_name_dic: song_name_id_dic[song_id_name_dic[song_id]] = song_id print "加载歌曲名到歌曲id的映射字典完成..." #内部编码的4号用户 user_inner_id = 4 user_rating = trainset.ur[user_inner_id] items = map(lambda x: x[0], user_rating) for song in items: print algo.predict( user_inner_id, song, r_ui=1), song_id_name_dic[algo.trainset.to_raw_iid(song)]
def make_prediction(test_data_imdb): train_data = pd.read_csv('../data/modeling/train/ratings_clean_std_0.csv', sep=',').drop(columns={'Unnamed: 0'}) omdb = pd.read_csv('../data/modeling/train/omdb_cleaned.csv') # build a reader, define the rating scale (minimum and maximum value) reader = Reader(rating_scale=(0.5, 5)) # convert data to surprise format train_surprise = Dataset.load_from_df(train_data, reader).build_full_trainset() # Collaborative Filtering Models knn_collaborative = KNNWithMeans(k=115, min_k=5, sim_options={ 'name': 'msd', 'user_based': False }) knn_collaborative.fit(train_surprise) svd = SVD(lr_all=0.01, reg_all=0.05, n_epochs=23) svd.fit(train_surprise) preds = [[ knn_collaborative.predict(test[1], test[3]).est for test in test_data_imdb.itertuples() ], [ svd.predict(test[1], test[3]).est for test in test_data_imdb.itertuples() ]] # Content-Based Models # define features for content-based models params_features = { 'threshold_actors': 0, 'ts_languages': 0, 'year': True, 'runtime': True, 'imdbvotes': True, 'series': False, 'awards': False, 'genres': True, 'imdb_rating': True, 'roto_rating': True, 'pg_rating': True, 'threshold_newkeywords': 0, 'threshold_plots': 0, 'threshold_directors': 0 } # load features features, names = preprocessing.features(**params_features) # add imdbID and set as index features = omdb[['imdbID' ]].join(pd.DataFrame(features)).set_index('imdbID') # predict ratings pred_content = [] no_of_ratings = [] train_data = train_data[train_data['imdbID'] != 'tt0720339'] for row in test_data_imdb.itertuples(): # select user and movie imdbID = row.imdbID userID = row.user_id # compute predictions if imdbID == 'tt0720339': # exclude outlier movie without information pred_content.append(svd.predict(userID, imdbID).est) else: # select ratings of the user ratings_user = train_data.loc[train_data['user_id'] == userID] ratings_user.reset_index(inplace=True, drop=True) # select features of corresponding movies and convert to array features_user = np.array(features.loc[ratings_user['imdbID']]) features_movie = np.array(features.loc[imdbID]) pred_content.append( predict_movie_rating(ratings_user, features_user, features_movie)) # store the number of predictions of a user: no_of_ratings.append(ratings_user.shape[0]) # predictions of the models predictions = weighted_prediction(preds[0], preds[1], pred_content, no_of_ratings) test_data_with_rating = test_data_imdb.join(predictions) return test_data_with_rating[['user_id', 'movieID', 'rating']]
from surprise import Dataset, print_perf, Reader from surprise.model_selection import cross_validate import os # 指定文件所在路径 file_path = os.path.expanduser('mydata.csv') # 告诉文本阅读器,文本的格式是怎么样的 reader = Reader(line_format='user item rating', sep=',') # 加载数据 data = Dataset.load_from_file(file_path, reader=reader) trainset = data.build_full_trainset() # Use user_based true/false to switch between user-based or item-based collaborative filtering algo = KNNWithMeans(k=50, sim_options={'user_based': False})#取最相似的用户进行计算时,只取最相似的k个 algo.fit(trainset) # we can now query for specific predicions uid = str(5) # raw user id iid = str(1) # raw item id # get a prediction for specific users and items. pred = algo.predict(uid, iid) print('rating of user-{0} to item-{1} is '.format(uid, iid), pred.est)# rating of user-5 to item-1 #---------------------------- uid = str(5) # raw user id iid = str(5) # raw item id # get a prediction for specific users and items. pred = algo.predict(uid, iid) print('rating of user-{0} to item-{1} is '.format(uid, iid), pred.est)
trainset, testset = train_test_split(data, test_size=.15) # Use user_based true/false to switch between user-based or item-based collaborative filtering algo = KNNWithMeans(k=50, sim_options={ 'name': 'pearson_baseline', 'user_based': True }) algo.fit(trainset) # we can now query for specific predicions uid = str(196) # raw user id iid = str(302) # raw item id # get a prediction for specific users and items. pred = algo.predict(uid, iid, r_ui=4, verbose=True) # run the trained model against the testset test_pred = algo.test(testset) # get RMSE print("User-based Model : Test Set") accuracy.rmse(test_pred, verbose=True) # if you wanted to evaluate on the trainset print("User-based Model : Training Set") train_pred = algo.test(trainset.build_testset()) accuracy.rmse(train_pred) # Use user_based true/false to switch between user-based or item-based collaborative filtering algo = KNNWithMeans(k=50,
algo3.fit(data_train.build_full_trainset()) pred1 = [] pred_f1 = [] pred2 = [] pred_f2 = [] pred3 = [] pred_f3 = [] with open("./data/testing.dat", "r", encoding='utf-8') as f: for line in f.readlines(): line_data = line.strip().split(",") a = algo1.predict(str(line_data[0]), str(line_data[1]), None, True, True)[3] b = algo2.predict(str(line_data[0]), str(line_data[1]), None, True, True)[3] c = algo3.predict(str(line_data[0]), str(line_data[1]), None, True, True)[3] pred1.append(int(round(a))) pred_f1.append(a) pred2.append(int(round(b))) pred_f2.append(b) pred3.append(int(round(c))) pred_f3.append(c) with open("./雷雨轩_PB18111791_4.txt", "w") as f: for ratings in pred1: f.write(str(ratings) + "\n") with open("./4_float.txt", "w") as f: for ratings in pred_f1: f.write(str(ratings) + "\n")
list_reviews = read_datafile(data_file) df = pd.DataFrame(list_reviews, columns=['UserId', 'ItemId', 'Playtime']) #filter_dataset(df) #normalize_playtime(df) reader = Reader(rating_scale=(0, max(df.Playtime))) sim_options = { "name": "cosine", "user_based": False, # Compute similarities between items } algo = KNNWithMeans(sim_options=sim_options) if cross_validate: data = Dataset.load_from_df(df, reader) cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) else: train_df, test_df = train_test_split(df, test_size=0.2) train_data = Dataset.load_from_df(train_df, reader) training_set = train_data.build_full_trainset() algo.fit(training_set) for index, row in test_df.iterrows(): user = row['UserId'] item = row['ItemId'] playtime = row['Playtime'] prediction = algo.predict(user, item) print('{}:{} - {} / {}'.format(user, item, prediction, playtime))
print(movie_vector['Toy Story (1995)']) dataset = pd.DataFrame({ 'uid': movies_with_ratings.userId, 'iid': movies_with_ratings.title, 'rating': movies_with_ratings.rating }) print('dataset', dataset) ratings.rating.min() ratings.rating.max() reader = Reader(rating_scale=(0.5, 5.0)) data = Dataset.load_from_df(dataset, reader) trainset, testset = train_test_split(data, test_size=.15) algo = KNNWithMeans(k=50, sim_options={ 'name': 'pearson_baseline', 'user_based': True }) algo.fit(trainset) test_pred = algo.test(testset) print('accuracy', accuracy.rmse(test_pred, verbose=True)) print('predict', algo.predict(uid=2, iid='Fight Club (1999)').est)
class KNNMean: def __init__(self, data, rating_scale, k=50, min_k=1, sim_options=None): self.data = data self.rating_scale = rating_scale self.k = k self.min_k = min_k self.reader = Reader(rating_scale=self.rating_scale) if not sim_options: sim_options = { "name": "cosine", 'min_support': 3, "user_based": False } # Compute similarities between items self.model_data = Dataset.load_from_df( data.loc[:, ["userId", "movieId", "rating"]], self.reader) self.trainset = self.model_data.build_full_trainset() self.model = KNNWithMeans(self.k, self.min_k, sim_options=sim_options) print('fitting KNNWithMeans model...') self.model.fit(self.trainset) self.grid_search_ = None def set_model_params(self, model_params): print('updating model parameters...') self.model = KNNWithMeans(model_params) print('fitting KNNWithMeans model...') self.model.fit(self.trainset) def update_grid_search(self, gs): self.grid_search_ = gs def fit(self, data): self.data = data self.model_data = Dataset.load_from_df( data.loc[:, ["userId", "movieId", "rating"]], self.reader) self.trainset = self.model_data.build_full_trainset() self.model.fit(self.trainset) def grid_search(self): print('grid search...') sim_options = { "name": ["msd", "cosine"], "min_support": [3, 4], "user_based": [False] } param_grid = { "sim_options": sim_options, "k": [50, 100, 200], "min_k": [1] } gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse", "mae"], cv=3) gs.fit(self.model_data) best_params, best_score = gs.best_params["rmse"], gs.best_score["rmse"] print(f'Best score (RMSE): {best_score}') print(f'Best params (RMSE): {best_params}') print(f'Best score (MAE): {gs.best_score["mae"]}') print(f'Best params (RMSE): {gs.best_params["mae"]}') self.set_model_params(best_params) return best_params def predict(self, test_data): ratings = test_data.apply( lambda x: self.model.predict(x['userId'], x['movieId']).est, axis=1) return ratings
kf = KFold(n_splits=2) algo = KNNWithMeans(k=30, min_k=1, verbose=True) count = 0 for trainset, testset in kf.split(data): count = count + 1 print('Training round: ' + str(count)) # 训练并测试算法 print('Training') algo.fit(trainset) print('Train completed!') #if count is 1: #break print('Variating') predictions = algo.test(testset) print('Variation completed!') # 计算并打印 RMSE(均方根误差,Root Mean Squared Error) accuracy.rmse(predictions, verbose=True) # 保存模型 dump.dump('saved_model_knnm.model', algo=algo, verbose=1) # 生成结果 fo = open('submission.txt', 'w', encoding='utf-8') with open('test.txt', 'r', encoding='utf-8') as f: for line in f: row = line.split(',') fo.write(str(algo.predict(int(row[0]), int(row[1])).est) + '\n') fo.close()