def knn_item(trainset, testset, predset): modelname = 'knnitem' # Check if predictions already exist if is_already_predicted(modelname): return bsl_options = { 'method': 'als', 'reg_i': 1.e-5, 'reg_u': 14.6, 'n_epochs': 10 } sim_options = { 'name': 'pearson_baseline', 'shrinkage': 100, 'user_based': False } algo = KNNBaseline(k=60, sim_options=sim_options, bsl_options=bsl_options) print('KNN item based Model') algo.train(trainset) predictions = algo.test(testset) rmse = accuracy.rmse(predictions, verbose=False) print(' RMSE on Test: ', rmse) preds = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds[j] = pred.est save_predictions(modelname, rmse, preds, 'test') print(' Evaluate predicted ratings...') predictions = algo.test(predset) preds = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds[j] = pred.est save_predictions(modelname, rmse, preds)
def knn_baseline_movie(train, test, ids, Xtest, Xids): """ nearest neighbour approach using the movie baseline Argument : train, the trainset test, the testset ids, unknown ratings Xtest, predicted ratings for testset, to be used for final blending Xids, predicted ratings for unknown ratings, to be used for final blending """ print('kNN Baseline Movie') bsl_option = {'method': 'als', 'n_epochs': 100, 'reg_u': 15, 'reg_i': 0.01} sim_option = { 'name': 'pearson_baseline', 'min_support': 1, 'user_based': False } algo = KNNBaseline(k=100, bsl_options=bsl_option, sim_options=sim_option, verbose=False) #Train algorithm on training set algo.fit(train) #Predict on train and compute RMSE predictions = algo.test(train.build_testset()) print(' Training RMSE: ', accuracy.rmse(predictions, verbose=False)) #Predict on test and compute RMSE predictions = algo.test(test) rmse = accuracy.rmse(predictions, verbose=False) print(' Test RMSE: ', rmse) preds_test = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds_test[j] = pred.est #Predict unknown ratings preds_ids = [] for i in range(len(ids[0])): pred = algo.predict(str(ids[0][i]), str(ids[1][i])) preds_ids.append(pred.est) Xtest.append(preds_test) Xids.append(preds_ids) return rmse, Xtest, Xids, preds_test, preds_ids
def surprise_knn_ub(train_file, test_file): """ Knn userbased with Surprise library. Compute the predictions on a test_set after training on a train_set using the method KNNBaseLineOnly from Surprise. Args: train_file (string): path to created test file test_file (string): path to created train file Hyperparameters : k : The (max) number of neighbors to take into account for aggregation sim_options (dict) – A dictionary of options for the similarity measure. Returns: numpy array: predictions """ print("knnUB") algo = KNNBaseline(k=300, sim_options={ 'name': 'pearson_baseline', 'user_based': True }) fold = [(train_file, test_file)] reader = Reader(line_format='user item rating', sep=',') data = Dataset.load_from_folds(fold, reader=reader) pkf = PredefinedKFold() for trainset, testset in pkf.split(data): # Train algo.fit(trainset) # Predict predictions = algo.test(testset) pred = np.zeros(len(predictions)) for i in range(len(predictions)): val = predictions[i].est pred[i] = val return pred
def get_top_n_for_user(target_user_id, recom_alg, recom_size): file_path = os.path.expanduser('static/CRdata.csv') reader = Reader(line_format='user item rating timestamp', sep=',', rating_scale=(0,100)) data = Dataset.load_from_file(file_path,reader=reader) trainset = data.build_full_trainset() testset = trainset.build_anti_testset() if(recom_alg == 'KNNBaseline'): similarity = {'name': 'cosine', 'user_based': True # compute similarities between users } algo = KNNBaseline(sim_options=similarity) elif(recom_alg == 'CoClustering'): algo = CoClustering() else: algo = SVD() algo.fit(trainset) predictions = algo.test(testset) # First map the predictions to each user. top_n = defaultdict(list) for uid, iid, true_r, est, _ in predictions: top_n[uid].append((iid, est)) # Then sort the predictions for each user and retrieve the k highest ones. for uid, user_ratings in top_n.items(): user_ratings.sort(key=lambda x: x[1], reverse=True) top_n[uid] = user_ratings[:recom_size] return top_n[str(target_user_id)]
def main(): # Loads dataset rating_data_set = load_dataset(TRAINING_SET_PATH) # # Clean data rating_data_set = remove_missing_values(rating_data_set) # Slice data drop_movie_list, rating_data_set = slice_data(rating_data_set) # Loads movie file movies = load_movies_file(drop_movie_list, MOVIES_FILE_PATH) reader = Reader() sim_options = {'name': 'cosine', 'min_support': 2, 'shrinkage': 100, 'user_based': True} bsl_options = {'method': 'sgd'} data = Dataset.load_from_df(rating_data_set[['CustomerID', 'MovieID', 'Rating']][:1000], reader) kf = KFold(n_splits=5) #algo = SVD() algo = KNNBaseline(k=N, sim_options=sim_options, bsl_options=bsl_options) i = 0 for trainset, testset in kf.split(data): print("Running fold: ", i) algo.fit(trainset) predictions = algo.test(testset) precisions, recalls = precision_recall(predictions, 20) # Precision and recall can then be averaged over all users print(sum(prec for prec in precisions.values()) / len(precisions)) print(sum(rec for rec in recalls.values()) / len(recalls)) i += 1
# чтение файла в словарь def read(): file_name = get_dataset_dir() + '/ml-100k/ml-100k/u.item' rid_name = {} with io.open(file_name, 'r', encoding='ISO-8859-1') as file: for line in file: line = line.split('|') rid_name[line[0]] = (line[1], line[2]) return rid_name # оценка с наилучшими параметрами(test) test = train.build_anti_testset() test = filter(lambda x: x[0] == user, test) pr = algorithm.test(test) name = read() # формируем список для пользователей(uid) top = collections.defaultdict(list) for uid, iid, _, est, _ in pr: top[uid].append((iid, round(est, 3))) # сортируем и добавляем только top_n for uid, ratings in top.items(): ratings.sort(key=lambda x: x[1], reverse=True) top[uid] = ratings[:top_n] print('User {user}:') for movie, rating in top[user]: print(movie, str(name[movie]), rating)
def collaborative_filtering(raw_uid): # To read the data from a txt file # =============== 数据预处理 =========================== # 将数据库中的所有数据读取转换到文件 # dir_data = '/www/wwwroot/music_recommender/page/cf_recommendation/cf_data' dir_data = './collaborative_filtering/cf_data' file_path = '{}/dataset_user_5.txt'.format(dir_data) if not os.path.exists(dir_data): os.makedirs(dir_data) # 数据库操作 # 打开数据库连接 db = pymysql.connect("localhost", "root", "password", "music_recommender", charset='utf8') # 使用 cursor() 方法创建一个游标对象 cursor cursor = db.cursor() songData = defaultdict(list) sql = """SELECT uid, song_id, rating FROM user_rating WHERE 1""" cursor.execute(sql) results = cursor.fetchall() with open(file_path, "w+") as data_f: a = 0 for result in results: uid, song_id, rating = result if song_id in songData: songData[song_id].append(rating) else: songData[song_id] = [rating] data_f.writelines("{}\t{}\t{}\n".format(uid, song_id, rating)) a += 1 if not os.path.exists(file_path): raise IOError("Dataset file is not exists!") reader = Reader(line_format='user item rating', sep='\t') data = Dataset.load_from_file(file_path, reader=reader) # Build the training set trainset = data.build_full_trainset() bsl_options = {'method': 'sgd', 'learning_rate': 0.0005, } algo_BaselineOnly = BaselineOnly(bsl_options=bsl_options) algo_BaselineOnly.fit(trainset) #训练模型 rset = user_build_anti_testset(trainset, raw_uid) predictions = algo_BaselineOnly.test(rset) top_n_baselineonly = get_top_n(predictions, n=10) # print(predictions) # uid 原生用户id # iid 原生项目id # r_ui 浮点型的真实评分 # est 浮点型的预测评分 # details 预测相关的其他详细信息 # print(top_n_baselineonly, 'top_n_baselineonly') # KNNBasic sim_options = {'name': 'pearson', 'user_based': True} algo_KNNBasic = KNNBasic(sim_options=sim_options) algo_KNNBasic.fit(trainset) predictor = PredictionSet(algo_KNNBasic, trainset, raw_uid) knn_anti_set = predictor.user_build_anti_testset() predictions = algo_KNNBasic.test(knn_anti_set) top_n_knnbasic = get_top_n(predictions, n=1000) # print(predictions, 'top_n_knnbasic') # KNNBaseline sim_options = {'name': 'pearson_baseline', 'user_based': True} algo_KNNBaseline = KNNBaseline(sim_options=sim_options) algo_KNNBaseline.fit(trainset) predictor = PredictionSet(algo_KNNBaseline, trainset, raw_uid) knn_anti_set = predictor.user_build_anti_testset() predictions = algo_KNNBaseline.test(knn_anti_set) top_n_knnbaseline = get_top_n(predictions, n=1000) evaluationMSEResult = evaluationMSE([top_n_baselineonly, top_n_knnbasic, top_n_knnbaseline], raw_uid) recommendset = set() for results in [top_n_baselineonly, top_n_knnbasic, top_n_knnbaseline]: for key in results.keys(): for recommendations in results[key]: iid, rating, true_score = recommendations recommendset.add(iid) items_baselineonly = set() for key in top_n_baselineonly.keys(): for recommendations in top_n_baselineonly[key]: iid, rating, true_score = recommendations items_baselineonly.add(iid) items_knnbasic = set() for key in top_n_knnbasic.keys(): for recommendations in top_n_knnbasic[key]: iid, rating, true_score = recommendations items_knnbasic.add(iid) items_knnbaseline = set() for key in top_n_knnbaseline.keys(): for recommendations in top_n_knnbaseline[key]: iid, rating, true_score = recommendations items_knnbaseline.add(iid) rank = dict() for recommendation in recommendset: if recommendation not in rank: rank[recommendation] = 0 if recommendation in items_baselineonly: rank[recommendation] += 1 if recommendation in items_knnbasic: rank[recommendation] += 1 if recommendation in items_knnbaseline: rank[recommendation] += 1 max_rank = max(rank, key=lambda s: rank[s]) evaluationMSEResult1 = {} if max_rank == 1: return items_baselineonly else: resultAll = dict() result = nlargest(10, rank, key=lambda s: rank[s]) for k in result: resultAll[k] = rank[k] # print("排名结果: {}".format(resultAll)) evaluation(songData, resultAll) for key in evaluationMSEResult: if key in resultAll: evaluationMSEResult1[key] = evaluationMSEResult[key] print(evaluationMSEResult1,'evaluationMSEResult1==') #最后的评估 return resultAll
def recommend(self, params): user = self.fullTrainSet.to_inner_uid(params["user"]) antiTestSet = self._buildAntiTestSetForUser(user) algo = params["algorithm"] path = "models/" + algo if algo == "svd": if "models" not in params.keys(): args = { "random_state": 0, "reg_all": float(params["rr"]), "lr_all": float(params["lr"]), "n_epochs": int(params["ne"]), "n_factors": int(params["factors"]) } svd = SVD(**args) svd = svd.fit(self.fullTrainSet) predictions = svd.test(antiTestSet) if "name" in params.keys(): mm = ModelManager() name = params["name"] path = path + "/" + name mm.saveModel(svd, path) self.models[algo].append(name) else: mm = ModelManager() model = params["models"] path = path + "/" + model svd, _ = mm.loadModel(path) predictions = svd.test(antiTestSet) topN = self._getTopNForUser(predictions) topN = [(self.getAdditionalData(movieId), int(round(estimated, 0)))for movieId, estimated in topN] elif algo == "knnItemBaseline": if "models" not in params.keys(): args = { "sim_options" : {'name': 'cosine', 'user_based': False}, "k": int(params["k"]) } knn = KNNBaseline(**args) knn = knn.fit(self.fullTrainSet) predictions = knn.test(antiTestSet) if "name" in params.keys(): mm = ModelManager() name = params["name"] path = path + "/" + name mm.saveModel(knn, path) self.models[algo].append(name) else: mm = ModelManager() model = params["models"] path = path + "/" + model knn, _ = mm.loadModel(path) predictions = knn.test(antiTestSet) topN = self._getTopNForUser(predictions) topN = [(self.getAdditionalData(movieId), int(round(estimated, 0))) for movieId, estimated in topN] elif algo == "weightedHybrid": svd = SVD(random_state=0, reg_all=0.1, lr_all=0.003, n_factors=30, verbose=False) knn = KNNBaseline(sim_options={'name': 'cosine', 'user_based': False}, k=150) weightedHybrid = WeightedHybridAlgorithm(svd, knn, weights=[0.6, 0.4]) weightedHybrid.fit(self.fullTrainSet) predictions = weightedHybrid.test(antiTestSet) topN = self._getTopNForUser(predictions) topN = [(self.getAdditionalData(movieId), int(round(estimated, 0))) for movieId, estimated in topN] elif algo == "userCollaborative": if "models" not in params.keys(): args = { "k": int(params["k"]), "sim_options": {'name': 'cosine', 'user_based': True} } knn = knnRecAlgorithm(**args) knn = knn.fit(self.fullTrainSet) predictions = knn.test(antiTestSet) if "name" in params.keys(): mm = ModelManager() name = params["name"] path = path + "/" + name mm.saveModel(knn, path) self.models[algo].append(name) else: mm = ModelManager() model = params["models"] path = path + "/" + model knn, _ = mm.loadModel(path) predictions = knn.test(antiTestSet) topN = self._getTopNForUser(predictions, minimumRating=0.0) # topN = [(self.getAdditionalData(movieId), round(estimated, 2)) for movieId, estimated in topN] topN = [(self.getAdditionalData(movieId), "") for movieId, estimated in topN] elif algo == "bpr": if "models" not in params.keys(): args = { "reg": float(params["rr"]), 'learning_rate': float(params["lr"]), 'n_iters': int(params["ni"]), 'n_factors': int(params["factors"]), 'batch_size': 100 } bpr = BPRecommender(args) bpr = bpr.fit() if "name" in params.keys(): mm = ModelManager() name = params["name"] path = path + "/" + name mm.saveBprModel(bpr, path) self.models[algo].append(name) else: mm = ModelManager() model = params["models"] path = path + "/" + model bpr = mm.loadBprModel(path) topN = bpr.recommend(user) topN = [(self.getAdditionalData(movieId), "") for movieId in topN] return topN
def run_knn_baseline(sparse_data): #filename = "test.json" prefix = "knn_baseline_" trainFile = prefix + "train.txt" testFile = prefix + "test.txt" raw_data, userPurchasedSet, userTrueTestSet = preprocess( sparse_data, trainFile, testFile) folds_files = [(trainFile, testFile)] reader = Reader(line_format='user item rating', sep='\t') data = Dataset.load_from_folds(folds_files, reader=reader) pkf = PredefinedKFold() bsl_options = { 'method': 'sgd', 'n_epochs': 20, 'learning_rate': 0.005, } ### sim name: cosine msd pearson pearson_baseline ### user_based : True ---- similarity will be computed based on users ### : False ---- similarity will be computed based on items. sim_options = {'name': 'pearson_baseline', 'user_based': False} predictions = {} top_n = {} testsSet = None total_precisions = 0.0 total_recalls = 0.0 total_hit = 0.0 total_nDCG = 0.0 total_ffeature = 0.0 result_file = prefix + "result.txt" result_f = open(result_file, "w") for trainset, testset in pkf.split(data): testsSet = testset #algo = SVD(n_factors = 5) algo = KNNBaseline(bsl_options=bsl_options, sim_options=sim_options) algo.fit(trainset) pre = algo.test(testset) accuracy.rmse(pre) accuracy.mae(pre) #calculate_rmse(predictions) ### test rowNum = raw_data.get_row_size() colNum = raw_data.get_col_size() cur_time = time.time() time_cost = 0 for i in range(rowNum): user = raw_data.get_userID(i) predictions[user] = set() pq = [] heapq.heapify(pq) for j in range(colNum): item = raw_data.get_itemID(j) if user not in userPurchasedSet or item in userPurchasedSet[ user]: continue value = raw_data.get_val(user, item, 'rating') predict = algo.predict(user, item, r_ui=0, verbose=False)[3] if len(pq) >= 10: heapq.heappop(pq) heapq.heappush(pq, (predict, item)) top_n[user] = set() for items in pq: top_n[user].add(items[1]) if user in userTrueTestSet: curPrecisions = calculate_precision(top_n[user], userTrueTestSet[user]) curRecalls = calculate_recall(top_n[user], userTrueTestSet[user]) ffeature = calculate_f_feature(curPrecisions, curRecalls) curHit = isHit(top_n[user], userTrueTestSet[user]) cur_nDCG = calculate_NDCG(top_n[user], userTrueTestSet[user]) total_precisions += curPrecisions total_recalls += curRecalls total_hit += curHit total_nDCG += cur_nDCG total_ffeature += ffeature result_f.write(user + "\t" + str(curPrecisions) + "\t" + str(curRecalls) + "\t" + str(ffeature) + "\t" + str(curHit) + '\t' + str(cur_nDCG) + "\n") if i != 0 and i % 1000 == 0: duration = (time.time() - cur_time) / 60 time_cost += duration remaining_time = ((rowNum - i) / 1000) * duration cur_time = time.time() #print 'precisions', total_precisions, ' recalls', total_recalls, ' nDCG', total_nDCG print 'i:', i, "/", rowNum, 'remaining time:', remaining_time, 'min' print 'precicions', total_precisions, ' recalls', total_recalls, ' hit', total_hit, 'nDCG:', total_nDCG rowNum = raw_data.get_row_size() print 'avg_precisions:', total_precisions / rowNum, 'avg_recalls:', total_recalls / rowNum, 'avg_ffeature', str( total_ffeature / rowNum ), 'avg_hit:', total_hit / rowNum, 'avg_nDCG:', total_nDCG / rowNum result_f.write("avg:\t" + str(total_precisions / rowNum) + "\t" + str(total_recalls / rowNum) + "\t" + str(total_ffeature / rowNum) + "\t" + str(total_hit / rowNum) + '\t' + str(total_nDCG / rowNum) + "\n") result_f.close()
rating.columns = ['route_id', 'user_id', 'rating'] df = pd.merge(user, rating, on='user_id', how='inner') #df.drop(['user_id', 'Age'], axis=1, inplace=True) df.head() reader = Reader(rating_scale=(0, 5)) data = Dataset.load_from_df(df[['user_id', 'route_id', 'rating']], reader) train, test = train_test_split(data, test_size=.2) sim_options = {'name': 'msd', 'min_support': 5, 'user_based': True} base1 = KNNBaseline(k=30,sim_options=sim_options) base1.fit(train) base1_preds = base1.test(test) accuracy.rmse(base1_preds) sim_options1 = {'name': 'cosine', 'min_support': 5, 'user_based': True} base13 = KNNBaseline(k=2,sim_options=sim_options1) base13.fit(train) base13_preds = base13.test(test) acc = accuracy.rmse(base13_preds) dump.dump('KNNFinal_Model',algo=base13,predictions=base13_preds)
def collaborative_fitlering(raw_uid): # =============== 数据预处理 =========================== # 将数据库中的所有数据读取转换到文件 # dir_data = '/www/wwwroot/music_recommender/page/cf_recommendation/cf_data' dir_data = './collaborative_filtering/cf_data' file_path = '{}/dataset_user_5.txt'.format(dir_data) if not os.path.exists(dir_data): os.makedirs(dir_data) # 数据库操作 # 打开数据库连接 db = pymysql.connect("localhost", "music_system", "music_system", "music_recommender", charset='utf8') # 使用 cursor() 方法创建一个游标对象 cursor cursor = db.cursor() sql = """SELECT uid, song_id, rating FROM user_rating WHERE 1""" cursor.execute(sql) results = cursor.fetchall() with open(file_path, "w+") as data_f: for result in results: uid, song_id, rating = result data_f.writelines("{}\t{}\t{}\n".format(uid, song_id, rating)) if not os.path.exists(file_path): raise IOError("Dataset file is not exists!") # =========== cf recommend ================== # 导入数据 reader = Reader(line_format='user item rating', sep='\t') data = Dataset.load_from_file(file_path, reader=reader) # 所有数据生成训练集 trainset = data.build_full_trainset() # ================= BaselineOnly ================== # start = time.clock() bsl_options = { 'method': 'sgd', 'learning_rate': 0.0005, } algo_BaselineOnly = BaselineOnly(bsl_options=bsl_options) algo_BaselineOnly.fit(trainset) # 获得推荐结果 rset = user_build_anti_testset(trainset, raw_uid) predictions = algo_BaselineOnly.test(rset) top_n_baselineonly = get_top_n(predictions, n=5) # end = time.clock() # print("user-50NN --- BaselineOnly 耗时: %.2fs\n" % (end-start)) # print("BaselineOnly 推荐结果:{}\n".format(top_n_baselineonly)) # ================= KNNBasic ================== sim_options = {'name': 'pearson', 'user_based': True} algo_KNNBasic = KNNBasic(sim_options=sim_options) algo_KNNBasic.fit(trainset) # 获得推荐结果 --- 只考虑 knn 用户的 # start = time.clock() predictor = PredictionSet(algo_KNNBasic, trainset, raw_uid) knn_anti_set = predictor.user_build_anti_testset() predictions = algo_KNNBasic.test(knn_anti_set) top_n_knnbasic = get_top_n(predictions, n=5) # end = time.clock() # print("user-50NN --- KNNBasic 耗时: %.2fs\n" % (end-start)) # print("KNNBasic 推荐结果:{}\n".format(top_n_knnbasic)) # ================= KNNBaseline ================== sim_options = {'name': 'pearson_baseline', 'user_based': True} algo_KNNBaseline = KNNBaseline(sim_options=sim_options) algo_KNNBaseline.fit(trainset) # 获得推荐结果 --- 只考虑 knn 用户的 # start = time.clock() predictor = PredictionSet(algo_KNNBaseline, trainset, raw_uid) knn_anti_set = predictor.user_build_anti_testset() predictions = algo_KNNBaseline.test(knn_anti_set) top_n_knnbaseline = get_top_n(predictions, n=5) # end = time.clock() # print("user-50NN --- KNNBaseline 耗时: %.2fs\n" % (end-start)) # print("KNNBaseline 推荐结果:{}\n".format(top_n_knnbaseline)) # =============== 按比例生成推荐结果 ================== recommendset = set() for results in [top_n_baselineonly, top_n_knnbasic, top_n_knnbaseline]: for key in results.keys(): for recommendations in results[key]: iid, rating = recommendations recommendset.add(iid) items_baselineonly = set() for key in top_n_baselineonly.keys(): for recommendations in top_n_baselineonly[key]: iid, rating = recommendations items_baselineonly.add(iid) items_knnbasic = set() for key in top_n_knnbasic.keys(): for recommendations in top_n_knnbasic[key]: iid, rating = recommendations items_knnbasic.add(iid) items_knnbaseline = set() for key in top_n_knnbaseline.keys(): for recommendations in top_n_knnbaseline[key]: iid, rating = recommendations items_knnbaseline.add(iid) rank = dict() for recommendation in recommendset: if recommendation not in rank: rank[recommendation] = 0 if recommendation in items_baselineonly: rank[recommendation] += 1 if recommendation in items_knnbasic: rank[recommendation] += 1 if recommendation in items_knnbaseline: rank[recommendation] += 1 max_rank = max(rank, key=lambda s: rank[s]) if max_rank == 1: # print(items_baselineonly) return items_baselineonly else: result = nlargest(5, rank, key=lambda s: rank[s]) # print(result) return result
class recsysBase: data = '' trainset = '' testset = '' algorithm = '' algo = '' predictions = '' def __init__(self, data, algorithm='svd', algo_options={}, testset_percent=0): if not data: return self.data = data self.algorithm = algorithm ## if testset_percent == 0: self.trainset = self.data.build_full_trainset() self.testset = self.trainset.build_anti_testset() else: self.trainset, self.testset = train_test_split( self.data, test_size=testset_percent) if self.algorithm == 'svd': self.algo = SVD() elif self.algorithm == 'knn_basic': self.algo = KNNBasic() elif self.algorithm == 'knn_baseline': if not algo_options: algo_options = { 'name': 'pearson_baseline', 'user_based': False } self.algo = KNNBaseline(sim_options=algo_options) self.algo.fit(self.trainset) def exec(self): self.step1() self.step2() self.step3() def step1(self): pass def step2(self): pass def step3(self): pass def compute_rmse(self): if not self.predictions: self.test() accuracy.rmse(self.predictions) def load_from_file(self, file_path='predictions.csv'): self.predictions = pd.read_csv(filepath) def save_to_file(self, file_path='predictions.csv'): pd.DataFrame(algo.predictions).to_csv(file_path, index=False) def benchmark(self): cross_validate(self.algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) def tune(self, opt_field='rmse', param_grid={ 'n_epochs': [5, 10], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6] }, SHOW_RESULT=False): if self.algorithm == 'svd': gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3) ## Start tuning gs.fit(self.data) ## Save to self.algo self.algo = gs.best_estimator[opt_field] self.algo.fit(self.trainset) if SHOW_RESULT: # best RMSE score print(gs.best_score['rmse']) # combination of parameters that gave the best RMSE score print(gs.best_params['rmse']) return self def tune_and_test(self, unbiased_percent=0.1, opt_field='rmse', param_grid={ 'n_epochs': [5, 10], 'lr_all': [0.001, 0.01] }): ## Get RAW raw_ratings = self.data.raw_ratings ## Shuffle ratings if you want random.shuffle(raw_ratings) ## threshold = int((1 - unbiased_percent) * len(raw_ratings)) A_raw_ratings = raw_ratings[:threshold] B_raw_ratings = raw_ratings[threshold:] data = self.data data.raw_ratings = A_raw_ratings ## Select your best algo with grid search. grid_search = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3) grid_search.fit(data) self.algo = grid_search.best_estimator[opt_field] # retrain on the whole set A trainset = data.build_full_trainset() self.algo.fit(trainset) # Compute biased accuracy on A predictions = self.algo.test(trainset.build_testset()) print('Biased accuracy on A,', end=' ') accuracy.rmse(predictions) # Compute unbiased accuracy on B testset = data.construct_testset( B_raw_ratings) # testset is now the set B predictions = self.algo.test(testset) print('Unbiased accuracy on B,', end=' ') accuracy.rmse(predictions) return self def test(self): self.predictions = self.algo.test(self.testset) self.compute_rmse() def get_top_n(self, target_uid=None, n=10, SHOW_RESULT=False): '''Return the top-N recommendation for each user from a set of predictions. Args: predictions(list of Prediction objects): The list of predictions, as returned by the test method of an algorithm. n(int): The number of recommendation to output for each user. Default is 10. Returns: A dict where keys are user (raw) ids and values are lists of tuples: [(raw item id, rating estimation), ...] of size n. ''' if target_uid: target_uid = str(target_uid) # Check if testset is valid if not self.predictions: self.predictions = self.algo.test(self.testset) # First map the predictions to each user. top_n = defaultdict(list) for uid, iid, true_r, est, _ in self.predictions: top_n[uid].append((iid, est)) # Then sort the predictions for each user and retrieve the k highest ones. for uid, user_ratings in top_n.items(): if target_uid and target_uid != uid: continue user_ratings.sort(key=lambda x: x[1], reverse=True) if target_uid: top_n = user_ratings[:n] break else: top_n[uid] = user_ratings[:n] # Print the recommended items for each user if SHOW_RESULT: try: for uid, user_ratings in top_n.items(): print(uid, [iid for (iid, _) in user_ratings]) except: print(top_n) return top_n def precision_recall_at_k(self, target_uid=1, threshold=3.5, k=10, num_of_testset=5, SHOW_RESULT=True): ## target_uid: User ID to get result ## threshold: the lowerbound that the rating should be higher ## k: to get number of relevant and recommended items in top k if target_uid: target_uid = str(target_uid) kf = KFold(n_splits=num_of_testset) final_precision = [] final_recalls = [] for trainset, testset in kf.split(self.data): self.algo.fit(trainset) predictions = self.algo.test(testset) '''Return precision and recall at k metrics for each user.''' # First map the predictions to each user. user_est_true = defaultdict(list) for uid, _, true_r, est, _ in predictions: user_est_true[uid].append((est, true_r)) precisions = dict() recalls = dict() for uid, user_ratings in user_est_true.items(): # Sort user ratings by estimated value user_ratings.sort(key=lambda x: x[0], reverse=True) # Number of relevant items n_rel = sum( (true_r >= threshold) for (_, true_r) in user_ratings) # Number of recommended items in top k n_rec_k = sum( (est >= threshold) for (est, _) in user_ratings[:k]) # Number of relevant and recommended items in top k n_rel_and_rec_k = sum( ((true_r >= threshold) and (est >= threshold)) for (est, true_r) in user_ratings[:k]) # Precision@K: Proportion of recommended items that are relevant precisions[ uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1 # Recall@K: Proportion of relevant items that are recommended recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1 if SHOW_RESULT: print('Relevant: ' + str( sum(prec for prec in precisions.values()) / len(precisions))) print('Recommended: ' + str(sum(rec for rec in recalls.values()) / len(recalls))) final_precision.append(precisions[uid]) final_recalls.append(recalls[uid]) if SHOW_RESULT: print(final_precision, final_recalls) return final_precision, final_recalls def read_item_names(self, file_name=get_dataset_dir() + '/ml-100k/ml-100k/u.item'): """Read the u.item file from MovieLens 100-k dataset and return two mappings to convert raw ids into movie names and movie names into raw ids. """ rid_to_name = {} name_to_rid = {} with io.open(file_name, 'r', encoding='ISO-8859-1') as f: for line in f: line = line.split('|') rid_to_name[line[0]] = line[1] name_to_rid[line[1]] = line[0] return rid_to_name, name_to_rid def get_k_neighbors(self, name='Toy Story (1995)', k=10, SHOW_RESULT=True): ########################################### ## You need to use algorithm='knn_baseline' at the beginning ########################################### if self.algorithm != 'knn_baseline': self.__init__(data=self.data, algorithm='knn_baseline', testset_percent=0) ########################################### ########################################### ## Read the mappings raw id <-> movie name rid_to_name, name_to_rid = self.read_item_names() ## input_raw_id = name_to_rid[name] input_inner_id = self.algo.trainset.to_inner_iid(input_raw_id) ## Retrieve inner ids of the nearest neighbors of Toy Story. input_neighbors = self.algo.get_neighbors(input_inner_id, k=k) ## Convert inner ids of the neighbors into names. input_neighbors = (self.algo.trainset.to_raw_iid(inner_id) for inner_id in input_neighbors) input_neighbors = (rid_to_name[rid] for rid in input_neighbors) ## Show result if SHOW_RESULT: print('\nThe ' + str(k) + ' nearest neighbors of "' + name + '" are:') for neighbor in input_neighbors: print(neighbor) return input_neighbors
We are setting minimum number of neighbous (min_k) 1 and maximum number of neighbours (k) = 40 We train the model on train set ''' algo2 = KNNBasic(sim_options=sim_options, k=40, min_k=1) algo2.fit(trainset) predictions2 = algo2.test(testset) print("RMSE for KNNBasic:", accuracy.rmse(predictions2, verbose=True)) # In[ ]: ''' We build the model by making use of KNNBasic which is collaborative filtering based algorithm. We are setting minimum number of neighbous (min_k) 1 and maximum number of neighbours (k) = 40 We train the model on train set ''' algo3 = KNNBaseline(sim_options=sim_options, k=40, min_k=1) algo3.fit(trainset) predictions3 = algo3.test(testset) print("RMSE for KNNBaseline:", accuracy.rmse(predictions3, verbose=True)) # In[ ]: ''' We build the model by making use of KNNBasic which is collaborative filtering based algorithm. We are setting minimum number of neighbous (min_k) 1 and maximum number of neighbours (k) = 40 We train the model on train set ''' algo4 = KNNWithZScore(sim_options=sim_options, k=40, min_k=1) algo4.fit(trainset) predictions4 = algo4.test(testset) print("RMSE for KNNBasic:", accuracy.rmse(predictions4, verbose=True))
import pandas as pd import numpy as np from tqdm import tqdm from surprise import KNNWithMeans, KNNBasic, KNNWithZScore, KNNBaseline from surprise import Dataset from surprise import accuracy from surprise import Reader from surprise.model_selection import train_test_split from scipy.spatial.distance import cityblock, cosine, euclidean, hamming, jaccard, rogerstanimoto data = Dataset.load_builtin('ml-1m') trainset, testset = train_test_split(data, test_size=.15) algo = KNNBaseline(k=50, min_k=1, sim_options={ 'name': 'pearson_baseline', 'user_based': True }) algo.fit(trainset) test_pred = algo.test(testset) print('accuracy', accuracy.rmse(test_pred, verbose=True)) print('predict', algo.predict(uid=2, iid='Fight Club (1999)').est)
pred = sv.test(testset) odf = pd.DataFrame(pred, columns=['uid', 'iid', 'rui', 'est', 'details']) odf['err'] = abs(odf.est - odf.rui) print("\n***** SVD Model Prediction Result via model file for Two record*****") accuracy.rmse(pred, verbose=True) accuracy.mae(test_pred, verbose=True) print(odf.head()) # Part-4 - Train the model using KNNBaseline item-item similarity sim_options = {'name': 'pearson_baseline', 'user_based': False} simsAlgo = KNNBaseline(sim_options=sim_options) simsAlgo.fit(trainset) test_pred = simsAlgo.test(testset) df = pd.DataFrame(test_pred, columns=['uid', 'iid', 'rui', 'est', 'details']) df['err'] = abs(df.est - df.rui) print( "****************KNNBaseline item-item similarity: Accuracy Score *****************" ) accuracy.rmse(test_pred, verbose=True) accuracy.mae(test_pred, verbose=True) print(df.head()) # Part-5 - Train the model using KNNBaseline User-User similarity and get the Top-10 movies predictions for each user sim_options = {'name': 'pearson_baseline', 'user_based': True} simsAlgo = KNNBaseline(sim_options=sim_options) simsAlgo.fit(trainset)
data = Dataset.load_from_file(file_path=custom_dataset_path, reader=reader) trainingSet = data.build_full_trainset() sim_options = { 'name': 'pearson_baseline', 'shrinkage': 0 # no shrinkage } knn = KNNBaseline(sim_options=sim_options) knn.fit(trainingSet) testSet = trainingSet.build_anti_testset() predictions = knn.test(testSet) from collections import defaultdict def get_top3_recommendations(predictions, topN = 10): top_recs = defaultdict(list) for uid, iid, true_r, est, _ in predictions: top_recs[uid].append((iid, est)) for uid, user_ratings in top_recs.items(): user_ratings.sort(key = lambda x: x[1], reverse = True) top_recs[uid] = user_ratings[:topN] return top_recs import os, io
# Then sort the predictions for each user and retrieve the k highest ones. for uid, user_ratings in top_n.items(): user_ratings.sort(key=lambda x: x[1], reverse=True) top_n[uid] = user_ratings[:n] return top_n reader = Reader(line_format='user item rating', sep=',', skip_lines=1, rating_scale=(0, 5)) data = Dataset.load_from_file('base.csv', reader=reader) trainset = data.build_full_trainset() #Train the algoritihm to compute the similarities between users sim_options = {'name': 'pearson_baseline'} algo = KNNBaseline(sim_options=sim_options) algo.fit(trainset) # Than predict ratings for all pairs (u, i) that are NOT in the training set. testset = trainset.build_anti_testset() predictions = algo.test(testset) top_n = get_top_n(predictions, n=5) # Print the recommended items for each user for uid, user_ratings in top_n.items(): print(uid, [iid for (iid, _) in user_ratings])
def collab_recommendations(user_id, df1, ratings, movieIds, movies_ratings, keep_movies1, df2, keep_movies2, content_recommedation_system=False, collab_recommendation_system=False, top_n=10, precision=False): # generate recommendations on train/test set if precision: test_ratings = df2.copy() # set parameters for KNN model user_based = { 'name': 'pearson_baseline', 'shrinkage': 0 # no shrinkage } collab_ratings = ratings[['userId', 'movieId', 'rating']] # set scale between min and max rating min_rat = collab_ratings.rating.min() max_rat = collab_ratings.rating.max() reader = Reader(rating_scale=(min_rat, max_rat)) # fit on train set data = Dataset.load_from_df(collab_ratings, reader) trainset = data.build_full_trainset() algo = KNNBaseline(sim_options=user_based) algo.fit(trainset) # predict on test set test_ratings = test_ratings[['userId', 'movieId', 'rating']] testset = [tuple(x) for x in test_ratings.to_numpy()] predictions = algo.test(testset) # return predictions on test set collab_predictions = pd.DataFrame(predictions) collab_predictions = collab_predictions[['uid', 'iid', 'est']] collab_predictions = collab_predictions.rename(columns={ 'est': 'prediction', 'uid': 'userId', 'iid': 'movieId' })[['userId', 'movieId', 'prediction']] collab_predictions[['userId', 'movieId' ]] = collab_predictions[['userId', 'movieId']].astype(int) # use precomputed else: collab_predictions = df2.copy() # get recommendations from collab filtering model collab_rec = collab_predictions[collab_predictions.userId == user_id] # merge with movie ratings + sort on prediction and secondarily on weighted average of ratings collab_rec = pd.merge(collab_rec, movies_ratings, on='movieId') collab_rec = collab_rec.sort_values(['prediction', 'weighted_avg'], ascending=[False, True]) return collab_rec
def three_ensemble_predict(m1_preds, m2_preds, m3_preds, w_1, w_2, w_3): final_preds = [] for x, y, z in zip(m1_preds, m2_preds, m3_preds): assert x[0] == y[0] == z[0] assert x[1] == y[1] == z[1] # avg_pred = (x[3] + y[3] + z[3]) / 3.0 avg_pred = (w_1 * x[3]) + (w_2 * y[3]) + (w_3 * z[3]) final_preds.append(Prediction(x[0], x[1], x[2], avg_pred, x[4])) return final_preds # Compute biased accuracy on A train_svd_preds = final_svd_algo.test(trainset.build_testset()) train_knn_preds = item_knn_algo.test(trainset.build_testset()) train_preds = two_ensemble_predict(train_svd_preds, train_knn_preds, opt_w_svd, opt_w_knn) train_rmse = accuracy.rmse(train_preds) train_mae = accuracy.mae(train_preds) print('Biased RMSE on training set: {}'.format(train_rmse)) print('Biased MAE on training set: {}'.format(train_mae)) # # Compute unbiased accuracy on B testset = data.construct_testset(test_raw_ratings) # testset is now the # set B test_svd_preds = final_svd_algo.test(testset) test_knn_preds = item_knn_algo.test(testset)
def collaborative_filtering(raw_uid): # To read the data from a txt file # TODO: To modify the file path of the data set # =============== 数据预处理 =========================== # 将数据库中的所有数据读取转换到文件 # dir_data = '/www/wwwroot/music_recommender/page/cf_recommendation/cf_data' dir_data = './collaborative_filtering/cf_data' file_path = '{}/dataset_user_5.txt'.format(dir_data) if not os.path.exists(dir_data): os.makedirs(dir_data) # 数据库操作 # 打开数据库连接 db = pymysql.connect("localhost", "root", "password", "music_recommender", charset='utf8') # 使用 cursor() 方法创建一个游标对象 cursor cursor = db.cursor() sql = """SELECT uid, song_id, rating FROM user_rating WHERE 1""" cursor.execute(sql) results = cursor.fetchall() with open(file_path, "w+") as data_f: # print(data_f) # exit() for result in results: uid, song_id, rating = result data_f.writelines("{}\t{}\t{}\n".format(uid, song_id, rating)) if not os.path.exists(file_path): raise IOError("Dataset file is not exists!") # file_path = "" reader = Reader(line_format='user item rating', sep='\t') data = Dataset.load_from_file(file_path, reader=reader) # Build the training set trainset = data.build_full_trainset() # print(trainset) # exit() # Baselineonly bsl_options = { 'method': 'sgd', 'learning_rate': 0.0005, } algo_BaselineOnly = BaselineOnly(bsl_options=bsl_options) algo_BaselineOnly.fit(trainset) rset = user_build_anti_testset(trainset, raw_uid) predictions = algo_BaselineOnly.test(rset) top_n_baselineonly = get_top_n(predictions, n=5) # KNNBasic sim_options = {'name': 'pearson', 'user_based': True} algo_KNNBasic = KNNBasic(sim_options=sim_options) algo_KNNBasic.fit(trainset) predictor = PredictionSet(algo_KNNBasic, trainset, raw_uid) knn_anti_set = predictor.user_build_anti_testset() predictions = algo_KNNBasic.test(knn_anti_set) top_n_knnbasic = get_top_n(predictions, n=5) # KNNBaseline sim_options = {'name': 'pearson_baseline', 'user_based': True} algo_KNNBaseline = KNNBaseline(sim_options=sim_options) algo_KNNBaseline.fit(trainset) predictor = PredictionSet(algo_KNNBaseline, trainset, raw_uid) knn_anti_set = predictor.user_build_anti_testset() predictions = algo_KNNBaseline.test(knn_anti_set) top_n_knnbaseline = get_top_n(predictions, n=5) # l = list(top_n_baselineonly)[0] # a = list() # a[l] = top_n_baselineonly[l] # print(a) # print(l,'predictions == ') print({raw_uid: top_n_baselineonly[raw_uid]}) print({raw_uid: top_n_knnbasic[raw_uid]}) print({raw_uid: top_n_knnbasic[raw_uid]})