def baseline(trainset, testset, predset): modelname = 'baseline' # Check if predictions already exist if is_already_predicted(modelname): return bsl_options = { 'method': 'als', 'reg_i': 1.e-5, 'reg_u': 14.6, 'n_epochs': 10 } algo = BaselineOnly(bsl_options=bsl_options) print('Baseline Model') algo.train(trainset) predictions = algo.test(trainset.build_testset()) print(' RMSE on Train: ', accuracy.rmse(predictions, verbose=False)) predictions = algo.test(testset) rmse = accuracy.rmse(predictions, verbose=False) print(' RMSE on Test: ', rmse) preds = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds[j] = pred.est save_predictions(modelname, rmse, preds, 'test') print(' Evaluate predicted ratings...') predictions = algo.test(predset) preds = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds[j] = pred.est save_predictions(modelname, rmse, preds)
def base_running_time(data): ''' Calculates the running times for training and predictions for Baseline algorithm Args: data(Dataset): a list of datasets with different numbers of users Returns: elapsedtime_Basetrain: running time for training elapsedtime_Basetest: running time for predictions on testset ''' elapsedtime_Basetrain = [] elapsedtime_Basetest = [] # calculate running times for i in range(len(data)): # training running time training_start = time.time() training = data[i].build_full_trainset() testing = training.build_anti_testset() baseline = BaselineOnly() baseline.train(training) elapsedtime_Basetrain.append(time.time() - training_start) # prediction running time test_start = time.time() baseline.test(testing) elapsedtime_Basetest.append(time.time() - test_start) return elapsedtime_Basetrain, elapsedtime_Basetrain
def normalize_affinity_scores_by_user_item_bs(user_item_affinities: List[Tuple[str, str, float]], rating_scale=(1, 5)) \ -> Tuple[float, Dict[str, float], Dict[str, float], float, List[Tuple[str, str, float]]]: train = pd.DataFrame(user_item_affinities) reader = Reader(rating_scale=rating_scale) trainset = Dataset.load_from_df(train, reader).build_full_trainset() trainset_for_testing = trainset.build_testset() algo = BaselineOnly(bsl_options={'method': 'sgd'}) algo.fit(trainset) predictions = algo.test(trainset_for_testing) mean = algo.trainset.global_mean bu = { u: algo.bu[algo.trainset.to_inner_uid(u)] for u in set([u for u, i, r in user_item_affinities]) } bi = { i: algo.bi[algo.trainset.to_inner_iid(i)] for i in set([i for u, i, r in user_item_affinities]) } uid = [[p.uid, p.iid, p.r_ui - p.est] for p in predictions] estimatates = [p.est for p in predictions] estimates_2 = [ p.r_ui - (mean + bu[p.uid] + bi[p.iid]) for p in predictions ] uid = pd.DataFrame(uid, columns=["user", "item", "rating"]) spread = max(uid["rating"].max(), np.abs(uid["rating"].min())) uid = list(zip(uid['user'], uid['item'], uid['rating'])) bu = defaultdict(float, bu) bi = defaultdict(float, bi) # assert estimatates == estimates_2 return mean, bu, bi, spread, uid
def use_als(): start = time.time() performance = [] data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() print('Using ALS') bsl_options = {'method': 'als', 'n_epochs': 20, 'reg_u': 12, 'reg_i': 5} algo_ALS = BaselineOnly(bsl_options=bsl_options) algo_ALS.fit(trainset) testset = trainset.build_anti_testset() predictions_ALS = algo_ALS.test(testset) accuracy_rmse \ = accuracy.rmse(predictions_ALS) accuracy_mae = accuracy.mae(predictions_ALS) performance.append(accuracy_rmse) performance.append(accuracy_mae) end = time.time() performance.append(end - start) return performance
def use_sgd(): start = time.time() performance = [] data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() print('Using SGD') bsl_options = { 'method': 'sgd', 'learning_rate': .005, } algo_SGD = BaselineOnly(bsl_options=bsl_options) algo_SGD.fit(trainset) testset = trainset.build_anti_testset() predictions_SGD = algo_SGD.test(testset) accuracy_rmse = accuracy.rmse(predictions_SGD) accuracy_mae = accuracy.mae(predictions_SGD) performance.append(accuracy_rmse) performance.append(accuracy_mae) end = time.time() performance.append(end - start) return performance
def surprise_baseline(train_file, test_file): """ Baseline with Surprise library. Compute the predictions on a test_set after training on a train_set using the method Baseline from Surprise. Args: train_file (string): path to created test file test_file (string): path to created train file Hyperparameters: - Returns: numpy array: predictions """ print("baseline") algo = BaselineOnly() fold = [(train_file, test_file)] reader = Reader(line_format='user item rating', sep=',') data = Dataset.load_from_folds(fold, reader=reader) pkf = PredefinedKFold() for trainset, testset in pkf.split(data): # Train algo.fit(trainset) # Predict predictions = algo.test(testset) pred = np.zeros(len(predictions)) for i in range(len(predictions)): val = predictions[i].est pred[i] = val return pred
def test_dump(): """Train an algorithm, compute its predictions then dump them. Ensure that the predictions that are loaded back are the correct ones, and that the predictions of the dumped algorithm are also equal to the other ones.""" random.seed(0) train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train') test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k')) pkf = PredefinedKFold() trainset, testset = next(pkf.split(data)) algo = BaselineOnly() algo.fit(trainset) predictions = algo.test(testset) with tempfile.NamedTemporaryFile() as tmp_file: dump.dump(tmp_file.name, predictions, algo) predictions_dumped, algo_dumped = dump.load(tmp_file.name) predictions_algo_dumped = algo_dumped.test(testset) assert predictions == predictions_dumped assert predictions == predictions_algo_dumped
def baseline_bias_model(df): """ Shows the performance of model based on just bias """ ratings_pandas_df = df.drop(columns=['date', 'text']) # ratings_pandas_df.columns = ['user_id', 'business_id', 'rating'] reader = Reader(rating_scale=(1, 5)) #TODO figure out data = surprise.dataset.Dataset.load_from_df(df=ratings_pandas_df, reader=reader) ts = data.build_full_trainset() dusers = ts._raw2inner_id_users ditems = ts._raw2inner_id_items trainset, testset = train_test_split(data) algo = BaselineOnly() algo.fit(trainset) # testset = trainset.build_anti_testset() predictions = algo.test(testset) print('\n') return (trainset, testset, predictions, dusers, ditems)
def baseline(trainset, testset): algo = BaselineOnly() algo.fit(trainset) print("Predictions") predictions = algo.test(testset) accuracy.rmse(predictions) accuracy.mae(predictions) return(predictions)
def predict(path): ##read data and transform it to reader = Reader(line_format='user item rating', sep=',', skip_lines=1) data = Dataset.load_from_file("{}".format(path), reader=reader) all_train = data.build_full_trainset() bsl = BaselineOnly() svd = SVD() bsl.fit(all_train) svd.fit(all_train) all_test = all_train.build_anti_testset() bsl_predictions = bsl.test(all_test) bsl_pred = get_top_n(bsl_predictions, 100) svd_predictions = bsl.test(all_test) svd_pred = get_top_n(svd_predictions, 100) with open("baseline_predictions.pickle", "wb") as f: pickle.dump([bsl_pred, svd_pred], f, protocol=2) f.close() print("Done recommending using baseline model and SVD model.")
def baseline_only(train, test, ids, Xtest, Xids): """ Combines user and item mean with user and item biases Argument : train, the trainset test, the testset ids, unknown ratings Xtest, predicted ratings for testset, to be used for final blending Xids, predicted ratings for unknown ratings, to be used for final blending """ print('Baseline Only') bsl_options = { 'method': 'als', 'n_epochs': 100, 'reg_u': 15, 'reg_i': 0.01 } algo = BaselineOnly(bsl_options=bsl_options, verbose=False) #Train algorithm on training set algo.fit(train) #Predict on train and compute RMSE predictions = algo.test(train.build_testset()) print(' Training RMSE: ', accuracy.rmse(predictions, verbose=False)) #Predict on test and compute RMSE predictions = algo.test(test) rmse = accuracy.rmse(predictions, verbose=False) print(' Test RMSE: ', rmse) preds_test = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds_test[j] = pred.est #Predict unknown ratings preds_ids = [] for i in range(len(ids[0])): pred = algo.predict(str(ids[0][i]), str(ids[1][i])) preds_ids.append(pred.est) Xtest.append(preds_test) Xids.append(preds_ids) return rmse, Xtest, Xids, preds_test, preds_ids
def baseline(trainset, testset): print("\n" + "-" * 5 + " Baseline algorithm using surprise package " + "-" * 5) algo = BaselineOnly() algo.fit(trainset) predictions = algo.test(testset) rmse = accuracy.rmse(predictions) mae = accuracy.mae(predictions) return rmse, mae, predictions
class BaseLineRecommender(object): """ Use surprise's baselineonly algorithm as the baseline of prediction """ def __init__(self): self.model = None def fit(self, train): """ Fit the model """ self.model = BaselineOnly(bsl_options={ 'method': 'sgd', 'n_epochs': 30, 'reg': 0.01, 'learning_rate': 0.01 }) self.model.fit(train) def predict(self, user_id, item_id): """ Predict ratings """ return self.model.predict(user_id, item_id) def rmse(self, test): """ Calculate RMSE for the predicted ratings """ pred = self.model.test(test) return accuracy.rmse(pred) def mae(self, test): """ Calculate MAE for the predicted ratings """ pred = self.model.test(test) return accuracy.mae(pred)
def test_dump(u1_ml100k): """Train an algorithm, compute its predictions then dump them. Ensure that the predictions that are loaded back are the correct ones, and that the predictions of the dumped algorithm are also equal to the other ones.""" random.seed(0) trainset, testset = next(PredefinedKFold().split(u1_ml100k)) algo = BaselineOnly() algo.fit(trainset) predictions = algo.test(testset) with tempfile.NamedTemporaryFile() as tmp_file: dump.dump(tmp_file.name, predictions, algo) predictions_dumped, algo_dumped = dump.load(tmp_file.name) predictions_algo_dumped = algo_dumped.test(testset) assert predictions == predictions_dumped assert predictions == predictions_algo_dumped
def baseline(training, testing): ''' Calculates RMSE, coverage and running time of Baseline model Args: training(Dataset): training dataset testing(Dataset): test dataset Returns: rmse: RMSE of Baseline with optimized parameters top_n: number of unique predictions for top n items ''' # fit model baseline = BaselineOnly() baseline.train(training) # evaluate the model using test data predictions = baseline.test(testing) top_n = get_top_n(predictions, n=5) rmse = accuracy.rmse(predictions, verbose=True) return rmse, top_n
def test_trainset_testset(): """Test the construct_trainset and construct_testset methods.""" current_dir = os.path.dirname(os.path.realpath(__file__)) folds_files = [(current_dir + '/custom_train', current_dir + '/custom_test')] data = Dataset.load_from_folds(folds_files=folds_files, reader=reader) for trainset, testset in data.folds(): pass # just need trainset and testset to be set # test ur ur = trainset.ur assert ur[0] == [(0, 4)] assert ur[1] == [(0, 4), (1, 2)] assert ur[40] == [] # not in the trainset # test ir ir = trainset.ir assert ir[0] == [(0, 4), (1, 4), (2, 1)] assert ir[1] == [(1, 2), (2, 1), (3, 5)] assert ir[20000] == [] # not in the trainset # test n_users, n_items, n_ratings, rating_scale assert trainset.n_users == 4 assert trainset.n_items == 2 assert trainset.n_ratings == 6 assert trainset.rating_scale == (1, 5) # test raw2inner for i in range(4): assert trainset.to_inner_uid('user' + str(i)) == i with pytest.raises(ValueError): trainset.to_inner_uid('unkown_user') for i in range(2): assert trainset.to_inner_iid('item' + str(i)) == i with pytest.raises(ValueError): trainset.to_inner_iid('unkown_item') # test inner2raw assert trainset._inner2raw_id_users is None assert trainset._inner2raw_id_items is None for i in range(4): assert trainset.to_raw_uid(i) == 'user' + str(i) for i in range(2): assert trainset.to_raw_iid(i) == 'item' + str(i) assert trainset._inner2raw_id_users is not None assert trainset._inner2raw_id_items is not None # Test the build_testset() method algo = BaselineOnly() algo.train(trainset) testset = trainset.build_testset() algo.test(testset) # ensure an algorithm can manage the data assert ('user0', 'item0', 4) in testset assert ('user3', 'item1', 5) in testset assert ('user3', 'item1', 0) not in testset # Test the build_anti_testset() method algo = BaselineOnly() algo.train(trainset) testset = trainset.build_anti_testset() algo.test(testset) # ensure an algorithm can manage the data assert ('user0', 'item0', trainset.global_mean) not in testset assert ('user3', 'item1', trainset.global_mean) not in testset assert ('user0', 'item1', trainset.global_mean) in testset assert ('user3', 'item0', trainset.global_mean) in testset
nmf_results = [] reader = Reader(rating_scale=(0, np.inf)) data = Dataset.load_from_df( usergroups_df[["user_id", "item_id", "rating"]], reader) folds_it = KFold(n_splits=5).split(data) i = 1 pl_fit = [] for trainset, testset in folds_it: print("Fold: %d" % i) i += 1 print("Baseline") baseline = BaselineOnly() baseline.fit(trainset) baseline_predictions = baseline.test(testset) results = get_group_measures(preds_all=baseline_predictions, U1=U1_users, U2=U2_users, U3=U3_users, U4=U4_users) baseline_results.append(results) print("KNN") knn = KNNBasic(sim_options={"name": "pearson"}) #knn = KNNBasic(sim_options={"name": "cosine"}) knn.fit(trainset) knn_predictions = knn.test(testset) results = get_group_measures(preds_all=knn_predictions, U1=U1_users, U2=U2_users,
from surprise import Dataset from surprise import Reader from surprise import BaselineOnly from surprise import accuracy from surprise.model_selection import KFold #数据读取 reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1) data = Dataset.load_from_file('./ratings.csv', reader=reader) train_set = data.build_full_trainset() #ALS优化,优化方式可以选其他的('SGD') #设置user、item的正则化项 bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5} model = BaselineOnly(bsl_options=bsl_options) #k折交叉验证 kf = KFold(n_splits=5) for trainset, testset in kf.split(data): model.fit(trainset) pred = model.test(testset) #计算RMSE accuracy.rmse(pred) uid = str(300) iid = str(180) #输出uid对iid 的预测结果 pred = model.predict(uid, iid, r_ui=4, verbose=True)
def collaborative_filtering(raw_uid): # To read the data from a txt file # =============== 数据预处理 =========================== # 将数据库中的所有数据读取转换到文件 # dir_data = '/www/wwwroot/music_recommender/page/cf_recommendation/cf_data' dir_data = './collaborative_filtering/cf_data' file_path = '{}/dataset_user_5.txt'.format(dir_data) if not os.path.exists(dir_data): os.makedirs(dir_data) # 数据库操作 # 打开数据库连接 db = pymysql.connect("localhost", "root", "password", "music_recommender", charset='utf8') # 使用 cursor() 方法创建一个游标对象 cursor cursor = db.cursor() songData = defaultdict(list) sql = """SELECT uid, song_id, rating FROM user_rating WHERE 1""" cursor.execute(sql) results = cursor.fetchall() with open(file_path, "w+") as data_f: a = 0 for result in results: uid, song_id, rating = result if song_id in songData: songData[song_id].append(rating) else: songData[song_id] = [rating] data_f.writelines("{}\t{}\t{}\n".format(uid, song_id, rating)) a += 1 if not os.path.exists(file_path): raise IOError("Dataset file is not exists!") reader = Reader(line_format='user item rating', sep='\t') data = Dataset.load_from_file(file_path, reader=reader) # Build the training set trainset = data.build_full_trainset() bsl_options = {'method': 'sgd', 'learning_rate': 0.0005, } algo_BaselineOnly = BaselineOnly(bsl_options=bsl_options) algo_BaselineOnly.fit(trainset) #训练模型 rset = user_build_anti_testset(trainset, raw_uid) predictions = algo_BaselineOnly.test(rset) top_n_baselineonly = get_top_n(predictions, n=10) # print(predictions) # uid 原生用户id # iid 原生项目id # r_ui 浮点型的真实评分 # est 浮点型的预测评分 # details 预测相关的其他详细信息 # print(top_n_baselineonly, 'top_n_baselineonly') # KNNBasic sim_options = {'name': 'pearson', 'user_based': True} algo_KNNBasic = KNNBasic(sim_options=sim_options) algo_KNNBasic.fit(trainset) predictor = PredictionSet(algo_KNNBasic, trainset, raw_uid) knn_anti_set = predictor.user_build_anti_testset() predictions = algo_KNNBasic.test(knn_anti_set) top_n_knnbasic = get_top_n(predictions, n=1000) # print(predictions, 'top_n_knnbasic') # KNNBaseline sim_options = {'name': 'pearson_baseline', 'user_based': True} algo_KNNBaseline = KNNBaseline(sim_options=sim_options) algo_KNNBaseline.fit(trainset) predictor = PredictionSet(algo_KNNBaseline, trainset, raw_uid) knn_anti_set = predictor.user_build_anti_testset() predictions = algo_KNNBaseline.test(knn_anti_set) top_n_knnbaseline = get_top_n(predictions, n=1000) evaluationMSEResult = evaluationMSE([top_n_baselineonly, top_n_knnbasic, top_n_knnbaseline], raw_uid) recommendset = set() for results in [top_n_baselineonly, top_n_knnbasic, top_n_knnbaseline]: for key in results.keys(): for recommendations in results[key]: iid, rating, true_score = recommendations recommendset.add(iid) items_baselineonly = set() for key in top_n_baselineonly.keys(): for recommendations in top_n_baselineonly[key]: iid, rating, true_score = recommendations items_baselineonly.add(iid) items_knnbasic = set() for key in top_n_knnbasic.keys(): for recommendations in top_n_knnbasic[key]: iid, rating, true_score = recommendations items_knnbasic.add(iid) items_knnbaseline = set() for key in top_n_knnbaseline.keys(): for recommendations in top_n_knnbaseline[key]: iid, rating, true_score = recommendations items_knnbaseline.add(iid) rank = dict() for recommendation in recommendset: if recommendation not in rank: rank[recommendation] = 0 if recommendation in items_baselineonly: rank[recommendation] += 1 if recommendation in items_knnbasic: rank[recommendation] += 1 if recommendation in items_knnbaseline: rank[recommendation] += 1 max_rank = max(rank, key=lambda s: rank[s]) evaluationMSEResult1 = {} if max_rank == 1: return items_baselineonly else: resultAll = dict() result = nlargest(10, rank, key=lambda s: rank[s]) for k in result: resultAll[k] = rank[k] # print("排名结果: {}".format(resultAll)) evaluation(songData, resultAll) for key in evaluationMSEResult: if key in resultAll: evaluationMSEResult1[key] = evaluationMSEResult[key] print(evaluationMSEResult1,'evaluationMSEResult1==') #最后的评估 return resultAll
def collaborative_fitlering(raw_uid): # =============== 数据预处理 =========================== # 将数据库中的所有数据读取转换到文件 # dir_data = '/www/wwwroot/music_recommender/page/cf_recommendation/cf_data' dir_data = './collaborative_filtering/cf_data' file_path = '{}/dataset_user_5.txt'.format(dir_data) if not os.path.exists(dir_data): os.makedirs(dir_data) # 数据库操作 # 打开数据库连接 db = pymysql.connect("localhost", "music_system", "music_system", "music_recommender", charset='utf8') # 使用 cursor() 方法创建一个游标对象 cursor cursor = db.cursor() sql = """SELECT uid, song_id, rating FROM user_rating WHERE 1""" cursor.execute(sql) results = cursor.fetchall() with open(file_path, "w+") as data_f: for result in results: uid, song_id, rating = result data_f.writelines("{}\t{}\t{}\n".format(uid, song_id, rating)) if not os.path.exists(file_path): raise IOError("Dataset file is not exists!") # =========== cf recommend ================== # 导入数据 reader = Reader(line_format='user item rating', sep='\t') data = Dataset.load_from_file(file_path, reader=reader) # 所有数据生成训练集 trainset = data.build_full_trainset() # ================= BaselineOnly ================== # start = time.clock() bsl_options = { 'method': 'sgd', 'learning_rate': 0.0005, } algo_BaselineOnly = BaselineOnly(bsl_options=bsl_options) algo_BaselineOnly.fit(trainset) # 获得推荐结果 rset = user_build_anti_testset(trainset, raw_uid) predictions = algo_BaselineOnly.test(rset) top_n_baselineonly = get_top_n(predictions, n=5) # end = time.clock() # print("user-50NN --- BaselineOnly 耗时: %.2fs\n" % (end-start)) # print("BaselineOnly 推荐结果:{}\n".format(top_n_baselineonly)) # ================= KNNBasic ================== sim_options = {'name': 'pearson', 'user_based': True} algo_KNNBasic = KNNBasic(sim_options=sim_options) algo_KNNBasic.fit(trainset) # 获得推荐结果 --- 只考虑 knn 用户的 # start = time.clock() predictor = PredictionSet(algo_KNNBasic, trainset, raw_uid) knn_anti_set = predictor.user_build_anti_testset() predictions = algo_KNNBasic.test(knn_anti_set) top_n_knnbasic = get_top_n(predictions, n=5) # end = time.clock() # print("user-50NN --- KNNBasic 耗时: %.2fs\n" % (end-start)) # print("KNNBasic 推荐结果:{}\n".format(top_n_knnbasic)) # ================= KNNBaseline ================== sim_options = {'name': 'pearson_baseline', 'user_based': True} algo_KNNBaseline = KNNBaseline(sim_options=sim_options) algo_KNNBaseline.fit(trainset) # 获得推荐结果 --- 只考虑 knn 用户的 # start = time.clock() predictor = PredictionSet(algo_KNNBaseline, trainset, raw_uid) knn_anti_set = predictor.user_build_anti_testset() predictions = algo_KNNBaseline.test(knn_anti_set) top_n_knnbaseline = get_top_n(predictions, n=5) # end = time.clock() # print("user-50NN --- KNNBaseline 耗时: %.2fs\n" % (end-start)) # print("KNNBaseline 推荐结果:{}\n".format(top_n_knnbaseline)) # =============== 按比例生成推荐结果 ================== recommendset = set() for results in [top_n_baselineonly, top_n_knnbasic, top_n_knnbaseline]: for key in results.keys(): for recommendations in results[key]: iid, rating = recommendations recommendset.add(iid) items_baselineonly = set() for key in top_n_baselineonly.keys(): for recommendations in top_n_baselineonly[key]: iid, rating = recommendations items_baselineonly.add(iid) items_knnbasic = set() for key in top_n_knnbasic.keys(): for recommendations in top_n_knnbasic[key]: iid, rating = recommendations items_knnbasic.add(iid) items_knnbaseline = set() for key in top_n_knnbaseline.keys(): for recommendations in top_n_knnbaseline[key]: iid, rating = recommendations items_knnbaseline.add(iid) rank = dict() for recommendation in recommendset: if recommendation not in rank: rank[recommendation] = 0 if recommendation in items_baselineonly: rank[recommendation] += 1 if recommendation in items_knnbasic: rank[recommendation] += 1 if recommendation in items_knnbaseline: rank[recommendation] += 1 max_rank = max(rank, key=lambda s: rank[s]) if max_rank == 1: # print(items_baselineonly) return items_baselineonly else: result = nlargest(5, rank, key=lambda s: rank[s]) # print(result) return result
def test_trainset_testset(toy_data_reader): """Test the construct_trainset and construct_testset methods.""" current_dir = os.path.dirname(os.path.realpath(__file__)) folds_files = [(current_dir + '/custom_train', current_dir + '/custom_test')] data = Dataset.load_from_folds(folds_files=folds_files, reader=toy_data_reader, rating_scale=(1, 5)) with pytest.warns(UserWarning): trainset, testset = next(data.folds()) # test ur ur = trainset.ur assert ur[0] == [(0, 4)] assert ur[1] == [(0, 4), (1, 2)] assert ur[40] == [] # not in the trainset # test ir ir = trainset.ir assert ir[0] == [(0, 4), (1, 4), (2, 1)] assert ir[1] == [(1, 2), (2, 1), (3, 5)] assert ir[20000] == [] # not in the trainset # test n_users, n_items, n_ratings, rating_scale assert trainset.n_users == 4 assert trainset.n_items == 2 assert trainset.n_ratings == 6 assert trainset.rating_scale == (1, 5) # test raw2inner for i in range(4): assert trainset.to_inner_uid('user' + str(i)) == i with pytest.raises(ValueError): trainset.to_inner_uid('unkown_user') for i in range(2): assert trainset.to_inner_iid('item' + str(i)) == i with pytest.raises(ValueError): trainset.to_inner_iid('unkown_item') # test inner2raw assert trainset._inner2raw_id_users is None assert trainset._inner2raw_id_items is None for i in range(4): assert trainset.to_raw_uid(i) == 'user' + str(i) for i in range(2): assert trainset.to_raw_iid(i) == 'item' + str(i) assert trainset._inner2raw_id_users is not None assert trainset._inner2raw_id_items is not None # Test the build_testset() method algo = BaselineOnly() algo.fit(trainset) testset = trainset.build_testset() algo.test(testset) # ensure an algorithm can manage the data assert ('user0', 'item0', 4) in testset assert ('user3', 'item1', 5) in testset assert ('user3', 'item1', 0) not in testset # Test the build_anti_testset() method algo = BaselineOnly() algo.fit(trainset) testset = trainset.build_anti_testset() algo.test(testset) # ensure an algorithm can manage the data assert ('user0', 'item0', trainset.global_mean) not in testset assert ('user3', 'item1', trainset.global_mean) not in testset assert ('user0', 'item1', trainset.global_mean) in testset assert ('user3', 'item0', trainset.global_mean) in testset
def test_trainset_testset(): """Test the construct_trainset and construct_testset methods.""" current_dir = os.path.dirname(os.path.realpath(__file__)) folds_files = [(current_dir + '/custom_train', current_dir + '/custom_test')] data = Dataset.load_from_folds(folds_files=folds_files, reader=reader) with pytest.warns(UserWarning): trainset, testset = next(data.folds()) # test ur ur = trainset.ur assert ur[0] == [(0, 4, None)] assert ur[1] == [(0, 4, None), (1, 2, None)] assert ur[40] == [] # not in the trainset # test ir ir = trainset.ir assert ir[0] == [(0, 4, None), (1, 4, None), (2, 1, None)] assert ir[1] == [(1, 2, None), (2, 1, None), (3, 5, None)] assert ir[20000] == [] # not in the trainset # test n_users, n_items, n_ratings, rating_scale assert trainset.n_users == 4 assert trainset.n_items == 2 assert trainset.n_ratings == 6 assert trainset.rating_scale == (1, 5) # test user features u_features = trainset.u_features assert u_features[0] == [] # no u_features_df added assert u_features[1] == [] # no u_features_df added assert u_features[3] == [] # no u_features_df added assert u_features[40] == [] # not in trainset and no u_features_df assert trainset.user_features_labels == [] assert trainset.n_user_features == 0 # test item features i_features = trainset.i_features assert i_features[0] == [] # no i_features_df added assert i_features[1] == [] # no i_features_df added assert i_features[20000] == [] # not in trainset and no i_features_df assert trainset.item_features_labels == [] assert trainset.n_item_features == 0 # test raw2inner for i in range(4): assert trainset.to_inner_uid('user' + str(i)) == i with pytest.raises(ValueError): trainset.to_inner_uid('unknown_user') for i in range(2): assert trainset.to_inner_iid('item' + str(i)) == i with pytest.raises(ValueError): trainset.to_inner_iid('unknown_item') # test inner2raw assert trainset._inner2raw_id_users is None assert trainset._inner2raw_id_items is None for i in range(4): assert trainset.to_raw_uid(i) == 'user' + str(i) for i in range(2): assert trainset.to_raw_iid(i) == 'item' + str(i) assert trainset._inner2raw_id_users is not None assert trainset._inner2raw_id_items is not None # Test the build_testset() method algo = BaselineOnly() algo.fit(trainset) testset = trainset.build_testset() algo.test(testset) # ensure an algorithm can manage the data assert ('user0', 'item0', [], [], 4) in testset assert ('user3', 'item1', [], [], 5) in testset assert ('user3', 'item1', [], [], 0) not in testset # Test the build_anti_testset() method algo = BaselineOnly() algo.fit(trainset) testset = trainset.build_anti_testset() algo.test(testset) # ensure an algorithm can manage the data assert ('user0', 'item0', [], [], trainset.global_mean) not in testset assert ('user3', 'item1', [], [], trainset.global_mean) not in testset assert ('user0', 'item1', [], [], trainset.global_mean) in testset assert ('user3', 'item0', [], [], trainset.global_mean) in testset
#################predictions using BaselineOnly print('') print('Making recommendations...') print('') algo4 = BaselineOnly() algo4.fit(trainset) predictions4 = algo4.test(testset) dictMovies4 = get_top_n(predictions4) topMovies4 = dictMovies4.get(672) print('') print('Here are the top 5 recommendations based on Baseline algorithm! ') for i in range(5): movieRecc4 = topMovies4[i] movieRawID4 = movieRecc4[0] movieName4 = movie[movieRawID4] print(str(i+1) + '. ' + movieName4 ) ######################predicitons using Matrix-Factorization
from surprise import Dataset, Reader, BaselineOnly import pandas as pd train_rating_df = pd.read_csv("train_rating.txt", header=0, index_col=0) test = pd.read_csv('test_rating.txt', header=0, index_col=0) test['dummy_rating'] = '-1' reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(train_rating_df[['user_id', 'business_id', 'rating']], reader) trainset = data.build_full_trainset() bsl_options = {'method':'sgd','reg':0.08,'n_epochs':50,'learning_rate':0.0035, 'reg_u':0.07, 'reg_i':0.2} algo=BaselineOnly(bsl_options=bsl_options) algo.train(trainset) testdata = Dataset.load_from_df(test[['user_id', 'business_id', 'dummy_rating']], reader) predictions = algo.test(testdata.construct_testset(raw_testset=testdata.raw_ratings)) df = pd.DataFrame(predictions) newdf = df['est'] newdf.rename('rating', inplace=True) newdf.to_csv('submission.csv',header='rating',index_label='test_id')
#import pandas as pd # 数据读取 reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1) data = Dataset.load_from_file('./ratings.csv', reader=reader) train_set = data.build_full_trainset() # ALS优化 #bsl_options = {'method': 'als','n_epochs': 5,'reg_u': 12,'reg_i': 5} # SGD优化 bsl_options = {'method': 'sgd', 'n_epochs': 5} algo = BaselineOnly(bsl_options=bsl_options) #algo = BaselineOnly() #algo = NormalPredictor() # 定义K折交叉验证迭代器,K=3 kf = KFold(n_splits=3) for trainset, testset in kf.split(data): # 训练并预测 algo.fit(trainset) predictions = algo.test(testset) # 计算RMSE accuracy.rmse(predictions, verbose=True) uid = str(196) iid = str(302) # 输出uid对iid的预测结果 pred = algo.predict(uid, iid, r_ui=4, verbose=True)
def make_predictions(user_id): performance = [] algorithms = ['SVD', 'KNN', 'ALS'] # First train an SVD algorithm on the movielens dataset. data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() algo_SVD = SVD() algo_SVD.fit(trainset) # Then predict ratings for all pairs (u, i) that are NOT in the training set. # SVD algorithm testset = trainset.build_anti_testset() predictions_SVD = algo_SVD.test(testset) accurancy_SVD = accuracy.rmse(predictions_SVD) performance.append(accurancy_SVD) algo_KNN = KNNBasic() algo_KNN.fit(trainset) predictions_KNN = algo_SVD.test(testset) accurancy_KNN = accuracy.rmse(predictions_KNN) performance.append(accurancy_KNN) bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5} algo_ALS = BaselineOnly(bsl_options=bsl_options) algo_ALS.fit(trainset) predictions_ALS = algo_ALS.test(testset) accurancy_ALS = accuracy.rmse(predictions_ALS) performance.append(accurancy_ALS) # comparing algorithms by performance best_performance_index = performance.index(min(performance)) best_algorithm = algorithms[best_performance_index] if best_algorithm == 'SVD': top_n = get_top_n(predictions_SVD, n=10) elif best_algorithm == 'KNN': top_n = get_top_n(predictions_KNN, n=10) elif best_algorithm == 'ALS': top_n = get_top_n(predictions_ALS, n=10) i_cols = [ 'movie_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western' ] items = pd.read_csv('../../ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1') predictions = [] # Print the recommended items for the user for uid, user_ratings in top_n.items(): if int(uid) + 1 == int(user_id) + 1: # print(uid, [iid for (iid, _) in user_ratings]) for (iid, _) in user_ratings: title = items[items['movie_id'] == int(iid) + 1]['movie_title'] title_t = str(title) title_split = title_t.split() print(title_split) # print(title_split(1)) # print(title_split(2)) # print(title_t) predictions.append(title_t) return predictions
avg_precision = sum(prec for prec in precisions.values()) / len(precisions) avg_recall= sum(rec for rec in recalls.values()) / len(recalls) metrics = {'rmse': rmse, 'avg_precision': avg_precision, 'avg_recall': avg_recall} results['NormalPredictor'] = metrics top_n['NormalPredictor'] = get_top_n(norm_pred, n=10) param_grid = {'bsl_options':{'method': ['als', 'sgd']}} gs = GridSearchCV(BaselineOnly, param_grid, measures = ['rmse'], cv = 5) gs.fit(data) params = gs.best_params['rmse'] algo = BaselineOnly(bsl_options = params['bsl_options']) algo.fit(trainset) base_pred = algo.test(testset) rmse = accuracy.rmse(base_pred) precisions, recalls = precision_recall_at_k(base_pred, k = 10, threshold = 4) avg_precision = sum(prec for prec in precisions.values()) / len(precisions) avg_recall= sum(rec for rec in recalls.values()) / len(recalls) metrics = {'rmse': rmse, 'avg_precision': avg_precision, 'avg_recall': avg_recall, 'best_parameters': params} results['BaselineOnly'] = metrics top_n['BaselineOnly'] = get_top_n(base_pred, n=10)
clf = LinearRegression().fit(new_train_only_data, new_train_label) y_pre = clf.predict(new_test_only_data) linear_prediction = [] for i in range(len(y_pre)): all_info = [test_data_get[i][0]] + [test_data_get[i][1]] + [y_pre[i]] linear_prediction.append(all_info) ####################################surprise###################################### surprise_reader = Reader(line_format='user item rating', sep=',', skip_lines=1) surprise_train = Dataset.load_from_file(input_file, reader=surprise_reader) surprise_train = surprise_train.build_full_trainset() surprise_test_data = sc.parallelize(test_data_get).map( lambda s: (s[0], s[1], float(s[2]))).collect() params = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5} surprise_formula = BaselineOnly(bsl_options=params) surprise_formula.fit(surprise_train) surprise_predict = surprise_formula.test(surprise_test_data) surprise_prediction = [] for i in range(len(surprise_predict)): surprise_prediction.append([ surprise_predict[i][0], surprise_predict[i][1], surprise_predict[i][3] ]) ################################SVD######################################## from surprise import SVD svd_surprise = SVD(n_epochs=30, lr_all=0.008, reg_all=0.2) svd_surprise.fit(surprise_train) surprise_svd_prediction = svd_surprise.test(surprise_test_data) svd_prediction = [] for i in range(len(surprise_svd_prediction)): svd_prediction.append([ surprise_svd_prediction[i][0], surprise_svd_prediction[i][1], surprise_svd_prediction[i][3]
def test_trainset_testset_ui_features(): """Test the construct_trainset and construct_testset methods with user and item features.""" current_dir = os.path.dirname(os.path.realpath(__file__)) folds_files = [(current_dir + '/custom_train', current_dir + '/custom_test')] data = Dataset.load_from_folds(folds_files=folds_files, reader=reader) u_features_df = pd.DataFrame( { 'urid': ['user0', 'user2', 'user3', 'user1'], 'isMale': [False, True, False, True] }, columns=['urid', 'isMale']) data = data.load_features_df(u_features_df, user_features=True) i_features_df = pd.DataFrame( { 'irid': ['item0', 'item1'], 'isNew': [False, True], 'webRating': [4, 3], 'isComedy': [True, False] }, columns=['irid', 'isNew', 'webRating', 'isComedy']) data = data.load_features_df(i_features_df, user_features=False) with pytest.warns(UserWarning): trainset, testset = next(data.folds()) # test ur ur = trainset.ur assert ur[0] == [(0, 4, None)] assert ur[1] == [(0, 4, None), (1, 2, None)] assert ur[40] == [] # not in the trainset # test ir ir = trainset.ir assert ir[0] == [(0, 4, None), (1, 4, None), (2, 1, None)] assert ir[1] == [(1, 2, None), (2, 1, None), (3, 5, None)] assert ir[20000] == [] # not in the trainset # test n_users, n_items, n_ratings, rating_scale assert trainset.n_users == 4 assert trainset.n_items == 2 assert trainset.n_ratings == 6 assert trainset.rating_scale == (1, 5) # test user features u_features = trainset.u_features assert u_features[0] == [False] assert u_features[40] == [] # not in trainset and u_features_df assert trainset.user_features_labels == ['isMale'] assert trainset.n_user_features == 1 # test item features i_features = trainset.i_features assert i_features[0] == [False, 4, True] assert i_features[20000] == [] # not in trainset and i_features_df assert trainset.item_features_labels == ['isNew', 'webRating', 'isComedy'] assert trainset.n_item_features == 3 # test raw2inner for i in range(4): assert trainset.to_inner_uid('user' + str(i)) == i with pytest.raises(ValueError): trainset.to_inner_uid('unknown_user') for i in range(2): assert trainset.to_inner_iid('item' + str(i)) == i with pytest.raises(ValueError): trainset.to_inner_iid('unknown_item') # test inner2raw assert trainset._inner2raw_id_users is None assert trainset._inner2raw_id_items is None for i in range(4): assert trainset.to_raw_uid(i) == 'user' + str(i) for i in range(2): assert trainset.to_raw_iid(i) == 'item' + str(i) assert trainset._inner2raw_id_users is not None assert trainset._inner2raw_id_items is not None # Test the build_testset() method algo = BaselineOnly() algo.fit(trainset) testset = trainset.build_testset() algo.test(testset) # ensure an algorithm can manage the data assert ('user0', 'item0', [False], [False, 4, True], 4) in testset assert ('user2', 'item1', [True], [True, 3, False], 1) in testset assert ('user3', 'item1', [False], [True, 3, False], 5) in testset assert ('user3', 'item1', [False], [True, 3, False], 0) not in testset # Test the build_anti_testset() method algo = BaselineOnly() algo.fit(trainset) testset = trainset.build_anti_testset() algo.test(testset) # ensure an algorithm can manage the data assert (('user0', 'item0', [False], [False, 4, True], trainset.global_mean) not in testset) assert (('user3', 'item1', [False], [True, 3, False], trainset.global_mean) not in testset) assert (('user0', 'item1', [False], [True, 3, False], trainset.global_mean) in testset) assert (('user3', 'item0', [False], [False, 4, True], trainset.global_mean) in testset)
def collaborative_filtering(raw_uid): # To read the data from a txt file # TODO: To modify the file path of the data set # =============== 数据预处理 =========================== # 将数据库中的所有数据读取转换到文件 # dir_data = '/www/wwwroot/music_recommender/page/cf_recommendation/cf_data' dir_data = './collaborative_filtering/cf_data' file_path = '{}/dataset_user_5.txt'.format(dir_data) if not os.path.exists(dir_data): os.makedirs(dir_data) # 数据库操作 # 打开数据库连接 db = pymysql.connect("localhost", "root", "password", "music_recommender", charset='utf8') # 使用 cursor() 方法创建一个游标对象 cursor cursor = db.cursor() sql = """SELECT uid, song_id, rating FROM user_rating WHERE 1""" cursor.execute(sql) results = cursor.fetchall() with open(file_path, "w+") as data_f: # print(data_f) # exit() for result in results: uid, song_id, rating = result data_f.writelines("{}\t{}\t{}\n".format(uid, song_id, rating)) if not os.path.exists(file_path): raise IOError("Dataset file is not exists!") # file_path = "" reader = Reader(line_format='user item rating', sep='\t') data = Dataset.load_from_file(file_path, reader=reader) # Build the training set trainset = data.build_full_trainset() # print(trainset) # exit() # Baselineonly bsl_options = { 'method': 'sgd', 'learning_rate': 0.0005, } algo_BaselineOnly = BaselineOnly(bsl_options=bsl_options) algo_BaselineOnly.fit(trainset) rset = user_build_anti_testset(trainset, raw_uid) predictions = algo_BaselineOnly.test(rset) top_n_baselineonly = get_top_n(predictions, n=5) # KNNBasic sim_options = {'name': 'pearson', 'user_based': True} algo_KNNBasic = KNNBasic(sim_options=sim_options) algo_KNNBasic.fit(trainset) predictor = PredictionSet(algo_KNNBasic, trainset, raw_uid) knn_anti_set = predictor.user_build_anti_testset() predictions = algo_KNNBasic.test(knn_anti_set) top_n_knnbasic = get_top_n(predictions, n=5) # KNNBaseline sim_options = {'name': 'pearson_baseline', 'user_based': True} algo_KNNBaseline = KNNBaseline(sim_options=sim_options) algo_KNNBaseline.fit(trainset) predictor = PredictionSet(algo_KNNBaseline, trainset, raw_uid) knn_anti_set = predictor.user_build_anti_testset() predictions = algo_KNNBaseline.test(knn_anti_set) top_n_knnbaseline = get_top_n(predictions, n=5) # l = list(top_n_baselineonly)[0] # a = list() # a[l] = top_n_baselineonly[l] # print(a) # print(l,'predictions == ') print({raw_uid: top_n_baselineonly[raw_uid]}) print({raw_uid: top_n_knnbasic[raw_uid]}) print({raw_uid: top_n_knnbasic[raw_uid]})