class ALSModelSurprise(ALSModel): def __init__(self, params): super().__init__(params) self.algo = BaselineOnly(bsl_options=self.params) def parse_data(self, ratings): reader = Reader(rating_scale=(1, 5)) self.data = Dataset.load_from_df(ratings, reader) def update_parameters(self): self.algo.bsl_options = self.params def fit(self): self.train = self.data.build_full_trainset() self.algo.fit(self.train) def predict(self, uid, iid): ''' uid, iid should be consistent with ratings['UID','IID'] ''' return self.algo.predict(uid, iid).est def top_n_recommendations(self, uid, n=5): ''' Obtain the top n recommendation for any user. Method for the surprise library ''' scores = [] for i in range(self.train.n_items): iid = self.train.to_raw_iid(i) scores.append((iid, self.predict(uid, iid))) scores.sort(key=lambda x: x[1], reverse=True) top_n_iid = [l[0] for l in scores[:n]] pred = [l[1] for l in scores[:n]] return top_n_iid, pred def cross_validate(self, cv=5, verbose=False): cv_result = cross_validate(self.algo, self.data, \ cv=cv, verbose=verbose) rmse = cv_result['test_rmse'].mean() return rmse def grid_search(self): self._best_params = self.params self._best_rmse = self.cross_validate(cv=5) for n_epochs in [5, 10, 15, 20, 25]: for reg_u in [5, 10, 15, 20]: for reg_i in [5, 10, 15]: self.set_params(n_epochs=n_epochs, reg_u=reg_u, reg_i=reg_i) rmse = self.cross_validate(cv=5) print(n_epochs, reg_u, reg_i, rmse) if (rmse < self._best_rmse): self._best_rmse = rmse self._best_params = self.params
def baseline_only(train, test, ids, Xtest, Xids): """ Combines user and item mean with user and item biases Argument : train, the trainset test, the testset ids, unknown ratings Xtest, predicted ratings for testset, to be used for final blending Xids, predicted ratings for unknown ratings, to be used for final blending """ print('Baseline Only') bsl_options = { 'method': 'als', 'n_epochs': 100, 'reg_u': 15, 'reg_i': 0.01 } algo = BaselineOnly(bsl_options=bsl_options, verbose=False) #Train algorithm on training set algo.fit(train) #Predict on train and compute RMSE predictions = algo.test(train.build_testset()) print(' Training RMSE: ', accuracy.rmse(predictions, verbose=False)) #Predict on test and compute RMSE predictions = algo.test(test) rmse = accuracy.rmse(predictions, verbose=False) print(' Test RMSE: ', rmse) preds_test = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds_test[j] = pred.est #Predict unknown ratings preds_ids = [] for i in range(len(ids[0])): pred = algo.predict(str(ids[0][i]), str(ids[1][i])) preds_ids.append(pred.est) Xtest.append(preds_test) Xids.append(preds_ids) return rmse, Xtest, Xids, preds_test, preds_ids
class BaseLineRecommender(object): """ Use surprise's baselineonly algorithm as the baseline of prediction """ def __init__(self): self.model = None def fit(self, train): """ Fit the model """ self.model = BaselineOnly(bsl_options={ 'method': 'sgd', 'n_epochs': 30, 'reg': 0.01, 'learning_rate': 0.01 }) self.model.fit(train) def predict(self, user_id, item_id): """ Predict ratings """ return self.model.predict(user_id, item_id) def rmse(self, test): """ Calculate RMSE for the predicted ratings """ pred = self.model.test(test) return accuracy.rmse(pred) def mae(self, test): """ Calculate MAE for the predicted ratings """ pred = self.model.test(test) return accuracy.mae(pred)
def compute_recommendations(user_id, prediction_table, numeric_prediction_table): algo = 'Baseline' algorithm = BaselineOnly() # add_pageview(user_id=user_id, item_id=None, page="Model Predictions", activity_type="Initialize Predictions - " + algo, rating=None) #pageview engine = create_engine(config.DB_URI, echo=True) session = scoped_session( sessionmaker(bind=engine, autocommit=False, autoflush=False)) #reading in the database df_ratings = pd.read_sql('SELECT * FROM ratings;', con=engine) df_ratings = df_ratings[['user_id', 'item_id', 'rating']] df_ratings = df_ratings.dropna() df_ratings = df_ratings.drop_duplicates() df_ratings2 = pd.read_csv('data/ratings.csv', low_memory=False) df_ratings2 = df_ratings2.rename(columns={'movie_id': 'item_id'}) df_ratings2 = df_ratings2[['user_id', 'item_id', 'rating']] df_ratings2 = df_ratings2.dropna() df_ratings2 = df_ratings2.drop_duplicates() df_ratings = pd.concat([df_ratings, df_ratings2], axis=0) reader = Reader(line_format='user item rating', sep=',', rating_scale=(1, 10)) data = Dataset.load_from_df(df_ratings, reader=reader) trainset = data.build_full_trainset() # algorithm = eval(algo + "()")# set the algorithm............................................... algorithm.train(trainset) items = pd.read_sql('SELECT distinct id FROM items;', con=engine) df_user_items = df_ratings.loc[df_ratings['user_id'] == user_id] total_items = items.id.unique() user_items = df_user_items.item_id.unique() # user_id = str(user_id) prediction_items = [x for x in total_items if x not in user_items] predictions = pd.DataFrame(columns=['user_id', 'item_id', 'prediction']) predicted_ratings = [] for i in prediction_items: a = user_id b = i est = algorithm.predict(a, b) predicted_ratings.append(est[3]) predictions['item_id'] = prediction_items predictions['user_id'] = pd.Series( [user_id for x in range(len(predictions.index))], index=predictions.index) predictions['prediction'] = predicted_ratings predictions = predictions.sort_values('prediction', ascending=False) test_prediction = predictions predictions = predictions.head(n=10) cols = [ 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6', 'pred_7', 'pred_8', 'pred_9', 'pred_10' ] df_pred = predictions[['item_id']].T df_pred.columns = cols df_pred['id'] = user_id df_pred = df_pred[[ 'id', 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6', 'pred_7', 'pred_8', 'pred_9', 'pred_10' ]] df_pred['id'] = df_pred['id'].astype(int) df_pred.to_sql(prediction_table, engine, if_exists='append', index=False) #if_exists='append' session.commit() df_num_ratings = test_prediction df_num_ratings = df_num_ratings.head(n=20) df_num_ratings['algorithm'] = algo df_num_ratings.rename(columns={'prediction': 'predicted_rating'}, inplace=True) df_num_ratings.to_sql('numeric_predictions', engine, if_exists='append', index=False) #if_exists='append' session.commit() predcols = [ 'num_1', 'num_2', 'num_3', 'num_4', 'num_5', 'num_6', 'num_7', 'num_8', 'num_9', 'num_10' ] df_num_ratings_transpose = predictions[['prediction']].T df_num_ratings_transpose.columns = predcols df_num_ratings_transpose['id'] = user_id df_num_ratings_transpose = df_num_ratings_transpose[[ 'id', 'num_1', 'num_2', 'num_3', 'num_4', 'num_5', 'num_6', 'num_7', 'num_8', 'num_9', 'num_10' ]] df_num_ratings_transpose['id'] = df_num_ratings_transpose['id'].astype(int) df_num_ratings_transpose.to_sql(numeric_prediction_table, engine, if_exists='append', index=False) #if_exists='append' session.commit()
alg.fit(data_train.build_full_trainset()) end = time.time() print("***********************************************") print("Exe time:") print(end - start) # %% Loading Test Data file_path = "Data/sample_submission.csv" data_test = utils.load_data_desired(file_path) # %% Prediction Predict_Test = [] for line in data_test: Predict_Test.append(alg.predict(str(line[1]), str(line[0])).est) # %% Save Prediction file = open("Details.txt", "w") file.write("+ Best Score: \n \n") file.write(str(Train_CV.best_score) + "\n \n") file.write("************************************************************ \n") file.write("+ Best Param: \n \n") file.write(str(Train_CV.best_params) + "\n \n") file.write("************************************************************ \n") file.write("+ CV Summary: \n \n") file.write(str(Train_CV.cv_results) + "\n \n") file.write("************************************************************ \n")
#import pandas as pd # 数据读取 reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1) data = Dataset.load_from_file('./ratings.csv', reader=reader) train_set = data.build_full_trainset() # ALS优化 #bsl_options = {'method': 'als','n_epochs': 5,'reg_u': 12,'reg_i': 5} # SGD优化 bsl_options = {'method': 'sgd', 'n_epochs': 5} algo = BaselineOnly(bsl_options=bsl_options) #algo = BaselineOnly() #algo = NormalPredictor() # 定义K折交叉验证迭代器,K=3 kf = KFold(n_splits=3) for trainset, testset in kf.split(data): # 训练并预测 algo.fit(trainset) predictions = algo.test(testset) # 计算RMSE accuracy.rmse(predictions, verbose=True) uid = str(196) iid = str(302) # 输出uid对iid的预测结果 pred = algo.predict(uid, iid, r_ui=4, verbose=True)
from surprise import Reader from surprise import BaselineOnly from surprise import accuracy from surprise.model_selection import KFold #数据读取 reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1) data = Dataset.load_from_file('./ratings.csv', reader=reader) train_set = data.build_full_trainset() #ALS优化,优化方式可以选其他的('SGD') #设置user、item的正则化项 bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5} model = BaselineOnly(bsl_options=bsl_options) #k折交叉验证 kf = KFold(n_splits=5) for trainset, testset in kf.split(data): model.fit(trainset) pred = model.test(testset) #计算RMSE accuracy.rmse(pred) uid = str(300) iid = str(180) #输出uid对iid 的预测结果 pred = model.predict(uid, iid, r_ui=4, verbose=True)
from surprise.model_selection import train_test_split # Importing built in MovieLens 100K dataset data = Dataset.load_builtin('ml-100k') # Baseline 알고리즘 지정 algo = BaselineOnly() # cv=4는 데이터를 4개로 나누어서 하나를 test set으로 사용하는데 5개 모두에 대해서 실행 result = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=4, verbose=True) # Set full train data 지정, 예측하기 trainset = data.build_full_trainset() pred = algo.predict('1', '2', r_ui=3, verbose=True) # user_id, item_id, default rating # csv 파일에서 불러오기 r_cols = ['user_id', 'movie_id', 'rating', 'timestamp'] ratings = pd.read_csv('C:/RecoSys/Data/u.data', names=r_cols, sep='\t', encoding='latin-1') reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(ratings[['user_id', 'movie_id', 'rating']], reader) result = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=4, verbose=True)
start = time.time() for line in data_test: Pred_Test_KNN.append( alg_KNN.predict(str(line[1]), str(line[0]), clip=False).est) Pred_Test_SVD.append( alg_SVD.predict(str(line[1]), str(line[0]), clip=False).est) Pred_Test_NMF.append( alg_NMF.predict(str(line[1]), str(line[0]), clip=False).est) Pred_Test_SL1.append( alg_SL1.predict(str(line[1]), str(line[0]), clip=False).est) Pred_Test_BSL.append( alg_BSL.predict(str(line[1]), str(line[0]), clip=False).est) end = time.time() print("***********************************************") print("Exe time:") print(end - start) X_Test = np.matrix([ Pred_Test_SVD, Pred_Test_NMF, Pred_Test_SL1, Pred_Test_KNN, Pred_Test_BSL ]) X_Test = X_Test.T # %% Prior Based X_Test = np.matrix( [Pred_Test_SVD, Pred_Test_NMF, Pred_Test_SL1, Pred_Test_KNN]) X_Test = X_Test.T
# 定义K折交叉验证迭代器,K=3 kf = KFold(n_splits=3) for trainset, testset in kf.split(suprise_data): # 训练并预测 algo.fit(trainset) predictions = algo.test(testset) # 计算RMSE accuracy.rmse(predictions, verbose=True) #读取需要预测数据,并处理 probe = pd.read_table('probe.txt', sep='/t', header=None) processed_probe = process_probe(probe) #由于训练数据读取一部分,需要筛选出在train中出现的user_id pre = pd.merge(data, processed_probe, how='inner', on=['user_id', 'movie_id']) print('start predict') ############最终结果0.989714596450271################ count = 0 error = 0 for user, movie in zip(pre['user_id'], pre['movie_id']): rui = pre[(pre.user_id == str(user)) & (pre.movie_id == str(movie))].loc[:, 'rating'] count += 1 rui_value = int(rui.iloc[0]) prediction = algo.predict(str(user), str(movie), r_ui=rui_value, verbose=True) error += np.square(prediction[3] - rui_value) print("RMSE:{}".format(np.sqrt(error / count)))
print("Predicted Rating:") pred[3] # print('Using ALS') bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5 } algo_2 = BaselineOnly(bsl_options=bsl_options) trainset = data.build_full_trainset() algo_2.train(trainset) pred = algo_2.predict('374', '500') print("Prediction Object:") pred print("Predicted Rating:") pred[3] #Predicting all missing entries #First lets start by visualising our matrix of all observed entries. #This matrix is quite sparse. import numpy as np n_users = trainset.n_users n_items = trainset.n_items
trainset = data.build_full_trainset() del data print(time.asctime(), 'training set built, now training') # algo = SlopeOne() # # # MODEL DEFINITION algo = BaselineOnly(verbose=True) # # # algo.fit(trainset) print(time.asctime(), 'training complete, now loading prediction data') to_predict = pd.read_csv(file_path_test, delimiter=' ', header=None) to_predict = to_predict.values.T[0:2].T predicted = np.zeros(len(to_predict)) print(time.asctime(), 'prediction data loaded, now predicting') for i in range(len(predicted)): user = to_predict[i][0] item = to_predict[i][1] predicted[i] = algo.predict(uid=user, iid=item, verbose=0).est if (i % 500000 == 0): print(i, 'of', len(predicted), 'predicted') print(time.asctime(), 'now saving predictions') np.savetxt('../custom_data/' + title + '.dta', predicted, fmt='%.3f') print(time.asctime(), 'done')
relation_file = open("../relation.txt", "r") relation_dict = {} for line in relation_file: temp0 = line.split(":") relation_dict[temp0[0].strip()] = temp0[1].strip() #print(relation_dict) #print(relation_dict[str(2159)]) testset = open("../test1.csv", "r") result = open("./result/result_BaselineOnly.txt", "w") full_result = open("./full_result/result_BaselineOnly.txt", "w") for line in testset: temp = line.split(",") pred = algo.predict(temp[0], temp[1], verbose=False) #score = round(pred[3]) score = pred[3] if str(temp[0]) in relation_dict.keys(): sum_score = 0 member_list = relation_dict[str(temp[0])].split(",") num = len(member_list) for i in range(num): sum_score = (algo.predict(member_list[i], temp[1], verbose=False))[3] + sum_score #print("calculate") final_score = round(0.9 * score + 0.1 * (sum_score / num)) #result.write(str(score) + "\n") result.write(str(final_score) + "\n") #full_result.write(str(pred[3]) + "\n")