from surprise import SVD, KNNBasic, KNNWithMeans, KNNBaseline, NMF, SlopeOne, CoClustering, BaselineOnly, NormalPredictor ''' "SVD" -- https://en.wikipedia.org/wiki/Singular_value_decomposition "KNN" -- https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm "Centered KNN" -- KNN with mean user ratings considered "KNN with Baseline" -- KNN with baseline considered "NMF" -- https://en.wikipedia.org/wiki/Non-negative_matrix_factorization "SlopeOne" -- https://en.wikipedia.org/wiki/Slope_One "CoClustering" -- https://en.wikipedia.org/wiki/Biclustering "BaselineOnly" -- baseline predicted for specific user/item "NormalPredictor" -- predict random rating from normal distribution https://surprise.readthedocs.io/en/stable/basic_algorithms.html#surprise.prediction_algorithms.baseline_only.BaselineOnly ''' labels = [ "SVD", "KNN", "Centered KNN", "KNN with Baseline", "NMF", "SlopeOne", "CoClustering", "BaselineOnly", "NormalPredictor" ] algorithms = [ SVD(), KNNBasic(), KNNWithMeans(), KNNBaseline(), NMF(), SlopeOne(), CoClustering(), BaselineOnly(), NormalPredictor() ]
df_cust_summary = df.groupby('Cust_id')['Rating'].agg(["count","mean"]) df_cust_summary.index = df_cust_summary.index.map(int) cust_benchmark = round(df_cust_summary['count'].quantile(0.7),0) drop_cust_list = df_cust_summary[df_cust_summary['count'] < cust_benchmark].index df = df[~df['Movie_id'].isin(drop_movie_list)] df = df[~df['Cust_id'].isin(drop_cust_list)] #Pivot data df_p = pd.pivot_table(df, index="Cust_id", columns="Movie_id", values="Rating") #See which algorithm gives the lowest RMSE value reader = Reader() data = Dataset.load_from_df(df[['Cust_id', 'Movie_id', 'Rating']][:100000], reader) benchmark = [] for algo in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), BaselineOnly(), CoClustering()]: data.split(n_folds=3) results = evaluate(algo, data, measures = ["RMSE"]) tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append(pd.Series([str(algo).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) print(pd.DataFrame(benchmark).set_index('Algorithm').sort_values('rmse')) ##Train and Test split #reader = Reader() #data = Dataset.load_from_df(df[['Cust_id', 'Movie_id', 'Rating']], reader) #trainset, testset = train_test_split(data, test_size = 0.25) #blo = BaselineOnly() #blo.fit(trainset)
def set_algo(self): self.algo = CoClustering(n_cltr_u=10)
def run(algorithm,min_nr_movies_train, min_nr_movies_test, netflix=True): if netflix: trainset, testset = getNetflix() else: if algorithm == 'SVD': model = SVD() elif algorithm == 'KNNBasic': model = KNNBasic() elif algorithm == 'KNNWithZScore': model = KNNWithZScore() elif algorithm == 'NMF': model = NMF() elif algorithm == 'CoClustering': model = CoClustering() elif algorithm == 'SlopeOne': model = SlopeOne() model.fit(trainset) preds = model.test(testset) predsDF = pd.DataFrame(preds) predsDF['userID'] = predsDF['uid'] predsDF['itemID'] = predsDF['iid'] predsDF['pred'] = predsDF['est'] full = pd.merge(predsDF, testData, on = ['userID','itemID'], how = 'inner') out = [] for pred, rating in zip(round(full['pred']), full['rating']): if pred==rating: #if (pred + 1 == rating) or (pred - 1 == rating) or (pred == rating): out.append(1) else: out.append(0) acc = sum(out) / len(out) return 'Model:{}, Accuracy: {}, Num Movies: {}/{}'.format(algorithm, round(acc,3) * 100, min_nr_movies_train, min_nr_movies_test) if __name__ == "__main__": print(run('CoClustering',10, 5)) print(run('CoClustering',20, 5)) print(run('CoClustering',20, 7)) print(run('CoClustering',20, 10)) print(run('KNNBasic',10, 5)) print(run('KNNBasic',20, 5)) print(run('KNNBasic',20, 7)) print(run('KNNBasic',20, 10)) print(run('KNNWithZScore',10, 5)) print(run('KNNWithZScore',20, 5)) print(run('KNNWithZScore',20, 7)) print(run('KNNWithZScore',20, 10)) print(run('NMF',10, 5)) print(run('NMF',20, 5)) print(run('NMF',20, 7)) print(run('NMF',20, 10)) print(run('SlopeOne',10, 5)) print(run('SlopeOne',20, 5)) print(run('SlopeOne',20, 7)) print(run('SlopeOne',20, 10)) print(run('SVD',10, 5)) print(run('SVD',20, 5)) print(run('SVD',20, 7)) print(run('SVD',20, 10))
def best_pred(): review['새주소'] = review['장소'] + "*" + review['주소'] review2 = review.drop([ '장소', '주소', '위도', '경도', '분류', '대분류', '주소1', '주소2', '방문횟수', '년도', '월', '계절' ], axis=1) review2 = review2[['이름', '새주소', '별점']] # 데이터 셋의 차원 줄이기 # 저조한 평가를 기록한 장소 및 사용자 제외 min_ratings = 50 filter_review = review2['새주소'].value_counts() > min_ratings filter_review = filter_review[filter_review].index.tolist() min_user_ratings = 50 filter_users = review2['이름'].value_counts() > min_user_ratings filter_users = filter_users[filter_users].index.tolist() review_new = review2[(review2['새주소'].isin(filter_review)) & (review2['이름'].isin(filter_users))] reader = Reader(rating_scale=(0, 5)) data = Dataset.load_from_df(review_new[['이름', '새주소', '별점']], reader) benchmark = [] # Iterate over all algorithms for algorithm in [ SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore, BaselineOnly(), CoClustering() ]: # Perform cross validation algo = NMF() results = cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False) trainset, testset = train_test_split(data, test_size=0.25) predictions = algo.fit(trainset).test(testset) # accuracy.rmse(predictions) # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) surprise_results = pd.DataFrame(benchmark).set_index( 'Algorithm').sort_values('test_rmse') # Train and Predict # CoClustering 알고리즘이 가장 좋은 rmse 결과를 보였다. 따라서 CoClustering 사용하여 # 훈련 및 예측을 진행하고 교대최소제곱(ALS)를 사용할 것 algo = NMF() cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False) # rmse 정확도 훈련셋과 검증셋을 샘플링하기위해 train_Test_split()을 사용 # rmse 정확도 척도를 사용 # fit() 메소드를 통해 훈련셋의 알고리즘을 훈련시키고, test() 메소드를 통해 검증셋으로부터 # 생성된 예측을 반환 trainset, testset = train_test_split(data, test_size=0.25) # algo = BaselineOnly(bsl_options=bsl_options) algo = NMF() predictions = algo.fit(trainset).test(testset) # dump.dump('./dump_file',predictions, algo) # predictions, algo = dump.load('./dump_file') trainset = algo.trainset # 예측을 정확히 살펴보기 위해, 모든 예측에 대한 데이터프레임 생성 def get_Iu(uid): try: return len(trainset.ur[trainset.to_inner_uid(uid)]) except ValueError: # user was not part of the trainset return 0 def get_Ui(iid): try: return len(trainset.ir[trainset.to_inner_iid(iid)]) except ValueError: return 0 df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details']) df['Iu'] = df.uid.apply(get_Iu) df['Ui'] = df.iid.apply(get_Ui) df['err'] = abs(df.est - df.rui) predictions = df.sort_values(by='err').drop_duplicates('iid') best_predictions = predictions[:100] worst_predictions = predictions[-10:] # tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],index=['Algorithm'])) best_predictions['iid'] = best_predictions.iid.str.split('*').str[0] sql = "insert into rec(rec_uid, rec_iid, rec_rui, rec_est) values(:rec_uid, :rec_iid, :rec_rui, :rec_est)" data = best_predictions[['uid', 'iid', 'rui', 'est']] data.columns = ['rec_uid', 'rec_iid', 'rec_rui', 'rec_est'] cursor.close() conn.close() return data
class Surprise(): def train(self, algo='SVD', like=True, test='cv', local=False): if local: csv_path = os.path.join(os.path.dirname(__file__), "data/preprocessed") self.recipes = pd.read_csv(f"{csv_path}/recipe_pp.csv") self.reviews = pd.read_csv(f"{csv_path}/review_pp.csv") else: self.recipes = storage.import_file('data/preprocessed', 'recipe_pp.csv') self.reviews = storage.import_file('data/preprocessed', 'review_pp.csv') if like: self.target = 'liked' self.s_min = 0 self.s_max = 1 else: self.target = 'rating' self.s_min = 1 self.s_max = 5 reader = Reader(rating_scale=(self.s_min, self.s_max)) self.relevant_data = self.reviews[[ 'user_id', 'recipe_id', self.target ]] model_data = Dataset.load_from_df(self.relevant_data, reader) # Algos if 'NormalPredictor': self.algorithm = NormalPredictor() elif 'BaselineOnly': self.algorithm = BaselineOnly() elif 'KNNBasic': self.algorithm = KNNBasic() elif 'KNNWithMeans': self.algorithm = KNNWithMeans() elif 'KNNWithZScore': self.algorithm = KNNWithZScore() elif 'KNNBaseline': self.algorithm = KNNBaseline() elif 'SVD': params = { 'n_epochs': 20, 'n_factors': 100, 'lr_all': 0.002, 'reg_all': 0.02 } self.algorithm = SVD(params) # Tuned with svd_grid elif 'SVDpp': self.algorithm = SVDpp() elif 'NMF': self.algorithm = NMF() elif 'SlopeOne': self.algorithm = SlopeOne() elif 'CoClustering': self.algorithm = CoClustering() if test == 'cv': cv_results = cross_validate(self.algorithm, model_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) rmse = np.round(cv_results['test_rmse'].mean(), 3) mae = np.round(cv_results['test_mae'].mean(), 3) train_data = model_data.build_full_trainset() self.algorithm.fit(train_data) elif test == 'svd_grid': param_grid = { 'n_epochs': [10, 20], 'n_factors': [100, 200], 'lr_all': [0.001, 0.002], 'reg_all': [0.01, 0.02] } train_data = model_data.build_full_trainset() gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3) gs.fit(model_data) rmse = gs.best_score['rmse'] mae = gs.best_score['mae'] print(gs.best_params['rmse'], gs.best_params['mae']) self.algorithm = gs.best_estimator['rmse'] train_data = model_data.build_full_trainset() self.algorithm.fit(train_data) else: train, test = train_test_split(model_data, test_size=0.3, random_state=42) self.algorithm.fit(train) predictions = self.algorithm.test(test) rmse = np.round(accuracy.rmse(predictions), 3) mae = np.round(accuracy.mae(predictions), 3) return rmse, mae def predict(self, user_id): inputs = self.relevant_data[self.relevant_data['user_id'] == user_id] \ .merge(self.recipes, on="recipe_id", how="left")[['recipe_id', 'name', self.target]] display(inputs) user_recipes = self.relevant_data[self.relevant_data['user_id'] == user_id].recipe_id.unique() recipe_list = self.relevant_data[ self.relevant_data['user_id'] != user_id].recipe_id.unique() predictions = [ self.algorithm.predict(user_id, rec) for rec in recipe_list if rec not in list(user_recipes) ] pdf = pd.DataFrame(predictions, columns=[ 'user_id', 'recipe_id', self.target, f'rec_{self.target}', 'details' ]) pdf = pdf.drop(columns=[self.target, 'details']) pdf = pdf.sort_values(f'rec_{self.target}', ascending=False) rec_target = pdf[f'rec_{self.target}'] pdf['rec_score'] = (rec_target - self.s_min) / (self.s_max - self.s_min) outputs = pdf.merge(self.recipes, on="recipe_id", how="left")[[ 'recipe_id', 'name', f'rec_{self.target}', 'rec_score' ]] display(outputs.head(10)) return outputs
print('Here are the top 5 recommendations based on Slope-One! ') for i in range(5): movieRecc2 = topMovies2[i] movieRawID2 = movieRecc2[0] movieName2 = movie[movieRawID2] print(str(i+1) + '. ' + movieName2 ) #############predictions using Co-Clustering print('') print('Making more recommendations...') algo3 = CoClustering() algo3.fit(trainset) predictions3 = algo3.test(testset) dictMovies3 = get_top_n(predictions3) topMovies3 = dictMovies3.get(672) print('') print('Here are the top 5 recommendations based on Co-Clustering! ') for i in range(5): movieRecc3 = topMovies3[i] movieRawID3 = movieRecc3[0] movieName3 = movie[movieRawID3] print(str(i+1) + '. ' + movieName3 )
from surprise import SVD, CoClustering, NMF from surprise import KNNBasic, KNNWithMeans from numpy import tensordot from numpy.linalg import norm from itertools import product from PIL import Image sns.set() pd.set_option('display.expand_frame_repr', False) labelencoder_PID = LabelEncoder() labelencoder_UID = LabelEncoder() svdModel = SVD(n_factors = 20, n_epochs = 10, biased=True) sim_options = {'name': 'msd', 'user_based': False} knnBasicModel = KNNBasic(k =10, sim_options=sim_options) coCluster = CoClustering(n_cltr_u = 10, n_cltr_i = 10) nmfModel = NMF(n_factors = 10, n_epochs = 40, biased=True) predictionsEn = [] listOfBooksReadByTop10 = [] recBooksEn = [] def getDataFromFile(): df = pd.read_csv("goodbooks-10k/ratings.csv") threeabvrating = df[df["rating"]>=3] books = threeabvrating.groupby("book_id").agg({"user_id":"count", "rating" : "mean"}).reset_index().rename(columns = {"user_id" : "count_users","rating": "avg_rating"}) sorted_bks = books.sort_values(by=['count_users', 'avg_rating'], ascending=False).reset_index() top500famousBookIds = sorted_bks.book_id.unique()[:500] famousBooks = df[df['book_id'].isin(top500famousBookIds)] countUsers = famousBooks.groupby("user_id").agg({"rating" : "count"}).reset_index().rename(columns = {"rating":"count"}) top500users = countUsers.sort_values(by = "count", ascending=False).reset_index().user_id.unique()[:500] df = famousBooks[famousBooks['user_id'].isin(top500users)].reset_index()
models.append('KNN with cosine') # ## CoClustering Implementation # In[88]: # We'll use the famous SVD algorithm. from surprise import CoClustering df_CoClustering = df_final_user_repo_star_v3.copy(deep=True); dataCoClustering = Dataset.load_from_df(df_CoClustering, reader) coClustering = CoClustering(n_cltr_u=3, n_cltr_i=3, n_epochs=20) # Train the algorithm on the trainset, and predict ratings for the testset trainsetcoClustering = dataCoClustering.build_full_trainset() coClustering.fit(trainsetcoClustering) testcoClustering = trainsetcoClustering.build_anti_testset() predictionscoClustering = coClustering.test(testcoClustering) accuracy.rmse(predictionscoClustering) listOfRMSE.append(accuracy.rmse(predictionscoClustering)) models.append('CoClustering')
#%% # Use the new parameters with the sampled training data algo_svdpp = SVDpp(lr_all = 0.01, reg_all = 0.15) fit_rmse(algo_svdpp, tr_dat) algo_svdpp.fit(tr_dat.build_full_trainset()) #%% algo_svdpp_new = SVDpp(lr_all = 0.01, reg_all = 0.1) fit_rmse(algo_svdpp_new, tr_dat) #%% output(algo_svdpp, "SVDpp_lr0.01_reg0.15.csv") output(algo_svdpp_new, "SVDpp_lr0.01_reg0.10.csv") #%% # CoClustering algo_cc = CoClustering() fit_rmse(algo_cc, tr_dat) output(algo_cc, "CoClustering.csv") #%% # KNNWithMeans algo_knnwm = KNNWithMeans(k = 40, sim_options = {'name': 'cosine', 'user_based': False}) fit_rmse(algo_knnwm, samp_dat) # Gridsearch param_grid_knnwm = {'k': [30, 40, 50], 'sim_options': {'name':['cosine', 'pearson'], 'user_based':[False]}} gs_knnwm = GridSearchCV(KNNWithMeans, param_grid_knnwm, measures = ['rmse','mae'], cv = 3) gs_knnwm.fit(samp_dat)
def getRecommendations(self, IDUser, method=9, similarityMeasure=1, isUserBased="Yes"): conn = sqlite3.connect(DATABASE_NAME) df = pd.read_sql_query( "SELECT userID, glassID, relativeRating FROM ratings", conn) reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df( df[['userID', 'glassID', 'relativeRating']], reader) trainset = data.build_full_trainset() isUserBased = True if (isUserBased == "Yes") else False if similarityMeasure == 1: similarityMeasure = "cosine" elif similarityMeasure == 2: similarityMeasure = "pearson" else: similarityMeasure = "pearson_baseline" sim_options = {'name': similarityMeasure, 'user_based': isUserBased} if method == 1: algo = SVD() elif method == 2: algo = SlopeOne() elif method == 3: algo = NMF() elif method == 4: algo = NormalPredictor() elif method == 5: algo = KNNBaseline(sim_options=sim_options) elif method == 6: algo = KNNBasic(sim_options=sim_options) elif method == 7: algo = KNNWithMeans(sim_options=sim_options) elif method == 8: algo = KNNWithZScore(sim_options=sim_options) elif method == 9: algo = BaselineOnly() else: algo = CoClustering() algo.fit(trainset) predictions = pd.DataFrame(columns=['glassID', 'estimatedRating']) totalGlass = df['glassID'].max() glassPivot = df.pivot_table(index='glassID', columns='userID', values='relativeRating') for iid in range(1, totalGlass + 1): isNan = True try: isNan = pd.isna(glassPivot.loc[iid, IDUser]) except: continue if isNan: prediction = algo.predict(IDUser, iid, verbose=False) predictions = predictions.append( pd.DataFrame([[iid, prediction[3]]], columns=predictions.columns)) predictions = predictions.sort_values('estimatedRating', ascending=False) recommendationList = [ item for item in predictions[predictions['estimatedRating'] > 3] ['glassID'].head(50).tolist() ] conn.close() return recommendationList
from surprise import KNNBasic bsl_options = { 'method': 'als', 'n_epochs': 10, } sim_options = {'name': 'pearson_baseline', 'user_based': False} algo = KNNBasic(k=5, bsl_options=bsl_options, sim_options=sim_options) # In[26]: bsl_options = {'method': 'sgd', 'lr': 0.01} sim_options = {'name': 'pearson_baseline', 'user_based': False} algo = KNNBasic(k=5, bsl_options=bsl_options, sim_options=sim_options) # In[22]: SlopeOne Algo from surprise import SVD, SlopeOne algo = SlopeOne() cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=10, verbose=True) # In[6]: CoClustering from surprise import CoClustering from surprise.model_selection import cross_validate algo = CoClustering(n_cltr_u=3, n_cltr_i=3) cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=10, verbose=True)
def train_surprise(self, model_type, trainset, testset, k_recommend, sql_db, k_fold, knowledge, model_name, result_name, system_eval=False): knn_user_based = self.config['SURPRISE_KNN'].getboolean( 'knn_user_based') knn_similarity = self.config['SURPRISE_KNN']['knn_similarity'] sim_options = {'name': knn_similarity, 'user_based': knn_user_based} verbose_switch = self.config['DEFAULT'].getboolean('verbose_switch') # Selección de modelo a utilizar if (model_type == "svd"): # Obtener valores de configuracion svd_grid_search = self.config['SURPRISE_SVD'].getboolean( 'svd_grid_search') svd_grid_metric = self.config['SURPRISE_SVD']['svd_grid_metric'] svd_n_factors = int(self.config['SURPRISE_SVD']['svd_n_factors']) svd_n_epochs = int(self.config['SURPRISE_SVD']['svd_n_epochs']) svd_biased = self.config['SURPRISE_SVD'].getboolean('svd_biased') svd_init_mean = float(self.config['SURPRISE_SVD']['svd_init_mean']) svd_init_std_dev = float( self.config['SURPRISE_SVD']['svd_init_std_dev']) svd_lr_all = float(self.config['SURPRISE_SVD']['svd_lr_all']) svd_reg_all = float(self.config['SURPRISE_SVD']['svd_reg_all']) if (self.common_functions.validate_available_sql_data( 'svd_params', sql_db) == True): results = pd.read_sql_query('select * from svd_params;', sql_db, index_col='index') real_results = results[(results["knowledge"] == knowledge) & (results["algorithm"] == "svd")] if (real_results.empty == False): svd_n_factors = int(real_results.iloc[0]['svd_n_factors']) svd_n_epochs = int(real_results.iloc[0]['svd_n_epochs']) svd_init_std_dev = float( real_results.iloc[0]['svd_init_std_dev']) svd_lr_all = float(real_results.iloc[0]['svd_lr_all']) svd_reg_all = float(real_results.iloc[0]['svd_reg_all']) algo = SVD(n_factors=svd_n_factors, n_epochs=svd_n_epochs, biased=svd_biased, init_mean=svd_init_mean, init_std_dev=svd_init_std_dev, lr_all=svd_lr_all, reg_all=svd_reg_all, verbose=verbose_switch) elif (model_type == "SVDpp"): # Obtener valores de configuracion svdpp_grid_search = self.config['SURPRISE_SVDPP'].getboolean( 'svdpp_grid_search') svdpp_grid_metric = self.config['SURPRISE_SVDPP'][ 'svdpp_grid_metric'] svdpp_n_factors = int( self.config['SURPRISE_SVDPP']['svdpp_n_factors']) svdpp_n_epochs = int( self.config['SURPRISE_SVDPP']['svdpp_n_epochs']) svdpp_init_mean = float( self.config['SURPRISE_SVDPP']['svdpp_init_mean']) svdpp_init_std_dev = float( self.config['SURPRISE_SVDPP']['svdpp_init_std_dev']) svdpp_lr_all = float(self.config['SURPRISE_SVDPP']['svdpp_lr_all']) svdpp_reg_all = float( self.config['SURPRISE_SVDPP']['svdpp_reg_all']) if (self.common_functions.validate_available_sql_data( 'svdpp_params', sql_db) == True): results = pd.read_sql_query('select * from svdpp_params;', sql_db, index_col='index') real_results = results[(results["knowledge"] == knowledge) & (results["algorithm"] == "svdpp")] if (real_results.empty == False): svdpp_n_factors = int( real_results.iloc[0]['svdpp_n_factors']) svdpp_n_epochs = int( real_results.iloc[0]['svdpp_n_epochs']) svdpp_init_std_dev = float( real_results.iloc[0]['svdpp_init_std_dev']) svdpp_lr_all = float(real_results.iloc[0]['svdpp_lr_all']) svdpp_reg_all = float( real_results.iloc[0]['svdpp_reg_all']) algo = SVDpp(n_factors=svdpp_n_factors, n_epochs=svdpp_n_epochs, init_mean=svdpp_init_mean, init_std_dev=svdpp_init_std_dev, lr_all=svdpp_lr_all, reg_all=svdpp_reg_all, verbose=verbose_switch) elif (model_type == "NMF"): # Obtener valores de configuracion nmf_grid_search = self.config['SURPRISE_NMF'].getboolean( 'nmf_grid_search') nmf_grid_metric = self.config['SURPRISE_NMF']['nmf_grid_metric'] nmf_n_factors = int(self.config['SURPRISE_NMF']['nmf_n_factors']) nmf_n_epochs = int(self.config['SURPRISE_NMF']['nmf_n_epochs']) nmf_biased = self.config['SURPRISE_NMF'].getboolean('nmf_biased') nmf_reg_pu = float(self.config['SURPRISE_NMF']['nmf_reg_pu']) nmf_reg_qi = float(self.config['SURPRISE_NMF']['nmf_reg_qi']) nmf_reg_bu = float(self.config['SURPRISE_NMF']['nmf_reg_bu']) nmf_reg_bi = float(self.config['SURPRISE_NMF']['nmf_reg_bi']) nmf_lr_bu = float(self.config['SURPRISE_NMF']['nmf_lr_bu']) nmf_lr_bi = float(self.config['SURPRISE_NMF']['nmf_lr_bi']) nmf_init_low = float(self.config['SURPRISE_NMF']['nmf_init_low']) nmf_init_high = int(self.config['SURPRISE_NMF']['nmf_init_high']) if (self.common_functions.validate_available_sql_data( 'nmf_params', sql_db) == True): results = pd.read_sql_query('select * from nmf_params;', sql_db, index_col='index') real_results = results[(results["knowledge"] == knowledge) & (results["algorithm"] == "nmf")] if (real_results.empty == False): nmf_n_factors = int(real_results.iloc[0]['nmf_n_factors']) nmf_n_epochs = int(real_results.iloc[0]['nmf_n_epochs']) nmf_reg_pu = float(real_results.iloc[0]['nmf_reg_pu']) nmf_reg_qi = float(real_results.iloc[0]['nmf_reg_qi']) nmf_init_low = float(real_results.iloc[0]['nmf_init_low']) algo = NMF(n_factors=nmf_n_factors, n_epochs=nmf_n_epochs, biased=nmf_biased, reg_pu=nmf_reg_pu, reg_qi=nmf_reg_qi, reg_bu=nmf_reg_bu, reg_bi=nmf_reg_bi, lr_bu=nmf_lr_bu, lr_bi=nmf_lr_bi, init_low=nmf_init_low, init_high=nmf_init_high, verbose=verbose_switch) elif (model_type == "NormalPredictor"): algo = NormalPredictor() elif (model_type == "BaselineOnly"): algo = BaselineOnly(verbose=verbose_switch) elif (model_type == "KNNBasic"): # Obtener valores de configuracion knn_k = int(self.config['SURPRISE_KNN']['knn_k']) knn_min_k = int(self.config['SURPRISE_KNN']['knn_min_k']) knn_grid_search = self.config['SURPRISE_KNN'].getboolean( 'knn_grid_search') knn_grid_metric = self.config['SURPRISE_KNN']['knn_grid_metric'] if (self.common_functions.validate_available_sql_data( 'knnbasic_params', sql_db) == True): results = pd.read_sql_query('select * from knnbasic_params;', sql_db, index_col='index') real_results = results[(results["knowledge"] == knowledge) & (results["algorithm"] == "knnbasic")] if (real_results.empty == False): knn_k = int(real_results.iloc[0]['knn_k']) knn_min_k = int(real_results.iloc[0]['knn_min_k']) algo = KNNBasic(k=knn_k, min_k=knn_min_k, sim_options=sim_options, verbose=verbose_switch) elif (model_type == "KNNWithMeans"): # Obtener valores de configuracion knn_k = int(self.config['SURPRISE_KNN']['knn_k']) knn_min_k = int(self.config['SURPRISE_KNN']['knn_min_k']) knn_grid_search = self.config['SURPRISE_KNN'].getboolean( 'knn_grid_search') knn_grid_metric = self.config['SURPRISE_KNN']['knn_grid_metric'] if (self.common_functions.validate_available_sql_data( 'knnwithmeans_params', sql_db) == True): results = pd.read_sql_query( 'select * from knnwithmeans_params;', sql_db, index_col='index') real_results = results[(results["knowledge"] == knowledge) & ( results["algorithm"] == "knnwithmeans")] if (real_results.empty == False): knn_k = int(real_results.iloc[0]['knn_k']) knn_min_k = int(real_results.iloc[0]['knn_min_k']) algo = KNNWithMeans(k=knn_k, min_k=knn_min_k, sim_options=sim_options, verbose=verbose_switch) elif (model_type == "KNNWithZScore"): # Obtener valores de configuracion knn_k = int(self.config['SURPRISE_KNN']['knn_k']) knn_min_k = int(self.config['SURPRISE_KNN']['knn_min_k']) knn_grid_search = self.config['SURPRISE_KNN'].getboolean( 'knn_grid_search') knn_grid_metric = self.config['SURPRISE_KNN']['knn_grid_metric'] if (self.common_functions.validate_available_sql_data( 'knnwithzscore_params', sql_db) == True): results = pd.read_sql_query( 'select * from knnwithzscore_params;', sql_db, index_col='index') real_results = results[(results["knowledge"] == knowledge) & ( results["algorithm"] == "knnwithzscore")] if (real_results.empty == False): knn_k = int(real_results.iloc[0]['knn_k']) knn_min_k = int(real_results.iloc[0]['knn_min_k']) algo = KNNWithZScore(k=knn_k, min_k=knn_min_k, sim_options=sim_options, verbose=verbose_switch) elif (model_type == "KNNBaseline"): # Obtener valores de configuracion knn_k = int(self.config['SURPRISE_KNN']['knn_k']) knn_min_k = int(self.config['SURPRISE_KNN']['knn_min_k']) knn_grid_search = self.config['SURPRISE_KNN'].getboolean( 'knn_grid_search') knn_grid_metric = self.config['SURPRISE_KNN']['knn_grid_metric'] if (self.common_functions.validate_available_sql_data( 'knnbaseline_params', sql_db) == True): results = pd.read_sql_query( 'select * from knnbaseline_params;', sql_db, index_col='index') real_results = results[(results["knowledge"] == knowledge) & (results["algorithm"] == "knnbaseline")] if (real_results.empty == False): knn_k = int(real_results.iloc[0]['knn_k']) knn_min_k = int(real_results.iloc[0]['knn_min_k']) algo = KNNBaseline(k=knn_k, min_k=knn_min_k, sim_options=sim_options, verbose=verbose_switch) elif (model_type == "SlopeOne"): algo = SlopeOne() elif (model_type == "CoClustering"): # Obtener valores de configuracion cc_grid_search = self.config['SURPRISE_COCLUSTERING'].getboolean( 'cc_grid_search') cc_grid_metric = self.config['SURPRISE_COCLUSTERING'][ 'cc_grid_metric'] cc_n_cltr_u = int( self.config['SURPRISE_COCLUSTERING']['cc_n_cltr_u']) cc_n_cltr_i = int( self.config['SURPRISE_COCLUSTERING']['cc_n_cltr_i']) cc_n_epochs = int( self.config['SURPRISE_COCLUSTERING']['cc_n_epochs']) if (self.common_functions.validate_available_sql_data( 'coclustering_params', sql_db) == True): results = pd.read_sql_query( 'select * from coclustering_params;', sql_db, index_col='index') real_results = results[(results["knowledge"] == knowledge) & ( results["algorithm"] == "coclustering")] if (real_results.empty == False): cc_n_cltr_u = int(real_results.iloc[0]['cc_n_cltr_u']) cc_n_cltr_i = int(real_results.iloc[0]['cc_n_cltr_i']) cc_n_epochs = int(real_results.iloc[0]['cc_n_epochs']) algo = CoClustering(n_cltr_u=cc_n_cltr_u, n_cltr_i=cc_n_cltr_i, n_epochs=cc_n_epochs, verbose=verbose_switch) else: return { "status": False, "result": "Defined model_type does not exist" } st = default_timer() print("STARTING to train model: " + str(model_name)) algo.fit(trainset) train_model_runtime = default_timer() - st # Almacenar tiempo de proceso en base de datos self.common_functions.save_process_time( st, event=str(model_name) + "_training", description="Time for model to be trained on dataset") # Guardar modelo # Crear directorio si no existe if (os.path.isdir(self.models_path + model_name) == False): try: os.makedirs(self.models_path + model_name) except OSError as e: if e.errno != errno.EEXIST: return {"status": False, "result": e} # Almacenar modelo en file system #file_name = self.models_path+model_name+"/model" #dump.dump(file_name, algo=algo) st = default_timer() print("STARTING to generate predictions with the trained model: " + str(model_name)) predictions = algo.test(testset) runtime = default_timer() - st print( "Tiempo de ejecucion total de la generacion de predicciones para Surprise Time:", round(runtime, 2)) self.common_functions.save_process_time( st, event=str(model_name) + "_generate_recommendations", description="Time for predictions to be generated using the model") # Guardar predicciones para hibridación # Crear directorio si no existe if (os.path.isdir(self.models_path + model_name + "/predictions/" + str(k_fold)) == False): try: os.makedirs(self.models_path + model_name + "/predictions/" + str(k_fold)) except OSError as e: if e.errno != errno.EEXIST: return {"status": False, "result": e} # Almacenar predicciones para hibridación eval_result = pd.DataFrame( columns=['user_id', 'item_id', 'r_ui', 'est']) for uid, iid, true_r, est, _ in predictions: eval_result = eval_result.append( { 'user_id': uid, 'item_id': iid, 'r_ui': true_r, 'est': est }, ignore_index=True) eval_result.to_csv(path_or_buf=self.models_path + model_name + "/predictions/" + str(k_fold) + "/predictions.csv", encoding='latin1', sep=str(u';').encode('utf-8'), index=False) # --------------------------- if (system_eval == False): # Procesar y evaluar las recomendaciones para el modelo st = default_timer() print("STARTING to evaluate recommendations with model: " + str(model_name)) process_evaluate_result = self.evaluation.surprise_process_evaluate( predictions, knowledge, model_name, result_name, train_model_runtime, k_recommend, sql_db, k_fold, is_surprise=True) # Almacenar tiempo de proceso en base de datos self.common_functions.save_process_time( st, event=str(model_name) + "_evaluate_model", description="Time for model to be evaluated in test dataset") if (process_evaluate_result["status"] == True): del (process_evaluate_result) return {"status": True, "result": ""} else: del (process_evaluate_result) return { "status": False, "result": "no se pudo ejecutar correctamente content_explicit" } else: print("decide what to do") #result_model.save(self.models_path+model) return {"status": True, "result": ""}
file_path6 = os.path.expanduser('~/Downloads/CS5344 Project/surprise/surprise/data/ratingsProcessed6m.csv') reader = Reader(line_format='user item rating', sep=',') data6 = Dataset.load_from_file(file_path6, reader=reader) # sample random trainset and testset # test set is made of 25% of the ratings. trainset6, testset6 = train_test_split(data6, test_size=.25) # Choose the algo to use to compute RMSE algo = SVD() algo = BaselineOnly() algo = KNNBasic() algo = SlopeOne() algo = CoClustering() algo = SVDpp() algo = NMF() algo = NormalPredictor() # Train the algorithm on the trainset, and predict ratings for the testset start = time.time() algo.fit(trainset6) predictions = algo.test(testset6) accuracy.rmse(predictions) end = time.time() elapsed = end - start print(elapsed) # Then compute RMSE
gs.fit(data) # * making the essential prints of what just happened print("Best Score\n", gs.best_score) print("Best Params\n", gs.best_params) print("Best Estimators\n", gs.best_estimator) print("Best Index\n", gs.best_index) print("Results Dicts: \n") results_df = pd.DataFrame.from_dict(gs.cv_results) print(results_df) # * define a cross-validation iterator kf = KFold(n_splits=5) # * Choosing Co-Clustering as algorithm algo = CoClustering() # * Train the algorithm on the trainset, and predict ratings for the testset for trainset, testset in kf.split(data): predictions = algo.fit(trainset).test(testset) precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4) accuracy.rmse(predictions) accuracy.mae(predictions) accuracy.mse(predictions) accuracy.fcp(predictions) print("Precision: ", sum(prec for prec in precisions.values()) / len(precisions)) print("Recall: ", sum(rec for rec in recalls.values()) / len(recalls)) df = pd.DataFrame(predictions, columns=["uid", "iid", "rui", "est", "details"]) df["err"] = abs(df.est - df.rui)
def __init__(self, module_type, baseline_type, cf_type, similar, sim_type, params): assert baseline_type in {"ALS", "SGD", "default"} assert cf_type in {None, "base_user", "base_item"} assert similar in {None, "COSINE", "cosine", "MSD", "msd", "PEARSON", "pearson", "PEARSON_BASELINE", "pearson_baseline", "JACCARD", "jaccard", "EUCLIDEAN", "euclidean"} assert sim_type in {None, "default"} self.module_type = module_type self.baseline_type = baseline_type self.cf_type = cf_type self.similar = similar self.sim_type = sim_type self.bu = None self.bi = None self.sim = None if self.baseline_type == "ALS": bsl_options = {'method': params["bsl_options"].get("method", 'als'), 'n_epochs': params["bsl_options"].get("n_epochs", 10), 'reg_u': params["bsl_options"].get("reg_u", 15), 'reg_i': params["bsl_options"].get("reg_i", 10) } elif self.baseline_type == "SGD": bsl_options = {'method': params["bsl_options"].get("method", 'sgd'), 'n_epochs': params["bsl_options"].get("n_epochs", 20), 'reg': params["bsl_options"].get("reg", 0.02), 'learning_rate': params["bsl_options"].get("learning_rate", 0.005) } else: # 默认值 bsl_options = {} params["sim_options"] = {} if self.cf_type == "base_user": params["sim_options"]["user_based"] = True elif self.cf_type == "base_item": params["sim_options"]["item_based"] = False else: params["sim_options"]["user_based"] = True if self.similar == "COSINE" or self.similar == "cosine": params["sim_options"]["name"] = "cosine" elif self.similar == "MSD" or self.similar == "msd": params["sim_options"]["name"] = "msd" elif self.similar == "PEARSON" or self.similar == "pearson": params["sim_options"]["name"] = "pearson" elif self.similar == "PEARSON_BASELINE" or self.similar == "pearson_baseline": params["sim_options"]["name"] = "pearson_baseline" elif self.similar == "JACCARD" or self.similar == "jaccard": params["sim_options"]["name"] = "jaccard" elif self.similar == "EUCLIDEAN" or self.similar == "euclidean": params["sim_options"]["name"] = "euclidean" else: params["sim_options"]["name"] = "msd" if self.sim_type == "default": sim_options = {} else: sim_options = {"name": params["sim_options"].get("name", "MSD"), "user_based": params["sim_options"].get("user_based", True), "min_support": params["sim_options"].get("min_support", 5), "shrinkage": params["sim_options"].get("shrinkage", 100) } """ 'name':要使用的相似性名称,如similarities模块中所定义 。默认值为'MSD'。 'user_based':将计算用户之间还是项目之间的相似性。这对预测算法的性能有巨大影响。默认值为True。 'min_support':相似度不为零的最小公共项数('user_based' 为'True'时)或最小公共用户数('user_based'为 'False'时)。 简单地说,如果 |Iuv|<min_support 然后 sim(u,v)=0。项目也是如此。 'shrinkage': """ if self.module_type == "KNNmeans": # 在KNNBasic算法的基础上,考虑用户均值或项目均值 self.model = KNNWithMeans(k=params.get("k", 40), min_k=params.get("min_k", 1), sim_options=sim_options, verbose=params.get("verbose", True)) elif self.module_type == "KNNzscore": # 引入Z - Score的思想 self.model = KNNWithZScore(k=params.get("k", 40), min_k=params.get("min_k", 1), sim_options=sim_options, verbose=params.get("verbose", True)) elif self.module_type == "KNNbase": # 和KNNWithMeans的区别在于,用的不是均值而是bias self.model = KNNBaseline(k=params.get("k", 40), min_k=params.get("min_k", 1), # 最少的邻居个数 sim_options=sim_options, bsl_options=bsl_options, verbose=params.get("verbose", True)) elif self.module_type == "KNNbasic": # 最基础的KNN算法,可分为user - based KNN和item - based KNN self.model = KNNBasic(k=params.get("k", 40), min_k=params.get("min_k", 1), sim_options=sim_options, verbose=params.get("verbose", True)) elif self.module_type == "SVD": self.model = SVD(n_factors=params.get("n_factors", 100), n_epochs=params.get("n_epochs", 20), init_mean=params.get("init_mean", 0), init_std_dev=params.get("init_std_dev", 0.1), lr_all=params.get("lr_all", 0.005), reg_all=params.get("reg_all", 0.02), lr_bu=params.get("lr_bu", None), lr_bi=params.get("lr_bi", None), lr_pu=params.get("lr_pu", None), lr_qi=params.get("lr_qi", None), reg_bu=params.get("reg_bu", None), reg_bi=params.get("reg_bi", None), reg_pu=params.get("reg_pu", None), reg_qi=params.get("reg_qi", None), random_state=params.get("random_state", None), verbose=params.get("verbose", False) ) """ n_factors –因素数。默认值为100。 n_epochs – SGD过程的迭代次数。默认值为 20。 偏见(bool)–是否使用基线(或偏见)。请参阅上面的注释。默认值为True。 init_mean –因子向量初始化的正态分布平均值。默认值为0。 init_std_dev –因子向量初始化的正态分布的标准偏差。默认值为0.1。 lr_all –所有参数的学习率。默认值为0.005。 reg_all –所有参数的正则项。默认值为 0.02。 lr_bu –的学习率bu。lr_all如果设置优先 。默认值为None。 lr_bi –的学习率bi。lr_all如果设置优先 。默认值为None。 lr_pu –的学习率pu。lr_all如果设置优先 。默认值为None。 lr_qi –的学习率qi。lr_all如果设置优先 。默认值为None。 reg_bu –的正则化术语bu。reg_all如果设置优先。默认值为None。 reg_bi –的正则化术语bi。reg_all如果设置优先。默认值为None。 reg_pu –的正则化术语pu。reg_all如果设置优先。默认值为None。 reg_qi –的正则化术语qi。reg_all如果设置优先。默认值为None。 random_state(int,numpy中的RandomState实例或None)–确定将用于初始化的RNG。 如果为int,random_state则将用作新RNG的种子。通过多次调用进行相同的初始化非常有用 fit()。 如果是RandomState实例,则将该实例用作RNG。如果为None,则使用numpy中的当前RNG。默认值为 None。 详细 –如果True,则打印当前纪元。默认值为False。 """ elif self.module_type == "SVDpp": self.model = SVDpp(n_factors=params.get("n_factors", 100), n_epochs=params.get("n_epochs", 20), init_mean=params.get("init_mean", 0), init_std_dev=params.get("init_std_dev", 0.1), lr_all=params.get("lr_all", 0.005), reg_all=params.get("reg_all", 0.02), lr_bu=params.get("lr_bu", None), lr_bi=params.get("lr_bi", None), lr_pu=params.get("lr_pu", None), lr_qi=params.get("lr_qi", None), reg_bu=params.get("reg_bu", None), reg_bi=params.get("reg_bi", None), reg_pu=params.get("reg_pu", None), reg_qi=params.get("reg_qi", None), random_state=params.get("random_state", None), verbose=params.get("verbose", False)) """ n_factors –因素数。默认值为20。 n_epochs – SGD过程的迭代次数。默认值为 20。 init_mean –因子向量初始化的正态分布平均值。默认值为0。 init_std_dev –因子向量初始化的正态分布的标准偏差。默认值为0 .1。 lr_all –所有参数的学习率。默认值为0 .007。 reg_all –所有参数的正则项。默认值为 0.02。 lr_bu –的学习率bu。lr_all如果设置优先 。默认值为None。 lr_bi –的学习率bi。lr_all如果设置优先 。默认值为None。 lr_pu –的学习率pu。lr_all如果设置优先 。默认值为None。 lr_qi –的学习率qi。lr_all如果设置优先 。默认值为None。 lr_yj –的学习率yj。lr_all如果设置优先 。默认值为None。 reg_bu –的正则化术语bu。reg_all如果设置优先。默认值为None。 reg_bi –的正则化术语bi。reg_all如果设置优先。默认值为None。 reg_pu –的正则化术语pu。reg_all如果设置优先。默认值为None。 reg_qi –的正则化术语qi。reg_all如果设置优先。默认值为None。 reg_yj –的正则化术语yj。reg_all如果设置优先。默认值为None。 random_state(int,numpy中的RandomState实例或None)–确定将用于初始化的RNG。如果为int,random_state则将用作新RNG的种子。通过多次调用进行相同的初始化非常有用 fit()。如果是RandomState实例,则将该实例用作RNG。如果为None,则使用numpy中的当前RNG。默认值为 None。 详细 –如果True,则打印当前纪元。默认值为False。 """ elif self.module_type == "NMF": # 非负矩阵分解,即要求p矩阵和q矩阵都是正的 self.model = NMF(n_factors=params.get("n_factors", 100), n_epochs=params.get("n_epochs", 20), init_mean=params.get("init_mean", 0), init_std_dev=params.get("init_std_dev", 0.1), lr_all=params.get("lr_all", 0.005), reg_all=params.get("reg_all", 0.02), lr_bu=params.get("lr_bu", None), lr_bi=params.get("lr_bi", None), lr_pu=params.get("lr_pu", None), lr_qi=params.get("lr_qi", None), reg_bu=params.get("reg_bu", None), reg_bi=params.get("reg_bi", None), reg_pu=params.get("reg_pu", None), reg_qi=params.get("reg_qi", None), random_state=params.get("random_state", None), verbose=params.get("verbose", False)) """ n_factors –因素数。默认值为15。 n_epochs – SGD过程的迭代次数。默认值为 50。 偏见(bool)–是否使用基线(或偏见)。默认值为 False。 reg_pu –用户的正则化术语λu。默认值为 0.06。 reg_qi –项目的正规化术语λi。默认值为 0.06。 reg_bu –的正则化术语bu。仅与偏置版本相关。默认值为0.02。 reg_bi –的正则化术语bi。仅与偏置版本相关。默认值为0.02。 lr_bu –的学习率bu。仅与偏置版本相关。默认值为0.005。 lr_bi –的学习率bi。仅与偏置版本相关。默认值为0.005。 init_low –因子的随机初始化的下限。必须大于0以确保非负因素。默认值为 0。 init_high –因子的随机初始化的上限。默认值为1。 random_state(int,numpy中的RandomState实例或None)–确定将用于初始化的RNG。 如果为int,random_state则将用作新RNG的种子。通过多次调用进行相同的初始化非常有用 fit()。 如果是RandomState实例,则将该实例用作RNG。如果为None,则使用numpy中的当前RNG。默认值为 None。 详细 –如果True,则打印当前纪元。默认值为False。 """ elif self.module_type == "SlopeOne": self.model = SlopeOne(**params) elif self.module_type == "cc": # 基于聚类的协同过滤 self.model = CoClustering(n_cltr_u=params.get("n_cltr_u", 3), n_cltr_i=params.get("n_cltr_i", 3), n_epochs=params.get("n_epochs", 20), random_state=params.get("random_state", None), verbose=params.get("verbose",False) ) """ n_cltr_u(int)–用户集群的数量。默认值为3。 n_cltr_i(int)–项目集群的数量。默认值为3。 n_epochs(int)–优化循环的迭代次数。默认值为 20。 random_state(int,numpy中的RandomState实例或None)–确定将用于初始化的RNG。 如果为int,random_state则将用作新RNG的种子。通过多次调用进行相同的初始化非常有用 fit()。 如果是RandomState实例,则将该实例用作RNG。如果为None,则使用numpy中的当前RNG。默认值为 None。 详细(bool)–如果为True,则将打印当前纪元。默认值为 False。 """ elif self.module_type == "BaselineOnly": # 不考虑用户的偏好 self.model = BaselineOnly(bsl_options=bsl_options, verbose=True) elif self.module_type == "Np": # 该算法即随机预测算法,假设测试集的评分满足正态分布,然后生成正态分布的随机数进行预测, self.model = NormalPredictor()
# 'n_epochs': [10,20,30,40,50,60,70,80,90,100]} # Evaluate the model with 5-fold cross validation #data.split(5) #grid_search = GridSearch(CoClustering, param_grid, measures=['RMSE']) #grid_search.evaluate(data) #print ("after grid_search.evaluate(data)") #print_perf(perf) #results_df = pd.DataFrame.from_dict(grid_search.cv_results) #print(results_df) """ # create a co-clustering algorithm algo = CoClustering(n_cltr_u=3, n_cltr_i=3, n_epochs=100) algo.train(trainset) # use the trained algorithm to predict ratings for every user in the test set f = open('testOutput.csv', 'w') f.write("test_id,rating\n") for i in range(len(dftest)): prediction = algo.predict(dftest.at[i, 'user_id'], dftest.at[i, 'business_id'], r_ui=4, verbose=True) predRating = prediction.est f.write(str(i) + "," + str(predRating) + '\n') f.close()
data_cv=data data_cv.split(n_folds=5) # SVD test svd = SVD() perf = evaluate(svd, data, measures=['RMSE']) print_perf(perf) # MSE 0.052 param_svd = {'n_factors': [50, 100], 'lr_all': [0.003, 0.005], 'reg_all': [0.05, 0.1, 0.5]} gs = GridSearch(SVD, param_svd, measures=['RMSE']) gs.evaluate(data_cv) # RMSE 0.2272 ~ 0.2284, after many tests notice 0.2272 is a benchmark, 100, 0.003, 0.1 # Co-clustering test coc=CoClustering() perf = evaluate(coc, data, measures=['RMSE']) print_perf(perf) # MSE 0.053 param_svd = {'n_cltr_u': [3, 5, 7], 'n_cltr_i': [3, 5, 7], 'n_epochs': [10, 20]} gs = GridSearch(CoClustering, param_svd, measures=['RMSE']) gs.evaluate(data_cv) # generally worse than SVD here, especially for larger cluster numbers # Non-negative Matrix Factorization nmf=NMF() perf = evaluate(nmf, data, measures=['RMSE']) print_perf(perf) # MSE 0.053 param_svd = {'n_factors': [5, 15], 'reg_qi': [0.06, 0.1], 'biased': [True], 'reg_pu': [0.06, 0.1], 'n_epochs': [20, 50]} gs = GridSearch(NMF, param_svd, measures=['RMSE'])
def train(self, algo='SVD', like=True, test='cv', local=False): if local: csv_path = os.path.join(os.path.dirname(__file__), "data/preprocessed") self.recipes = pd.read_csv(f"{csv_path}/recipe_pp.csv") self.reviews = pd.read_csv(f"{csv_path}/review_pp.csv") else: self.recipes = storage.import_file('data/preprocessed', 'recipe_pp.csv') self.reviews = storage.import_file('data/preprocessed', 'review_pp.csv') if like: self.target = 'liked' self.s_min = 0 self.s_max = 1 else: self.target = 'rating' self.s_min = 1 self.s_max = 5 reader = Reader(rating_scale=(self.s_min, self.s_max)) self.relevant_data = self.reviews[[ 'user_id', 'recipe_id', self.target ]] model_data = Dataset.load_from_df(self.relevant_data, reader) # Algos if 'NormalPredictor': self.algorithm = NormalPredictor() elif 'BaselineOnly': self.algorithm = BaselineOnly() elif 'KNNBasic': self.algorithm = KNNBasic() elif 'KNNWithMeans': self.algorithm = KNNWithMeans() elif 'KNNWithZScore': self.algorithm = KNNWithZScore() elif 'KNNBaseline': self.algorithm = KNNBaseline() elif 'SVD': params = { 'n_epochs': 20, 'n_factors': 100, 'lr_all': 0.002, 'reg_all': 0.02 } self.algorithm = SVD(params) # Tuned with svd_grid elif 'SVDpp': self.algorithm = SVDpp() elif 'NMF': self.algorithm = NMF() elif 'SlopeOne': self.algorithm = SlopeOne() elif 'CoClustering': self.algorithm = CoClustering() if test == 'cv': cv_results = cross_validate(self.algorithm, model_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) rmse = np.round(cv_results['test_rmse'].mean(), 3) mae = np.round(cv_results['test_mae'].mean(), 3) train_data = model_data.build_full_trainset() self.algorithm.fit(train_data) elif test == 'svd_grid': param_grid = { 'n_epochs': [10, 20], 'n_factors': [100, 200], 'lr_all': [0.001, 0.002], 'reg_all': [0.01, 0.02] } train_data = model_data.build_full_trainset() gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3) gs.fit(model_data) rmse = gs.best_score['rmse'] mae = gs.best_score['mae'] print(gs.best_params['rmse'], gs.best_params['mae']) self.algorithm = gs.best_estimator['rmse'] train_data = model_data.build_full_trainset() self.algorithm.fit(train_data) else: train, test = train_test_split(model_data, test_size=0.3, random_state=42) self.algorithm.fit(train) predictions = self.algorithm.test(test) rmse = np.round(accuracy.rmse(predictions), 3) mae = np.round(accuracy.mae(predictions), 3) return rmse, mae
predictions = algo.test(testset) print('Unbiased accuracy on B,', end=' ') accuracy.rmse(predictions) # comparing models: # Spot Check Algorithms models = [] models.append(('AlgoBase', BaselineOnly())) models.append(('BaselineOnly', KNNBasic())) models.append(('KNNBasic', KNNWithMeans())) models.append(('KNNWithMeans', KNNWithZScore())) models.append(('KNNWithZScore', SVD())) models.append(('SVD', NMF())) models.append(('NMF', SlopeOne())) models.append(('SlopeOne', CoClustering())) # evaluate each model in turn results = [] names = [] for name, model in models: # define a cross-validation iterator kf = KFold(n_splits=3) algo = model for trainset, testset in kf.split(data): # train and test algorithm. algo.fit(trainset) predictions = algo.test(testset) # Compute and print Root Mean Squared Error accuracy.rmse(predictions, verbose=True)
def surprise_algorithms_print_perf(): print('Surprise Algorithms (Tabla de resultados finales)...') print('Que data desea utilizar?') print('(1) Android') print('(2) WordPress') data_utilizar = input() # Funcion de encoding para no tener error de lectura del archivo. reload(sys) sys.setdefaultencoding('utf8') if data_utilizar == 1: file_path = configuration.FILE_PATH_ANDROID reader = Reader(line_format='user item rating', sep='\t') else: file_path = configuration.FILE_PATH_WORDPRESS reader = Reader(line_format='user item rating', sep=',') # Dataset data = Dataset.load_from_file(file_path, reader=reader) data.split(n_folds=5) # BaselineOnly algo_normal_predictor = NormalPredictor() perf_normal_predictor = evaluate(algo_normal_predictor, data, measures=['RMSE', 'MAE'], verbose=False) # SVD algo_svd = SVD() perf_svd = evaluate(algo_svd, data, measures=['RMSE', 'MAE'], verbose=False) # BaselineOnly algo_baseline_only = BaselineOnly() perf_baseline_only = evaluate(algo_baseline_only, data, measures=['RMSE', 'MAE'], verbose=False) # SVDpp algo_svdpp = SVDpp() perf_svdpp = evaluate(algo_svdpp, data, measures=['RMSE', 'MAE'], verbose=False) # NMF algo_nmf = NMF() perf_nmf = evaluate(algo_nmf, data, measures=['RMSE', 'MAE'], verbose=False) # SlopeOne algo_slope_one = SlopeOne() perf_slope_one = evaluate(algo_slope_one, data, measures=['RMSE', 'MAE'], verbose=False) # CoClustering algo_coclustering = CoClustering() perf_coclustering = evaluate(algo_coclustering, data, measures=['RMSE', 'MAE'], verbose=False) """Segmento que utiliza KNN para el analisis: 'k' Es el numero maximo de vecinos a tomar en cuenta para la agregacion 'min_k' El numero minimo de vecinos a tomar en cuenta para la agregacion. Si no hay suficientes vecinos,la predicción se establece en la media global de todas las calificaciones 'sim_options' son las opciones de similitud que utiliza el knn 'bsl_options' configuracion de las estimaciones de base""" k = 40 min_k = 1 sim_options = { 'name': 'pearson_baseline', 'user_based': 0 # no shrinkage } bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5} algo_knn_basic = KNNBasic(k=k, min_k=k, sim_options=sim_options) perf_knn_basic = evaluate(algo_knn_basic, data, measures=['RMSE', 'MAE'], verbose=False) algo_knn_with_means = KNNWithMeans(k=k, min_k=k, sim_options=sim_options) perf_knn_with_means = evaluate(algo_knn_with_means, data, measures=['RMSE', 'MAE'], verbose=False) algo_knn_base_line = KNNBaseline(k=k, min_k=k, sim_options=sim_options, bsl_options=bsl_options) perf_knn_base_line = evaluate(algo_knn_base_line, data, measures=['RMSE', 'MAE'], verbose=False) """Imprimiendo resultados de los algoritmos""" print('') print('Printing results from algorithms...') print('- Normal predictor') print_perf(perf_normal_predictor) print('') print('- Normal SVD') print_perf(perf_svd) print('') print('- Normal Baseline Only') print_perf(perf_baseline_only) print('') print('- Normal SVD++') print_perf(perf_svdpp) print('') print('- Normal NMF') print_perf(perf_nmf) print('') print('- Normal Slope One') print_perf(perf_slope_one) print('') print('- Normal Co-Clustering') print_perf(perf_coclustering) print('') print('- Normal KNN Basic') print_perf(perf_knn_basic) print('') print('- Normal KNN With Means') print_perf(perf_knn_with_means) print('') print('- Normal KNN Base Line') print_perf(perf_knn_base_line)
from surprise import Dataset from surprise import Reader from surprise.model_selection import train_test_split, cross_validate from surprise import accuracy from surprise import SVD, SVDpp, SlopeOne, NMF, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, NormalPredictor, CoClustering ratings_test = pd.read_csv('/Users/chrisjohanson/Desktop/Capstone 2/ratings.csv').set_index('rating_id') ratings_test = ratings_test.sample(frac=1)[:200000] print('----------Data is ready----------') #Load the dataset, save raw ratings to variable reader = Reader(rating_scale=(1.0, 5.0)) dataset_test = Dataset.load_from_df(ratings_test, reader) #put together list of algorithms to test out (1 out of 3 lists total) algorithms3 = [KNNWithZScore(), BaselineOnly(), CoClustering()] #create empty list to store results data benchmark = [] #iterate through each algorithm and save results info to benchmark for algo in algorithms3: results = cross_validate(algo, dataset_test, measures=['RMSE', 'MAE'], cv=3, verbose=False) tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append(pd.Series([str(algo).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) print(f"{algo} complete") #create df with the results results_df = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse') #save the results as a csv