def surprise_baseline(trainset, finalset): "baseline model" algo = spr.BaselineOnly() algo.fit(trainset) predictions_final = algo.test(finalset) return spr_estimate_to_vect(predictions_final)
def baseline(train, test): """ Run Baseline model from Surprise library. @param train: the training set in the Surprise format. @param test: the test set in the Surprise format. @return: the predictions in a numpy array. """ algo = spr.BaselineOnly() algo.fit(train) predictions = algo.test(test) return get_predictions(predictions)
def main(args): user_item_based = 'item_based' if args.item_based else 'user_based' filename = '_'.join([ args.exp_name, args.algorithm, args.sim_name, user_item_based, str(args.num_rows) ]) + '.pkl' output_file = Path(filename) if output_file.exists(): print(f'ERROR! Output file {output_file} already exists. Exiting!') sys.exit(1) print(f'Saving scores in {output_file}\n') reader = surprise.Reader(rating_scale=(1, 5)) df = pq.read_table('all_ratings_with_indices.parquet', columns=['user_idx', 'movie_idx', 'rating']).to_pandas() df.user_idx = df.user_idx.astype(np.uint32) df.movie_idx = df.movie_idx.astype(np.uint16) df.rating = df.rating.astype(np.uint8) print(df.dtypes) data = surprise.Dataset.load_from_df(df[:args.num_rows], reader=reader) del df sim_options = { 'name': args.sim_name, 'user_based': False if args.item_based else True } if args.algorithm == 'knn': algo = surprise.KNNBasic(sim_options=sim_options) elif args.algorithm == 'baseline': algo = surprise.BaselineOnly() elif args.algorithm == 'normal': algo = surprise.NormalPredictor() elif args.algorithm == 'knn_zscore': algo = surprise.KNNWithZScore(sim_options=sim_options) elif args.algorithm == 'svd': algo = surprise.SVD() elif args.algorithm == 'nmf': algo = surprise.NMF() else: print(f'Algorithm {args.algorithm} is not a valid choice.') scores = surprise.model_selection.cross_validate(algo, data, cv=args.cv_folds, verbose=True, n_jobs=-1) pickle.dump(scores, open(output_file, 'wb'))
def get_surprise_model(train_df): reader = surprise.Reader(rating_scale=(train_df["rating"].min(), train_df["rating"].max())) surprise_inp = surprise.Dataset.load_from_df(train_df, reader).build_full_trainset() model = surprise.BaselineOnly(bsl_options={ 'method': 'als', 'n_epochs': 50, 'reg_u': 12, 'reg_i': 5 }) model.fit(surprise_inp) return model
def algo_tester(data_object): ''' Produces a dataframe displaying all the different RMSE's, test & train times of the different surprise algorithms ---Parameters--- data_object(variable) created from the read_data_surprise function ---Returns--- returns a dataframe where you can compare the performance of different algorithms ''' benchmark = [] algos = [ sp.SVDpp(), sp.SVD(), sp.SlopeOne(), sp.NMF(), sp.NormalPredictor(), sp.KNNBaseline(), sp.KNNBasic(), sp.KNNWithMeans(), sp.KNNWithZScore(), sp.BaselineOnly(), sp.CoClustering() ] # Iterate over all algorithms for algorithm in algos: # Perform cross validation results = cross_validate(algorithm, data_object, measures=['RMSE'], cv=3, verbose=False) # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) benchmark = pd.DataFrame(benchmark).set_index('Algorithm').sort_values( 'test_rmse') return benchmark
def dic_to_train(data): from surprise.model_selection import KFold bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5} algo = surprise.BaselineOnly(bsl_options) ax1 = data.plot.scatter(x='store', y='user', s=1, c='score') print(ax1) df_to_dict = recur_dictify(data) store_list = [] # 음식점 목록을 담을 리스트 user_set = set() # 유저 목록을 담을 set # store 수 만큼 반복 for store_key in df_to_dict: store_list.append(store_key) for user_key in df_to_dict[store_key]: user_set.add(user_key) user_list = list(user_set) pd.to_pickle( pd.Series(user_list).to_frame(), "../data/Item_based_user_list.pkl") pd.to_pickle( pd.Series(store_list).to_frame(), "../data/Item_based_store_list.pkl") rating_dic = {"store_id": [], "user_id": [], "score": []} # store 수 만큼 반복 for store_key in df_to_dict: for name_key in df_to_dict[store_key]: a1 = store_list.index(store_key) a2 = user_list.index(name_key) a3 = df_to_dict[store_key][name_key] rating_dic["store_id"].append(a1) rating_dic["user_id"].append(a2) rating_dic["score"].append(a3) df = pd.DataFrame(rating_dic) return df.sort_values(by=['store_id'])
def model_fit(self): ''' Train model using surprise.SVD algorithm. ''' self.build_trainset() algo = self._algo_choise if algo == 'SVD': self.algorithm = surprise.SVD() elif algo == 'Baseline': self.algorithm = surprise.BaselineOnly() elif algo == 'SlopeOne': self.algorithm = surprise.SlopeOne() elif algo == 'CoClustering': self.algorithm = surprise.CoClustering() else: self.algorithm = surprise.KNNBasic() print('Training Recommender System using %s...' % algo) self.algorithm.fit(self.trainset) self.ratings_changed = False print('Done')
epochs=2, validation_split=0.1, shuffle=True) y_pred = model.predict([df_hybrid_test['User'], df_hybrid_test['Movie'], test_tfidf]) y_true = df_hybrid_test['Rating'].values rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true)) print('\n\nTesting Result With Keras Hybrid Deep Learning: {:.4f} RMSE'.format(rmse)) # Load dataset into surprise specific data-structure data = sp.Dataset.load_from_df(df_filterd[['User', 'Movie', 'Rating']].sample(20000), sp.Reader()) benchmark = [] # Iterate over all algorithms for algorithm in [sp.SVD(), sp.SVDpp(), sp.SlopeOne(), sp.NMF(), sp.NormalPredictor(), sp.KNNBaseline(), sp.KNNBasic(), sp.KNNWithMeans(), sp.KNNWithZScore(), sp.BaselineOnly(), sp.CoClustering()]: # Perform cross validation results = cross_validate(algorithm, data, measures=['RMSE', 'MAE'], cv=3, verbose=False) # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) # Store data benchmark.append(tmp) # Store results surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse', ascending=False) # Get data data = surprise_results[['test_rmse', 'test_mae']]
sgd_gs = sp.model_selection.GridSearchCV(sp.BaselineOnly, sgd_param_grid, measures=['rmse'], cv=3, joblib_verbose=0) reader = sp.Reader(rating_scale=(0, 10)) data = sp.Dataset.load_from_df( ScoresDFHotStart[['username', 'anime_id', 'my_score']], reader) als_gs.fit(data) sgd_gs.fit(data) trainset = data.build_full_trainset() algo = sp.BaselineOnly() algo.fit(trainset) testset = trainset.build_anti_testset() predictions = algo.test(testset) last_predictions = pd.DataFrame( predictions, columns=['uid', 'iid', 'rui', 'est', 'details']) last_predictions.drop('rui', inplace=True, axis=1) sim_options = {'name': 'pearson_baseline', 'user_based': False} algo_items = sp.KNNBaseline(sim_options=sim_options) algo_items.fit(trainset) def get_item_recommendations(anime_title, anime_id=100000, k=10): if anime_id == 100000:
reader = surprise.Reader(rating_scale=(1, 5)) rating_data = surprise.Dataset.load_from_df(rating_df, reader=reader) svd = surprise.SVD() svd_temp = surprise.model_selection.cross_validate(svd, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) print('SVD--------------') print(svd_temp) normalPredictor = surprise.NormalPredictor() normalPredictor_temp = surprise.model_selection.cross_validate( normalPredictor, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) print('normalPredictor--------------') print(normalPredictor_temp) baselineOnly = surprise.BaselineOnly() baselineOnly_temp = surprise.model_selection.cross_validate( baselineOnly, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) print('baselineOnly-----------------') print(baselineOnly_temp) knnBasic = surprise.KNNBasic() knnBasic_temp = surprise.model_selection.cross_validate( knnBasic, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) print('knnBasic-----------------') print(knnBasic_temp) knnWithMeans = surprise.KNNWithMeans() knnWithMeans_temp = surprise.model_selection.cross_validate( knnWithMeans, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) print('knnWithMeans-----------------') print(knnWithMeans_temp) knnBaseline = surprise.KNNBaseline()
from surprise import dump from surprise.model_selection import cross_validate from surprise import accuracy from surprise.model_selection import KFold import matplotlib.pyplot as plt import numpy as np from surprise.model_selection import KFold data = surprise.Dataset.load_builtin('ml-100k') df = pd.DataFrame(data.raw_ratings, columns=["user", "item", "rate", "id"]) del df["id"] df_table = df.set_index(["user", "item"]).unstack() data_array = df_table.values matA = np.array(data_array).astype(np.float64) print(data_array) U, s, V = np.linalg.svd(matA, full_matrices=True) print(U, s * V) bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5} algo = surprise.BaselineOnly(bsl_options) np.random.seed(0) acc = np.zeros(3) cv = KFold(3) for i, (trainset, testset) in enumerate(cv.split(data)): algo.fit(trainset) predictions = algo.test(testset) acc[i] = surprise.accuracy.rmse(predictions, verbose=True) acc.mean()
def main(train_df, target_df, cache_name="test", force_recompute=[]): """Train multiple models on train_df and predicts target_df Predictions are cached. If the indices don't match the indices of target_df, the cache is discarded. By default, if a method was already computed it is not recomputed again (except if the method name is listed in force_recompute). cache_name is the name to use to read and write the cache. Arguments: train_df {dataframe} -- Training dataframe target_df {dataframe} -- Testing dataframe Keyword Arguments: cache_name {str} -- Name to use for caching (default: {"test"}) force_recompute {list} -- Name(s) of methods to recompute, whether or not it was already computed. Useful to only recompute single methods without discarding the rest. (default: {[]}) Returns: Dataframe -- Dataframe with predictions for each methods as columns, IDs as indices """ global algo_in_use CACHED_DF_FILENAME = os.path.dirname( os.path.abspath(__file__)) +\ "/cache/cached_predictions_{}.pkl".format(cache_name) train_df = preprocess_df(train_df) trainset = pandas_to_data(train_df) ids_to_predict = target_df["Id"].to_list() # try to retrieve backup dataframe try: print("Retrieving cached predictions") all_algos_preds_df = pd.read_pickle(CACHED_DF_FILENAME) print("Ensuring cached IDs match given IDs") assert sorted(ids_to_predict) == sorted( all_algos_preds_df.index.values) print("Indices match, continuing") except (FileNotFoundError, AssertionError): print("No valid cached predictions found") all_algos_preds_df = pd.DataFrame(ids_to_predict, columns=["Id"]) all_algos_preds_df.set_index("Id", inplace=True) all_algos = { "SVD": spr.SVD(n_factors=200, n_epochs=100), "Baseline": spr.BaselineOnly(), "NMF": spr.NMF(n_factors=30, n_epochs=100), "Slope One": spr.SlopeOne(), "KNN Basic": spr.KNNBasic(k=60), "KNN Means": spr.KNNWithMeans(k=60), "KNN Baseline": spr.KNNBaseline(), "KNN Zscore": spr.KNNWithZScore(k=60), "SVD ++": spr.SVDpp(n_factors=40, n_epochs=100), "Co Clustering": spr.CoClustering() } for name in all_algos: print("##### {} ####".format(name)) if name in force_recompute and name in all_algos_preds_df.columns: all_algos_preds_df.drop(name, axis=1, inplace=True) if name in all_algos_preds_df.columns: print("Already computed {}, skipping".format(name)) continue algo = all_algos[name] time.sleep(1) algo.fit(trainset) time.sleep(1) algo_in_use = algo print("Generating predictions...") predictions = parallelize_predictions(ids_to_predict, 80) print("Done. Merging with previous results") this_algo_preds_df = pd.DataFrame(predictions, columns=["Id", name]) this_algo_preds_df.set_index("Id", inplace=True) all_algos_preds_df = pd.merge(all_algos_preds_df, this_algo_preds_df, left_index=True, right_index=True) all_algos_preds_df.to_pickle(CACHED_DF_FILENAME) print("DONE computing surprize") return all_algos_preds_df
# 指定文件所在路径 root_dir = os.getcwd() dfs_path = os.path.join(root_dir, 'Data/datasets/') ratings_df = pd.read_csv(os.path.join(dfs_path, 'ratings_expl.csv'), sep=';', encoding='latin-1', low_memory=False) # 加载数据 reader = surprise.Reader(rating_scale=(1, 10)) data = surprise.Dataset.load_from_df(ratings_df[['user_id', 'isbn', 'rating']], reader) kf = KFold(n_splits=5) # data = Dataset.load_builtin('jester') # Kfold algo1 = SVD() algo2 = surprise.BaselineOnly() algo3 = surprise.KNNBasic() algo4 = surprise.CoClustering() for trainset, testset in kf.split(data): # SVD algo1.fit(trainset) pSVD = algo1.test(testset) # 计算并打印RMSE print("SVD-") accuracy.rmse(pSVD, verbose=True) #Baseline algo2.fit(trainset) pBase = algo2.test(testset) print("BaseLine-") accuracy.rmse(pBase) #Baseline
n_recall = 0 n_precision = 0 for uid in est_top_n.keys(): hit += len(set(est_top_n[uid]) & set(true_top_n[uid])) n_precision += len(est_top_n[uid]) n_recall += len(true_top_n[uid]) return hit / n_precision, hit / n_recall # define reader and load data file_path = os.path.expanduser('./rates.csv') reader = Reader(line_format='user item rating', sep=',', rating_scale=(1, 5)) data = Dataset.load_from_file(file_path, reader=reader) kf = KFold(n_splits=5, random_state=1) algo = surprise.BaselineOnly() # algo = surprise.KNNBasic() # algo = surprise.KNNWithMeans() # algo = surprise.KNNBaseline() # algo = surprise.SVD() precisions = [] recalls = [] rmse = [] mae = [] # # # GridList = [(5, 5), (10, 10), (5, 10), (5, 20)] # with open('result.txt', 'w') as file: # for est_n, true_n in GridList: # precisions = [] # recalls = []
data = sp.Dataset.load_from_file(file_name, reader=reader) print("Done.") # defining the number of folds = 5 print("Performing splits...") kf = sp.model_selection.KFold(n_splits=5, random_state=0) print("Done.") ### ### PART 1.1 ### ''' application of all algorithms for recommendation made available by “Surprise” libraries, according to their default configuration. ''' algorithms = [sp.NormalPredictor(), sp.BaselineOnly(), sp.KNNBasic(),\ sp.KNNWithMeans(), sp.KNNWithZScore(), sp.KNNBaseline(),\ sp.SVD(), sp.SVDpp(), sp.NMF(), sp.SlopeOne(), sp.CoClustering()] for elem in algorithms: start_time = time.time() algo = elem sp.model_selection.cross_validate(algo, data, measures=['RMSE'], \ cv=kf, n_jobs = 2, verbose=True) print("--- %s seconds ---" % (time.time() - start_time)) print() ### ### PART 1.2 ### ''' Improvement of the quality of both KNNBaseline and SVD methods,
'SVD', 'SVDpp', 'NMF', 'SlopeOne', 'CoClustering' ]) args = parser.parse_args() train_path = path + '/Data/train_format.txt' train_reader = Reader(line_format='user item rating timestamp', sep=',', rating_scale=(0, 5)) trainset = Dataset.load_from_file(train_path, reader=train_reader) trainset = trainset.build_full_trainset() if args.model == 'NormalPredictor': model = surprise.NormalPredictor() elif args.model == 'BaselineOnly': model = surprise.BaselineOnly() elif args.model == 'KNNBasic': model = surprise.KNNBasic() elif args.model == 'KNNWithMeans': model = surprise.KNNWithMeans() elif args.model == 'KNNWithZScore': model = surprise.KNNWithZScore() elif args.model == 'KNNBaseline': model = surprise.KNNBaseline() elif args.model == 'SVD': model = surprise.SVD() elif args.model == 'SVDpp': model = surprise.SVDpp(verbose=True) elif args.model == 'NMF': model = surprise.NMF() elif args.model == 'SlopeOne':