class Appraisal: def __init__(self, data, popularityRankings): self.rankings = popularityRankings # Build a full training set for evaluating overall properties self.fullTrainSet = data.build_full_trainset() self.fullAntiTestSet = self.fullTrainSet.build_anti_testset() # Build a 75/25 train/test split for measuring accuracy self.trainSet, self.testSet = train_test_split(data, test_size=.25, random_state=1) # Build a "leave one out" train/test split for evaluating top-N recommenders # And build an anti-test-set for building predictions LOOCV = LeaveOneOut(n_splits=1, random_state=1) for train, test in LOOCV.split(data): self.LOOCVTrain = train self.LOOCVTest = test self.LOOCVAntiTestSet = self.LOOCVTrain.build_anti_testset() # Compute similarty matrix between items so we can measure diversity sim_options = {'name': 'cosine', 'user_based': False} self.simsAlgo = KNNBaseline(sim_options=sim_options) self.simsAlgo.fit(self.fullTrainSet) def EntireTraining(self): return self.fullTrainSet def EntireTesting(self): return self.fullAntiTestSet def OppositeTesting(self, USER): trainset = self.fullTrainSet fill = trainset.global_mean anti_testset = [] u = trainset.to_inner_uid(str(USER)) user_items = set([j for (j, _) in trainset.ur[u]]) anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for i in trainset.all_items() if i not in user_items] return anti_testset def Training(self): return self.trainSet def Testing(self): return self.testSet def LeaveOneOutTraing(self): return self.LOOCVTrain def LeaveOneOutTesting(self): return self.LOOCVTest def LeaveOneOutTestingOppositeTesting(self): return self.LOOCVAntiTestSet def Closeness(self): return self.simsAlgo def Celebrity(self): return self.rankings
from surprise import SVD, KNNBaseline from surprise.model_selection import train_test_split, LeaveOneOut from RecommenderMetrics import RecommenderMetrics ml = MovieLens() print("Loading movie ratings...") data = ml.loadMovieLensLatestSmall() print("\nComputing movie popularity ranks so we can measure novelty later...") rankings = ml.getPopularityRanks() print("\nComputing item similarities so we can measure diversity later...") fullTrainSet = data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': False} simsAlgo = KNNBaseline(sim_options=sim_options) simsAlgo.fit(fullTrainSet) print("\nBuilding recommendation model...") trainSet, testSet = train_test_split(data, test_size=.25, random_state=1) algo = SVD(random_state=10) algo.fit(trainSet) print("\nComputing recommendations...") predictions = algo.test(testSet) print("\nEvaluating accuracy of model...") print("RMSE: ", RecommenderMetrics.RMSE(predictions)) print("MAE: ", RecommenderMetrics.MAE(predictions))
from surprise import SVD, KNNBasic, KNNWithMeans, KNNBaseline, NMF, SlopeOne, CoClustering, BaselineOnly, NormalPredictor ''' "SVD" -- https://en.wikipedia.org/wiki/Singular_value_decomposition "KNN" -- https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm "Centered KNN" -- KNN with mean user ratings considered "KNN with Baseline" -- KNN with baseline considered "NMF" -- https://en.wikipedia.org/wiki/Non-negative_matrix_factorization "SlopeOne" -- https://en.wikipedia.org/wiki/Slope_One "CoClustering" -- https://en.wikipedia.org/wiki/Biclustering "BaselineOnly" -- baseline predicted for specific user/item "NormalPredictor" -- predict random rating from normal distribution https://surprise.readthedocs.io/en/stable/basic_algorithms.html#surprise.prediction_algorithms.baseline_only.BaselineOnly ''' labels = [ "SVD", "KNN", "Centered KNN", "KNN with Baseline", "NMF", "SlopeOne", "CoClustering", "BaselineOnly", "NormalPredictor" ] algorithms = [ SVD(), KNNBasic(), KNNWithMeans(), KNNBaseline(), NMF(), SlopeOne(), CoClustering(), BaselineOnly(), NormalPredictor() ]
def train_knn(self, df, userId, user_m_ids, movies_watched): """ param: df - movies pandas DataFrame userId - user ID to predict movies with user_m_ids - List of movieIDs of movies to be recommended upon (as seen in TMDB dataset) movies_watched - List of movie titles watched (as seen in TMDB dataset) return: pandas DataFrame of the recommended movies with attributes - title, id, vote_average, vote_count, popularity, release date Collaborative filtering is done using KNN-Baseline and prediction is done using pearson_baseline. The technique used is item-item based. """ reader = Reader(rating_scale=(1, 5)) movie_ids = self.get_movie_ids() rec_result = dict() sim_options = {"name": "pearson_baseline", "user_based": False} data = Dataset.load_from_df(df[RATING_ATTR], reader) if isfile(PATH_COLL_FILTERING_CACHE): model = joblib.load(PATH_COLL_FILTERING_CACHE) else: trainset = data.build_full_trainset() model = KNNBaseline(sim_options=sim_options) model.fit(trainset) joblib.dump(model, PATH_COLL_FILTERING_CACHE) inn_id = model.trainset.to_inner_iid(user_m_ids[0]) # print(self.get_movie_title(self.get_tmdb_id(user_m_ids[0]))) inn_id_neigh = model.get_neighbors(inn_id, k=10) # print(inn_id_neigh) df_pref = pd.DataFrame(columns=[ "title", "id", "vote_average", "vote_count", "popularity", "release_date", ]) index = 0 for m_id in inn_id_neigh: title_df = self.get_movie_title( self.get_tmdb_id(model.trainset.to_raw_iid(m_id))) try: if title_df[0] not in movies_watched: df_pref.loc[index] = array([ title_df[0], title_df[1], title_df[2], title_df[3], title_df[4], title_df[5], ]) index += 1 except: pass return df_pref
line = line.split('|') rid_to_name[line[0]] = line[1] name_to_rid[line[1]] = line[0] return rid_to_name, name_to_rid # First, train the algortihm to compute the similarities between items #data = Dataset.load_builtin('ml-100k') file_path = 'ratings_android.dat' reader = Reader(line_format='user item rating', rating_scale=(1, 5), sep=' ') data = Dataset.load_from_file(file_path, reader=reader) trainset = data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': True} algo = KNNBaseline(sim_options=sim_options) algo.train(trainset) # Read the mappings raw id <-> movie name #rid_to_name, name_to_rid = read_item_names() # Retrieve inner id of the movie Toy Story #toy_story_raw_id = name_to_rid['Toy Story (1995)'] #toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id) # Retrieve inner ids of the nearest neighbors of Toy Story. toy_story_neighbors = algo.get_neighbors(1, k=10) # Convert inner ids of the neighbors into names. #toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id) # for inner_id in toy_story_neighbors)
def best_pred(): review['새주소'] = review['장소'] + "*" + review['주소'] review2 = review.drop([ '장소', '주소', '위도', '경도', '분류', '대분류', '주소1', '주소2', '방문횟수', '년도', '월', '계절' ], axis=1) review2 = review2[['이름', '새주소', '별점']] # 데이터 셋의 차원 줄이기 # 저조한 평가를 기록한 장소 및 사용자 제외 min_ratings = 50 filter_review = review2['새주소'].value_counts() > min_ratings filter_review = filter_review[filter_review].index.tolist() min_user_ratings = 50 filter_users = review2['이름'].value_counts() > min_user_ratings filter_users = filter_users[filter_users].index.tolist() review_new = review2[(review2['새주소'].isin(filter_review)) & (review2['이름'].isin(filter_users))] reader = Reader(rating_scale=(0, 5)) data = Dataset.load_from_df(review_new[['이름', '새주소', '별점']], reader) benchmark = [] # Iterate over all algorithms for algorithm in [ SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore, BaselineOnly(), CoClustering() ]: # Perform cross validation algo = NMF() results = cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False) trainset, testset = train_test_split(data, test_size=0.25) predictions = algo.fit(trainset).test(testset) # accuracy.rmse(predictions) # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) surprise_results = pd.DataFrame(benchmark).set_index( 'Algorithm').sort_values('test_rmse') # Train and Predict # CoClustering 알고리즘이 가장 좋은 rmse 결과를 보였다. 따라서 CoClustering 사용하여 # 훈련 및 예측을 진행하고 교대최소제곱(ALS)를 사용할 것 algo = NMF() cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False) # rmse 정확도 훈련셋과 검증셋을 샘플링하기위해 train_Test_split()을 사용 # rmse 정확도 척도를 사용 # fit() 메소드를 통해 훈련셋의 알고리즘을 훈련시키고, test() 메소드를 통해 검증셋으로부터 # 생성된 예측을 반환 trainset, testset = train_test_split(data, test_size=0.25) # algo = BaselineOnly(bsl_options=bsl_options) algo = NMF() predictions = algo.fit(trainset).test(testset) # dump.dump('./dump_file',predictions, algo) # predictions, algo = dump.load('./dump_file') trainset = algo.trainset # 예측을 정확히 살펴보기 위해, 모든 예측에 대한 데이터프레임 생성 def get_Iu(uid): try: return len(trainset.ur[trainset.to_inner_uid(uid)]) except ValueError: # user was not part of the trainset return 0 def get_Ui(iid): try: return len(trainset.ir[trainset.to_inner_iid(iid)]) except ValueError: return 0 df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details']) df['Iu'] = df.uid.apply(get_Iu) df['Ui'] = df.iid.apply(get_Ui) df['err'] = abs(df.est - df.rui) predictions = df.sort_values(by='err').drop_duplicates('iid') best_predictions = predictions[:100] worst_predictions = predictions[-10:] # tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],index=['Algorithm'])) best_predictions['iid'] = best_predictions.iid.str.split('*').str[0] sql = "insert into rec(rec_uid, rec_iid, rec_rui, rec_est) values(:rec_uid, :rec_iid, :rec_rui, :rec_est)" data = best_predictions[['uid', 'iid', 'rui', 'est']] data.columns = ['rec_uid', 'rec_iid', 'rec_rui', 'rec_est'] cursor.close() conn.close() return data
# sim_options = {'name': 'cosine', 'user_based': False} # algo = KNNBaseline(k = 10, sim_options = sim_options) # cross_validate(algo, data, cv = 5, verbose = True) # use Gridsearch to find the best parameters param_grid_knnbl = {'k': list(range(10, 50, 10)), 'sim_options': {'name':['cosine', 'pearson'], 'user_based':[False]}} gs_knnbl = GridSearchCV(KNNBaseline, param_grid_knnbl, measures = ['rmse','mae'], cv = 5) gs_knnbl.fit(samp_dat) algo_knnbl = gs_knnbl.best_estimator['rmse'] print(gs_knnbl.best_score['rmse']) print(gs_knnbl.best_params['rmse']) # Use the new parameters with the sampled training data algo_knnbl = KNNBaseline(k = 20, sim_options = {'name': 'pearson', 'user_based': False}) fit_rmse(algo_knnbl, samp_dat) output(algo_knnbl, "KNNBaseline_k20_pearson_ii.csv") #%% # SVD algo_svd = SVD() fit_rmse(algo_svd, tr_dat) output(algo_svd, "SVD.csv") #%% # SVDpp algo_svdpp = SVDpp() fit_rmse(algo_svdpp, tr_dat) output(algo_svdpp, "SVDPP.csv")
from surprise import KNNBasic, evaluate algo = KNNBasic() perf = evaluate(algo, music_data, measures=['RMSE', 'MAE']) ### 使用均值协同过滤 from surprise import KNNWithMeans, evaluate algo = KNNWithMeans() perf = evaluate(algo, music_data, measures=['RMSE', 'MAE']) ### 使用协同过滤baseline from surprise import KNNBaseline, evaluate algo = KNNBaseline() perf = evaluate(algo, music_data, measures=['RMSE', 'MAE']) ### 使用SVD from surprise import SVD, evaluate algo = SVD() perf = evaluate(algo, music_data, measures=['RMSE', 'MAE']) ### 使用SVD++ from surprise import SVDpp, evaluate algo = SVDpp() perf = evaluate(algo, music_data, measures=['RMSE', 'MAE']) ### 使用NMF
pred1 = algo1.predict(uid, iid, verbose=True) #KNNWithMeans algo2 = KNNWithMeans(k=30, sim_options={ 'name': 'cosine', 'user_based': False }, verbose=True) algo2.fit(trainset) pred2 = algo2.predict(uid, iid, verbose=True) #KNNWithZScore f algo3 = KNNWithZScore(k=30, sim_options={ 'name': 'MSD', 'user_based': True }, verbose=True) algo3.fit(trainset) pred3 = algo3.predict(uid, iid, verbose=True) #KNNBaseline algo4 = KNNBaseline(k=30, sim_options={ 'name': 'MSD', 'user_based': True }, verbose=True) algo4.fit(trainset) pred4 = algo4.predict(uid, iid, verbose=True)
rid_to_name = {} name_to_rid = {} with io.open(file_name, 'r', encoding='ISO-8859-1') as f: for line in f: line = line.split('|') rid_to_name[line[0]] = line[1] name_to_rid[line[1]] = line[0] return rid_to_name, name_to_rid # First, train the algortihm to compute the similarities between items data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': False} algo = KNNBaseline(sim_options=sim_options) algo.fit(trainset) # Read the mappings raw id <-> movie name rid_to_name, name_to_rid = read_item_names() # Retrieve inner id of the movie movie_raw_id = name_to_rid['Clockwork Orange, A (1971)'] movie_inner_id = algo.trainset.to_inner_iid(movie_raw_id) movie_neighbors = algo.get_neighbors(movie_inner_id, k=10) # Convert inner ids of the neighbors into names. movie_neighbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in movie_neighbors) movie_neighbors = (rid_to_name[rid] for rid in movie_neighbors)
class recsysBase: data = '' trainset = '' testset = '' algorithm = '' algo = '' predictions = '' def __init__(self, data, algorithm='svd', algo_options={}, testset_percent=0): if not data: return self.data = data self.algorithm = algorithm ## if testset_percent == 0: self.trainset = self.data.build_full_trainset() self.testset = self.trainset.build_anti_testset() else: self.trainset, self.testset = train_test_split( self.data, test_size=testset_percent) if self.algorithm == 'svd': self.algo = SVD() elif self.algorithm == 'knn_basic': self.algo = KNNBasic() elif self.algorithm == 'knn_baseline': if not algo_options: algo_options = { 'name': 'pearson_baseline', 'user_based': False } self.algo = KNNBaseline(sim_options=algo_options) self.algo.fit(self.trainset) def exec(self): self.step1() self.step2() self.step3() def step1(self): pass def step2(self): pass def step3(self): pass def compute_rmse(self): if not self.predictions: self.test() accuracy.rmse(self.predictions) def load_from_file(self, file_path='predictions.csv'): self.predictions = pd.read_csv(filepath) def save_to_file(self, file_path='predictions.csv'): pd.DataFrame(algo.predictions).to_csv(file_path, index=False) def benchmark(self): cross_validate(self.algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) def tune(self, opt_field='rmse', param_grid={ 'n_epochs': [5, 10], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6] }, SHOW_RESULT=False): if self.algorithm == 'svd': gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3) ## Start tuning gs.fit(self.data) ## Save to self.algo self.algo = gs.best_estimator[opt_field] self.algo.fit(self.trainset) if SHOW_RESULT: # best RMSE score print(gs.best_score['rmse']) # combination of parameters that gave the best RMSE score print(gs.best_params['rmse']) return self def tune_and_test(self, unbiased_percent=0.1, opt_field='rmse', param_grid={ 'n_epochs': [5, 10], 'lr_all': [0.001, 0.01] }): ## Get RAW raw_ratings = self.data.raw_ratings ## Shuffle ratings if you want random.shuffle(raw_ratings) ## threshold = int((1 - unbiased_percent) * len(raw_ratings)) A_raw_ratings = raw_ratings[:threshold] B_raw_ratings = raw_ratings[threshold:] data = self.data data.raw_ratings = A_raw_ratings ## Select your best algo with grid search. grid_search = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3) grid_search.fit(data) self.algo = grid_search.best_estimator[opt_field] # retrain on the whole set A trainset = data.build_full_trainset() self.algo.fit(trainset) # Compute biased accuracy on A predictions = self.algo.test(trainset.build_testset()) print('Biased accuracy on A,', end=' ') accuracy.rmse(predictions) # Compute unbiased accuracy on B testset = data.construct_testset( B_raw_ratings) # testset is now the set B predictions = self.algo.test(testset) print('Unbiased accuracy on B,', end=' ') accuracy.rmse(predictions) return self def test(self): self.predictions = self.algo.test(self.testset) self.compute_rmse() def get_top_n(self, target_uid=None, n=10, SHOW_RESULT=False): '''Return the top-N recommendation for each user from a set of predictions. Args: predictions(list of Prediction objects): The list of predictions, as returned by the test method of an algorithm. n(int): The number of recommendation to output for each user. Default is 10. Returns: A dict where keys are user (raw) ids and values are lists of tuples: [(raw item id, rating estimation), ...] of size n. ''' if target_uid: target_uid = str(target_uid) # Check if testset is valid if not self.predictions: self.predictions = self.algo.test(self.testset) # First map the predictions to each user. top_n = defaultdict(list) for uid, iid, true_r, est, _ in self.predictions: top_n[uid].append((iid, est)) # Then sort the predictions for each user and retrieve the k highest ones. for uid, user_ratings in top_n.items(): if target_uid and target_uid != uid: continue user_ratings.sort(key=lambda x: x[1], reverse=True) if target_uid: top_n = user_ratings[:n] break else: top_n[uid] = user_ratings[:n] # Print the recommended items for each user if SHOW_RESULT: try: for uid, user_ratings in top_n.items(): print(uid, [iid for (iid, _) in user_ratings]) except: print(top_n) return top_n def precision_recall_at_k(self, target_uid=1, threshold=3.5, k=10, num_of_testset=5, SHOW_RESULT=True): ## target_uid: User ID to get result ## threshold: the lowerbound that the rating should be higher ## k: to get number of relevant and recommended items in top k if target_uid: target_uid = str(target_uid) kf = KFold(n_splits=num_of_testset) final_precision = [] final_recalls = [] for trainset, testset in kf.split(self.data): self.algo.fit(trainset) predictions = self.algo.test(testset) '''Return precision and recall at k metrics for each user.''' # First map the predictions to each user. user_est_true = defaultdict(list) for uid, _, true_r, est, _ in predictions: user_est_true[uid].append((est, true_r)) precisions = dict() recalls = dict() for uid, user_ratings in user_est_true.items(): # Sort user ratings by estimated value user_ratings.sort(key=lambda x: x[0], reverse=True) # Number of relevant items n_rel = sum( (true_r >= threshold) for (_, true_r) in user_ratings) # Number of recommended items in top k n_rec_k = sum( (est >= threshold) for (est, _) in user_ratings[:k]) # Number of relevant and recommended items in top k n_rel_and_rec_k = sum( ((true_r >= threshold) and (est >= threshold)) for (est, true_r) in user_ratings[:k]) # Precision@K: Proportion of recommended items that are relevant precisions[ uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1 # Recall@K: Proportion of relevant items that are recommended recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1 if SHOW_RESULT: print('Relevant: ' + str( sum(prec for prec in precisions.values()) / len(precisions))) print('Recommended: ' + str(sum(rec for rec in recalls.values()) / len(recalls))) final_precision.append(precisions[uid]) final_recalls.append(recalls[uid]) if SHOW_RESULT: print(final_precision, final_recalls) return final_precision, final_recalls def read_item_names(self, file_name=get_dataset_dir() + '/ml-100k/ml-100k/u.item'): """Read the u.item file from MovieLens 100-k dataset and return two mappings to convert raw ids into movie names and movie names into raw ids. """ rid_to_name = {} name_to_rid = {} with io.open(file_name, 'r', encoding='ISO-8859-1') as f: for line in f: line = line.split('|') rid_to_name[line[0]] = line[1] name_to_rid[line[1]] = line[0] return rid_to_name, name_to_rid def get_k_neighbors(self, name='Toy Story (1995)', k=10, SHOW_RESULT=True): ########################################### ## You need to use algorithm='knn_baseline' at the beginning ########################################### if self.algorithm != 'knn_baseline': self.__init__(data=self.data, algorithm='knn_baseline', testset_percent=0) ########################################### ########################################### ## Read the mappings raw id <-> movie name rid_to_name, name_to_rid = self.read_item_names() ## input_raw_id = name_to_rid[name] input_inner_id = self.algo.trainset.to_inner_iid(input_raw_id) ## Retrieve inner ids of the nearest neighbors of Toy Story. input_neighbors = self.algo.get_neighbors(input_inner_id, k=k) ## Convert inner ids of the neighbors into names. input_neighbors = (self.algo.trainset.to_raw_iid(inner_id) for inner_id in input_neighbors) input_neighbors = (rid_to_name[rid] for rid in input_neighbors) ## Show result if SHOW_RESULT: print('\nThe ' + str(k) + ' nearest neighbors of "' + name + '" are:') for neighbor in input_neighbors: print(neighbor) return input_neighbors
print("KNNWithZScore Results:", perf) print("-" * 118) ### 使用协同过滤正态分布 Item based from surprise import KNNWithZScore algo = KNNWithZScore(k=50, sim_options={'user_based': True, 'verbose': 'True'}) perf = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3) print("KNNWithZScore Results:", perf) print("-" * 118) ### 使用基础版协同过滤 from surprise import KNNBasic algo = KNNBasic(k=50, sim_options={'user_based': False, 'verbose': 'True'}) perf = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3) print("KNNBasic Results:", perf) print("-" * 118) ### 使用均值协同过滤 from surprise import KNNWithMeans algo = KNNWithMeans(k=50, sim_options={'user_based': False, 'verbose': 'True'}) perf = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3) print("KNNWithMeans Results:", perf) print("-" * 118) ### 使用协同过滤baseline from surprise import KNNBaseline algo = KNNBaseline(k=50, sim_options={'user_based': False, 'verbose': 'True'}) perf = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3) print("KNNBaseline Results:", perf) print("-" * 118)
''' We build the model by making use of KNNBasic which is collaborative filtering based algorithm. We are setting minimum number of neighbous (min_k) 1 and maximum number of neighbours (k) = 40 We train the model on train set ''' algo2 = KNNBasic(sim_options=sim_options, k=40, min_k=1) algo2.fit(trainset) predictions2 = algo2.test(testset) print("RMSE for KNNBasic:", accuracy.rmse(predictions2, verbose=True)) # In[ ]: ''' We build the model by making use of KNNBasic which is collaborative filtering based algorithm. We are setting minimum number of neighbous (min_k) 1 and maximum number of neighbours (k) = 40 We train the model on train set ''' algo3 = KNNBaseline(sim_options=sim_options, k=40, min_k=1) algo3.fit(trainset) predictions3 = algo3.test(testset) print("RMSE for KNNBaseline:", accuracy.rmse(predictions3, verbose=True)) # In[ ]: ''' We build the model by making use of KNNBasic which is collaborative filtering based algorithm. We are setting minimum number of neighbous (min_k) 1 and maximum number of neighbours (k) = 40 We train the model on train set ''' algo4 = KNNWithZScore(sim_options=sim_options, k=40, min_k=1) algo4.fit(trainset) predictions4 = algo4.test(testset) print("RMSE for KNNBasic:", accuracy.rmse(predictions4, verbose=True))
# 2. 做一个交叉验证的数据划分 data.split(5) # 3. 模型对象的构建 bsl_options = { 'method': 'als', # 给定求解方式,可选值:als和sgd 'n_epochs': 10, # 迭代次数 'reg_i': 20, # 正则化系数, 'reg_u': 10 # 正则化系数, } """ k=40: 给定预测时候的邻居样本的数目 min_k=1:在产生预测值的时候,只要要求有多少个临近用户/物品 sim_options={} : 给定相似度矩阵的计算方式 """ sim_options = { 'name': 'pearson_baseline', # 指定相似度的计算法方式,可选值:pearson\msd\cosine\pearson_baseline 'user_based': True # 指定是基于用户的协同过滤,还是基于物品的协同过滤 } algo = KNNBaseline(k=40, min_k=1, sim_options=sim_options, bsl_options=bsl_options) # algo = KNNBasic(sim_options=sim_options) # 4. 模型效果评估 #均方误差(MSE) 平均绝对误差(MAE) 一致序列对比率评分(FCP) evaluate(algo=algo, data=data, measures=['rmse', 'mae', 'fcp'])
def getRecommendations(self, IDUser, method=9, similarityMeasure=1, isUserBased="Yes"): conn = sqlite3.connect(DATABASE_NAME) df = pd.read_sql_query( "SELECT userID, glassID, relativeRating FROM ratings", conn) reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df( df[['userID', 'glassID', 'relativeRating']], reader) trainset = data.build_full_trainset() isUserBased = True if (isUserBased == "Yes") else False if similarityMeasure == 1: similarityMeasure = "cosine" elif similarityMeasure == 2: similarityMeasure = "pearson" else: similarityMeasure = "pearson_baseline" sim_options = {'name': similarityMeasure, 'user_based': isUserBased} if method == 1: algo = SVD() elif method == 2: algo = SlopeOne() elif method == 3: algo = NMF() elif method == 4: algo = NormalPredictor() elif method == 5: algo = KNNBaseline(sim_options=sim_options) elif method == 6: algo = KNNBasic(sim_options=sim_options) elif method == 7: algo = KNNWithMeans(sim_options=sim_options) elif method == 8: algo = KNNWithZScore(sim_options=sim_options) elif method == 9: algo = BaselineOnly() else: algo = CoClustering() algo.fit(trainset) predictions = pd.DataFrame(columns=['glassID', 'estimatedRating']) totalGlass = df['glassID'].max() glassPivot = df.pivot_table(index='glassID', columns='userID', values='relativeRating') for iid in range(1, totalGlass + 1): isNan = True try: isNan = pd.isna(glassPivot.loc[iid, IDUser]) except: continue if isNan: prediction = algo.predict(IDUser, iid, verbose=False) predictions = predictions.append( pd.DataFrame([[iid, prediction[3]]], columns=predictions.columns)) predictions = predictions.sort_values('estimatedRating', ascending=False) recommendationList = [ item for item in predictions[predictions['estimatedRating'] > 3] ['glassID'].head(50).tolist() ] conn.close() return recommendationList
for k in (10,20,30,40,50,60,70,80,90,100): algo = KNNWithMeans(sim_options=sim_options, k=k) x = cross_validate(algo, data, verbose=True) cur_mean = np.mean(x['test_rmse']) if(cur_mean < min_mean): min_mean = cur_mean optimal_k = k print("current optimal K={} min mean={}".format(optimal_k, min_mean)) data = Dataset.load_from_file(file_path, reader=reader) benchmark = [] for algorithm in [SVD(), KNNBaseline(k=60,sim_options = {'name': 'cosine','user_based': True }), KNNBasic(), KNNWithMeans()]: results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False) tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse') benchmark = [] for algorithm in [SVD(),KNNBaseline(sim_options = {'name': 'pearson','user_based': True }), KNNBasic(k=30,sim_options = {'name': 'pearson','user_based': True }), KNNWithMeans(k=60,sim_options = {'name': 'pearson','user_based': True })]: results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False) tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
if not args.train: args.train = 'datasets/' + args.dataset + '/train.dat' if not args.test: args.test = 'datasets/' + args.dataset + '/test.dat' if not args.validation: args.validation = 'datasets/' + args.dataset + '/val.dat' sim_options = { 'name': 'cosine', 'user_based': False # compute similarities between items } if rec == 'ItemKNN': algorithm = [KNNBaseline(sim_options=sim_options)] name = ['ItemKNN'] elif rec == 'SVD': algorithm = [SVD()] name = ['SVD'] elif rec == 'NMF': algorithm = [NMF()] name = ['NMF'] else: algorithm = [KNNBaseline(sim_options=sim_options), SVD(), NMF()] name = ['ItemKNN', 'SVD', 'NMF'] # initialize evaluator
if int(line[1]) in uid: uid[int(line[1])].update({int(line[0]):int(line[2])}) else: uid[int(line[1])]={int(line[0]):int(line[2])} # print "done!" return uid file_path=os.path.expanduser('~')+"/Downloads/CSC522/toy/sample.data" reader=Reader(line_format='item user rating timestamp',sep=',') data=Dataset.load_from_file(file_path,reader=reader) data.split(n_folds=60) trainset=data.build_full_trainset() sim_options={'name':'pearson_baseline','user_based':True} algo=KNNBaseline(sim_options=sim_options) algo.train(trainset) user_id='911' user_inner_id=algo.trainset.to_inner_uid(user_id) user_neighbors=algo.get_neighbors(user_inner_id,k=22) user_neighbors=(algo.trainset.to_raw_uid(inner_id) for inner_id in user_neighbors) print() print('The 5 nearest neighbors of the userid %s are:'%user_id) for userid in user_neighbors: print(userid) perf=evaluate(algo,data,measures=['RMSE','MAE']) print (perf)
def collaborative_fitlering(raw_uid): # =============== 数据预处理 =========================== # 将数据库中的所有数据读取转换到文件 # dir_data = '/www/wwwroot/music_recommender/page/cf_recommendation/cf_data' dir_data = './collaborative_filtering/cf_data' file_path = '{}/dataset_user_5.txt'.format(dir_data) if not os.path.exists(dir_data): os.makedirs(dir_data) # 数据库操作 # 打开数据库连接 db = pymysql.connect("localhost", "music_system", "music_system", "music_recommender", charset='utf8') # 使用 cursor() 方法创建一个游标对象 cursor cursor = db.cursor() sql = """SELECT uid, song_id, rating FROM user_rating WHERE 1""" cursor.execute(sql) results = cursor.fetchall() with open(file_path, "w+") as data_f: for result in results: uid, song_id, rating = result data_f.writelines("{}\t{}\t{}\n".format(uid, song_id, rating)) if not os.path.exists(file_path): raise IOError("Dataset file is not exists!") # =========== cf recommend ================== # 导入数据 reader = Reader(line_format='user item rating', sep='\t') data = Dataset.load_from_file(file_path, reader=reader) # 所有数据生成训练集 trainset = data.build_full_trainset() # ================= BaselineOnly ================== # start = time.clock() bsl_options = { 'method': 'sgd', 'learning_rate': 0.0005, } algo_BaselineOnly = BaselineOnly(bsl_options=bsl_options) algo_BaselineOnly.fit(trainset) # 获得推荐结果 rset = user_build_anti_testset(trainset, raw_uid) predictions = algo_BaselineOnly.test(rset) top_n_baselineonly = get_top_n(predictions, n=5) # end = time.clock() # print("user-50NN --- BaselineOnly 耗时: %.2fs\n" % (end-start)) # print("BaselineOnly 推荐结果:{}\n".format(top_n_baselineonly)) # ================= KNNBasic ================== sim_options = {'name': 'pearson', 'user_based': True} algo_KNNBasic = KNNBasic(sim_options=sim_options) algo_KNNBasic.fit(trainset) # 获得推荐结果 --- 只考虑 knn 用户的 # start = time.clock() predictor = PredictionSet(algo_KNNBasic, trainset, raw_uid) knn_anti_set = predictor.user_build_anti_testset() predictions = algo_KNNBasic.test(knn_anti_set) top_n_knnbasic = get_top_n(predictions, n=5) # end = time.clock() # print("user-50NN --- KNNBasic 耗时: %.2fs\n" % (end-start)) # print("KNNBasic 推荐结果:{}\n".format(top_n_knnbasic)) # ================= KNNBaseline ================== sim_options = {'name': 'pearson_baseline', 'user_based': True} algo_KNNBaseline = KNNBaseline(sim_options=sim_options) algo_KNNBaseline.fit(trainset) # 获得推荐结果 --- 只考虑 knn 用户的 # start = time.clock() predictor = PredictionSet(algo_KNNBaseline, trainset, raw_uid) knn_anti_set = predictor.user_build_anti_testset() predictions = algo_KNNBaseline.test(knn_anti_set) top_n_knnbaseline = get_top_n(predictions, n=5) # end = time.clock() # print("user-50NN --- KNNBaseline 耗时: %.2fs\n" % (end-start)) # print("KNNBaseline 推荐结果:{}\n".format(top_n_knnbaseline)) # =============== 按比例生成推荐结果 ================== recommendset = set() for results in [top_n_baselineonly, top_n_knnbasic, top_n_knnbaseline]: for key in results.keys(): for recommendations in results[key]: iid, rating = recommendations recommendset.add(iid) items_baselineonly = set() for key in top_n_baselineonly.keys(): for recommendations in top_n_baselineonly[key]: iid, rating = recommendations items_baselineonly.add(iid) items_knnbasic = set() for key in top_n_knnbasic.keys(): for recommendations in top_n_knnbasic[key]: iid, rating = recommendations items_knnbasic.add(iid) items_knnbaseline = set() for key in top_n_knnbaseline.keys(): for recommendations in top_n_knnbaseline[key]: iid, rating = recommendations items_knnbaseline.add(iid) rank = dict() for recommendation in recommendset: if recommendation not in rank: rank[recommendation] = 0 if recommendation in items_baselineonly: rank[recommendation] += 1 if recommendation in items_knnbasic: rank[recommendation] += 1 if recommendation in items_knnbaseline: rank[recommendation] += 1 max_rank = max(rank, key=lambda s: rank[s]) if max_rank == 1: # print(items_baselineonly) return items_baselineonly else: result = nlargest(5, rank, key=lambda s: rank[s]) # print(result) return result
#!/usr/bin/env python # -*- coding: utf-8 -*- # File : KNN_baseline_movie.py # Author: WangYu # Date : 2020-04-12 from surprise import KNNWithMeans from surprise import Dataset, Reader from surprise import KNNBaseline from surprise.model_selection import KFold from surprise import accuracy # 数据读取 reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1) data = Dataset.load_from_file('./ratings.csv', reader=reader) KF = KFold(n_splits=3) algo = KNNBaseline(k=50, sim_options={'user_based': False, 'verbose': 'True'}) for train, test in KF.split(data): algo.fit(train) predictions = algo.test(test) accuracy.rmse(predictions, verbose=True) accuracy.mae(predictions, verbose=True)
class EvaluationData: def __init__(self, data, popularityRankings, doTopN: bool): self.rankings = popularityRankings self.data = data self.fullTrainSet = data.build_full_trainset() # And build an anti-test-set for building predictions self.fullAntiTestSet = self.fullTrainSet.build_anti_testset() #Build a 75/25 train/test split for measuring accuracy self.trainSet, self.testSet = train_test_split(data, test_size=.25, random_state=1) if (doTopN): # Build a full training set for evaluating overall properties # Build a "leave one out" train/test split for evaluating top-N recommenders LOOCV = LeaveOneOut(n_splits=1, random_state=1) for train, test in LOOCV.split(data): self.LOOCVTrain = train self.LOOCVTest = test self.LOOCVAntiTestSet = self.LOOCVTrain.build_anti_testset() # Compute similarty matrix between items so we can measure diversity sim_options = {'name': 'cosine', 'user_based': False} self.simsAlgo = KNNBaseline(sim_options=sim_options) self.simsAlgo.fit(self.fullTrainSet) def GetFullTrainSet(self): return self.fullTrainSet def GetFullAntiTestSet(self): return self.fullAntiTestSet def GetAntiTestSetForUser(self, testSubject): trainset = self.fullTrainSet fill = trainset.global_mean anti_testset = [] u = trainset.to_inner_uid(str(testSubject)) user_items = set([j for (j, _) in trainset.ur[u]]) anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for i in trainset.all_items() if i not in user_items] return anti_testset def GetTrainSet(self): return self.trainSet def GetTestSet(self): return self.testSet def GetLOOCVTrainSet(self): return self.LOOCVTrain def GetLOOCVTestSet(self): return self.LOOCVTest def GetLOOCVAntiTestSet(self): return self.LOOCVAntiTestSet def GetSimilarities(self): return self.simsAlgo def GetPopularityRankings(self): return self.rankings
class MovieRecommender: def __init__(self): self._knn = None self._nmf = None self._trainset = None self._predictions = None self.initialized = False def initialize(self, data_filepath): self._data = Dataset.load_from_file(data_filepath, reader=Reader('ml-100k')) self._trainset = self._data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': False} self._knn = KNNBaseline(sim_options=sim_options) self._nmf = NMF() start_new_thread(self._train) def get_similar_movies(self, movie_id, k=10): if not self.initialized: return [] model = self._knn movie_inner_id = model.trainset.to_inner_iid(movie_id) similar_movie_inner_ids = model.get_neighbors(movie_inner_id, k=k) to_raw_iid = model.trainset.to_raw_iid similar_movie_ids = (to_raw_iid(inner_id) for inner_id in similar_movie_inner_ids) movie_ids = [ similar_movie_id.encode('ascii') for similar_movie_id in similar_movie_ids ] return movie_dataset.get_movies(movie_ids) def get_similar_movies_for_user(self, user_id, num_movies=10): if not self.initialized: return [] user_id = str(user_id) user_predictions = [ prediction for prediction in self._predictions if prediction[0] == user_id ] sorted_predictions = sorted(user_predictions, key=lambda x: x.est, reverse=True) top_n_predictions = sorted_predictions[:num_movies] similar_movie_ids = (prediction.iid for prediction in top_n_predictions) movie_ids = [ similar_movie_id.encode('ascii') for similar_movie_id in similar_movie_ids ] return movie_dataset.get_movies(movie_ids) def update_user_ratings(self, user_id, movie_id, rating): if not self.initialized: return rating = float(rating) has_previous_rating = False if self._trainset.knows_user(user_id): trainset_dict = dict(self._trainset.ur[user_id]) has_previous_rating = movie_id in trainset_dict user_id = str(user_id) movie_id = str(movie_id) new_rating = (user_id, movie_id, rating, time()) if has_previous_rating: for i, rating in enumerate(self._data.raw_ratings): if rating[0] == user_id and rating[1] == movie_id: self._data.raw_ratings[i] = new_rating break else: self._data.raw_ratings.append(new_rating) self._trainset = self._data.build_full_trainset() self._train() def _train(self): self._nmf.train(self._trainset) self._knn.train(self._trainset) self._predictions = self._nmf.test(self._trainset.build_anti_testset()) self.initialized = True
file_name = df rid_to_name = {} name_to_rid = {} with io.open(file_name, 'r', encoding='ISO-8859-1') as f: for line in f: line = line.split('|') rid_to_name[line[0]] = line[1] name_to_rid[line[1]] = line[0] return rid_to_name, name_to_rid # First, train the algortihm to compute the similarities between items trainset = df.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': False} algo = KNNBaseline(sim_options=sim_options) algo.fit(trainset) # Read the mappings raw id <-> movie name rid_to_name, name_to_rid = read_item_names() # Retrieve inner id of the movie Toy Story #*********************Movie Recommended for Movie********** toy_story_raw_id = name_to_rid['Toy Story'] toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id) # Retrieve inner ids of the nearest neighbors of Toy Story. toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10) # Convert inner ids of the neighbors into names. toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)
def get_top_n(predictions, n=5): top_n = defaultdict(list) for uid, iid, true_r, est, _ in predictions: top_n[uid].append((iid, round(est, 3))) for uid, user_ratings in top_n.items(): user_ratings.sort(key=lambda x: x[1], reverse=True) top_n[uid] = user_ratings[:n] return top_n user_id = input('User ID: ') data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() sim_options = {'name': 'cosine', 'user_based': True, 'min_support': 5} algo = KNNBaseline(k=4, sim_options=sim_options) algo.fit(trainset) testset = trainset.build_anti_testset() testset = filter(lambda x: x[0] == user_id, testset) predictions = algo.test(testset) top_n = get_top_n(predictions) rid_to_name = read_item_names() print('User ' + user_id) for movie_rid, rating in top_n[user_id]: print('{:4s} {:70s} {}'.format(movie_rid, str(rid_to_name[movie_rid]), rating))
class KnnRecom: def __init__(self): pass def load_data(self): # 载入原始.csv数据 reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1) data = Dataset.load_from_file( file_path='../' + Config().config['DATAPATH']['ratings_path'], reader=reader) # 构建训练集 self.trainset = data.build_full_trainset() logger.info('数据集成功构建') def get_neighbors(self, movie_id_raw, n=10): movie_id_inner = self.trainset.to_inner_iid(movie_id_raw) movies_inner_id = self.algo.get_neighbors(movie_id_inner, k=n) movies_raw_id = [ self.trainset.to_raw_iid(inner_id) for inner_id in movies_inner_id ] return movies_raw_id def fit(self): # 训练模型 sim_options = {'name': 'pearson_baseline', 'user_based': False} self.algo = KNNBaseline(sim_options=sim_options) self.algo.fit(trainset=self.trainset) def save_mode(self, file_path): dump.dump(file_path, algo=self.algo) def tosql(self, n, database, tablename): ''' table name:knn_predictions table columns:['movieId','k nearest neighbors'] :Args: n:number of nearest neighbors database:name of database stored in mysql tablename:name of table stored in mysql :return: ''' data = [] for iid in self.trainset.all_items(): movie_rawid = self.trainset.to_raw_iid(iid) movies_inner_id = self.algo.get_neighbors(iid, k=n) data.append([ movie_rawid, ','.join([(self.trainset.to_raw_iid(inner_id)) for inner_id in movies_inner_id]) ]) df = pd.DataFrame(data=data) df.columns = ['movieId', 'k_nearest_neighbors'] BASE_DIR = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) knn_predictions = os.path.join(BASE_DIR, 'data/knn_predictions.csv') df.to_csv(knn_predictions, index=False) dftosql(DataFrame=df, database=database, table_name=tablename, if_exists='replace') def load_model(self, file_path): predictions, loaded_algo = dump.load(file_path) return predictions, loaded_algo
return precisions, recalls, f1scores # load dataset data_path = abspath("../../../resources/ml-100k/i.data") # set rating range when loading in the dataset reader = Reader(line_format='user item rating timestamp', sep='\t', rating_scale=(0, 1)) # load the dataset data = Dataset.load_from_file(data_path, reader=reader) # calculate RMSE and MAE for algo in [KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore()]: # Run 5-fold cross-validation and print results. cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) for algo in [KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore()]: # Calculate precision and recall and f1score kf = KFold(n_splits=5) fold_count = 1 for trainset, testset in kf.split(data): algo.fit(trainset) predictions = algo.test(testset) precisions, recalls, f1scores = precision_recall_at_k(predictions, k=5, threshold=4) # Precision and recall can then be averaged over all users
def fit(self): # 训练模型 sim_options = {'name': 'pearson_baseline', 'user_based': False} self.algo = KNNBaseline(sim_options=sim_options) self.algo.fit(trainset=self.trainset)
from surprise import accuracy import os reader = Reader(line_format='user item rating', sep=',', skip_lines=3, rating_scale=(1, 5)) custom_dataset_path = (os.path.dirname(os.path.realpath(__file__)) + '\\ratings.csv') print("Using: " + custom_dataset_path) data = Dataset.load_from_file(file_path=custom_dataset_path, reader=reader) trainingSet = data.build_full_trainset() sim_options = { 'name': 'pearson_baseline', 'shrinkage': 0 # no shrinkage } knn = KNNBaseline(sim_options=sim_options) knn.fit(trainingSet) testSet = trainingSet.build_testset() predictions = knn.test(testSet) accuracy.rmse(predictions, verbose=True) accuracy.mae(predictions, verbose=True)
def __init__(self, module_type, baseline_type, cf_type, similar, sim_type, params): assert baseline_type in {"ALS", "SGD", "default"} assert cf_type in {None, "base_user", "base_item"} assert similar in {None, "COSINE", "cosine", "MSD", "msd", "PEARSON", "pearson", "PEARSON_BASELINE", "pearson_baseline", "JACCARD", "jaccard", "EUCLIDEAN", "euclidean"} assert sim_type in {None, "default"} self.module_type = module_type self.baseline_type = baseline_type self.cf_type = cf_type self.similar = similar self.sim_type = sim_type self.bu = None self.bi = None self.sim = None if self.baseline_type == "ALS": bsl_options = {'method': params["bsl_options"].get("method", 'als'), 'n_epochs': params["bsl_options"].get("n_epochs", 10), 'reg_u': params["bsl_options"].get("reg_u", 15), 'reg_i': params["bsl_options"].get("reg_i", 10) } elif self.baseline_type == "SGD": bsl_options = {'method': params["bsl_options"].get("method", 'sgd'), 'n_epochs': params["bsl_options"].get("n_epochs", 20), 'reg': params["bsl_options"].get("reg", 0.02), 'learning_rate': params["bsl_options"].get("learning_rate", 0.005) } else: # 默认值 bsl_options = {} params["sim_options"] = {} if self.cf_type == "base_user": params["sim_options"]["user_based"] = True elif self.cf_type == "base_item": params["sim_options"]["item_based"] = False else: params["sim_options"]["user_based"] = True if self.similar == "COSINE" or self.similar == "cosine": params["sim_options"]["name"] = "cosine" elif self.similar == "MSD" or self.similar == "msd": params["sim_options"]["name"] = "msd" elif self.similar == "PEARSON" or self.similar == "pearson": params["sim_options"]["name"] = "pearson" elif self.similar == "PEARSON_BASELINE" or self.similar == "pearson_baseline": params["sim_options"]["name"] = "pearson_baseline" elif self.similar == "JACCARD" or self.similar == "jaccard": params["sim_options"]["name"] = "jaccard" elif self.similar == "EUCLIDEAN" or self.similar == "euclidean": params["sim_options"]["name"] = "euclidean" else: params["sim_options"]["name"] = "msd" if self.sim_type == "default": sim_options = {} else: sim_options = {"name": params["sim_options"].get("name", "MSD"), "user_based": params["sim_options"].get("user_based", True), "min_support": params["sim_options"].get("min_support", 5), "shrinkage": params["sim_options"].get("shrinkage", 100) } """ 'name':要使用的相似性名称,如similarities模块中所定义 。默认值为'MSD'。 'user_based':将计算用户之间还是项目之间的相似性。这对预测算法的性能有巨大影响。默认值为True。 'min_support':相似度不为零的最小公共项数('user_based' 为'True'时)或最小公共用户数('user_based'为 'False'时)。 简单地说,如果 |Iuv|<min_support 然后 sim(u,v)=0。项目也是如此。 'shrinkage': """ if self.module_type == "KNNmeans": # 在KNNBasic算法的基础上,考虑用户均值或项目均值 self.model = KNNWithMeans(k=params.get("k", 40), min_k=params.get("min_k", 1), sim_options=sim_options, verbose=params.get("verbose", True)) elif self.module_type == "KNNzscore": # 引入Z - Score的思想 self.model = KNNWithZScore(k=params.get("k", 40), min_k=params.get("min_k", 1), sim_options=sim_options, verbose=params.get("verbose", True)) elif self.module_type == "KNNbase": # 和KNNWithMeans的区别在于,用的不是均值而是bias self.model = KNNBaseline(k=params.get("k", 40), min_k=params.get("min_k", 1), # 最少的邻居个数 sim_options=sim_options, bsl_options=bsl_options, verbose=params.get("verbose", True)) elif self.module_type == "KNNbasic": # 最基础的KNN算法,可分为user - based KNN和item - based KNN self.model = KNNBasic(k=params.get("k", 40), min_k=params.get("min_k", 1), sim_options=sim_options, verbose=params.get("verbose", True)) elif self.module_type == "SVD": self.model = SVD(n_factors=params.get("n_factors", 100), n_epochs=params.get("n_epochs", 20), init_mean=params.get("init_mean", 0), init_std_dev=params.get("init_std_dev", 0.1), lr_all=params.get("lr_all", 0.005), reg_all=params.get("reg_all", 0.02), lr_bu=params.get("lr_bu", None), lr_bi=params.get("lr_bi", None), lr_pu=params.get("lr_pu", None), lr_qi=params.get("lr_qi", None), reg_bu=params.get("reg_bu", None), reg_bi=params.get("reg_bi", None), reg_pu=params.get("reg_pu", None), reg_qi=params.get("reg_qi", None), random_state=params.get("random_state", None), verbose=params.get("verbose", False) ) """ n_factors –因素数。默认值为100。 n_epochs – SGD过程的迭代次数。默认值为 20。 偏见(bool)–是否使用基线(或偏见)。请参阅上面的注释。默认值为True。 init_mean –因子向量初始化的正态分布平均值。默认值为0。 init_std_dev –因子向量初始化的正态分布的标准偏差。默认值为0.1。 lr_all –所有参数的学习率。默认值为0.005。 reg_all –所有参数的正则项。默认值为 0.02。 lr_bu –的学习率bu。lr_all如果设置优先 。默认值为None。 lr_bi –的学习率bi。lr_all如果设置优先 。默认值为None。 lr_pu –的学习率pu。lr_all如果设置优先 。默认值为None。 lr_qi –的学习率qi。lr_all如果设置优先 。默认值为None。 reg_bu –的正则化术语bu。reg_all如果设置优先。默认值为None。 reg_bi –的正则化术语bi。reg_all如果设置优先。默认值为None。 reg_pu –的正则化术语pu。reg_all如果设置优先。默认值为None。 reg_qi –的正则化术语qi。reg_all如果设置优先。默认值为None。 random_state(int,numpy中的RandomState实例或None)–确定将用于初始化的RNG。 如果为int,random_state则将用作新RNG的种子。通过多次调用进行相同的初始化非常有用 fit()。 如果是RandomState实例,则将该实例用作RNG。如果为None,则使用numpy中的当前RNG。默认值为 None。 详细 –如果True,则打印当前纪元。默认值为False。 """ elif self.module_type == "SVDpp": self.model = SVDpp(n_factors=params.get("n_factors", 100), n_epochs=params.get("n_epochs", 20), init_mean=params.get("init_mean", 0), init_std_dev=params.get("init_std_dev", 0.1), lr_all=params.get("lr_all", 0.005), reg_all=params.get("reg_all", 0.02), lr_bu=params.get("lr_bu", None), lr_bi=params.get("lr_bi", None), lr_pu=params.get("lr_pu", None), lr_qi=params.get("lr_qi", None), reg_bu=params.get("reg_bu", None), reg_bi=params.get("reg_bi", None), reg_pu=params.get("reg_pu", None), reg_qi=params.get("reg_qi", None), random_state=params.get("random_state", None), verbose=params.get("verbose", False)) """ n_factors –因素数。默认值为20。 n_epochs – SGD过程的迭代次数。默认值为 20。 init_mean –因子向量初始化的正态分布平均值。默认值为0。 init_std_dev –因子向量初始化的正态分布的标准偏差。默认值为0 .1。 lr_all –所有参数的学习率。默认值为0 .007。 reg_all –所有参数的正则项。默认值为 0.02。 lr_bu –的学习率bu。lr_all如果设置优先 。默认值为None。 lr_bi –的学习率bi。lr_all如果设置优先 。默认值为None。 lr_pu –的学习率pu。lr_all如果设置优先 。默认值为None。 lr_qi –的学习率qi。lr_all如果设置优先 。默认值为None。 lr_yj –的学习率yj。lr_all如果设置优先 。默认值为None。 reg_bu –的正则化术语bu。reg_all如果设置优先。默认值为None。 reg_bi –的正则化术语bi。reg_all如果设置优先。默认值为None。 reg_pu –的正则化术语pu。reg_all如果设置优先。默认值为None。 reg_qi –的正则化术语qi。reg_all如果设置优先。默认值为None。 reg_yj –的正则化术语yj。reg_all如果设置优先。默认值为None。 random_state(int,numpy中的RandomState实例或None)–确定将用于初始化的RNG。如果为int,random_state则将用作新RNG的种子。通过多次调用进行相同的初始化非常有用 fit()。如果是RandomState实例,则将该实例用作RNG。如果为None,则使用numpy中的当前RNG。默认值为 None。 详细 –如果True,则打印当前纪元。默认值为False。 """ elif self.module_type == "NMF": # 非负矩阵分解,即要求p矩阵和q矩阵都是正的 self.model = NMF(n_factors=params.get("n_factors", 100), n_epochs=params.get("n_epochs", 20), init_mean=params.get("init_mean", 0), init_std_dev=params.get("init_std_dev", 0.1), lr_all=params.get("lr_all", 0.005), reg_all=params.get("reg_all", 0.02), lr_bu=params.get("lr_bu", None), lr_bi=params.get("lr_bi", None), lr_pu=params.get("lr_pu", None), lr_qi=params.get("lr_qi", None), reg_bu=params.get("reg_bu", None), reg_bi=params.get("reg_bi", None), reg_pu=params.get("reg_pu", None), reg_qi=params.get("reg_qi", None), random_state=params.get("random_state", None), verbose=params.get("verbose", False)) """ n_factors –因素数。默认值为15。 n_epochs – SGD过程的迭代次数。默认值为 50。 偏见(bool)–是否使用基线(或偏见)。默认值为 False。 reg_pu –用户的正则化术语λu。默认值为 0.06。 reg_qi –项目的正规化术语λi。默认值为 0.06。 reg_bu –的正则化术语bu。仅与偏置版本相关。默认值为0.02。 reg_bi –的正则化术语bi。仅与偏置版本相关。默认值为0.02。 lr_bu –的学习率bu。仅与偏置版本相关。默认值为0.005。 lr_bi –的学习率bi。仅与偏置版本相关。默认值为0.005。 init_low –因子的随机初始化的下限。必须大于0以确保非负因素。默认值为 0。 init_high –因子的随机初始化的上限。默认值为1。 random_state(int,numpy中的RandomState实例或None)–确定将用于初始化的RNG。 如果为int,random_state则将用作新RNG的种子。通过多次调用进行相同的初始化非常有用 fit()。 如果是RandomState实例,则将该实例用作RNG。如果为None,则使用numpy中的当前RNG。默认值为 None。 详细 –如果True,则打印当前纪元。默认值为False。 """ elif self.module_type == "SlopeOne": self.model = SlopeOne(**params) elif self.module_type == "cc": # 基于聚类的协同过滤 self.model = CoClustering(n_cltr_u=params.get("n_cltr_u", 3), n_cltr_i=params.get("n_cltr_i", 3), n_epochs=params.get("n_epochs", 20), random_state=params.get("random_state", None), verbose=params.get("verbose",False) ) """ n_cltr_u(int)–用户集群的数量。默认值为3。 n_cltr_i(int)–项目集群的数量。默认值为3。 n_epochs(int)–优化循环的迭代次数。默认值为 20。 random_state(int,numpy中的RandomState实例或None)–确定将用于初始化的RNG。 如果为int,random_state则将用作新RNG的种子。通过多次调用进行相同的初始化非常有用 fit()。 如果是RandomState实例,则将该实例用作RNG。如果为None,则使用numpy中的当前RNG。默认值为 None。 详细(bool)–如果为True,则将打印当前纪元。默认值为 False。 """ elif self.module_type == "BaselineOnly": # 不考虑用户的偏好 self.model = BaselineOnly(bsl_options=bsl_options, verbose=True) elif self.module_type == "Np": # 该算法即随机预测算法,假设测试集的评分满足正态分布,然后生成正态分布的随机数进行预测, self.model = NormalPredictor()
for j in result: if ('film' in j['description']): q.append((query, j['id'])) break return q k = 4 top_n = 5 user = input('Enter user ID:') # загрузка встроенного набора данных data = Dataset.load_builtin('ml-100k') # создание класса train = data.build_full_trainset() # использование алгоритма для прогноза algorithm = KNNBaseline(k, sim_options={'name': 'cosine', 'min_support': 5}) algorithm.fit(train) # чтение файла в словарь def read(): file_name = get_dataset_dir() + '/ml-100k/ml-100k/u.item' rid_name = {} with io.open(file_name, 'r', encoding='ISO-8859-1') as file: for line in file: line = line.split('|') rid_name[line[0]] = (line[1], line[2]) return rid_name # оценка с наилучшими параметрами(test)
rid_to_name = {} name_to_rid = {} with io.open(file_name, 'r', encoding='ISO-8859-1') as f: for line in f: line = line.split('|') rid_to_name[line[0]] = line[1] name_to_rid[line[1]] = line[0] return rid_to_name, name_to_rid # First, train the algortihm to compute the similarities between items data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': False} algo = KNNBaseline(sim_options=sim_options) algo.fit(trainset) # Read the mappings raw id <-> movie name rid_to_name, name_to_rid = read_item_names() # Retrieve inner id of the movie Toy Story toy_story_raw_id = name_to_rid['Toy Story (1995)'] toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id) # Retrieve inner ids of the nearest neighbors of Toy Story. toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10) # Convert inner ids of the neighbors into names. toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in toy_story_neighbors)