def knn_cosine(dataset_path, target, value): # Read the data file file_path = dataset_path reader = Reader(line_format= 'user item rating', rating_scale=(1, 7), sep='\t') data = Dataset.load_from_file(file_path, reader) # Construct a training set using the entire dataset (without spliting the dataset into folds) # variable trainset is an object of Trainset trainset = data.build_full_trainset() # Parameters needed to create rating matrix user_to_item = {} item_to_user = {} file = open(file_path, "r") # Train the algorithm to compute the similarities between users sim_options = {'name': 'cosine', 'user_based': True} algo = KNNBaseline(sim_options=sim_options) algo.train(trainset) # Read the mappings user <-> item for line in file: line = line.split('\t') user_to_item[line[0]] = line[1] item_to_user[line[1]] = line[0] # Retrieve the user id and target_neighbors = algo.get_neighbors(target, k=value) target_neighbors = (algo.trainset.to_raw_uid(inner_id) for inner_id in target_neighbors) target_neighbors = (item_to_user[rid] for rid in target_neighbors) return target_neighbors
def knn_item(trainset, testset, predset): modelname = 'knnitem' # Check if predictions already exist if is_already_predicted(modelname): return bsl_options = { 'method': 'als', 'reg_i': 1.e-5, 'reg_u': 14.6, 'n_epochs': 10 } sim_options = { 'name': 'pearson_baseline', 'shrinkage': 100, 'user_based': False } algo = KNNBaseline(k=60, sim_options=sim_options, bsl_options=bsl_options) print('KNN item based Model') algo.train(trainset) predictions = algo.test(testset) rmse = accuracy.rmse(predictions, verbose=False) print(' RMSE on Test: ', rmse) preds = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds[j] = pred.est save_predictions(modelname, rmse, preds, 'test') print(' Evaluate predicted ratings...') predictions = algo.test(predset) preds = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds[j] = pred.est save_predictions(modelname, rmse, preds)
def getSimModle(): # 默认载入movielens数据集 data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset( ) #使用pearson_baseline方式计算相似度 False以item为基准计算相似度 本例为电影之间的相似度 sim_options = { 'name': 'pearson_baseline', 'user_based': False } ##使用KNNBaseline算法 algo = KNNBaseline(sim_options=sim_options) #训练模型 algo.train(trainset) return algo
def getSimModle(): # 载入数据集 reader = Reader(rating_scale=(1, 5)) new_data = Dataset.load_from_df(data['user', 'product', 'rating'], reader) trainset = new_data.build_full_trainset() # 使用pearson_baseline方式计算相似度 以product为基准计算相似度 sim_options = {'name': 'pearson_baseline', 'user_based': False} ##使用KNNBaseline算法 algo = KNNBaseline(sim_options=sim_options) #训练模型 algo.train(trainset) return algo
def get_model(re_train=False): algopath = os.path.join("Practice 3", "Movie Len Code", "kNNBaseline.algo") if not re_train and os.path.exists(algopath): logging.debug("Retrieving existed Model") algo = dump.load(algopath)[1] return algo filepath = os.path.join("Practice 3", "ml-latest-small", "ratings.csv") reader = Reader(line_format="user item rating timestamp", sep=",", skip_lines=1) data = Dataset.load_from_file(filepath, reader=reader) trainset = data.build_full_trainset() # print("train") sim_options = {'name': 'pearson_baseline', 'user_based': False} algo = KNNBaseline(sim_options=sim_options) algo.train(trainset) dump.dump(algopath, algo=algo, verbose=1) return algo
def start(goods='951'): file_path = os.path.expanduser('SampleData') # 指定文件格式 reader = Reader(line_format='user item rating timestamp', sep=',') # 从文件读取数据 data = Dataset.load_from_file(file_path, reader=reader) trainset = data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': False} ##使用KNNBaseline算法 algo = KNNBaseline(sim_options=sim_options) algo.train(trainset) iid_innerid = algo.trainset.to_inner_iid(goods) iid_neighbors = algo.get_neighbors(iid_innerid, k=10) iid_neighbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in iid_neighbors) print('The 10 nearest neighbors of %s:' % goods) for iid in iid_neighbors: print(iid)
from surprise import (KNNBaseline, Reader, Dataset, dump) # First, train the algortihm to compute the similarities between items data = Dataset.load_from_file('ratings.csv', reader=Reader(sep=',', rating_scale=(1, 10))) trainset = data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': False} algo = KNNBaseline(sim_options=sim_options) algo.train(trainset) ''' >>> algo.predict('425', '0338564') Prediction(uid='425', iid='0338564', r_ui=None, est=8.8268148604314725, details={u'actual_k': 40, u'was_impossible': False}) >>> algo.predict('732', '1219827') Prediction(uid='732', iid='1219827', r_ui=None, est=1.3944813261280586, details={u'actual_k': 8, u'was_impossible': False}) ''' dump.dump('knn.algo', algo=algo)
class MovieRecommender: def __init__(self): self._knn = None self._nmf = None self._trainset = None self._predictions = None self.initialized = False def initialize(self, data_filepath): self._data = Dataset.load_from_file(data_filepath, reader=Reader('ml-100k')) self._trainset = self._data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': False} self._knn = KNNBaseline(sim_options=sim_options) self._nmf = NMF() start_new_thread(self._train) def get_similar_movies(self, movie_id, k=10): if not self.initialized: return [] model = self._knn movie_inner_id = model.trainset.to_inner_iid(movie_id) similar_movie_inner_ids = model.get_neighbors(movie_inner_id, k=k) to_raw_iid = model.trainset.to_raw_iid similar_movie_ids = (to_raw_iid(inner_id) for inner_id in similar_movie_inner_ids) movie_ids = [ similar_movie_id.encode('ascii') for similar_movie_id in similar_movie_ids ] return movie_dataset.get_movies(movie_ids) def get_similar_movies_for_user(self, user_id, num_movies=10): if not self.initialized: return [] user_id = str(user_id) user_predictions = [ prediction for prediction in self._predictions if prediction[0] == user_id ] sorted_predictions = sorted(user_predictions, key=lambda x: x.est, reverse=True) top_n_predictions = sorted_predictions[:num_movies] similar_movie_ids = (prediction.iid for prediction in top_n_predictions) movie_ids = [ similar_movie_id.encode('ascii') for similar_movie_id in similar_movie_ids ] return movie_dataset.get_movies(movie_ids) def update_user_ratings(self, user_id, movie_id, rating): if not self.initialized: return rating = float(rating) has_previous_rating = False if self._trainset.knows_user(user_id): trainset_dict = dict(self._trainset.ur[user_id]) has_previous_rating = movie_id in trainset_dict user_id = str(user_id) movie_id = str(movie_id) new_rating = (user_id, movie_id, rating, time()) if has_previous_rating: for i, rating in enumerate(self._data.raw_ratings): if rating[0] == user_id and rating[1] == movie_id: self._data.raw_ratings[i] = new_rating break else: self._data.raw_ratings.append(new_rating) self._trainset = self._data.build_full_trainset() self._train() def _train(self): self._nmf.train(self._trainset) self._knn.train(self._trainset) self._predictions = self._nmf.test(self._trainset.build_anti_testset()) self.initialized = True
def getModel(train_data): sim_option = {'name': 'pearson_baseline', 'user_based': False} algo = KNNBaseline(sim_options=sim_option) algo.train(train_data) return algo
with io.open(item_file, 'r', encoding='ISO-8859-1') as f: for line in f: line = line.split('|') rid_2_name[line[0]] = line[1] name_2_rid[line[1]] = line[0] return rid_2_name, name_2_rid # u.data数据格式为 user item rating timestamp; reader = Reader(line_format='user item rating timestamp', sep='\t') file_path = 'your path + /ml-100k' data = Dataset.load_from_file(file_path=file_path + '/u.data', reader=reader) train_set = data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': False} algo = KNNBaseline(sim_options=sim_options) algo.train(train_set) # 获取id对应的电影名列表,由于中途涉及一个id转换,所以要双向 rid_2_name, name_2_rid = read_iter_names() # print(rid_2_name['1']) # print(name_2_rid['Toy Story (1995)']) # raw-id映射到内部id toy_story_raw_id = name_2_rid['Toy Story (1995)'] toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id) # 获取toy story对应的内部id 并由此取得其对应的k个近邻 k个近邻对应的也是内部id toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10) # 近邻内部id转换为对应的名字 toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)