def hook_user_based_recommend(df): df_shop = prep_shop_model(df) df_shop = transform_data(df_shop) data, train, test = prep_surprise_dataset(df_shop, 'shop_id') option = {'name': 'cosine'} # cosine, msd, pearson, pearson_baseline algo = KNNBaseline(sim_options=option) algo.fit(train) return algo, df_shop
def getSimModle(): # 默认载入movielens数据集 data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() #使用pearson_baseline方式计算相似度 False以item为基准计算相似度 本例为电影之间的相似度 sim_options = {'name': 'pearson_baseline', 'user_based': False} ##使用KNNBaseline算法 algo = KNNBaseline(sim_options=sim_options) #训练模型 algo.fit(trainset) return algo
def get_trained_model(dataset): # To use item-based cosine similarity sim_options = { "name": "msd", "user_based": False, # Compute similarities between items } model = KNNBaseline(sim_options=sim_options) training_set = dataset.build_full_trainset() model.fit(training_set) return model
class EvaluationData: def __init__(self, data, popularityRankings): self.rankings = popularityRankings self.fullTrainSet = data.build_full_trainset() self.fullAntiTestSet = self.fullTrainSet.build_anti_testset() self.trainSet, self.testSet = train_test_split(data, test_size=.25, random_state=1) LOOCV = LeaveOneOut(n_splits=1, random_state=1) for train, test in LOOCV.split(data): self.LOOCVTrain = train self.LOOCVTest = test self.LOOCVAntiTestSet = self.LOOCVTrain.build_anti_testset() sim_options = {'name': 'cosine', 'user_based': False} self.simsAlgo = KNNBaseline(sim_options=sim_options) self.simsAlgo.fit(self.fullTrainSet) def GetFullTrainSet(self): return self.fullTrainSet def GetFullAntiTestSet(self): return self.fullAntiTestSet def GetAntiTestsetForUser(self, testSubject): trainset = self.fullTrainSet fill = trainset.global_mean anti_testset = [] u = trainset.to_inner_uid(str(testSubject)) user_items = set([j for (j, _) in trainset.ur[u]]) anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for i in trainset.all_items() if i not in user_items] return anti_testset def GetTrainSet(self): return self.trainSet def GetTestSet(self): return self.testSet def GetLOOCVTrainSet(self): return self.LOOCVTrain def GetLOOCVTestSet(self): return self.LOOCVTest def GetPopularityRankings(self): return self.rankings def GetSimilarities(self): return self.simsAlgo
def get_similar_items(iid, n = 10): trainset = data.build_full_trainset() algo = KNNBaseline(sim_option = item_based_sim_option) algo.fit(trainset) inner_id = algo.trainset.to_inner_iid(iid) # 使用get_neighbors方法得到n个最相似的电影 neighbors = algo.get_neighbors(inner_id, k=n) neighbors_iid = ( algo.trainset.to_raw_iid(x) for x in neighbors ) recommendations = [ item_dict[x] for x in neighbors_iid ] # print('\nten movies most similar to the',item_dict[iid]) # for i in recommendations: # print(i) return recommendations
def browse(uID): #dataset data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() #create and fit the KNN classifier algo = KNNBaseline() algo.fit(trainset) #movie lists #0, movie id. 1, our predicted rating. 2, title. 3, actual rating top5_definite = get_best_recs_(uID, algo) top5_somewhat = get_med_recs_(uID, algo) #this are fixed length, range 5 because we want 0-4 movies, and range 4 becasue each has 0-3 attributes definite_like = [[0 for x in range(4)] for y in range(5)] somewhat_like = [[0 for x in range(4)] for y in range(5)] #filling the final 2d list of definite likes for i in range(len(definite_like)): #first value is the movie ID definite_like[i][0] = top5_definite[i][0] #second value is our predicted rating definite_like[i][1] = top5_definite[i][0] #third is the title definite_like[i][2] = id_to_title_(definite_like[i][0]) #fourth is the average rating definite_like[i][3] = get_rating_(top5_definite[i][0]) #same as last loop, but for the somewhat like list for i in range(len(somewhat_like)): somewhat_like[i][0] = top5_somewhat[i][0] somewhat_like[i][1] = top5_somewhat[i][1] somewhat_like[i][2] = id_to_title_(somewhat_like[i][0]) somewhat_like[i][3] = get_rating_(top5_somewhat[i][0]) print("\n\nDefinite like:\n") for item in definite_like: print("Movie ID: " + str(item[0])) print("Our rating prediction: " + str(item[1])) print("Title: " + str(item[2])) print("Average rating: " + str(item[3])) print("\n\nSomewhat like:\n") for item in somewhat_like: print("Movie ID: " + str(item[0])) print("Our rating prediction: " + str(item[1])) print("Title: " + str(item[2])) print("Average rating: " + str(item[3])) return definite_like, somewhat_like
class KNNBaselineRS(AbstractRS): def __init__(self, path, sim_options={ 'name': 'pearson_baseline', 'user_based': False }): self.path = os.path.expanduser(path) self.algo = KNNBaseline(sim_options=sim_options) def train(self): self.data.build_full_trainset() self.algo = KNNBaseline() self.algo.fit(self.data)
class Recmodel(object): def __init__(self, algo='knn_baseline', filepath=None): if not os.path.exists(filepath): raise FileNotFoundError("{} not exist".format(filepath)) self.filepath = filepath if algo == 'nmf': self.algo = NMF() self.model_name = 'nmf' else: self.algo = KNNBaseline() self.model_name = 'knn_baseline' self.convertor = DataConvertHelper() def buildDataSet(self): reader = Reader(line_format='user item rating timestamp', sep=',') music_data = Dataset.load_from_file(file_path=self.filepath, reader=reader) self.trainset = music_data.build_full_trainset() def train(self): print("begin training...") self.algo.fit(self.trainset) def evaluate(self, index): current_playlist_name = self.convertor.get_name_by_index(index) print('当前歌单:{}'.format(current_playlist_name)) current_playlist_rid = self.convertor.get_rid_by_name( current_playlist_name) print("当前歌单rid: {}".format(current_playlist_rid)) playlist_inner_id = self.algo.trainset.to_inner_uid( current_playlist_rid) print('歌单inid', playlist_inner_id) playlist_neighbors_inner_ids = self.algo.get_neighbors( playlist_inner_id, k=10) playlist_neighbors_rids = ( self.algo.trainset.to_raw_uid(inner_id) for inner_id in playlist_neighbors_inner_ids) playlist_neighbors_names = (self.convertor.get_name_by_rid(rid) for rid in playlist_neighbors_rids) print('歌单 《', current_playlist_name, '》 最接近的10个歌单为:') for playlist_name in playlist_neighbors_names: print( playlist_name, self.algo.trainset.to_inner_uid( self.convertor.get_rid_by_name(playlist_name)))
def knn_baseline_movie(train, test, ids, Xtest, Xids): """ nearest neighbour approach using the movie baseline Argument : train, the trainset test, the testset ids, unknown ratings Xtest, predicted ratings for testset, to be used for final blending Xids, predicted ratings for unknown ratings, to be used for final blending """ print('kNN Baseline Movie') bsl_option = {'method': 'als', 'n_epochs': 100, 'reg_u': 15, 'reg_i': 0.01} sim_option = { 'name': 'pearson_baseline', 'min_support': 1, 'user_based': False } algo = KNNBaseline(k=100, bsl_options=bsl_option, sim_options=sim_option, verbose=False) #Train algorithm on training set algo.fit(train) #Predict on train and compute RMSE predictions = algo.test(train.build_testset()) print(' Training RMSE: ', accuracy.rmse(predictions, verbose=False)) #Predict on test and compute RMSE predictions = algo.test(test) rmse = accuracy.rmse(predictions, verbose=False) print(' Test RMSE: ', rmse) preds_test = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds_test[j] = pred.est #Predict unknown ratings preds_ids = [] for i in range(len(ids[0])): pred = algo.predict(str(ids[0][i]), str(ids[1][i])) preds_ids.append(pred.est) Xtest.append(preds_test) Xids.append(preds_ids) return rmse, Xtest, Xids, preds_test, preds_ids
def knn(): df = pd.read_sql("select userId,movieId,rating from rating", db.engine) print('finished load data') reader = Reader(rating_scale=(1, 5), line_format='user item rating') print(df.head()) data = Dataset.load_from_df(df, reader) trainset = data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': True} algo = KNNBaseline(sim_options=sim_options) print('start fit') algo.fit(trainset) dump.dump(f'{RECOMMEND_MODEL_SAVED_PATH}/knn', algo=algo) print('saved to knn') cross_validate(algo, data, measures=['RMSE', 'MAE'], verbose=True)
def build_recommender(data, model_meta): """ This function takes order, item, and rating data returns a KNN recommendation engine. Args: data: dataframe with columns order_id, product_id, and rating model_meta: original model_meta dict """ from surprise import Dataset, Reader # import libraries for EB logging.info('Setting up random state') random.seed(model_meta['train_recommender']['random_state']) np.random.seed(model_meta['train_recommender']['random_state']) logging.info('Setting up Surprise data reader') reader = Reader(rating_scale=(max(data.rating), min(data.rating))) logging.info('Calling load_from_df') data = Dataset.load_from_df(data, reader) # reads and sets up data logging.info('Setting up recommender') k = model_meta['train_recommender']['neighbors'] sim_options = model_meta['train_recommender']['sim_options'] knn = KNNBaseline(k, sim_options=sim_options) # collaborative filtering logging.info('Calling build_full_trainset') data = data.build_full_trainset() # uses whole dataset to build model logging.info('Fit recommender') fit = knn.fit(data) # fits model to data logging.info('Return recommender') return (fit)
class EvaluationData: def __init__(self,data,withSim=False): self.trainSet, self.testSet = train_test_split(data, test_size=0.25, random_state=0) LOOX = LeaveOneOut(1, random_state=1) for xtrain, xtest in LOOX.split(data): self.LOOX_trainSet = xtrain self.LOOX_testSet = xtest del xtrain, xtest self.LOOX_antitestSet = self.LOOX_trainSet.build_anti_testset() self.full_trainSet = data.build_full_trainset() self.full_antitestSet = self.full_trainSet.build_anti_testset() if withSim: sim_options = {'name': 'cosine', 'user_based': False} self.simAlgo = KNNBaseline(sim_options=sim_options) self.simAlgo.fit(self.full_trainSet)
def get_nearest_neighbors(user_id): #path to dataset file file_path = os.path.expanduser('outward.csv') # define a reader object for our dataset reader = Reader(sep=',') #load data from dataset data = Dataset.load_from_file(file_path, reader=reader) #Train algorithm on dataset trainset = data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': True} algo = KNNBaseline(sim_options=sim_options) algo.fit(trainset) #Retrieve inner id of the user in question user_inner_id = algo.trainset.to_inner_uid(str(user_id)) #Retrieve inner ids of the nearest neighbors of user user_neighbors = algo.get_neighbors(user_inner_id, k=10) #Convert inner ids of the neighbors into raw user ids user_neighbors = (algo.trainset.to_raw_uid(inner_id) for inner_id in user_neighbors) neighbors_lst = [] print() print(f'The 10 nearest neighbors of {user_id} are:') for user in user_neighbors: print(user) neighbors_lst.append(user) return neighbors_lst # pred = algo.predict(uid, bid, verbose=True) # pred is <class 'surprise.prediction_algorithms.predictions.Prediction'>
def init_collaborative_filtering(): # Step 1: Read data from database ratings = pps.get_all_ratings_as_df() ratings[RATING] = None ratings.loc[ratings[LIKED] == True, RATING] = 1 ratings.loc[ratings[LIKED] == False, RATING] = 0 # Step 2: Transform to training set reader = Reader(rating_scale=(0.0, 1.0)) data = Dataset.load_from_df(ratings[[USER_ID, POI_ID, RATING]], reader) trainset = data.build_full_trainset() # Step 3: Apply training of collaborative filtering (CF) algorithm algo = KNNBaseline(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True}) algo.fit(trainset) return algo, ratings
class EvaluationData: def __init__(self, data, popularityRankings): self.rankings = popularityRankings self.fullTrainSet = data.build_full_trainset() self.fullAntiTestSet = self.fullTrainSet.build_anti_testset() self.trainSet, self.testSet = train_test_split(data, test_size=.25, random_state=1) #Compute similarty matrix between items so we can measure diversity sim_options = {'name': 'cosine', 'user_based': False} self.simsAlgo = KNNBaseline(sim_options=sim_options) self.simsAlgo.fit(self.fullTrainSet) def GetFullTrainSet(self): return self.fullTrainSet def GetFullAntiTestSet(self): return self.fullAntiTestSet def GetAntiTestSetForUser(self, userId): trainset = self.fullTrainSet fill = trainset.global_mean anti_testset = [] u = trainset.to_inner_uid(str(userId)) user_items = set([j for (j, _) in trainset.ur[u]]) anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for i in trainset.all_items() if i not in user_items] return anti_testset def GetTrainSet(self): return self.trainSet def GetTestSet(self): return self.testSet def GetSimilarities(self): return self.simsAlgo def GetPopularityRankings(self): return self.rankings
def recommend_friends(request): queryset = Rate.objects.all() query, params = queryset.query.as_sql( compiler='django.db.backends.sqlite3.compiler.SQLCompiler', connection=connections['default']) df = pd.read_sql_query(query, con=connections['default'], params=params) print("load data") reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['user_id', 'item_id', 'rate']], reader) trainset = data.build_full_trainset() sim_options = {'name': 'pearson_baseline'} algo = KNNBaseline(sim_options=sim_options) algo.fit(trainset) for given_user_id in set(df['user_id']): print(given_user_id) given_user_id = int(given_user_id) _from = get_object_or_404(Profile, profile_id=given_user_id) inner_id = algo.trainset.to_inner_uid(given_user_id) # to_inner_uid(), to_inner_iid(), to_raw_uid(), and to_raw_iid() neighbors = algo.get_neighbors(inner_id, k=5) results = [ algo.trainset.to_raw_uid(inner_user_id) for inner_user_id in neighbors ] print('The 5 nearest neighbors of Given User Id:') for raw_user_id in results: _to = get_object_or_404(Profile, user_id=int(raw_user_id)) # print(raw_user_id,Candidates2.objects.filter(user_from=user_from,user_to=user_to)) if Candidates2.objects.filter(user_from=_from): if Candidates2.objects.filter(user_from=_from, user_to=_to): print("user from , to 다 일치") pass else: cand = Candidates2.objects.get(user_from=_from) cand.user_to.add(_to) print("user from만 일치, to 추가") else: cand = Candidates2.objects.create() cand.user_from.add(_from) cand.user_to.add(_to) print("해당 유저 %s 에 대한 데이터 저장완료" % given_user_id) return render(request, "recommend_completed.html")
def predict(rating_dic): df_clean = pd.read_csv("dataset_clean.csv") ####################### # Fit surprise model ####################### final_model = KNNBaseline(k=60, min_k=2, sim_options={'name': 'pearson_baseline', 'user_based': True}) new_user_id = max(df_clean["userID"]) + 1 ratings = np.array(list(rating_dic.values())) rated_mask = ratings != None ratings = ratings[rated_mask] items = np.array(list(rating_dic.keys()))[rated_mask] user = np.ones(len(items), dtype="int") * new_user_id new_user_df = pd.DataFrame({"userID": user, "itemID": items, "rating": ratings}) total_df = df_clean.append(new_user_df) # A reader is still needed but only the rating_scale param is requiered. reader = Reader(rating_scale=(0, 10)) # The columns must correspond to user id, item id and ratings (in that order). new_trainset = Dataset.load_from_df(total_df, reader).build_full_trainset() ## Fit the best model final_model.fit(new_trainset) predicted_ratings = [] for nootropic in nootropics_list: predicted_ratings.append(final_model.predict(new_user_id, nootropic).est) item_baselines = final_model.default_prediction() + final_model.compute_baselines()[ 1] # mean rating + item baseline ? result_df = pd.DataFrame( {"nootropic": nootropics_list, "predicted_rating": predicted_ratings, "baseline_rating": item_baselines}) nootropics_without_ratings = [nootropic for nootropic in nootropics_list if (nootropic not in rating_dic.keys())] new_result_df = result_df[result_df["nootropic"].isin(nootropics_without_ratings)] return new_result_df.sort_values("predicted_rating", ascending=False, ignore_index=True)
def eval(user_id): # Step 1: Define variables ratings = pps.get_all_ratings_as_df() # read ratings from database ratings[RATING] = None ratings.loc[ratings[LIKED] == True, RATING] = 1 ratings.loc[ratings[LIKED] == False, RATING] = 0 reader = Reader(rating_scale=(0.0, 1.0)) all_items = ratings.poi_id.unique() # find all items user_rmse = pd.DataFrame(columns=['est', 'true']) # define resulting dataframe for storing the probabilites # Step 2: Iterating over all items and leave out the current iteration's item (x) for training for x in np.nditer(all_items): # Step 2a: Define test dataset -> rating of currentUser and current (leaved out) item testset = ratings[(ratings.user_id == user_id)] testset = testset[(testset.poi_id == x)] # Step 2b: If user has given no rating for this item, the prediction cannot be compared to something true => thus skip if testset.rating.size == 0: continue # Step 2c: Define train dataset -> leave out the current item x trainset = ratings[~ratings.isin(testset).all(1)] trainset = Dataset.load_from_df(trainset[[USER_ID, POI_ID, RATING]], reader) trainset = trainset.build_full_trainset() # Step 2d: Apply algorithm by training and predicting of the item x that was leaved out algo = KNNBaseline(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True}) algo.fit(trainset) pred = algo.predict(user_id, np.asscalar(x), r_ui=4, verbose=False) # execute the calculation # Step 2e: Store estimate and true value into output dataframe user_rmse.loc[len(user_rmse)] = [pred.est, np.asscalar(testset.rating)] # Step 3: Calculate the RMSE over all leave out estimatieons confidence = np.mean((user_rmse.est - user_rmse.true)**2) return confidence
def get(self, item_id): # SQL query conn = mysql.connect() cursor = conn.cursor() # STEP 1 : KNN WITH MSD df = pd.read_sql_query("SELECT * FROM story_reviews", conn) # Data and Model reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['user_id', 'story_id', 'star']], reader) sim_options = {'name': 'pearson_baseline', 'user_based': False} model = KNNBaseline(sim_options=sim_options) # Training training_set = data.build_full_trainset() model.fit(training_set) item_inner_id = model.trainset.to_inner_iid(item_id) item_neighbors_inner = model.get_neighbors(item_inner_id, k=10) item_neighbors = [model.trainset.to_raw_iid(inner_id) for inner_id in item_neighbors_inner] # STEP 2 : CASCADE IT WITH TF-IDF df_stories = pd.read_sql_query("SELECT * FROM stories", conn) # Model tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english') tf_idf_matrix = tf.fit_transform(df_stories['title']) cosine_similarities = cosine_similarity(tf_idf_matrix, tf_idf_matrix) # Retrieve similar items cosine_similarities_row = cosine_similarities[item_id-1] recommendations_list = [] n = 10 for i in range(n): recommendations_list.append((item_neighbors[i], cosine_similarities_row[item_neighbors[i]-1])) recommendations_list.sort(key=lambda x:x[1], reverse=True) formatted_recommendations_list = [item[0] for item in recommendations_list] # Return K Nearest Neighbors return jsonify(recommendations = formatted_recommendations_list)
def compute_user_neighbors(id_name_dic, name_id_dic, trainset): algo = KNNBaseline() algo.fit(trainset) user_name = name_id_dic.keys()[1] print("user_name: ", user_name) user_id = name_id_dic[user_name] print("user_id: ", user_id) user_inner_id = algo.trainset.to_inner_uid(user_id) print("内部id: ", user_inner_id) user_neighbors = algo.get_neighbors(user_inner_id, k=10) user_neighbors = (algo.trainset.to_raw_uid(inner_id) for inner_id in user_neighbors) user_neighbors = (id_name_dic[user_id] for user_id in user_neighbors) print() print("和user 《", user_name, "》 最接近的10个user为:") for user in user_neighbors: print(algo.trainset.to_inner_uid(name_id_dic[user]), user)
def KNN(data, kwargs): # Set algorithm k_neigbor = kwargs.get('n_neigbor') min_neighb = kwargs.get('min_neigbor') similarity = kwargs.get('similarity') options = {'name': similarity} algo = KNNBaseline(k = k_neigbor, min_k = min_neighb, sim_options = options) # Train the algorithm on the data, and predict ratings for the testset algo.fit(data) prediction = np.zeros([10000,1000]) for row in range(10000): for col in range(1000): prediction[row,col] = algo.predict(str(row+1),str(col+1)).est return prediction
def knn_centered(self): print("calculating knn centered... File Rating: " + self.file_path) print("calculating knn centered... Item to Evaluate: " + self.item_evaluated) print("calculating knn centered... Number of recommendations: " + str(self.number_recommendations)) # Reader reader = Reader(line_format='item user rating', sep=self.delimiter, skip_lines=1) # Dataset data = Dataset.load_from_file(self.file_path, reader=reader) trainset = data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': False} algo = KNNBaseline(sim_options=sim_options) algo.fit(trainset) item_inner_id = algo.trainset.to_inner_iid(self.item_evaluated) item_neighbors = algo.get_neighbors(item_inner_id, k=int(self.number_recommendations)) item_neighbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in item_neighbors) dictionary_neighbors = {} print( "\nTransition Component Based Ratings >> Recommended items by KNN Centered:" ) i = 0 for item in item_neighbors: i += 1 dictionary_neighbors[i] = item print("- " + item) return dictionary_neighbors
def compute_movie_neighbors(id_name_dic, name_id_dic, trainset): sim_options = {'user_based': False} algo = KNNBaseline(sim_options=sim_options) algo.fit(trainset) #movie_name = name_id_dic.keys()[1] movie_name = "古墓迷途2" print("movie_name: ", movie_name) movie_id = name_id_dic[movie_name] print("movie_id: ", movie_id) movie_inner_id = algo.trainset.to_inner_iid(movie_id) print("内部id: ", movie_inner_id) movie_neighbors = algo.get_neighbors(movie_inner_id, k=10) movie_neighbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in movie_neighbors) movie_neighbors = (id_name_dic[movie_id] for movie_id in movie_neighbors) print() print("和movie 《", movie_name, "》 最接近的10个movie为:") for movie in movie_neighbors: print(algo.trainset.to_inner_iid(name_id_dic[movie]), movie)
def get_top_n_for_user(target_user_id, recom_alg, recom_size): file_path = os.path.expanduser('static/CRdata.csv') reader = Reader(line_format='user item rating timestamp', sep=',', rating_scale=(0,100)) data = Dataset.load_from_file(file_path,reader=reader) trainset = data.build_full_trainset() testset = trainset.build_anti_testset() if(recom_alg == 'KNNBaseline'): similarity = {'name': 'cosine', 'user_based': True # compute similarities between users } algo = KNNBaseline(sim_options=similarity) elif(recom_alg == 'CoClustering'): algo = CoClustering() else: algo = SVD() algo.fit(trainset) predictions = algo.test(testset) # First map the predictions to each user. top_n = defaultdict(list) for uid, iid, true_r, est, _ in predictions: top_n[uid].append((iid, est)) # Then sort the predictions for each user and retrieve the k highest ones. for uid, user_ratings in top_n.items(): user_ratings.sort(key=lambda x: x[1], reverse=True) top_n[uid] = user_ratings[:recom_size] return top_n[str(target_user_id)]
def main(): # Loads dataset rating_data_set = load_dataset(TRAINING_SET_PATH) # # Clean data rating_data_set = remove_missing_values(rating_data_set) # Slice data drop_movie_list, rating_data_set = slice_data(rating_data_set) # Loads movie file movies = load_movies_file(drop_movie_list, MOVIES_FILE_PATH) reader = Reader() sim_options = {'name': 'cosine', 'min_support': 2, 'shrinkage': 100, 'user_based': True} bsl_options = {'method': 'sgd'} data = Dataset.load_from_df(rating_data_set[['CustomerID', 'MovieID', 'Rating']][:1000], reader) kf = KFold(n_splits=5) #algo = SVD() algo = KNNBaseline(k=N, sim_options=sim_options, bsl_options=bsl_options) i = 0 for trainset, testset in kf.split(data): print("Running fold: ", i) algo.fit(trainset) predictions = algo.test(testset) precisions, recalls = precision_recall(predictions, 20) # Precision and recall can then be averaged over all users print(sum(prec for prec in precisions.values()) / len(precisions)) print(sum(rec for rec in recalls.values()) / len(recalls)) i += 1
def train(trainset: Trainset): """ Train SVD model based on options using utility matrix, then dump prediction and algorithm for future usage. """ global loaded_svd_algo, loaded_knn_algo svd_options = { 'n_factors': 82, 'n_epochs': 33, 'lr_all': 0.115, 'reg_all': 0.02 } knn_options = { 'sim_options': { 'name': 'pearson', 'min_support': 4, 'user_based': False }, 'k': 35, 'min_k': 1 } # setup the algorithm svd_algo = SVD(**svd_options) knn_algo = KNNBaseline(**knn_options) # train and dump svd_algo.fit(trainset) loaded_svd_algo = svd_algo dump.dump(base_dir.joinpath('svd.dump'), algo=svd_algo) knn_algo.fit(trainset) loaded_knn_algo = knn_algo dump.dump(base_dir.joinpath('knn.dump'), algo=knn_algo) print('Training and dumping completed')
def trainModel(): userID = [] itemID = [] rating = [] # the DSN value should be the name of the entry in odbc.ini, not freetds.conf # change the UID and PWD to your own conn = pyodbc.connect('DSN=MYMSSQL;UID=SA;PWD=Easton888') crsr = conn.cursor() with crsr: crsr.execute("use DataMiningFull") rows = crsr.execute("select users_ind, movies_ind, rating from users_movies \ where users_ind < 5000").fetchall() crsr.close() conn.close() # make panda dataframe for i in rows: userID.append(i[0]) itemID.append(i[1]) rating.append(i[2]) rating_dict = { 'userID': userID, 'itemID': itemID, 'rating': rating } df = pd.DataFrame(rating_dict) reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader) trainset = data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': False} algo = KNNBaseline(sim_options=sim_options) # train algo.fit(trainset) return algo
def get_item_baseline(): df_clean = pd.read_csv("dataset_clean.csv") ####################### # Fit surprise model ####################### final_model = KNNBaseline(k=60, min_k=2, sim_options={'name': 'pearson_baseline', 'user_based': True}) total_df = df_clean # A reader is still needed but only the rating_scale param is requiered. reader = Reader(rating_scale=(0, 10)) # The columns must correspond to user id, item id and ratings (in that order). new_trainset = Dataset.load_from_df(total_df, reader).build_full_trainset() ## Fit the best model final_model.fit(new_trainset) item_baselines = final_model.default_prediction() + final_model.compute_baselines()[ 1] # mean rating + item baseline ? return pd.DataFrame({"nootropic": nootropics_list, "item_baselines":item_baselines})
def train_baseon_playlist(): # 数据预处理 data_preprocess() path = "./data/" file_path = os.path.expanduser(path + "popular_music_suprise_format.txt") # 指定文件格式 reader = Reader(line_format='user item rating timestamp', sep=',') # 从文件读取数据 music_data = Dataset.load_from_file(file_path, reader=reader) # 计算歌单和歌单之间的相似度 print("构建数据集...") trainset = music_data.build_full_trainset() # 把全部数据进行训练,不进行交叉验证 print("开始训练模型...") sim_options = {'user_based': True} # 基于歌单的协同过滤 algo = KNNBaseline(sim_options=sim_options) algo.fit(trainset) surprise.dump.dump(path + 'KNNBaseline_Playlist_Recommand.model', algo=algo) # 保证数据一致性 # 重建歌单id到歌单名的映射字典 f1 = open(path + "playlist_id_name_dic.pkl", "rb") playlist_id_name_dic = pickle.load(f1) f1.close() f2 = open(path + "popular_music_suprise_format1.txt") context = f2.readlines() new_playlist_id_name_dic = {} for line in context: playlist_id, song_id, rating, time = line.split(',') new_playlist_id_name_dic[playlist_id] = playlist_id_name_dic[ playlist_id] pickle.dump(new_playlist_id_name_dic, open(path + "playlist_id_name_dic.pkl", "wb")) f2.close()
def get(self, algorithm, item_id): # SQL query conn = mysql.connect() cursor = conn.cursor() df = pd.read_sql_query("SELECT * FROM story_reviews", conn) # Data and Model reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['user_id', 'story_id', 'star']], reader) if algorithm == 'pearson': sim_options = {'name': 'pearson', 'user_based': False} elif algorithm == 'cosine': sim_options = {'name': 'cosine', 'user_based': False} elif algorithm == 'pearsonbaseline': sim_options = {'name': 'pearson_baseline', 'user_based': False} elif algorithm == 'msd': sim_options = {'name': 'msd', 'user_based': False} else: sim_options = {'name': 'pearson_baseline', 'user_based': False} model = KNNBaseline(sim_options=sim_options) # Training training_set = data.build_full_trainset() model.fit(training_set) # TESTING # cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) item_inner_id = model.trainset.to_inner_iid(item_id) item_neighbors_inner = model.get_neighbors(item_inner_id, k=10) item_neighbors = [model.trainset.to_raw_iid(inner_id) for inner_id in item_neighbors_inner] # Return K Nearest Neighbors return jsonify(recommendations = item_neighbors)
name_to_rid = {} with io.open(file_name, 'r', encoding='ISO-8859-1') as f: for line in f: line = line.split('|') rid_to_name[line[0]] = line[1] name_to_rid[line[1]] = line[0] return rid_to_name, name_to_rid # First, train the algortihm to compute the similarities between items data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': False} algo = KNNBaseline(sim_options=sim_options) algo.fit(trainset) # Read the mappings raw id <-> movie name rid_to_name, name_to_rid = read_item_names() # Retrieve inner id of the movie Toy Story toy_story_raw_id = name_to_rid['Toy Story (1995)'] toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id) # Retrieve inner ids of the nearest neighbors of Toy Story. toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10) # Convert inner ids of the neighbors into names. toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in toy_story_neighbors) toy_story_neighbors = (rid_to_name[rid]