def __build_model(self): model_path = '{}{}'.format(self.file_prefix, self.model_path) try: model = joblib.load(model_path) print('recommender exists, load it') return model except Exception as e: print('recommender does not exist, build new recommender') # load data # initialize KNN recommender algo = KNNWithMeans(k=50, sim_options={ 'name': 'pearson_baseline', 'user_based': False }) # train model algo.fit(self.trainset) # save model joblib.dump(algo, model_path) # validation test_pred = algo.test(self.testset) accuracy.rmse(test_pred) return algo
def cal_KNNWithMeans(trainset, df): # KNNWithMeans sim_options = {'name': 'cosine', 'user-based': True} algo_knnm = KNNWithMeans(k=40, min_k=1, sim_options=sim_options) algo_knnm.fit(trainset) users = [] items = [] real = [] estimate = [] for i in range(len(df)): uid = df[i:i + 1].user.values[0] users.append(uid) iid = df[i:i + 1].store.values[0] items.append(iid) r_ui = df[i:i + 1].stars.values[0] real.append(r_ui) pred = algo.predict(uid, iid, r_ui, verbose=True) estimate.append(pred) print("end") # knn With Means df4 = pd.DataFrame(columns=['user', 'item', 'r_ui', 'est']) df4['user'] = users df4['item'] = items df4['r_ui'] = real df4['est'] = estimate #df3.head() df4['est'] = df4['est'].apply(lambda x: x[-2]) df4['err'] = abs(df4.est - df4.r_ui) df4.to_csv(save_file2)
def fit(self, trainset): """Model fitting for KNN with significance weighting Calls the parent class fit method and then generates the overlap matrix needed by the significance weighting. :param trainset: :return: self """ # Call parent class function KNNWithMeans.fit(self, trainset) # Create an "overlap" matrix counting the number of items that # pairs of users have in common. # See the creation of the "freq" matrix in the "similarities.pyx" file. if self.sim_options['user_based']: n_x, yr = self.trainset.n_users, self.trainset.ir else: n_x, yr = self.trainset.n_items, self.trainset.ur self.overlap = np.zeros((n_x, n_x), np.int) for y, y_ratings in iteritems(yr): for xi, ri in y_ratings: for xj, rj in y_ratings: self.overlap[xi, xj] += 1 # Use overlap matrix to update the sim matrix, discounting by the significance weight factor. for xi in range(n_x): for xj in range(n_x): weight = self.sig_weight(xi, xj) self.sim[xi, xj] = self.sim[xi, xj] * weight return self
def main(): # Charge movielens-100k dataset movielens_ds = Dataset.load_builtin('ml-100k') # Creer un jeu de test et de train ( 15%, 85%) trainset, testset = train_test_split(movielens_ds, test_size=.15) algo = KNNWithMeans() # Train sur le jeu de donnée trainset algo.fit(trainset) # Prediction sur le jeu de donnée testset predictions = algo.test(testset) # Affiche le RMSE accuracy.rmse(predictions) #print(predictions) result = [] for prediction in predictions: # Difference prediction et realite result.append(prediction.r_ui - prediction.est) # Histogramme du resultat plt.hist(result, 100) plt.show()
def recommender_knn_baseline(self, train_file, test_file, output): train, test, train_dataset, test_dataset = prepare_datasets( train_file, test_file) # Use user_based true/false to switch between user-based or item-based collaborative filtering algo_knn_means = KNNWithMeans(verbose=False) algo_knn_means.fit(train) #not_seen_elems = self.merge_train_set(train_dataset, test_dataset) #predictions_precision_svd = algo_svd.test(not_seen_elems, test, verbose=False, not_seen_flag=True) predictions_knn_means = algo_knn_means.test(test, verbose=False) #precisions, recalls = self.precision_recall_at_k(predictions_precision_svd, 10, threshold=0.0) # Precision and recall can then be averaged over all users #precision_avg = sum(prec for prec in precisions.values()) / len(precisions) #recall_avg = sum(rec for rec in recalls.values()) / len(recalls) #print('Precision: ' + str(precision_avg) + ' Recall: ' + str(recall_avg) + ' RMSE: ' + str( # rmse(predictions_svd, verbose=False)) + ' MAE: ' + str(mae(predictions_svd, verbose=False))) print('KNN_BASELINE: ' + ' RMSE ' + str(rmse(predictions_knn_means, verbose=False)) + ' MAE ' + str(mae(predictions_knn_means, verbose=False))) return algo_knn_means
def evaluate_on_test(self, train_set, test_set): """ Evaluate the algorithm on the test set after running it on the test set :param train_set: :param test_set: :return: RMSE value on test set """ if train_set is not None and test_set is not None: print("Evaluate RMSE on test data") self.LOG_HANDLE.info("Evaluate RMSE on test data") similarity_options = { 'name': 'msd', 'user_based': False, } # Use the KNN algorithm algo = KNNWithMeans(sim_options=similarity_options) # Train the algorithm on the trainset, and predict ratings for the testset algo.fit(train_set) predictions = algo.test(test_set) # Then compute RMSE return accuracy.rmse(predictions)
def knnBasico(df, testSize, vecinos, pr, bool): # df = pd.read_csv('../datasets/yelp_beautySpa_aspects.csv', header=0) reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['user_id', 'item_id', 'rating']], reader) trainset, testset = train_test_split(data, test_size=testSize, shuffle=False) sim_options = { 'name': 'cosine', 'user_based': bool # compute similarities between items } algo = KNNWithMeans(k=vecinos, sim_options=sim_options) algo.fit(trainset) predictions = algo.test(testset) precisions, recalls = precision_recall_at_k(predictions, pr, 4) # Precision and recall can then be averaged over all users # print(sum(prec for prec in precisions.values()) / len(precisions)) # print(sum(rec for rec in recalls.values()) / len(recalls)) precision = round( sum(prec for prec in precisions.values()) / len(precisions), 3) recall = round(sum(rec for rec in recalls.values()) / len(recalls), 3) return precision, recall
def main(): # Charge movielens-100k dataset data = Dataset.load_builtin('ml-100k') # Créer un jeu de test et de train ( 15%, 85%) trainset, testset = train_test_split(data, test_size=.15) # Détermine l'algorithme utilisé algo = KNNWithMeans() # Train sur le jeu de donnée trainset algo.fit(trainset) # Prediction sur le jeu de donnée testset predictions = algo.test(testset) # Affiche le RMSE accuracy.rmse(predictions) result =[] for prediction in predictions: # Calcul le delta entre la prediction et la réalité result.append(prediction.r_ui - prediction.est) # Affiche l'histogramme du delta entre les predictions et la réalité plt.hist(result, 100) plt.show()
def KNNPred(data): #KNN Means algorithm print("\nTraining KNN Means model..\n") global x_test, y_test, testlen, trainlen, y_train, model_params, X, Y, avg_rat, cold_itm options = model_params[0] knnModel = KNNWithMeans(sim_options=options) knnModel_1 = KNNWithMeans() train = data.build_full_trainset() knnModel.fit(train) print("\nTraining done..\nPrediction started..") knnModel_1.fit(train) #y_pred_w_m = [knnModel.predict(x_test[i][0], x_test[i][1]).est for i in range(testlen)] #y_pred_wo_m = [knnModel_1.predict(x_test[i][0], x_test[i][1]).est for i in range(testlen)] y_pred_w_m = [0 for i in range(testlen)] y_pred_wo_m = [0 for i in range(testlen)] kk = 0 for i in x_test: if i[1] - 1 in cold_itm: y_pred_w_m[kk] = avg_rat[i[0] - 1] y_pred_wo_m[kk] = avg_rat[i[0] - 1] else: y_pred_w_m[kk] = knnModel.predict(i[0], i[1]).est y_pred_wo_m[kk] = knnModel_1.predict(i[0], i[1]).est kk += 1 #y_pred_train = [knnModel_1.predict(x_train[i][0], x_train[i][1]).est for i in range(trainlen)] #y_pred_tot = [knnModel_1.predict(X[i][0], X[i][1]).est for i in range(trainlen+testlen)] print("\nPrediction done..\n") return [y_pred_w_m, y_pred_wo_m, knnModel, knnModel_1] #, y_pred_train, y_pred_tot
def KNN_train(self, k=20, options={ 'name': 'pearson', 'user_based': False }): ''' seed:int-3划分训练集测试集的随机种子 k:int-40,最大邻居数量 options:dict-{'name': 'pearson', 'user_based': False},算法的选项,默认为Pearson相似度,基于项目的方法 ''' self.algos = [] df = self.trainDatas names = locals() r = Reader(rating_scale=(1, 5)) # 读取、划分数据;训练预测数据 total = Dataset.load_from_df(df[['uid', 'iid', 'total']], reader=r) total_train = total.build_full_trainset() total_algo = KNNWithMeans(k, sim_options=options) total_algo.fit(total_train) self.algos.append(total_algo) for i in range(1, self.no_of_criteria + 1): names['c' + str(i)] = Dataset.load_from_df( df[['uid', 'iid', 'c' + str(i)]], reader=r) names['c' + str(i) + '_train'] = names.get('c' + str(i)).build_full_trainset() names['algo_c' + str(i)] = KNNWithMeans(k, sim_options=options) names.get('algo_c' + str(i)).fit(names.get('c' + str(i) + '_train')) self.algos.append(names.get('algo_c' + str(i)))
class Rater: def __init__(self, ratings): self.classifier = KNNWithMeans(sim_options={"name": "cosine", "user_based": False}) self.training_set = None self.ratings_dict = None self._prepare_data_(ratings) self._train_() def _prepare_data_(self, ratings): self.ratings_dict = { "user_id": [item.user_id for item in ratings], "movie_id": [item.movie_id for item in ratings], "mark": [item.mark for item in ratings] } df = pd.DataFrame(self.ratings_dict) data = Dataset.load_from_df(df[["user_id", "movie_id", "mark"]], Reader(rating_scale=Constants.RATING_SCALE)) self.training_set = data.build_full_trainset() def _train_(self): self.classifier.fit(self.training_set) def get_ratings(self, user_id): predicted_ratings = {} for movie_id in self.ratings_dict["movie_id"]: prediction = self.classifier.predict(user_id, movie_id) predicted_ratings[movie_id] = prediction.est return predicted_ratings
def plot_ROC(qNum, k, thresh=[2.5,3,3.5,4]): range = 5.0 trainset, testset = train_test_split(data, test_size=0.1) if qNum == 15: model = KNNWithMeans(k=k, sim_options={'name': 'pearson'}) model.fit(trainset) predictions = model.test(testset) for thrs in thresh: y = np.array([]) scores = np.array([]) for u, i, t, est, d in predictions: if t >= thrs: t = 1 else: t = 0 y = np.append(y, t) scores = np.append(scores, est/range) fpr, tpr, thresholds = metrics.roc_curve(y, scores) roc_auc = metrics.auc(fpr, tpr) plt.figure() plt.plot(fpr, tpr, color='darkorange', lw=2) plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Threshold = '+str(thrs)) plt.show() print("auc = "+str(roc_auc))
def test_knn_based(data): """ Parameters ---------- data : dataframe Dataframe with columns userId, movieId, and rating in that order. Returns ------- test_mse : float The mean squared error for the knn based algorithm. """ reader = Reader(rating_scale=(1, 5)) knn_data = Dataset.load_from_df(data, reader) trainset, testset = train_test_split(knn_data, test_size=.10, random_state=24) algo = KNNWithMeans(k=5, sim_options={ 'name': 'pearson_baseline', 'user_based': True }) algo.fit(trainset) predictions = algo.test(testset) test_mse = accuracy.mse(predictions, verbose=False) return test_mse
def fit(self, trainset): """Model fitting for KNN with significance weighting Calls the parent class fit method and then generates the overlap matrix needed by the significance weighting. :param trainset: :return: self """ # Call parent class function KNNWithMeans.fit(self, trainset) # Create an "overlap" matrix counting the number of items that # pairs of users have in common. ur_data = trainset.ur n_d = len(ur_data) overlap = np.zeros([n_d, n_d], np.double) # See the creation of the "freq" matrix in the "similarities.pyx" file. # Use overlap matrix to update the sim matrix, discounting by the significance weight factor. self.ur_data = ur_data self.overlap = np.zeros([n_d, n_d], np.int) for u in range(n_d): for v in range(n_d): if (u != v): overlap[u, v] = self.sig_weight(u, v) self.sim = overlap * self.sim return self
def DisplayGraphDelta(data) : """ Affichage du delta entre prédiction et réalité """ # Créer un jeu de test et de train ( 25%, 75%) trainset, testset = train_test_split(data, test_size=.25) algo = KNNWithMeans() # Train sur le jeu de donnée trainset algo.fit(trainset) # Prediction sur le jeu de donnée testset predictions = algo.test(testset) # Affiche le RMSE accuracy.rmse(predictions) #print(predictions) result =[] for prediction in predictions: print(prediction) # Calcul le delta entre la prediction et la réalité result.append(prediction.r_ui - prediction.est) # Affiche l'histogramme du delta entre les prediction et la réalité print(len(result)) plt.hist(result, 100) plt.show()
def train(): # TODO put in real data here when we have collected enough ratings_dict = { "item": [1, 2, 1, 2, 1, 2, 1, 2, 1], "user": ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D', 'E'], "rating": [1, 0, 0, 0, 1, 0, 1, 1, 1], } df = pd.DataFrame(ratings_dict) reader = Reader(rating_scale=(0, 1)) # Loads Pandas dataframe data = Dataset.load_from_df(df[["user", "item", "rating"]], reader) trainingSet = data.build_full_trainset() # To use item-based cosine similarity sim_options = { "name": "cosine", "user_based": False, # Compute similarities between items } algo = KNNWithMeans(sim_options=sim_options) algo.fit(trainingSet) return algo
def CFM(self): u_id = [] I_id = [] r_ui_ = np.array([]) _est = np.array([]) sim_options = {'name': 'cosine', 'user_based': True} algo = KNNWithMeans(k=40, min_k=1, sim_options=sim_options) algo.fit(self.trainset) for uid in (self.list): lids = self.data[self.data.uid == uid] a = self.data[self.data.uid == uid] for i in range(1, len(a)): lid = lids[i - 1:i].lid.values[0] r_ui = lids[i - 1:i].rate.values[0] pred = algo.predict(uid, lid, r_ui, verbose=True) u_id.append(int(pred.uid)) I_id.append(int(pred.iid)) r_ui_ = np.append(r_ui_, pred.r_ui) _est = np.append(_est, pred.est) self.df_est = pd.DataFrame({ 'uid': u_id, 'Iid': I_id, 'r_ui': r_ui_, 'est': _est }) self.arr = self.df_est['uid'].unique() self.CFWM_ndcg_ = self.Calculate_NDCG()
def load_data(): data = Dataset.load_builtin('ml-100k') # similarity options sim_options = {"name": "msd", "user_based": False} param_grid = { "n_epochs": [5, 10], "lr_all": [0.002, 0.005], "reg_all": [0.4, 0.6] } # algorithm algo = KNNWithMeans(sim_options=sim_options) # computation training_set = data.build_full_trainset() algo.fit(training_set) # GRID SEACH, MATRIX FACTORIZATION print("Divide matrix in grids") gs = GridSearchCV(SVD, param_grid=param_grid, measures=["rmse"], cv=3) gs.fit(data) print(gs.best_score['rmse'])
def rank_predictions(model_name): k_KNN = 22 k_NNMF = 20 k_MF = 26 if model_name == 'KNN': sim_options = { 'name': 'pearson_baseline', 'shrinkage': 0 } model = KNNWithMeans(k_KNN, sim_options=sim_options) elif model_name == 'NNMF': model = NMF(n_factors= k_NNMF) else: model = SVD(n_factors = k_MF) precision_arr = [] recall_arr = [] for t in range (1,26): kf = KFold(n_splits=10) print(t) p = [] r = [] for trainSet, testSet in kf.split(data): model.fit(trainSet) predictions = model.test(testSet) precisions, recalls = precision_recall (predictions, t) p.append(sum(prec for prec in precisions.values()) / len(precisions)) r.append(sum(rec for rec in recalls.values()) / len(recalls)) precision_arr.append(np.mean(np.array(p))) recall_arr.append(np.mean(np.array(r))) # precision vs t plt.plot(list(range (1,26)), precision_arr) plt.xlabel("Size") plt.ylabel("Precision") plt.title("The average precision plot using " + model_name) plt.show() # recall vs t plt.plot(list(range (1,26)), recall_arr) plt.xlabel("Size") plt.ylabel("Recall") plt.title("The average recall plot using MF " + model_name) plt.show() # precision vs recall plt.plot(recall_arr, precision_arr) plt.xlabel("Recall") plt.ylabel("Precision") plt.title("The average precision and recall plot using " + model_name) plt.show() return precision_arr, recall_arr
def get_rec_sys_resources(df_reviews): sim_options = {'name': 'pearson', 'user_based': False} algo = KNNWithMeans(sim_options=sim_options) # load csv to build trainset, required to recommend cols = ['reviewerID', 'asin', 'overall'] trainset, testset = train_test_from_df(df_reviews, cols, test_size=0.2) algo.fit(trainset) return algo, algo.compute_similarities(), trainset, testset
def train(): reader = Reader(rating_scale=(1, 10)) data = Dataset.load_from_df(book_rating_ds[['user', 'item', 'rating']], reader) sim_options = {"name": "cosine", "user_based": False} model = KNNWithMeans(sim_options=sim_options) training_Set = data.build_full_trainset() model.fit(training_Set) # export the model model_path = os.path.join(PICKLES_PATH, "rec.pkl") joblib.dump(model, model_path, compress=True)
def run(self): #will run model ratings = pd.read_csv('rating_final.csv') ratings_dict = {"userID": list(ratings.userID), "placeID": list(ratings.placeID), "rating": list(ratings.rating)} df = pd.DataFrame(ratings_dict) reader = Reader(rating_scale=(0, 2)) data = Dataset.load_from_df(df[["userID", "placeID", "rating"]], reader) # To use item-based cosine similarity sim_options = { "name": "cosine", "user_based": True, # Compute similarities between items "min_support":9 } # define a cross-validation iterator kf = KFold(n_splits=5) algo = KNNWithMeans(sim_options=sim_options) places = list(df['placeID'].unique()) ordered = ArrayList() for i in places: total=0 for trainset, testset in kf.split(data): #finds result for each fold # train algorithm. algo.fit(trainset) #test algorithm #predictions = algo.test(testset) # Compute and print Root Mean Squared Error #accuracy.rmse(predictions, verbose=True) #gets predicted rating for each place prediction = algo.predict(self.user, i, verbose=False) total+=prediction.est ordered.append(i, total/5) #we find average of estimate for each fold ordered.sort() highest = ordered.inArray[ordered.count - 5:ordered.count] place = pd.read_csv('geoplaces2.csv') #placedf = pd.DataFrame({"placeID": list(place.placeID), "name": list(place.name)}) count = 0 finalRec=ArrayList() for i in range(len(highest) - 1, -1, -1): count += 1 name = list(place[place["placeID"].unique() == highest[i].id]['name']) finalRec.append(count, name[0]) #printing accuracy score out = cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=False) mean_rmse = '{:.3f}'.format(np.mean(out['test_rmse'])) print(mean_rmse) return finalRec.inArray
def train_surprise_model(): # import reduced dataset: df = import_reduced_reviews( 'C:/Users/lukas/OneDrive/Desktop/Reviews_Reduced.csv') df = df[['user_key', 'game_key', 'rating']] # drop duplicates: df = df.drop_duplicates(subset=['game_key', 'user_key']) ### Modelling part with Surprise: # get data in a format surprise can work with: reader = Reader(rating_scale=(1, 10)) data = Dataset.load_from_df(df[['user_key', 'game_key', 'rating']], reader) # Build trainset from the whole dataset: trainsetfull = data.build_full_trainset() print('Number of users: ', trainsetfull.n_users, '\n') print('Number of items: ', trainsetfull.n_items, '\n') # Parameters: sim_option = {'name': 'cosine', 'user_based': False} k = 10 min_k = 5 algo = KNNWithMeans(k=k, min_k=min_k, sim_options=sim_option) # Run fit: start_time = time.time() algo.fit(trainsetfull) print("--- %s seconds ---" % (time.time() - start_time)) ### Test: is it possible to exchange the sim matrix? sim_matrix_imported = pd.read_csv( '../Data/Recommender/selfmade_item-item-similarity-matrix.csv', index_col=0) sim_matrix_imported.columns = sim_matrix_imported.columns.astype(int) sim_matrix_imported = sim_matrix_imported.to_numpy() a = algo.predict(93681, 100007) algo.sim = sim_matrix_imported b = algo.predict(93681, 100007) # We now need to save the similarity matrix somewhere: sim_matrix = algo.sim pd.DataFrame(sim_matrix).to_csv( '../Data/Recommender/sim_matrix-myKNNWithMeans_item_based_model') # Save the precomputed model: dump.dump('../Data/Recommender/myKNNWithMeans_item_based_model', algo)
def binary_value(data, threshold) : trainset, testset = train_test_split(data, test_size=.1) algo = KNNWithMeans(k = 30) algo.fit(trainset) predictions = algo.test(testset) like0 = []#real like = []#predict for row in range(len(predictions)) : like.append( 1 if predictions[row][3] > threshold else 0) like0.append(1 if predictions[row][2] > threshold else 0) #predictions[row][3] -> predict value #predictions[row][2] -> real value return like0, like
def CFM(self): sim_options = {'name': 'cosine', 'user_based': True} algo = KNNWithMeans(k=40, min_k=1, sim_options=sim_options) algo.fit(self.trainset) for uid in (self.list): lids = self.data[self.data.uid == uid] a = self.data[self.data.uid == uid] for i in range(1, len(a)): lid = lids[i - 1:i].lid.values[0] r_ui = lids[i - 1:i].rate.values[0] pred = algo.predict(uid, lid, r_ui, verbose=True) return pred
def trim_performance(qNum,maxk=0): pop, unpop, highVar = trimMovies() if maxk == 0: if 12 <= qNum <= 14: maxk = 100 elif 19 <= qNum <= 21: maxk = 50 trim_Model = { 12: (pop, 'KNNWithMeans'), 13: (unpop, 'KNNWithMeans'), 14: (highVar, 'KNNWithMeans'), 19: (pop, 'NMF'), 20: (unpop, 'NMF'), 21: (highVar, 'NMF'), } trimSet, modelName = trim_Model[qNum] kf = KFold(n_splits=10) RMSE = [] for k in range(2, maxk + 1, 2): print('-' * 20 + 'k = ' + str(k) + ' ' + '-' * 20) if modelName == 'KNNWithMeans': model = KNNWithMeans(k=k, sim_options={'name': 'pearson'}) elif modelName == 'NMF': model = NMF(n_factors=k) subRMSE = [] temp = 1 for trainSet, testSet in kf.split(data): model.fit(trainSet) testSet = list(filter(lambda x: int(x[1]) in trimSet, testSet)) print("Split " + str(temp) + ": test set size after trimming: %d", len(testSet)) temp += 1 predictions = model.test(testSet) subRMSE.append(accuracy.rmse(predictions, verbose=True)) RMSE.append(np.mean(subRMSE)) plt.figure() plt.plot(list(range(2, maxk+1, 2)), RMSE) plt.xlabel("k") plt.ylabel("Average RMSE") plt.title("Q"+str(qNum)+": Average RMSE Along k") plt.show() print(min(RMSE)) return min(RMSE)
def solve_item_item(pathw): reader = Reader(line_format='user item rating timestamp', sep=',') data = Dataset.load_from_file(pathw, reader=reader) data.split(n_folds=5) algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': False}) trainset = data.build_full_trainset() algo.fit(trainset) # Than predict ratings for all pairs (u, i) that are NOT in the training set. testset = trainset.build_anti_testset() predictions = algo.test(testset) top_n = get_top_n(predictions, n=10) # Print the recommended items for each user for uid, user_ratings in top_n.items(): if uid == '615': # print(uid, [iid for (iid, _) in user_ratings]) return [iid for (iid, _) in user_ratings]
def CFM(self): kf = KFold(n_splits=5) sim_options = {'name': 'cosine', 'user_based': True} algo = KNNWithMeans(k=40, min_k=1, sim_options=sim_options) for trainset, testset in kf.split(self.data): algo.fit(trainset) predictions = algo.test(testset) precisions, recalls = self.precision_recall_at_k(predictions) P = sum(prec for prec in precisions.values()) / len(precisions) R = sum(rec for rec in recalls.values()) / len(recalls) F1 = 2 * P * R / (P + R) print("Precision : ", P) print("Recall : ", R) print("F1 : ", F1)
def ComputeCollaborativeFiltering_User_User(recipe_df, train_rating_df, pd, benchmark, knnmeans=False): print("\n###### Compute CollaborativeFiltering_User_User ######") df = pd.merge(recipe_df, train_rating_df, on='recipe_id', how='inner') reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['user_id', 'recipe_id', 'rating']], reader) trainSet, testSet = train_test_split(data, test_size=.2, random_state=0) # compute similarities between items sim_options = {'name': 'cosine', 'user_based': True} if knnmeans: algo = KNNWithMeans(sim_options=sim_options, verbose=False) else: algo = KNNBasic(sim_options=sim_options, verbose=False) algo.fit(trainSet) predictions = algo.test(testSet) Evaluators.RunAllEvals(predictions, benchmark)
def algoProdToProd(): reader2 = Reader(rating_scale=(0, productTable['Frequency'].max())) data2 = Dataset.load_from_df( productTable[["Product_ID1", "Product_ID2", "Frequency"]], reader2) # To use item-based cosine similarity sim_options = { "name": "cosine", "user_based": False, # Compute similarities between items } algo2 = KNNWithMeans(sim_options=sim_options) trainingSet2 = data2.build_full_trainset() algo2.fit(trainingSet2) return algo2
from surprise import KNNWithMeans from surprise import Dataset, print_perf, Reader from surprise.model_selection import cross_validate import os # 指定文件所在路径 file_path = os.path.expanduser('mydata.csv') # 告诉文本阅读器,文本的格式是怎么样的 reader = Reader(line_format='user item rating', sep=',') # 加载数据 data = Dataset.load_from_file(file_path, reader=reader) trainset = data.build_full_trainset() # Use user_based true/false to switch between user-based or item-based collaborative filtering algo = KNNWithMeans(k=50, sim_options={'user_based': False})#取最相似的用户进行计算时,只取最相似的k个 algo.fit(trainset) # we can now query for specific predicions uid = str(5) # raw user id iid = str(1) # raw item id # get a prediction for specific users and items. pred = algo.predict(uid, iid) print('rating of user-{0} to item-{1} is '.format(uid, iid), pred.est)# rating of user-5 to item-1 #---------------------------- uid = str(5) # raw user id iid = str(5) # raw item id # get a prediction for specific users and items. pred = algo.predict(uid, iid) print('rating of user-{0} to item-{1} is '.format(uid, iid), pred.est)