def KNNPred(data): #KNN Means algorithm print("\nTraining KNN Means model..\n") global x_test, y_test, testlen, trainlen, y_train, model_params, X, Y, avg_rat, cold_itm options = model_params[0] knnModel = KNNWithMeans(sim_options=options) knnModel_1 = KNNWithMeans() train = data.build_full_trainset() knnModel.fit(train) print("\nTraining done..\nPrediction started..") knnModel_1.fit(train) #y_pred_w_m = [knnModel.predict(x_test[i][0], x_test[i][1]).est for i in range(testlen)] #y_pred_wo_m = [knnModel_1.predict(x_test[i][0], x_test[i][1]).est for i in range(testlen)] y_pred_w_m = [0 for i in range(testlen)] y_pred_wo_m = [0 for i in range(testlen)] kk = 0 for i in x_test: if i[1] - 1 in cold_itm: y_pred_w_m[kk] = avg_rat[i[0] - 1] y_pred_wo_m[kk] = avg_rat[i[0] - 1] else: y_pred_w_m[kk] = knnModel.predict(i[0], i[1]).est y_pred_wo_m[kk] = knnModel_1.predict(i[0], i[1]).est kk += 1 #y_pred_train = [knnModel_1.predict(x_train[i][0], x_train[i][1]).est for i in range(trainlen)] #y_pred_tot = [knnModel_1.predict(X[i][0], X[i][1]).est for i in range(trainlen+testlen)] print("\nPrediction done..\n") return [y_pred_w_m, y_pred_wo_m, knnModel, knnModel_1] #, y_pred_train, y_pred_tot
def KNN_train(self, k=20, options={ 'name': 'pearson', 'user_based': False }): ''' seed:int-3划分训练集测试集的随机种子 k:int-40,最大邻居数量 options:dict-{'name': 'pearson', 'user_based': False},算法的选项,默认为Pearson相似度,基于项目的方法 ''' self.algos = [] df = self.trainDatas names = locals() r = Reader(rating_scale=(1, 5)) # 读取、划分数据;训练预测数据 total = Dataset.load_from_df(df[['uid', 'iid', 'total']], reader=r) total_train = total.build_full_trainset() total_algo = KNNWithMeans(k, sim_options=options) total_algo.fit(total_train) self.algos.append(total_algo) for i in range(1, self.no_of_criteria + 1): names['c' + str(i)] = Dataset.load_from_df( df[['uid', 'iid', 'c' + str(i)]], reader=r) names['c' + str(i) + '_train'] = names.get('c' + str(i)).build_full_trainset() names['algo_c' + str(i)] = KNNWithMeans(k, sim_options=options) names.get('algo_c' + str(i)).fit(names.get('c' + str(i) + '_train')) self.algos.append(names.get('algo_c' + str(i)))
def select_model(loaded_data, model_selection='user_user'): # default model is user-user based collaborative filtering if model_selection == 'user_user': algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True}) elif model_selection == 'item_item': algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': False}) else: algo = mf.matrix_factorization_param(loaded_data) print(algo) return algo
def randomize(): sim_options_cosine = {'name': 'cosine', 'user_based': False} sim_options_msd = {'name': 'msd', 'user_based': False} sim_options_pearson = {'name': 'pearson', 'user_based': False} sim_options_baseline = { 'name': 'pearson_baseline', 'user_based': False, 'shrinkage': 0 } algorithms = [ ('kNN Basic - Cosine', KNNBasic(sim_options=sim_options_cosine, verbose=False)), ('kNN Basic - MSD', KNNBasic(sim_options=sim_options_msd, verbose=False)), ('kNN Basic - Pearson', KNNBasic(sim_options=sim_options_pearson, verbose=False)), ('kNN Basic - Pearson B', KNNBasic(sim_options=sim_options_baseline, verbose=False)), ('kNN Means - Cosine', KNNWithMeans(sim_options=sim_options_cosine, verbose=False)), ('kNN Means - MSD', KNNWithMeans(sim_options=sim_options_msd, verbose=False)), ('kNN Means - Pearson', KNNWithMeans(sim_options=sim_options_pearson, verbose=False)), ('kNN Means - Pearson B', KNNWithMeans(sim_options=sim_options_baseline, verbose=False)), ('kNN Z - Cosine', KNNWithZScore(sim_options=sim_options_cosine, verbose=False)), ('kNN Z - MSD', KNNWithZScore(sim_options=sim_options_msd, verbose=False)), ('kNN Z - Pearson', KNNWithZScore(sim_options=sim_options_pearson, verbose=False)), ('kNN Z - Pearson B', KNNWithZScore(sim_options=sim_options_baseline, verbose=False)), ('kNN Baseline - Cosine', KNNBaseline(sim_options=sim_options_cosine, verbose=False)), ('kNN Baseline - MSD', KNNBaseline(sim_options=sim_options_msd, verbose=False)), ('kNN Baseline - Pearson', KNNBaseline(sim_options=sim_options_pearson, verbose=False)), ('kNN Baseline - Pearson B', KNNBaseline(sim_options=sim_options_baseline, verbose=False)), ('SVD', SVD(verbose=False)), ('SVDpp', SVDpp(verbose=False)), ('Baseline Only', BaselineOnly(verbose=False)), ('CoClustering', CoClustering(verbose=False)), ('SlopeOne', SlopeOne()), ('NMF', NMF(verbose=False)) ] random_ = random.randint(0, len(algorithms)) return algorithms[random_]
def generate_knn(self,rating_data): """ here we separate untuned and tuned algo as it might take a really long time on tuning, it's easier to comment out the tuning part if needed Args: param1: rating_data: the main data set Return: a dictionary of algorithms; key: name of algo, val: algo object """ algo = {} bcKNN = KNNBasic(sim_options={'name': 'cosine', 'user_based': True}) algo['bcKNN'] = bcKNN wmKNN = KNNWithMeans(sim_options={'name': 'cosine', 'user_based': True}) algo['wmKNN'] = wmKNN wzKNN = KNNWithZScore(sim_options={'name': 'cosine', 'user_based': True}) algo['wzKNN'] = wzKNN blKNN = KNNBaseline(sim_options={'name': 'cosine', 'user_based': True}) algo['blKNN'] = blKNN # tune param for knnBaseline, since it has best accuracy param_grid_bl = {'k': [10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 100]} best_params_bl = self.tune_and_find_parameter('blKNN', KNNBaseline, rating_data, param_grid_bl) blKNN_tuned = KNNBaseline(k=best_params_bl['k']) algo.update({'blKNN_tuned': blKNN_tuned}) return algo
def check_for_args(): args = sys.argv for arg in args: if (arg == 'SVD'): alg_list.append(SVD()) elif (arg == 'SVDpp'): alg_list.append(SVDpp()) elif (arg == 'SlopeOne'): alg_list.append(SlopeOne()) elif (arg == 'NMF'): alg_list.append(NMF()) elif (arg == 'NormalPredictor'): alg_list.append(NormalPredictor()) elif (arg == 'KNNBaseline'): alg_list.append(KNNBaseline()) elif (arg == 'KNNBasic'): alg_list.append(KNNBasic()) elif (arg == 'KNNWithMeans'): alg_list.append(KNNWithMeans()) elif (arg == 'KNNWithZScore'): alg_list.append(KNNWithZScore()) elif (arg == 'BaselineOnly'): alg_list.append(BaselineOnly()) elif (arg == 'CoClustering'): alg_list.append(CoClustering()) return alg_list
def EvaluateDifferentAlgorithms(): benchmark = [] # Iterate over all algorithms for algorithm in [ SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering() ]: # Perform cross validation results = cross_validate(algorithm, data_6months, measures=['RMSE'], cv=3, verbose=False) # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) print( pd.DataFrame(benchmark).set_index('Algorithm').sort_values( 'test_rmse'))
def plot_ROC(qNum, k, thresh=[2.5,3,3.5,4]): range = 5.0 trainset, testset = train_test_split(data, test_size=0.1) if qNum == 15: model = KNNWithMeans(k=k, sim_options={'name': 'pearson'}) model.fit(trainset) predictions = model.test(testset) for thrs in thresh: y = np.array([]) scores = np.array([]) for u, i, t, est, d in predictions: if t >= thrs: t = 1 else: t = 0 y = np.append(y, t) scores = np.append(scores, est/range) fpr, tpr, thresholds = metrics.roc_curve(y, scores) roc_auc = metrics.auc(fpr, tpr) plt.figure() plt.plot(fpr, tpr, color='darkorange', lw=2) plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Threshold = '+str(thrs)) plt.show() print("auc = "+str(roc_auc))
def generate_knn(self, rating_data): algo = {} bcKNN = KNNBasic(sim_options={'name': 'cosine', 'user_based': True}) algo['bcKNN'] = bcKNN wmKNN = KNNWithMeans(sim_options={ 'name': 'cosine', 'user_based': True }) algo['wmKNN'] = wmKNN wzKNN = KNNWithZScore(sim_options={ 'name': 'cosine', 'user_based': True }) algo['wzKNN'] = wzKNN blKNN = KNNBaseline(sim_options={'name': 'cosine', 'user_based': True}) algo['blKNN'] = blKNN # tune param for knnBaseline, since it has best accuracy param_grid_bl = {'k': [10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 100]} best_params_bl = self.tune_and_find_parameter('blKNN', KNNBaseline, rating_data, param_grid_bl) blKNN_tuned = KNNBaseline(k=best_params_bl['k']) algo.update({'blKNN_tuned': blKNN_tuned}) return algo
def knnBasico(df, testSize, vecinos, pr, bool): # df = pd.read_csv('../datasets/yelp_beautySpa_aspects.csv', header=0) reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['user_id', 'item_id', 'rating']], reader) trainset, testset = train_test_split(data, test_size=testSize, shuffle=False) sim_options = { 'name': 'cosine', 'user_based': bool # compute similarities between items } algo = KNNWithMeans(k=vecinos, sim_options=sim_options) algo.fit(trainset) predictions = algo.test(testset) precisions, recalls = precision_recall_at_k(predictions, pr, 4) # Precision and recall can then be averaged over all users # print(sum(prec for prec in precisions.values()) / len(precisions)) # print(sum(rec for rec in recalls.values()) / len(recalls)) precision = round( sum(prec for prec in precisions.values()) / len(precisions), 3) recall = round(sum(rec for rec in recalls.values()) / len(recalls), 3) return precision, recall
def select_model(user_review): user_review = data_prep() reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df( user_review[['user_id', 'business_id', 'stars']], reader) benchmark = [] # Iterate over all algorithms for algorithm in [ KNNBasic(), KNNBaseline(), KNNWithMeans(), SVD(), SVDpp(), SlopeOne(), NMF() ]: # Perform cross validation print(algorithm) print('start ......') results = cross_validate(algorithm, data, measures=['RMSE', 'MAE'], cv=3, verbose=False) # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) print(benchmark)
def test_knn_based(data): """ Parameters ---------- data : dataframe Dataframe with columns userId, movieId, and rating in that order. Returns ------- test_mse : float The mean squared error for the knn based algorithm. """ reader = Reader(rating_scale=(1, 5)) knn_data = Dataset.load_from_df(data, reader) trainset, testset = train_test_split(knn_data, test_size=.10, random_state=24) algo = KNNWithMeans(k=5, sim_options={ 'name': 'pearson_baseline', 'user_based': True }) algo.fit(trainset) predictions = algo.test(testset) test_mse = accuracy.mse(predictions, verbose=False) return test_mse
def work(data, k): history = {} p_history=[] r_history=[] f1_history=[] ndcg_history=[] sim_options = {'name':'cosine', 'user_based': True} algo = KNNWithMeans(k=k, min_k=1, sim_options=sim_options, verbose=False) KNNWithMeans_history = train_with_Kfold(algo, data, 5, False) p_history.append(KNNWithMeans_history.mean()[0]) r_history.append(KNNWithMeans_history.mean()[1]) f1_history.append(KNNWithMeans_history.mean()[2]) ndcg_history.append(KNNWithMeans_history.mean()[3]) history[str(k)] = { "precision" : p_history, "recall" : r_history, "f1" : f1_history, "ndcg" : ndcg_history } return history
def knn_m(data, training, testing): ''' Tune KNN with Means parameters then calculates RMSE, coverage and running time of KNN with Means Args: data(Dataset): the whole dataset divided into 5 folds training(Dataset): training dataset testing(Dataset): test dataset Returns: rmse: RMSE of KNN with Means with optimized parameters top_n: number of unique predictions for top n items ''' # candidate parameters knn_param_grid = {'k': [5, 10, 20], 'sim_options': {'name': ['msd', 'cosine', 'pearson'], 'min_support': [1, 5], 'user_based': [False]}} # optimize parameters knnm_grid_search = GridSearch(KNNWithMeans, knn_param_grid, measures=['RMSE'], verbose=False) knnm_grid_search.evaluate(data) param = knnm_grid_search.best_params['RMSE'] print('KNNWithMeans:', param) # fit model using the optimized parameters knnm = KNNWithMeans(k=param['k'], name=param['sim_options']['name'], min_support=param['sim_options']['min_support'], user_based=param['sim_options']['user_based']) knnm.train(training) # evaluate the model using test data predictions = knnm.test(testing) top_n = get_top_n(predictions, n=5) rmse = accuracy.rmse(predictions, verbose=True) return rmse, top_n
def __build_model(self): model_path = '{}{}'.format(self.file_prefix, self.model_path) try: model = joblib.load(model_path) print('recommender exists, load it') return model except Exception as e: print('recommender does not exist, build new recommender') # load data # initialize KNN recommender algo = KNNWithMeans(k=50, sim_options={ 'name': 'pearson_baseline', 'user_based': False }) # train model algo.fit(self.trainset) # save model joblib.dump(algo, model_path) # validation test_pred = algo.test(self.testset) accuracy.rmse(test_pred) return algo
def recommender_knn_baseline(self, train_file, test_file, output): train, test, train_dataset, test_dataset = prepare_datasets( train_file, test_file) # Use user_based true/false to switch between user-based or item-based collaborative filtering algo_knn_means = KNNWithMeans(verbose=False) algo_knn_means.fit(train) #not_seen_elems = self.merge_train_set(train_dataset, test_dataset) #predictions_precision_svd = algo_svd.test(not_seen_elems, test, verbose=False, not_seen_flag=True) predictions_knn_means = algo_knn_means.test(test, verbose=False) #precisions, recalls = self.precision_recall_at_k(predictions_precision_svd, 10, threshold=0.0) # Precision and recall can then be averaged over all users #precision_avg = sum(prec for prec in precisions.values()) / len(precisions) #recall_avg = sum(rec for rec in recalls.values()) / len(recalls) #print('Precision: ' + str(precision_avg) + ' Recall: ' + str(recall_avg) + ' RMSE: ' + str( # rmse(predictions_svd, verbose=False)) + ' MAE: ' + str(mae(predictions_svd, verbose=False))) print('KNN_BASELINE: ' + ' RMSE ' + str(rmse(predictions_knn_means, verbose=False)) + ' MAE ' + str(mae(predictions_knn_means, verbose=False))) return algo_knn_means
def main(): # Charge movielens-100k dataset data = Dataset.load_builtin('ml-100k') # Créer un jeu de test et de train ( 15%, 85%) trainset, testset = train_test_split(data, test_size=.15) # Détermine l'algorithme utilisé algo = KNNWithMeans() # Train sur le jeu de donnée trainset algo.fit(trainset) # Prediction sur le jeu de donnée testset predictions = algo.test(testset) # Affiche le RMSE accuracy.rmse(predictions) result =[] for prediction in predictions: # Calcul le delta entre la prediction et la réalité result.append(prediction.r_ui - prediction.est) # Affiche l'histogramme du delta entre les predictions et la réalité plt.hist(result, 100) plt.show()
def get_model(model_name): algo = None if 'KNN' in model_name: model_name = model_name.split('_') knn_model_name = model_name[0] user_based = False if len( model_name) > 1 and model_name[1] == 'I' else True dis_method = 'msd' if len(model_name) < 3 else model_name[2] k = 20 if len(model_name) < 4 else int(model_name[3]) sim_options = {'user_based': user_based, 'name': dis_method} if knn_model_name == 'KNNBasic': algo = KNNBasic(sim_options=sim_options, k=k) elif knn_model_name == 'KNNWithMeans': algo = KNNWithMeans(sim_options=sim_options, k=k) elif knn_model_name == 'KNNWithZScore': algo = KNNWithZScore(sim_options=sim_options, k=k) elif 'SVDpp' in model_name or 'SVD' in model_name or 'NMF' in model_name: model_name = model_name.split('_') n_factors = 25 if len(model_name) == 1 else int(model_name[1]) if model_name[0] == 'SVDpp': algo = SVDpp(n_factors=n_factors) elif model_name[0] == 'SVD': algo = SVD(n_factors=n_factors) elif model_name[0] == 'NMF': algo = NMF(n_factors=n_factors) return algo
def cal_KNNWithMeans(trainset, df): # KNNWithMeans sim_options = {'name': 'cosine', 'user-based': True} algo_knnm = KNNWithMeans(k=40, min_k=1, sim_options=sim_options) algo_knnm.fit(trainset) users = [] items = [] real = [] estimate = [] for i in range(len(df)): uid = df[i:i + 1].user.values[0] users.append(uid) iid = df[i:i + 1].store.values[0] items.append(iid) r_ui = df[i:i + 1].stars.values[0] real.append(r_ui) pred = algo.predict(uid, iid, r_ui, verbose=True) estimate.append(pred) print("end") # knn With Means df4 = pd.DataFrame(columns=['user', 'item', 'r_ui', 'est']) df4['user'] = users df4['item'] = items df4['r_ui'] = real df4['est'] = estimate #df3.head() df4['est'] = df4['est'].apply(lambda x: x[-2]) df4['err'] = abs(df4.est - df4.r_ui) df4.to_csv(save_file2)
def main(): # Charge movielens-100k dataset movielens_ds = Dataset.load_builtin('ml-100k') # Creer un jeu de test et de train ( 15%, 85%) trainset, testset = train_test_split(movielens_ds, test_size=.15) algo = KNNWithMeans() # Train sur le jeu de donnée trainset algo.fit(trainset) # Prediction sur le jeu de donnée testset predictions = algo.test(testset) # Affiche le RMSE accuracy.rmse(predictions) #print(predictions) result = [] for prediction in predictions: # Difference prediction et realite result.append(prediction.r_ui - prediction.est) # Histogramme du resultat plt.hist(result, 100) plt.show()
def __init__(self, train_data, model_to_use=["baselineonly", "svd", "coClustering", "knn"]): """initialize class with full dataset and a set of base models to use""" AlgoBase.__init__(self) self.available_models = { "baselineonly": BaselineOnly( bsl_options={ "method": "sgd", "n_epochs": 30, "reg": 0.1, "learning_rate": 0.005 }), "svd": SVD(lr_all=0.005, n_factors=50, reg_all=0.1), "coClustering": CoClustering(n_epochs=3, n_cltr_u=3, n_cltr_i=3), "knn": KNNWithMeans(k=40, sim_options={ "name": "cosine", "user_based": False }), } self.model_selection = [] for model in model_to_use: self.model_selection.append([model, self.available_models[model]]) self.model_rmse = {} self.model_mae = {} self.model_list = {} self.trainset = train_data.build_full_trainset()
def benchmark(data): performance = [] algorithms = [ SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering(), SVD_SGD_momentum(), SVDpp_SGD_momentum() ] for algorithm in algorithms: results = cross_validate(algorithm, data, measures=['RMSE', 'MAE', 'FCP'], cv=3, verbose=False) output = pd.DataFrame.from_dict(results).mean(axis=0) output = output.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) performance.append(output) output_df = pd.DataFrame(performance).set_index('Algorithm').sort_values( 'test_rmse') store_dataframe(output_df, 'Algorithm_Benchmark.csv')
def compAlgos(data): #Compare MAE, RMSE values for different algorithms print("\nLet us compare performance of KNN and SVD algorithms\n") #KNN Algos knn_Basic = cross_validate(KNNBasic(), data, cv=5, n_jobs=5, verbose=False) knn_means = cross_validate(KNNWithMeans(), data, cv=5, n_jobs=5, verbose=False) knn_z = cross_validate(KNNWithZScore(), data, cv=5, n_jobs=5, verbose=False) #SVD Algos svd = cross_validate(SVD(), data, cv=5, n_jobs=5, verbose=False) svdpp = cross_validate(SVDpp(), data, cv=5, n_jobs=5, verbose=False) print('\nKNN Basic: RMSE: {}, MAE: {}'.format( knn_Basic['test_rmse'].mean(), knn_Basic['test_mae'].mean())) print('\nKNN Means: RMSE: {}, MAE: {}'.format( knn_means['test_rmse'].mean(), knn_means['test_mae'].mean())) print('\nKNN Z Score: RMSE: {}, MAE: {}'.format(knn_z['test_rmse'].mean(), knn_z['test_mae'].mean())) print('\nSVD: RMSE: {}, MAE: {}'.format(svd['test_rmse'].mean(), svd['test_mae'].mean())) print('\nSVD ++: RMSE: {}, MAE: {}'.format(svdpp['test_rmse'].mean(), svdpp['test_mae'].mean())) print('\nBoth SVDs perform better on the dataset\n') print( '\nWe will test with KNN means from KNN family and SVDPP from svd family\n' )
def evaluate_on_test(self, train_set, test_set): """ Evaluate the algorithm on the test set after running it on the test set :param train_set: :param test_set: :return: RMSE value on test set """ if train_set is not None and test_set is not None: print("Evaluate RMSE on test data") self.LOG_HANDLE.info("Evaluate RMSE on test data") similarity_options = { 'name': 'msd', 'user_based': False, } # Use the KNN algorithm algo = KNNWithMeans(sim_options=similarity_options) # Train the algorithm on the trainset, and predict ratings for the testset algo.fit(train_set) predictions = algo.test(test_set) # Then compute RMSE return accuracy.rmse(predictions)
def DisplayGraphDelta(data) : """ Affichage du delta entre prédiction et réalité """ # Créer un jeu de test et de train ( 25%, 75%) trainset, testset = train_test_split(data, test_size=.25) algo = KNNWithMeans() # Train sur le jeu de donnée trainset algo.fit(trainset) # Prediction sur le jeu de donnée testset predictions = algo.test(testset) # Affiche le RMSE accuracy.rmse(predictions) #print(predictions) result =[] for prediction in predictions: print(prediction) # Calcul le delta entre la prediction et la réalité result.append(prediction.r_ui - prediction.est) # Affiche l'histogramme du delta entre les prediction et la réalité print(len(result)) plt.hist(result, 100) plt.show()
def CFM(self): u_id = [] I_id = [] r_ui_ = np.array([]) _est = np.array([]) sim_options = {'name': 'cosine', 'user_based': True} algo = KNNWithMeans(k=40, min_k=1, sim_options=sim_options) algo.fit(self.trainset) for uid in (self.list): lids = self.data[self.data.uid == uid] a = self.data[self.data.uid == uid] for i in range(1, len(a)): lid = lids[i - 1:i].lid.values[0] r_ui = lids[i - 1:i].rate.values[0] pred = algo.predict(uid, lid, r_ui, verbose=True) u_id.append(int(pred.uid)) I_id.append(int(pred.iid)) r_ui_ = np.append(r_ui_, pred.r_ui) _est = np.append(_est, pred.est) self.df_est = pd.DataFrame({ 'uid': u_id, 'Iid': I_id, 'r_ui': r_ui_, 'est': _est }) self.arr = self.df_est['uid'].unique() self.CFWM_ndcg_ = self.Calculate_NDCG()
def train(): # TODO put in real data here when we have collected enough ratings_dict = { "item": [1, 2, 1, 2, 1, 2, 1, 2, 1], "user": ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D', 'E'], "rating": [1, 0, 0, 0, 1, 0, 1, 1, 1], } df = pd.DataFrame(ratings_dict) reader = Reader(rating_scale=(0, 1)) # Loads Pandas dataframe data = Dataset.load_from_df(df[["user", "item", "rating"]], reader) trainingSet = data.build_full_trainset() # To use item-based cosine similarity sim_options = { "name": "cosine", "user_based": False, # Compute similarities between items } algo = KNNWithMeans(sim_options=sim_options) algo.fit(trainingSet) return algo
def to_test(k, option, model): df = pd.read_csv('training_set.dat') test_df = pd.read_csv('test_set.dat') reader = Reader(rating_scale=(1, 5)) trainingSet = Dataset.load_from_df(df, reader).build_full_trainset() testSet = Dataset.load_from_df(test_df, reader).build_full_trainset().build_testset() opt = {'name': option, 'user_based': False} if model == 'Basic': algo = KNNBasic(k = k,sim_options = opt) algo.fit(trainingSet) # dump.dump("KNNBS.model", algo=algo, verbose=1) elif model == 'WithMeans': algo = KNNWithMeans(k = k,sim_options = opt) algo.fit(trainingSet) # dump.dump("KNNWM.model", algo=algo, verbose=1) elif model == 'WithZScore': algo = KNNWithZScore(k = k,sim_options = opt) algo.fit(trainingSet) # dump.dump("KNNWZS.model", algo=algo, verbose=1) elif model == 'Baseline': algo = KNNBaseline(k = k,sim_options = opt) algo.fit(trainingSet)
def load_data(): data = Dataset.load_builtin('ml-100k') # similarity options sim_options = {"name": "msd", "user_based": False} param_grid = { "n_epochs": [5, 10], "lr_all": [0.002, 0.005], "reg_all": [0.4, 0.6] } # algorithm algo = KNNWithMeans(sim_options=sim_options) # computation training_set = data.build_full_trainset() algo.fit(training_set) # GRID SEACH, MATRIX FACTORIZATION print("Divide matrix in grids") gs = GridSearchCV(SVD, param_grid=param_grid, measures=["rmse"], cv=3) gs.fit(data) print(gs.best_score['rmse'])
def crossvalidate(data): results = [] for algorithm in [ NormalPredictor(), KNNBaseline(k=15, sim_options=similarity_measure('pearson', 1)), KNNBasic(k=15, sim_options=similarity_measure('pearson', 1)), KNNWithMeans(k=15, sim_options=similarity_measure('pearson', 1)), KNNWithZScore(k=15, sim_options=similarity_measure('pearson', 1)), BaselineOnly(), SVD(), SVDpp(), NMF(), SlopeOne(), CoClustering() ]: result = cross_validate(algorithm, data, measures=['RMSE'], cv=5, verbose=False) temp = pd.DataFrame.from_dict(result).mean(axis=0) temp = temp.append( pd.Series([str(algorithm).split(' ')[0].split(".")[-1]], index=['Algorithm'])) results.append(temp) rmse_values = pd.DataFrame(results).set_index('Algorithm').sort_values( 'test_rmse') return rmse_values