def slope_one(test, train, all): start = time.time() reader = Reader(rating_scale=(0.5, 5)) data = Dataset.load_from_df(train[['userId', 'movieId', 'rating']], reader) test_data = Dataset.load_from_df(test[['userId', 'movieId', 'rating']], reader) trainset = data.build_full_trainset() testset = test_data.build_full_trainset().build_testset() algo = SlopeOne() algo.fit(trainset) fit = time.time() fit_time = fit - start predictions = algo.test(testset) uid = [] mid = [] rate = [] for i in range(len(predictions)): uid.append(predictions[i].uid) mid.append(predictions[i].iid) rate.append(predictions[i].est) out = {'userId': uid, 'movieId': mid, 'rating': rate} out = pd.DataFrame.from_dict(out) predict_time = time.time() - fit overall = predict_time + fit - start return out, [fit_time, predict_time, overall]
def surprise_slopeOne(train_file, test_file): """ SlopeOne with Surprise library. Compute the predictions on a test_set after training on a train_set using the method SlopeOne from Surprise. Args: train_file (string): path to created test file test_file (string): path to created train file Hyperparameters: - Returns: numpy array: predictions """ print("slopeone") algo = SlopeOne() fold = [(train_file, test_file)] reader = Reader(line_format='user item rating', sep=',') data = Dataset.load_from_folds(fold, reader=reader) pkf = PredefinedKFold() for trainset, testset in pkf.split(data): # Train algo.fit(trainset) # Predict predictions = algo.test(testset) pred = np.zeros(len(predictions)) for i in range(len(predictions)): val = predictions[i].est pred[i] = val return pred
def SlopeOne_alg(): print('Using SlopeOne') alg = SlopeOne() print(alg) alg.fit(trainset) predictions = alg.test(testset) print(accuracy.rmse(predictions))
def get(self, user_id): # SQL query conn = mysql.connect() cursor = conn.cursor() df = pd.read_sql_query("SELECT * FROM story_reviews", conn) # Data and Model reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['user_id', 'story_id', 'star']], reader) model = SlopeOne() # Training training_set = data.build_full_trainset() model.fit(training_set) # Prediction anti_training_set = training_set.build_anti_testset() prediction_set = [x for x in anti_training_set if x[0]==user_id] predictions = model.test(prediction_set) # Return Top N Recommendations n = 10 predictions.sort(key=lambda x:x.est, reverse=True) top_n_predictions = predictions[:n] story_recommendations = [] for predictionItem in top_n_predictions: story_recommendations.append(predictionItem.iid) return jsonify(recommendations = story_recommendations)
def SlopeOne_train(self): ''' seed:int-3划分训练集测试集的随机种子 k:int-40,最大邻居数量 options:dict-{'name': 'pearson', 'user_based': False},算法的选项,默认为Pearson相似度,基于项目的方法 ''' self.algos = [] df = self.trainDatas names = locals() r = Reader(rating_scale=(1, 5)) # 读取、划分数据;训练预测数据 total = Dataset.load_from_df(df[['uid', 'iid', 'total']], reader=r) total_train = total.build_full_trainset() total_algo = SlopeOne() total_algo.fit(total_train) self.algos.append(total_algo) for i in range(1, self.no_of_criteria + 1): names['c' + str(i)] = Dataset.load_from_df( df[['uid', 'iid', 'c' + str(i)]], reader=r) names['c' + str(i) + '_train'] = names.get('c' + str(i)).build_full_trainset() names['algo_c' + str(i)] = SlopeOne() names.get('algo_c' + str(i)).fit(names.get('c' + str(i) + '_train')) self.algos.append(names.get('algo_c' + str(i)))
def slopeOne(trainset, testset): # Slope One print("\n" + "-" * 5 + " SlopeOne algorithm using surprise package " + "-" * 5) algo = SlopeOne() algo.fit(trainset) predictions = algo.test(testset) rmse = accuracy.rmse(predictions) mae = accuracy.mae(predictions) return rmse, mae, predictions
def slope_one(self): """ SlopeOne to reflect how much one item is liked over than another. Returns: predictions_df: The predictions of the model on the test data in Pandas Data Frame format """ algorithm = SlopeOne() predictions = algorithm.fit(self.train_data).test(self.test_data) predictions_df = self.data.test_df.copy() predictions_df['Rating'] = [x.est for x in predictions] if self.test_purpose: self.evalueate_model(predictions_df['Rating'], 'Surprise slope_one') return predictions_df
def slope_one(): print('Algoritmo Baseline Only...') print('Que data desea utilizar?') print('(1) Android') print('(2) WordPress') data_utilizar = input() # Funcion de encoding para no tener error de lectura del archivo. reload(sys) sys.setdefaultencoding('utf8') if data_utilizar == 1: file_path = configuration.FILE_PATH_ANDROID reader = Reader(line_format='user item rating', sep='\t') else: file_path = configuration.FILE_PATH_WORDPRESS reader = Reader(line_format='user item rating', sep=',') # Dataset data = Dataset.load_from_file(file_path, reader=reader) data.split(n_folds=10) algo = SlopeOne() perf = evaluate(algo, data, measures=['RMSE', 'MAE']) print_perf(perf)
def EvaluateDifferentAlgorithms(): benchmark = [] # Iterate over all algorithms for algorithm in [ SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering() ]: # Perform cross validation results = cross_validate(algorithm, data_6months, measures=['RMSE'], cv=3, verbose=False) # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) print( pd.DataFrame(benchmark).set_index('Algorithm').sort_values( 'test_rmse'))
def __init__(self): super().__init__("slope", SlopeOne, param_grid={}) best_params = super().tune() print(best_params) res = not best_params if res: self.algo = SlopeOne()
def select_model(user_review): user_review = data_prep() reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df( user_review[['user_id', 'business_id', 'stars']], reader) benchmark = [] # Iterate over all algorithms for algorithm in [ KNNBasic(), KNNBaseline(), KNNWithMeans(), SVD(), SVDpp(), SlopeOne(), NMF() ]: # Perform cross validation print(algorithm) print('start ......') results = cross_validate(algorithm, data, measures=['RMSE', 'MAE'], cv=3, verbose=False) # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) print(benchmark)
def check_for_args(): args = sys.argv for arg in args: if (arg == 'SVD'): alg_list.append(SVD()) elif (arg == 'SVDpp'): alg_list.append(SVDpp()) elif (arg == 'SlopeOne'): alg_list.append(SlopeOne()) elif (arg == 'NMF'): alg_list.append(NMF()) elif (arg == 'NormalPredictor'): alg_list.append(NormalPredictor()) elif (arg == 'KNNBaseline'): alg_list.append(KNNBaseline()) elif (arg == 'KNNBasic'): alg_list.append(KNNBasic()) elif (arg == 'KNNWithMeans'): alg_list.append(KNNWithMeans()) elif (arg == 'KNNWithZScore'): alg_list.append(KNNWithZScore()) elif (arg == 'BaselineOnly'): alg_list.append(BaselineOnly()) elif (arg == 'CoClustering'): alg_list.append(CoClustering()) return alg_list
def run_baselines(ratings_dict, compressed_test_ratings_dict, data_origin): for alg in algos: if alg == "KNNBasic": algo = KNNBasic() elif alg == "KNNWithZScore": algo = KNNWithZScore() elif alg == "SVD": algo = SVD() elif alg == "NMF": algo = NMF() elif alg == "SlopeOne": algo = SlopeOne() elif alg == "CoClustering": algo = CoClustering() if data_origin == 'netflix': nr_predictions, accuracy, rmse, mae, precision, recall, f1 = testing( algo, ratings_dict, compressed_test_ratings_dict, 'netflix') elif data_origin == 'small': nr_predictions, accuracy, rmse, mae, precision, recall, f1 = testing( algo, ratings_dict, compressed_test_ratings_dict, 'small') elif data_origin == '100k': nr_predictions, accuracy, rmse, mae, precision, recall, f1 = testing( algo, ratings_dict, compressed_test_ratings_dict, '100k') # print results print("\n\nAlg %s" % alg) print("Number of user-items pairs: %d" % nr_predictions) print("Accuracy: %.2f " % accuracy) print("RMSE: %.2f" % rmse) print("MAE: %.2f" % mae) print("Precision: %.2f" % precision) print("Recall: %.2f" % recall) print("F1: %.2f" % f1)
def benchmark(data): performance = [] algorithms = [ SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering(), SVD_SGD_momentum(), SVDpp_SGD_momentum() ] for algorithm in algorithms: results = cross_validate(algorithm, data, measures=['RMSE', 'MAE', 'FCP'], cv=3, verbose=False) output = pd.DataFrame.from_dict(results).mean(axis=0) output = output.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) performance.append(output) output_df = pd.DataFrame(performance).set_index('Algorithm').sort_values( 'test_rmse') store_dataframe(output_df, 'Algorithm_Benchmark.csv')
def crossvalidate(data): results = [] for algorithm in [ NormalPredictor(), KNNBaseline(k=15, sim_options=similarity_measure('pearson', 1)), KNNBasic(k=15, sim_options=similarity_measure('pearson', 1)), KNNWithMeans(k=15, sim_options=similarity_measure('pearson', 1)), KNNWithZScore(k=15, sim_options=similarity_measure('pearson', 1)), BaselineOnly(), SVD(), SVDpp(), NMF(), SlopeOne(), CoClustering() ]: result = cross_validate(algorithm, data, measures=['RMSE'], cv=5, verbose=False) temp = pd.DataFrame.from_dict(result).mean(axis=0) temp = temp.append( pd.Series([str(algorithm).split(' ')[0].split(".")[-1]], index=['Algorithm'])) results.append(temp) rmse_values = pd.DataFrame(results).set_index('Algorithm').sort_values( 'test_rmse') return rmse_values
def computeSlopeOne(data, test_np): """Compute the slope one method and return the predictions on the test The method has no parameter. data : data frame which represent the train set test_np : data frame on which the prediction will be returned return : test_np with a column of prediction named 'slopeone_rating'""" trainset, test = dataTrainSurprise(data, test_np) slopeone = SlopeOne().fit(trainset) test['slopeone_rating'] = test[['user_id', 'movie_id']] \ .apply(lambda row: slopeone.predict(row['user_id'], row['movie_id'])[3], axis=1) return test
def _hyperopt(self): algo = SlopeOne() return cross_validate(algo, self._data, measures=ACCURACY_METRICS, cv=self._cv, n_jobs=self._cv_n_jobs, verbose=self._debug)[self._metric].mean()
def slope_one(trainset, testset, predset): modelname = 'slopeone' # Check if predictions already exist if is_already_predicted(modelname): return algo = SlopeOne() print('SlopeOne Model') algo.train(trainset) predictions = algo.test(trainset.build_testset()) print(' RMSE on Train: ', accuracy.rmse(predictions, verbose=False)) predictions = algo.test(testset) rmse = accuracy.rmse(predictions, verbose=False) print(' RMSE on Test: ', rmse) preds = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds[j] = pred.est save_predictions(modelname, rmse, preds, 'test') print(' Evaluate predicted ratings...') predictions = algo.test(predset) preds = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds[j] = pred.est save_predictions(modelname, rmse, preds)
def SlopeOne(self, namefile, uid, iid, rati, value_uid, value_iid): test_data = pd.read_csv('./container/' + namefile) dt = pd.DataFrame(test_data) # Retrieve the trainset. reader = Reader(rating_scale=(0, 100)) data = Dataset.load_from_df(dt[[uid, iid, rati]], reader) trainset = data.build_full_trainset() algo = SlopeOne() algo.fit(trainset) pred = algo.predict(float(value_uid), float(value_iid), r_ui=1, verbose=True) #var_rmse = accuracy.rmse(pred) #return result to json jsondata = {} jsondata["uid"] = pred.uid jsondata["idd"] = pred.iid jsondata["rati"] = round(pred.est, 2) return jsondata
def generate_svd_recommendation_df() -> pd.DataFrame: # Prepare input DataFrame and algorithm score_df = genearte_score_df() svd_data = MyDataSet(score_df) #Try SVD algo = SVD() full_train_set = svd_data.build_full_trainset() test_set = full_train_set.build_anti_testset() # 5 fold validation score = cross_validate(algo, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # Fitting the SVD algo.fit(full_train_set) predictions = algo.test(test_set) # Then compute RMSE accuracy.rmse(predictions) # Generate recommendation DataFrame recommendation_df_svd = get_top_n(predictions, n=5) #print (recommendation_df) #Try the NMF nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) algo = NMF() full_train_set = svd_data.build_full_trainset() test_set = full_train_set.build_anti_testset() # 5 fold validation score = cross_validate(algo, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # Fitting the SVD algo.fit(full_train_set) predictions = algo.test(test_set) # Then compute RMSE accuracy.rmse(predictions) accuracy.mae(predictions) # Generate recommendation DataFrame recommendation_df_svd = get_top_n(predictions, n=5) #print (recommendation_df) #--------------------------------------------------- # as per - https://bmanohar16.github.io/blog/recsys-evaluation-in-surprise knnbasic_cv = cross_validate(KNNBasic(), svd_data, cv=5, n_jobs=5, verbose=False) knnmeans_cv = cross_validate(KNNWithMeans(), svd_data, cv=5, n_jobs=5, verbose=False) knnz_cv = cross_validate(KNNWithZScore(), svd_data, cv=5, n_jobs=5, verbose=False) # Matrix Factorization Based Algorithms svd_cv = cross_validate(SVD(), svd_data, cv=5, n_jobs=5, verbose=False) svdpp_cv = cross_validate(SVDpp(),svd_data, cv=5, n_jobs=5, verbose=False) nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) #Other Collaborative Filtering Algorithms slope_cv = cross_validate(SlopeOne(), svd_data, cv=5, n_jobs=5, verbose=False) coclus_cv = cross_validate(CoClustering(), svd_data, cv=5, n_jobs=5, verbose=False)
def randomize(): sim_options_cosine = {'name': 'cosine', 'user_based': False} sim_options_msd = {'name': 'msd', 'user_based': False} sim_options_pearson = {'name': 'pearson', 'user_based': False} sim_options_baseline = { 'name': 'pearson_baseline', 'user_based': False, 'shrinkage': 0 } algorithms = [ ('kNN Basic - Cosine', KNNBasic(sim_options=sim_options_cosine, verbose=False)), ('kNN Basic - MSD', KNNBasic(sim_options=sim_options_msd, verbose=False)), ('kNN Basic - Pearson', KNNBasic(sim_options=sim_options_pearson, verbose=False)), ('kNN Basic - Pearson B', KNNBasic(sim_options=sim_options_baseline, verbose=False)), ('kNN Means - Cosine', KNNWithMeans(sim_options=sim_options_cosine, verbose=False)), ('kNN Means - MSD', KNNWithMeans(sim_options=sim_options_msd, verbose=False)), ('kNN Means - Pearson', KNNWithMeans(sim_options=sim_options_pearson, verbose=False)), ('kNN Means - Pearson B', KNNWithMeans(sim_options=sim_options_baseline, verbose=False)), ('kNN Z - Cosine', KNNWithZScore(sim_options=sim_options_cosine, verbose=False)), ('kNN Z - MSD', KNNWithZScore(sim_options=sim_options_msd, verbose=False)), ('kNN Z - Pearson', KNNWithZScore(sim_options=sim_options_pearson, verbose=False)), ('kNN Z - Pearson B', KNNWithZScore(sim_options=sim_options_baseline, verbose=False)), ('kNN Baseline - Cosine', KNNBaseline(sim_options=sim_options_cosine, verbose=False)), ('kNN Baseline - MSD', KNNBaseline(sim_options=sim_options_msd, verbose=False)), ('kNN Baseline - Pearson', KNNBaseline(sim_options=sim_options_pearson, verbose=False)), ('kNN Baseline - Pearson B', KNNBaseline(sim_options=sim_options_baseline, verbose=False)), ('SVD', SVD(verbose=False)), ('SVDpp', SVDpp(verbose=False)), ('Baseline Only', BaselineOnly(verbose=False)), ('CoClustering', CoClustering(verbose=False)), ('SlopeOne', SlopeOne()), ('NMF', NMF(verbose=False)) ] random_ = random.randint(0, len(algorithms)) return algorithms[random_]
def get(self, algorithm, user_id): # SQL query conn = mysql.connect() cursor = conn.cursor() df = pd.read_sql_query("SELECT * FROM story_reviews", conn) # Data and Model reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['user_id', 'story_id', 'star']], reader) if algorithm=='svd': print('Using SVD') model = SVD() elif algorithm=='svdpp': print('Using SVD++') model = SVDpp() elif (algorithm=='nmf'): print('Using NMF') model = NMF() elif (algorithm=='slopeone'): print('Using Slope One') model = SlopeOne() elif (algorithm=='coclustering'): print('Using Co-Clustering') model = CoClustering() else: print('Using SVD') model = SVD() # Training training_set = data.build_full_trainset() model.fit(training_set) # Prediction anti_training_set = training_set.build_anti_testset() prediction_set = [x for x in anti_training_set if x[0]==user_id] predictions = model.test(prediction_set) # TESTING : Run 5-fold Cross Validation using Root Mean Square Error and Mean Average Error # cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # Return Top N Recommendations n = 10 predictions.sort(key=lambda x:x.est, reverse=True) top_n_predictions = predictions[:n] story_recommendations = [] for predictionItem in top_n_predictions: story_recommendations.append(predictionItem.iid) return jsonify(recommendations = story_recommendations)
def SlopeOne_from_to(self, namefile, uid, iid, rati, from_uid, to_uid, from_iid, to_iid): test_data = pd.read_csv('./container/' + namefile) dt = pd.DataFrame(test_data) # Retrieve the trainset. reader = Reader(rating_scale=(0, 100)) data = Dataset.load_from_df(dt[[uid, iid, rati]], reader) trainset = data.build_full_trainset() algo = SlopeOne() algo.fit(trainset) arr = [] for value_uid in range(from_uid, to_uid): for value_iid in range(from_iid, to_iid): pred = algo.predict(value_uid, value_iid, r_ui=1, verbose=True) tempdata = [] tempdata.append(pred.uid) tempdata.append(pred.iid) tempdata.append(round(pred.est, 2)) arr.append(tempdata) #return result to json return arr
def EvaluateAllModels(self): """ test_rmse fit_time test_time Algorithm SVDpp 0.965824 9.401286 0.151476 SVD 0.967286 1.474139 0.062471 BaselineOnly 0.972408 0.108964 0.057277 NMF 0.992677 4.073005 0.171846 KNNWithZScore 1.001898 0.620192 0.083341 KNNWithMeans 1.002924 0.489803 0.078121 SlopeOne 1.006664 19.091191 1.275676 KNNBaseline 1.007437 0.890452 0.088495 KNNBasic 1.016717 0.432159 0.072929 NormalPredictor 1.253265 0.041646 0.078105 CoClustering 1.828291 3.020921 0.052071 :return: test_rmse sonucu en düşük olan alınır. """ benchmark = [] # Iterate over all algorithms for algorithm in [ SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering() ]: # Perform cross validation results = cross_validate(algorithm, self.data, measures=['RMSE'], cv=3, verbose=False) # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) result = pd.DataFrame(benchmark).set_index('Algorithm').sort_values( 'test_rmse') print(result) return result
def calculateRMSE(self, method=9, similarityMeasure=1, isUserBased="Yes"): conn = sqlite3.connect(DATABASE_NAME) df = pd.read_sql_query( "SELECT userID, glassID, relativeRating FROM ratings", conn) reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df( df[['userID', 'glassID', 'relativeRating']], reader) trainset, testset = train_test_split(data, test_size=.20) isUserBased = True if (isUserBased == "Yes") else False if similarityMeasure == 1: similarityMeasure = "cosine" elif similarityMeasure == 2: similarityMeasure = "pearson" else: similarityMeasure = "pearson_baseline" sim_options = {'name': similarityMeasure, 'user_based': isUserBased} if method == 1: algo = SVD() elif method == 2: algo = SlopeOne() elif method == 3: algo = NMF() elif method == 4: algo = NormalPredictor() elif method == 5: algo = KNNBaseline(sim_options=sim_options) elif method == 6: algo = KNNBasic(sim_options=sim_options) elif method == 7: algo = KNNWithMeans(sim_options=sim_options) elif method == 8: algo = KNNWithZScore(sim_options=sim_options) elif method == 9: algo = BaselineOnly() else: algo = CoClustering() algo.fit(trainset) predictions = algo.test(testset) conn.close() #cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True) return round(accuracy.rmse(predictions, verbose=False), 4)
def __init__(self, modelName, dataPath): self.modelDict = { "KNNBasic": KNNBasic(), "KNNWithMeans": KNNWithMeans(), "KNNWithZScore": KNNWithZScore(), "SVD": SVD(), "SVDpp": SVDpp(), "NMF": NMF(), "SlopeOne": SlopeOne(), "CoClustering": CoClustering() } self.trainset = None self.testset = None self.data = None self.model = self.modelDict[modelName] self.loadData(os.path.expanduser(dataPath))
def select_cf_model(algorithms=[SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]): #=========================Create automated context to pick best CF model======================== benchmark = [] algos = [] # Iterate over all algorithms for algorithm in algorithms: # Perform cross validation results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False) algos = algos +[algorithm] # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) out = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse') return out,algos
def main(): parser = argparse.ArgumentParser() parser.add_argument("--train_file_path", default="data/train.csv", help="training file path") parser.add_argument("--test_file_path", default="data/test.csv", help="testing file path") parser.add_argument("--approach", default="SVD", help="Baseline | SVD | SlopeOne | NMF | CoClustering") parser.add_argument("--output_ranking_file", default="ranking", help="output ranking for test") bsl_options = {'method': 'sgd', 'n_epochs': 20, 'reg_u': 100, 'reg_i': 50} options = { "Baseline": BaselineOnly(bsl_options, verbose=True), "SVD": SVD(verbose=True, n_factors=20, n_epochs=3), "SlopeOne": SlopeOne(), "NMF": NMF(), "CoClustering": CoClustering() } args = parser.parse_args() reader = Reader(line_format='user item rating timestamp', sep='\t') algo = options[args.approach] train_data = Dataset.load_from_file(args.train_file_path, reader=reader) test_data = Dataset.load_from_file(args.test_file_path, reader=reader) train_set = train_data.build_full_trainset() test_set = test_data.build_full_trainset().build_testset() print("training....") algo.fit(train_set) print("testing...") predictions = algo.test(test_set) accuracy.mae(predictions, verbose=True) accuracy.rmse(predictions, verbose=True) ### Extra Credit output_ranking(predictions, args.output_ranking_file + "_" + args.approach + ".out") precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=2.5) print("Precision:", sum(prec for prec in precisions.values()) / len(precisions)) print("Recall:", sum(rec for rec in recalls.values()) / len(recalls)) print("F-measure:", f_measure(precisions, recalls)) print("conversion_rate:", get_conversion_rate(predictions, k=10)) print("ndcg:", get_ndcg(predictions, k_highest_scores=10))
def matrix_factorization_param(data_cv): # Iterate over all algorithms benchmark = [] for algorithm in [ SVD(), SVDpp(), NMF(), SlopeOne(), NormalPredictor(), CoClustering() ]: # Perform cross validation results = model_selection.cross_validate(algorithm, data_cv, measures=['RMSE', 'MAE'], cv=5, verbose=False) # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) rmse = pd.DataFrame(benchmark).set_index('Algorithm').sort_values( 'test_mae') #print(rmse) # Parameter grid param_grid = { 'n_factors': [100, 150, 200], 'n_epochs': [20, 40], 'lr_all': [0.001, 0.005, 0.008], 'reg_all': [0.075, 0.1, 0.15] } algorithm_gs = model_selection.GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5, n_jobs=-1) algorithm_gs.fit(data_cv) # best parameters for a model with the lowest rmse best_algo = algorithm_gs.best_estimator['rmse'] return best_algo
def checkBestAlgorithm(self): self.df = pd.read_csv(csv_name) reader = Reader(rating_scale=(1, 10)) data = Dataset.load_from_df(self.df[['user_id', 'item_id', 'rating']], reader) benchmark = [] rmseTuple = [] # 모든 알고리즘을 literate화 시켜서 반복문을 실행시킨다. for algorithm in [ SVD(), SVDpp(), SlopeOne(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering() ]: # 교차검증을 수행하는 단계. results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False) # 결과 저장과 알고리즘 이름 추가. tmp = pd.DataFrame.from_dict(results).mean(axis=0) rmseTuple.append((algorithm, tmp['test_rmse'])) tmp = tmp.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) print( pd.DataFrame(benchmark).set_index('Algorithm').sort_values( 'test_rmse')) print("\n") rmseTuple.sort(key=lambda x: x[1]) print("Best algorithm : ") print(str(rmseTuple[0]).split(' ')[0].split('.')[-1]) return rmseTuple[0]