def test_old_style_algo(small_ml): '''Test that old algorithms (i.e. algoritms that only define train()) can support both calls to fit() and to train() - supporting algo.fit() is needed so that custom algorithms that only define train() can still use up to date tools (such as evalute, which has been updated to use fit()). - algo.train() is the old way, and must still be supported for custom algorithms and tools. ''' class CustomAlgoTrain(AlgoBase): def __init__(self): AlgoBase.__init__(self) self.cnt = -1 def train(self, trainset): AlgoBase.train(self, trainset) self.est = 3 self.bu, self.bi = 1, 1 self.cnt += 1 def estimate(self, u, i): return self.est with pytest.warns(UserWarning): algo = CustomAlgoTrain() kf = KFold(n_splits=2) for i, (trainset, testset) in enumerate(kf.split(small_ml)): with pytest.warns(UserWarning): algo.fit(trainset) predictions = algo.test(testset) # Make sure AlgoBase.fit has been called assert hasattr(algo, 'trainset') # Make sure CustomAlgoFit.train has been called assert all(est == 3 for (_, _, _, est, _) in predictions) # Make sure AlgoBase.fit is finished before CustomAlgoTrain.train assert (algo.bu, algo.bi) == (1, 1) # Make sure the rest of train() is only called once assert algo.cnt == i with pytest.warns(UserWarning): algo = CustomAlgoTrain() for i, (trainset, testset) in enumerate(kf.split(small_ml)): with pytest.warns(UserWarning): algo.train(trainset) predictions = algo.test(testset) # Make sure AlgoBase.fit has been called assert hasattr(algo, 'trainset') # Make sure CustomAlgoFit.train has been called assert all(est == 3 for (_, _, _, est, _) in predictions) # Make sure AlgoBase.fit is finished before CustomAlgoTrain.train assert (algo.bu, algo.bi) == (1, 1) # Make sure the rest of train() is only called once assert algo.cnt == i
def test_KFold(toy_data): # Test n_folds parameter kf = KFold(n_splits=5) assert len(list(kf.split(toy_data))) == 5 with pytest.raises(ValueError): kf = KFold(n_splits=10) next(kf.split(toy_data)) # Too big (greater than number of ratings) with pytest.raises(ValueError): kf = KFold(n_splits=1) next(kf.split(toy_data)) # Too low (must be >= 2) # Make sure data has not been shuffled. If not shuffled, the users in the # testsets are 0, 1, 2... 4 (in that order). kf = KFold(n_splits=5, shuffle=False) users = [int(testset[0][0][-1]) for (_, testset) in kf.split(toy_data)] assert users == list(range(5)) # Make sure that when called two times without shuffling, folds are the # same. kf = KFold(n_splits=5, shuffle=False) testsets_a = [testset for (_, testset) in kf.split(toy_data)] testsets_b = [testset for (_, testset) in kf.split(toy_data)] assert testsets_a == testsets_b # test once again with another KFold instance kf = KFold(n_splits=5, shuffle=False) testsets_a = [testset for (_, testset) in kf.split(toy_data)] assert testsets_a == testsets_b # We'll now shuffle b and check that folds are different. # (this is conditioned by seed setting at the beginning of file) kf = KFold(n_splits=5, random_state=None, shuffle=True) testsets_b = [testset for (_, testset) in kf.split(toy_data)] assert testsets_a != testsets_b # test once again: two calls to kf.split make different splits when # random_state=None testsets_a = [testset for (_, testset) in kf.split(toy_data)] assert testsets_a != testsets_b # Make sure that folds are the same when same KFold instance is used with # suffle is True but random_state is set to some value kf = KFold(n_splits=5, random_state=1, shuffle=True) testsets_a = [testset for (_, testset) in kf.split(toy_data)] testsets_b = [testset for (_, testset) in kf.split(toy_data)] assert testsets_a == testsets_b # Make sure raw ratings are not shuffled by KFold old_raw_ratings = copy(toy_data.raw_ratings) kf = KFold(n_splits=5, shuffle=True) next(kf.split(toy_data)) assert old_raw_ratings == toy_data.raw_ratings
def test_new_style_algo(small_ml): '''Test that new algorithms (i.e. algoritms that only define fit()) can support both calls to fit() and to train() - algo.fit() is the new way of doing things - supporting algo.train() is needed for the (unlikely?) case where a user has defined custom tools that use algo.train(). ''' class CustomAlgoFit(AlgoBase): def __init__(self): AlgoBase.__init__(self) self.cnt = -1 def fit(self, trainset): AlgoBase.fit(self, trainset) self.est = 3 self.bu, self.bi = 1, 1 self.cnt += 1 def estimate(self, u, i): return self.est algo = CustomAlgoFit() kf = KFold(n_splits=2) for i, (trainset, testset) in enumerate(kf.split(small_ml)): algo.fit(trainset) predictions = algo.test(testset) # Make sure AlgoBase.fit has been called assert hasattr(algo, 'trainset') # Make sure CustomAlgoFit.fit has been called assert all(est == 3 for (_, _, _, est, _) in predictions) # Make sure AlgoBase.fit is finished before CustomAlgoFit.fit assert (algo.bu, algo.bi) == (1, 1) # Make sure the rest of fit() is only called once assert algo.cnt == i algo = CustomAlgoFit() for i, (trainset, testset) in enumerate(kf.split(small_ml)): with pytest.warns(UserWarning): algo.train(trainset) predictions = algo.test(testset) # Make sure AlgoBase.fit has been called assert hasattr(algo, 'trainset') # Make sure CustomAlgoFit.fit has been called assert all(est == 3 for (_, _, _, est, _) in predictions) # Make sure AlgoBase.fit is finished before CustomAlgoFit.fit assert (algo.bu, algo.bi) == (1, 1) # Make sure the rest of fit() is only called once assert algo.cnt == i
def train_with_Kfold(algo, data, k=5, verbose=True): kf = KFold(n_splits=k,) history = pd.DataFrame(columns=['precision','recall', 'f1', 'NDCG']) i = 0 for trainset, testset in kf.split(data): # algo 는 fit의 인자로 trainset 객체를 받고, algo.fit(trainset) predictions = algo.test(testset) # test의 인자로 튜플의 list 를 받는다. precisions, recalls = precision_recall_at_k(predictions, k=15, threshold=4) P = sum(rec for rec in precisions.values()) / len(precisions) R = sum(rec for rec in recalls.values()) / len(recalls) F1 = (2 * P * R) / (P + R) # NDCG 의 top k rank 는 k=5 사용 NDCG = ndcg_at_k_all(predictions, k=5) history.loc[i]=[P, R, F1, NDCG] if verbose: print(f"FOLD: {i}") print("precision: ", P) print("recall: ",R) print("f1: ",F1) print("NDCG: ",NDCG) print("------") i +=1 return history
def test() -> [Dict[str, object]]: alg_list = [SVD, KNNBaseline, BaselineOnly, CoClustering, NMF, KNNBasic, KNNWithMeans] seed = 0 random.seed(seed) np.random.seed(seed) interactions: List[Interaction] = load_sorted_test_interactions() parsed_data: ParsedData = Parser.parse(interactions) kf = KFold(n_splits=5) entries = [] for trainset, testset in kf.split(parsed_data.whole_data_set): for alg_to_test in alg_list: print("TESTING ALGORITHM: " + alg_to_test.__name__ + ", TIME: ") try: before = datetime.now() predictions: List[Prediction] = AlgoTrainer.calc_predictions(trainset, testset, alg_to_test()) time_elapsed = (datetime.now() - before).total_seconds() recommender = Recommender(parsed_data.ids_offers_map, predictions) entry: Dict[str, object] = {'rmse': recommender.calc_rmse()} entry['algorithm'] = alg_to_test.__name__ entry['time_elapsed'] = time_elapsed entries.append(entry) except Exception as e: print(e) print("") return entries
def collaborative_filter(id, new_words): ratings_dict = calc_collaborative_param(new_words, id) df = pd.DataFrame(ratings_dict) # A reader is still needed but only the rating_scale param is required. reader = Reader(rating_scale=(0.0, 5.0)) # The columns must correspond to user id, item id and ratings (in that order). data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader) # define a cross-validation iterator kf = KFold(n_splits=3) algo = KNNBasic() for trainset, testset in kf.split(data): # train and test algorithm. algo.fit(trainset) kf_predictions = algo.test(testset) # Compute and print Root Mean Squared Error accuracy.rmse(kf_predictions, verbose=True) trainset = data.build_full_trainset() new_data = trainset.build_anti_testset() predictions = algo.test(new_data) top_n = get_top_n(predictions, n=3) with open('top_n.json', 'w') as fp: dump(top_n, fp, indent=4) return top_n
def check_k_and_thresh(algo): global predictions prec_to_ave = [] rec_to_ave = [] kf = KFold(n_splits=30) for trainset, testset in kf.split(data): algo.fit(trainset) predictions = algo.test(testset) precisions, recalls = precision_recall_at_k(predictions, k=30, threshold=2.5) # Precision and recall can then be averaged over all users prec_to_ave.append(sum(prec for prec in precisions.values()) / len(precisions)) rec_to_ave.append(sum(rec for rec in recalls.values()) / len(recalls)) results = [] for i in range(2, 30): precisions, recalls = precision_recall_at_k(predictions, k=i, threshold=2.5) # Precision and recall can then be averaged over all users prec = sum(prec for prec in precisions.values()) / len(precisions) rec = sum(rec for rec in recalls.values()) / len(recalls) results.append({'K': i, 'Precision': prec, 'Recall': rec}) K = np.arange(2,30) precs = [] recs = [] for i in range(len(K)): precs.append(results[i].get("Precision")) recs.append(results[i].get("Recall")) plt.plot(K, precs) plt.plot(K, recs)
def context_RMSE(file_path, context, algo_id, k=10): #Define o algoritimo algo = get_algo(algo_id) #Define o padrao de leitura dos arquivos reader = Reader(line_format='user item rating', sep=',', skip_lines=1) #Cria o dataset baseado nos dados lidos data = Dataset.load_from_file(file_path, reader) # define a cross-validation iterator kf = KFold(k) if not os.path.exists('resultados'): os.makedirs('resultados') with open("resultados/RMSE_" + context + '_' + str(algo_id) + ".csv", "w") as result_file: result_file.write('RMSEs:\n') # Printa os itens recomendados para cada usuario for trainset, testset in kf.split(data): # treina e testa o algoritimo algo.fit(trainset) predictions = algo.test(testset) result_file.write(str(accuracy.rmse(predictions)) + '\n')
def Kfold_validation(k, algo, data): # determining number of folds of splitting kf = KFold(n_splits=k) # dictionary to hold folds with their MAE values fold_dict = {} # list of folds numbers folds = [] # list of errors error = [] for j, (trainset, testset) in enumerate(kf.split(data)): start_time = time.time() #append fold number in folds list folds.append('FOLD ' + str(j)) #fitting algorithm on training set algo.fit(trainset) #predicting on test set predictions = algo.test(testset) #appending error in errors list error.append(surprise.accuracy.mae(predictions, verbose=False)) end_time = time.time() print('Fold {}, MAE: {:.3f}, Time Elapsed: {:.3f} seconds'.format( j, error[j], end_time - start_time)) #making key value pairs in dictionary #FOLD as key and folds list as value fold_dict['FOLD'] = folds #MAE as key and error list as value fold_dict['MAE'] = error return fold_dict
def train_trim_knn(data, R): kfold = KFold(n_splits=10) sim_options = {'name': 'pearson'} rmse_list = [[], [], []] for k in range(2, 102, 2): print("using k = %d" % k) p_rmse = [] u_rmse = [] hv_rmse = [] knn = KNNWithMeans(k=k, sim_options=sim_options) for trainset, testset in kfold.split(data): knn.fit(trainset) (p_testset, u_testset, hv_testset) = trim(testset, R) p_pred = knn.test(p_testset) u_pred = knn.test(u_testset) hv_pred = knn.test(hv_testset) p_rmse.append(accuracy.rmse(p_pred)) u_rmse.append(accuracy.rmse(u_pred)) hv_rmse.append(accuracy.rmse(hv_pred)) rmse_list[0].append(np.mean(p_rmse)) rmse_list[1].append(np.mean(u_rmse)) rmse_list[2].append(np.mean(hv_rmse)) print("KNN with trim is finished!!") return rmse_list
def run_with_diff_k(self, algo, args, range_, folds=2, test_filter=None, threshold=2, msg=None, modal_name=None): arg_name = { 'KNN': 'k', 'NMF': 'n_factors', 'SVD': 'n_factors' }[modal_name] rmse_by_k = [] mae_by_k = [] k_values = [] for k in range(*range_): k_values.append(k) args.update({arg_name: k}) modal = algo(**args) kf = KFold(n_splits=folds) rmse_by_fold = [] mae_by_fold = [] for trainset, testset in kf.split(self.data): modal.fit(trainset) if test_filter: testset = test_filter(testset, threshold) predictions = modal.test(testset) rmse_by_fold.append(accuracy.rmse(predictions, verbose=True)) mae_by_fold.append(accuracy.mae(predictions, verbose=True)) rmse_by_k.append(np.mean(rmse_by_fold)) mae_by_k.append(np.mean(mae_by_fold)) plt.plot(k_values, rmse_by_k) plt.plot(k_values, mae_by_k) plt.legend(['RMSE', 'MAE']) plt.title(msg) plt.show()
def train_trim_nmf(data, R): kfold = KFold(n_splits=10) rmse_list = [[], [], []] for k in range(2, 52, 2): print("using k = %d" % k) p_rmse = [] u_rmse = [] hv_rmse = [] nmf = NMF(n_factors=k) for trainset, testset in kfold.split(data): nmf.fit(trainset) (p_testset, u_testset, hv_testset) = trim(testset, R) p_pred = nmf.test(p_testset) u_pred = nmf.test(u_testset) hv_pred = nmf.test(hv_testset) p_rmse.append(accuracy.rmse(p_pred)) u_rmse.append(accuracy.rmse(u_pred)) hv_rmse.append(accuracy.rmse(hv_pred)) rmse_list[0].append(np.mean(p_rmse)) rmse_list[1].append(np.mean(u_rmse)) rmse_list[2].append(np.mean(hv_rmse)) print("NMF with trim is finished!!") return rmse_list
def Q30to33(qNum): data = load_data() data_full = data.build_full_trainset() pop, unpop, highVar = classifyMovies() kf = KFold(n_splits=10) ncf = NaiveCF() ncf.fit(data_full) subRMSE = np.array([]) iter = 1 for trainSet, testSet in kf.split(data): if qNum == 31: testSet = list(filter(lambda x: x[1] in pop, testSet)) if qNum == 32: testSet = list(filter(lambda x: x[1] in unpop, testSet)) if qNum == 33: testSet = list(filter(lambda x: x[1] in highVar, testSet)) nTest = len(testSet) print("Split " + str(iter) + ": test set size after trimming: %d", nTest) iter += 1 uid, iid, tr, est = ncf.test(testSet) subsubRMSE = pow(est - tr, 2) subsubRMSE = np.sum(subsubRMSE) subRMSE = np.append(subRMSE, np.sqrt(subsubRMSE / nTest)) RMSE = np.mean(subRMSE) print("Q" + str(qNum) + " has RMSE " + str(RMSE))
def surpriseSVD(movieLensDataPath='data_clean.txt'): ''' Basic use of the surprise SVD algorithm. ''' ''' Params: movieLensDataPath is the path to the movielens data we're looking at. ''' ''' Note: replace with cleaned data. ''' ''' We want to return U and V where for a Y of a matrix of movie ratings, Y ~/= U^TV.''' # Load the data as a pandas data frame, as reading from text didn't quite work at first. df = pd.read_csv(movieLensDataPath, sep="\t", header=None) df.columns = ["User Id", "Movie Id", "Rating"] # We need the rating scale. reader = Reader(rating_scale=(1, 5)) # The columns are User Id, Movie Id, and Rating. data = Dataset.load_from_df(df[["User Id", "Movie Id", "Rating"]], reader) # To fit to the SVD algorithm, we have to convert it to a trainset. algo = SVD() trainset = data.build_full_trainset() algo.fit(trainset) # U and V! algop = algo.pu algoq = algo.qi # Simple crossvalidation kf = KFold(n_splits=3) algo = SVD() for trainset, testset in kf.split(data): # train and test algorithm. algo.fit(trainset) predictions = algo.test(testset) # Compute and print Root Mean Squared Error accuracy.rmse(predictions, verbose=True) # Return U (pu) and V (qi) return algop, algoq
def my_cross_validation(algo, data, k=5, threshold=7, n_splits=5, verbose=False): kf = KFold(n_splits=n_splits) cv_map = {'map@{}'.format(k): [], 'mar@{}'.format(k): []} time_map = {'Fit time': [], 'Test time': []} for trainset, testset in kf.split(data): step_one = datetime.now() algo.fit(trainset) step_two = datetime.now() predictions = algo.test(testset) step_three = datetime.now() precisions, recalls = precision_recall_at_k(predictions, k=k, threshold=threshold) cv_map['map@{}'.format(k)].append(sum(precisions.values()) / len(precisions)) cv_map['mar@{}'.format(k)].append(sum(recalls.values()) / len(recalls)) time_map['Fit time'].append((step_two - step_one).total_seconds()) time_map['Test time'].append((step_three - step_two).total_seconds()) if verbose: print_summary( algo, ['map@{}'.format(k), 'mar@{}'.format(k)], cv_map, time_map['Fit time'], time_map['Test time'], n_splits ) cv_map.update(time_map) return cv_map
def kfold_crossvalidation(data, model, folds=5, k=5, threshold=4): """Preforms K fold crossvalidation on a KNN surprise model and returns average precision and recall. Arguments: data {surprise.dataset.DatasetAutoFolds} -- Surprise Dataset model {surprise.prediction_algorithms.knns.KNNBasic} -- Surprise KNNBasic model folds {int} -- number of folds in cross validation (default: {5}) k {int} -- number of metrics -- (default: {10}) threshold {int} -- ratings threshold -- (default {3}) Returns: Tuple consisting of: average_precision {float} -- Average precision of the model average_recall {float} -- Average recall of the model """ kf = KFold(n_splits=folds) preclist = [] reclist = [] for trainset, testset in kf.split(data): #cross validation splits model.fit(trainset) #fit model on trainset predictions = model.test(testset) precisions, recalls = precision_recall_at_k(predictions, k=k, threshold=threshold) total_precision = (sum(prec for prec in precisions.values()) / len(precisions)) total_recall = (sum(rec for rec in recalls.values()) / len(recalls)) preclist.append(total_precision) reclist.append(total_recall) average_precision = np.mean(preclist) average_recall = np.mean(reclist) return average_precision, average_recall
def draw_t_prec_recall(algo, kf, t_low, t_high, thre): kf = KFold(n_splits=kf) ts = [i for i in range(t_low, t_high + 1)] precision = [] recall = [] for t in ts: temp_prec = [] temp_recall = [] for trainset, testset in kf.split(data): # train and test algorithm. algo.fit(trainset) trimmed_testset = testset_trim(testset, t, threshold=thre) predictions = algo.test(trimmed_testset) precisions, recalls = precision_recall_at_t(predictions, t, threshold=thre) fold_mean_prec = sum(prec for prec in precisions.values()) / len(precisions) fold_mean_recall = sum(rec for rec in recalls.values()) / len(recalls) temp_prec.append(fold_mean_prec) temp_recall.append(fold_mean_recall) t_mean_prec = sum(prec for prec in temp_prec) / len(temp_prec) t_mean_recall = sum(rec for rec in temp_recall) / len(temp_recall) precision.append(t_mean_prec) recall.append(t_mean_recall) return ts, precision, recall
def get_accuracy(df, genre, neighbors=30, min_neighbors=5, seed=12345, kfolds=5, k=5, threshold=4): """ Gets the precision and accuracy of the model for each genre using cross validation Args: df (pandas.DataFrame): the dataset of actual ratings genre (str): the genre for the model neighbors (int): the number of neighbors to take into account when training the model Default is 30. min_neighbors (int): the number of neighbors a user must have in order to get a prediction. Default is 5. seed (int): setting the random state. Default is 12345. kfolds (int): the number of folds for cross validation. Default is 5. k (int): number of recommendations for each user. default is 5. threshold (int): the cutoff rating at which an item will be considered 'enjoyed.' Returns: prec (int): The average of precision across the kfolds cross validation rec (int): The average of recall across the kfolds cross validation """ data = df[df['genre'] == genre] data = data[['user_id', 'book_id', 'rating']] reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(data[['user_id', 'book_id', 'rating']], reader) algo_KNNbasic = KNNBasic(k=neighbors, min_k=min_neighbors, random_state=seed) kf = KFold(n_splits=kfolds, random_state=seed) prec_list = [] recalls_list = [] for trainset, testset in kf.split(data): algo_KNNbasic.fit(trainset) predictions = algo_KNNbasic.test(testset) precisions, recalls = precision_recall_at_k(predictions, k=k, threshold=threshold) # Precision and recall can then be averaged over all users logger.info("Precision:") logger.info( sum(prec for prec in precisions.values()) / len(precisions)) precision = (sum(prec for prec in precisions.values()) / len(precisions)) logger.info("Recall") logger.info(sum(rec for rec in recalls.values()) / len(recalls)) recall = (sum(rec for rec in recalls.values()) / len(recalls)) prec_list.append(precision) recalls_list.append(recall) prec = (sum(prec_list) / len(prec_list)) rec = (sum(recalls_list) / len(recalls_list)) return prec, rec
def eval_model(model): kf = KFold(n_splits=3) for trainset, testset in kf.split(data): #训练并预测 model.fit(trainset) predictions = model.test(testset) #计算RMSE accuracy.rmse(predictions, verbose=True)
def rank_predictions(model_name): k_KNN = 22 k_NNMF = 20 k_MF = 26 if model_name == 'KNN': sim_options = { 'name': 'pearson_baseline', 'shrinkage': 0 } model = KNNWithMeans(k_KNN, sim_options=sim_options) elif model_name == 'NNMF': model = NMF(n_factors= k_NNMF) else: model = SVD(n_factors = k_MF) precision_arr = [] recall_arr = [] for t in range (1,26): kf = KFold(n_splits=10) print(t) p = [] r = [] for trainSet, testSet in kf.split(data): model.fit(trainSet) predictions = model.test(testSet) precisions, recalls = precision_recall (predictions, t) p.append(sum(prec for prec in precisions.values()) / len(precisions)) r.append(sum(rec for rec in recalls.values()) / len(recalls)) precision_arr.append(np.mean(np.array(p))) recall_arr.append(np.mean(np.array(r))) # precision vs t plt.plot(list(range (1,26)), precision_arr) plt.xlabel("Size") plt.ylabel("Precision") plt.title("The average precision plot using " + model_name) plt.show() # recall vs t plt.plot(list(range (1,26)), recall_arr) plt.xlabel("Size") plt.ylabel("Recall") plt.title("The average recall plot using MF " + model_name) plt.show() # precision vs recall plt.plot(recall_arr, precision_arr) plt.xlabel("Recall") plt.ylabel("Precision") plt.title("The average precision and recall plot using " + model_name) plt.show() return precision_arr, recall_arr
def svdpp(dataset): start = time.time() algo = SVDpp() kf = KFold(n_splits=5) for trainset, testset in kf.split(dataset): algo.fit(trainset) predictions = algo.test(testset) acc = accuracy.rmse(predictions, verbose=True) end = time.time() print('svdpp花分钟数为:', (end - start) / 60) return acc
def cross_validation(self, data, algo): # define a cross-validation iterator kf = KFold(n_splits=7, random_state=2) for trainset, testset in kf.split(data): # train and test algorithm. algo.fit(trainset) predictions = algo.test(testset) # Compute and print Root Mean Squared Error accuracy.rmse(predictions, verbose=True)
def train_naive(data, R): kfold = KFold(n_splits=10) ur_mean = np.mean(R, axis=1) rmse = [] for _, testset in kfold.split(data): r_pred = [] r = [] for item in testset: r_pred.append(ur_mean[int(item[0]) - 1]) r.append(item[2]) rmse.append((np.mean((np.array(r_pred) - np.array(r))**2))**0.5) return np.mean(rmse)
def train(): data = load_dataset() algo_svd = SVD() algo_nmf = NMF() print("Cross Validation procedure") kf = KFold(n_splits=KFOLD_NUM) for i, (trainset_cv, testset_cv) in enumerate(kf.split(data), start=1): print(f"===> Fold number {i}") # Save the first fold train_helper(algo_svd, "SVD", trainset_cv, testset_cv, i == 1) train_helper(algo_nmf, "NMF", trainset_cv, testset_cv, i == 1)
def run(self): #will run model ratings = pd.read_csv('rating_final.csv') ratings_dict = {"userID": list(ratings.userID), "placeID": list(ratings.placeID), "rating": list(ratings.rating)} df = pd.DataFrame(ratings_dict) reader = Reader(rating_scale=(0, 2)) data = Dataset.load_from_df(df[["userID", "placeID", "rating"]], reader) # To use item-based cosine similarity sim_options = { "name": "cosine", "user_based": True, # Compute similarities between items "min_support":9 } # define a cross-validation iterator kf = KFold(n_splits=5) algo = KNNWithMeans(sim_options=sim_options) places = list(df['placeID'].unique()) ordered = ArrayList() for i in places: total=0 for trainset, testset in kf.split(data): #finds result for each fold # train algorithm. algo.fit(trainset) #test algorithm #predictions = algo.test(testset) # Compute and print Root Mean Squared Error #accuracy.rmse(predictions, verbose=True) #gets predicted rating for each place prediction = algo.predict(self.user, i, verbose=False) total+=prediction.est ordered.append(i, total/5) #we find average of estimate for each fold ordered.sort() highest = ordered.inArray[ordered.count - 5:ordered.count] place = pd.read_csv('geoplaces2.csv') #placedf = pd.DataFrame({"placeID": list(place.placeID), "name": list(place.name)}) count = 0 finalRec=ArrayList() for i in range(len(highest) - 1, -1, -1): count += 1 name = list(place[place["placeID"].unique() == highest[i].id]['name']) finalRec.append(count, name[0]) #printing accuracy score out = cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=False) mean_rmse = '{:.3f}'.format(np.mean(out['test_rmse'])) print(mean_rmse) return finalRec.inArray
def func6(): from surprise import SVD from surprise import Dataset from surprise import accuracy from surprise.model_selection import KFold data = Dataset.load_builtin('ml-100k') kf = KFold(n_splits=3) algo = SVD() for trainset, testset in kf.split(data): algo.fit(trainset) predictions = algo.test(testset) accuracy.rmse(predictions, verbose=True)
def printingModelPrecisionAndRecall(algo, dataSet): kf = KFold(n_splits=5) for trainset, testset in kf.split(dataSet): algo.fit(trainset) predictions = algo.test(testset) precisions, recalls = precision_recall_at_k(predictions, k=5) # Precision and recall can then be averaged over all users print("Precision value: " + str(sum(prec for prec in precisions.values()) / len(precisions))) print("Recall value: " + str(sum(rec for rec in recalls.values()) / len(recalls)))
def surprise_cv_algo(data, algo, k_fold=5, verbose=True): # Split into folds kf = KFold(n_splits=k_fold) rmse_ = 0 for trainset, testset in kf.split(data): # train and test algorithm. model = algo.fit(trainset) predictions = algo.test(testset) # Compute and print RMSE rmse_ += accuracy.rmse(predictions, verbose=verbose) rmse_mean = rmse_ / k_fold return rmse_mean
def calculate_precision_recall(classifiers, threshold, data): kf = KFold(n_splits=10) precisions = [[], [], []] recalls = [[], [], []] for t in range(1, 26): precision_list = [] recall_list = [] for i in range(3): classifier = classifiers[i] if i == 1: print("doing nmf") elif i == 2: print("doing svd") for trainset, testset in kf.split(data): pass classifier.fit(trainset) prediction = classifier.test(testset) S = dict() # user: 88 item: 337 r_ui = 3.50 est = 3.74 {'actual_k': 24, 'was_impossible': False} for (uid, mid, r, r_pred, _) in prediction: if uid in S: S[uid].append((mid, r, r_pred)) else: S[uid] = [(mid, r, r_pred)] count, p_sum, r_sum = (0, 0, 0) for uid in S: if len(S[uid]) >= t: pred_r = S[uid] G = set([ x[0] for x in filter(lambda x: x[1] >= threshold, pred_r) ]) if len(G) != 0: pred_r = sorted(pred_r, key=lambda x: -int(x[2])) S2 = set([x[0] for x in pred_r[:t]]) inter = G & S2 precision = float(len(inter)) / len(S2) recall = float(len(inter)) / len(G) count += 1 p_sum += precision r_sum += recall precisions[i].append(p_sum / count) recalls[i].append(r_sum / count) return precisions, recalls
def NMF_trim_filter(ratings, dims, func, mv_dict): reader = Reader(rating_scale=(0.0, 5.0)) data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader) RMSE = np.empty([len(dims)]) MAE = np.empty([len(dims)]) min_RMSE = False min_MAE = False fac_num_RMSE = 0 fac_num_MAE = 0 kf = KFold(n_splits=10, random_state=42) for k in range(len(dims)): nmf = NMF(n_factors=dims[k], random_state=42) test_rmse = np.array([]) test_mae = np.array([]) for trainset, testset in kf.split(data): nmf.fit(trainset) full_data = trainset.build_testset() + testset func(mv_dict, testset) pred = nmf.test(testset) test_rmse = np.append(test_rmse, accuracy.rmse(pred, verbose=False)) test_mae = np.append(test_mae, accuracy.mae(pred, verbose=False)) RMSE[k] = np.mean(test_rmse) if ((not min_RMSE) or RMSE[k] < min_RMSE): min_RMSE = RMSE[k] fac_num_RMSE = dims[k] MAE[k] = np.mean(test_mae) if ((not min_MAE) or MAE[k] < min_MAE): min_MAE = MAE[k] fac_num_MAE = dims[k] print('For k = %i :' % dims[k]) print('RMSE: ', RMSE[k]) print('MAE: ', MAE[k]) plt.plot(dims, RMSE) plt.plot(dims, MAE) plt.legend(['RMSE', 'MAE']) plt.show() print('Finishing Plotting...') print('For RMSE:') print('\t---Optimal number of latent factors is ', fac_num_RMSE) print('\t---Minumun Average RMSE is ', min_RMSE) print('\nFor MAE:') print('\t---Optimal number of latent factors is ', fac_num_MAE) print('\t---Minumun Average MAE is ', min_MAE)
""" This module descibes how to use cross-validation iterators. """ from __future__ import (absolute_import, division, print_function, unicode_literals) from surprise import SVD from surprise import Dataset from surprise import accuracy from surprise.model_selection import KFold # Load the movielens-100k dataset data = Dataset.load_builtin('ml-100k') # define a cross-validation iterator kf = KFold(n_splits=3) algo = SVD() for trainset, testset in kf.split(data): # train and test algorithm. algo.fit(trainset) predictions = algo.test(testset) # Compute and print Root Mean Squared Error accuracy.rmse(predictions, verbose=True)
def test_KFold(toy_data): # Test n_folds parameter kf = KFold(n_splits=5) assert len(list(kf.split(toy_data))) == 5 with pytest.raises(ValueError): kf = KFold(n_splits=10) next(kf.split(toy_data)) # Too big (greater than number of ratings) with pytest.raises(ValueError): kf = KFold(n_splits=1) next(kf.split(toy_data)) # Too low (must be >= 2) # Make sure data has not been shuffled. If not shuffled, the users in the # testsets are 0, 1, 2... 4 (in that order). kf = KFold(n_splits=5, shuffle=False) users = [int(testset[0][0][-1]) for (_, testset) in kf.split(toy_data)] assert users == list(range(5)) # Make sure that when called two times without shuffling, folds are the # same. kf = KFold(n_splits=5, shuffle=False) testsets_a = [testset for (_, testset) in kf.split(toy_data)] testsets_b = [testset for (_, testset) in kf.split(toy_data)] assert testsets_a == testsets_b # test once again with another KFold instance kf = KFold(n_splits=5, shuffle=False) testsets_a = [testset for (_, testset) in kf.split(toy_data)] assert testsets_a == testsets_b # We'll now shuffle b and check that folds are different. # (this is conditioned by seed setting at the beginning of file) kf = KFold(n_splits=5, random_state=None, shuffle=True) testsets_b = [testset for (_, testset) in kf.split(toy_data)] assert testsets_a != testsets_b # test once again: two calls to kf.split make different splits when # random_state=None testsets_a = [testset for (_, testset) in kf.split(toy_data)] assert testsets_a != testsets_b # Make sure that folds are the same when same KFold instance is used with # suffle is True but random_state is set to some value kf = KFold(n_splits=5, random_state=1, shuffle=True) testsets_a = [testset for (_, testset) in kf.split(toy_data)] testsets_b = [testset for (_, testset) in kf.split(toy_data)] assert testsets_a == testsets_b # Make sure raw ratings are not shuffled by KFold old_raw_ratings = copy(toy_data.raw_ratings) kf = KFold(n_splits=5, shuffle=True) next(kf.split(toy_data)) assert old_raw_ratings == toy_data.raw_ratings # Make sure kf.split() and the old toy_data.split() have the same folds. np.random.seed(3) with pytest.warns(UserWarning): toy_data.split(2, shuffle=True) testsets_a = [testset for (_, testset) in toy_data.folds()] kf = KFold(n_splits=2, random_state=3, shuffle=True) testsets_b = [testset for (_, testset) in kf.split(toy_data)]
from surprise.model_selection import KFold data = Dataset.load_builtin('ml-100k') algo = SVD() trainset = data.build_full_trainset() algo.fit(trainset) testset = trainset.build_testset() predictions = algo.test(testset) # RMSE should be low as we are biased accuracy.rmse(predictions, verbose=True) # ~ 0.68 (which is low) # We can also do this during a cross-validation procedure! print('CV procedure:') kf = KFold(n_splits=3) for i, (trainset_cv, testset_cv) in enumerate(kf.split(data)): print('fold number', i + 1) algo.fit(trainset_cv) print('On testset,', end=' ') predictions = algo.test(testset_cv) accuracy.rmse(predictions, verbose=True) print('On trainset,', end=' ') predictions = algo.test(trainset_cv.build_testset()) accuracy.rmse(predictions, verbose=True)