def surprise_algo(algo, train_path="datas/train.csv", test_path="datas/test.csv", verbose=True): # reader with rating scale reader = Reader(line_format='user item rating', sep=',', rating_scale=(1, 5)) # Specify the training and test dataset folds_files = [(train_path, test_path)] data = Dataset.load_from_folds(folds_files, reader=reader) pkf = PredefinedKFold() print("Start prediction...") for trainset, testset in pkf.split(data): # train and predict algorithm. model = algo.fit(trainset) predictions = algo.test(testset) pred = pd.read_csv(test_path, names = ["User", "Movie", "Rating"]) print("Postprocessing predictions...") for index, row in pred.iterrows(): rating = round(predictions[index].est) if rating > 5: rating = 5 elif rating < 1: rating = 1 row.Rating = rating return pred
def test_randomizedsearchcv_best_estimator(): """Ensure that the best estimator is the one that gives the best score (by re-running it)""" train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train') test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k')) param_distributions = { 'n_epochs': [5], 'lr_all': uniform(0.002, 0.003), 'reg_all': uniform(0.04, 0.02), 'n_factors': [1], 'init_std_dev': [0] } rs = RandomizedSearchCV(SVD, param_distributions, measures=['mae'], cv=PredefinedKFold(), joblib_verbose=100) rs.fit(data) best_estimator = rs.best_estimator['mae'] # recompute MAE of best_estimator mae = cross_validate(best_estimator, data, measures=['MAE'], cv=PredefinedKFold())['test_mae'] assert mae == rs.best_score['mae']
def test_randomizedsearchcv_best_estimator(u1_ml100k): """Ensure that the best estimator is the one that gives the best score (by re-running it)""" param_distributions = { 'n_epochs': [5], 'lr_all': uniform(0.002, 0.003), 'reg_all': uniform(0.04, 0.02), 'n_factors': [1], 'init_std_dev': [0] } rs = RandomizedSearchCV(SVD, param_distributions, measures=['mae'], cv=PredefinedKFold(), joblib_verbose=100) rs.fit(u1_ml100k) best_estimator = rs.best_estimator['mae'] # recompute MAE of best_estimator mae = cross_validate(best_estimator, u1_ml100k, measures=['MAE'], cv=PredefinedKFold())['test_mae'] assert mae == rs.best_score['mae']
def test_gridsearchcv_best_estimator(u1_ml100k): """Ensure that the best estimator is the one giving the best score (by re-running it)""" param_grid = { 'n_epochs': [5], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6], 'n_factors': [1], 'init_std_dev': [0] } gs = GridSearchCV(SVD, param_grid, measures=['mae'], cv=PredefinedKFold(), joblib_verbose=100) gs.fit(u1_ml100k) best_estimator = gs.best_estimator['mae'] # recompute MAE of best_estimator mae = cross_validate(best_estimator, u1_ml100k, measures=['MAE'], cv=PredefinedKFold())['test_mae'] assert mae == gs.best_score['mae']
def surprise_slopeOne(train_file, test_file): """ SlopeOne with Surprise library. Compute the predictions on a test_set after training on a train_set using the method SlopeOne from Surprise. Args: train_file (string): path to created test file test_file (string): path to created train file Hyperparameters: - Returns: numpy array: predictions """ print("slopeone") algo = SlopeOne() fold = [(train_file, test_file)] reader = Reader(line_format='user item rating', sep=',') data = Dataset.load_from_folds(fold, reader=reader) pkf = PredefinedKFold() for trainset, testset in pkf.split(data): # Train algo.fit(trainset) # Predict predictions = algo.test(testset) pred = np.zeros(len(predictions)) for i in range(len(predictions)): val = predictions[i].est pred[i] = val return pred
def surprise_SVD(train_file, test_file): """ Svd with Surprise library. Compute the predictions on a test_set after training on a train_set using the method Svd from Surprise. Args: train_file (string): path to created test file test_file (string): path to created train file Hyperparameters: n_factors : The number of factors. n_epochs : The number of iteration of the SGD procedure lr_all: The learning rate for all reg_all : The regularization term for all Returns: numpy array: predictions """ print("SVD") fold = [(train_file, test_file)] reader = Reader(line_format='user item rating', sep=',') data = Dataset.load_from_folds(fold, reader=reader) pkf = PredefinedKFold() # Algorithm algo = SVD(n_epochs=30, lr_all=0.01, reg_all=0.1) for trainset, testset in pkf.split(data): # Train algo.fit(trainset) # Predict predictions = algo.test(testset) pred = np.zeros(len(predictions)) for i in range(len(predictions)): val = predictions[i].est pred[i] = val return pred
def test_dump(): """Train an algorithm, compute its predictions then dump them. Ensure that the predictions that are loaded back are the correct ones, and that the predictions of the dumped algorithm are also equal to the other ones.""" random.seed(0) train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train') test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k')) pkf = PredefinedKFold() trainset, testset = next(pkf.split(data)) algo = BaselineOnly() algo.fit(trainset) predictions = algo.test(testset) with tempfile.NamedTemporaryFile() as tmp_file: dump.dump(tmp_file.name, predictions, algo) predictions_dumped, algo_dumped = dump.load(tmp_file.name) predictions_algo_dumped = algo_dumped.test(testset) assert predictions == predictions_dumped assert predictions == predictions_algo_dumped
def surprise_knn_ub(train_file, test_file): """ Knn userbased with Surprise library. Compute the predictions on a test_set after training on a train_set using the method KNNBaseLineOnly from Surprise. Args: train_file (string): path to created test file test_file (string): path to created train file Hyperparameters : k : The (max) number of neighbors to take into account for aggregation sim_options (dict) – A dictionary of options for the similarity measure. Returns: numpy array: predictions """ print("knnUB") algo = KNNBaseline(k=300, sim_options={ 'name': 'pearson_baseline', 'user_based': True }) fold = [(train_file, test_file)] reader = Reader(line_format='user item rating', sep=',') data = Dataset.load_from_folds(fold, reader=reader) pkf = PredefinedKFold() for trainset, testset in pkf.split(data): # Train algo.fit(trainset) # Predict predictions = algo.test(testset) pred = np.zeros(len(predictions)) for i in range(len(predictions)): val = predictions[i].est pred[i] = val return pred
def test_best_estimator(): """Ensure that the best estimator is the one giving the best score (by re-running it)""" train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train') test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k')) param_grid = { 'n_epochs': [5], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6], 'n_factors': [1], 'init_std_dev': [0] } gs = GridSearchCV(SVD, param_grid, measures=['mae'], cv=PredefinedKFold(), joblib_verbose=100) gs.fit(data) best_estimator = gs.best_estimator['mae'] # recompute MAE of best_estimator mae = cross_validate(best_estimator, data, measures=['MAE'], cv=PredefinedKFold())['test_mae'] assert mae == gs.best_score['mae']
def prepare_data(self): super(RecommenderOnSurprice, self).prepare_data() reader = Reader(line_format='user item rating', sep='\t', rating_scale=(1, 5)) data = Dataset.load_from_folds([(self.train_path, self.test_path)], reader=reader) trainset, testset = None, None pkf = PredefinedKFold() for trainset_, testset_ in pkf.split(data): trainset, testset = trainset_, testset_ self.trainset, self.testset = trainset, testset
def load_test_files(self): reader = Reader(line_format='user item rating', sep=',', skip_lines=1) train_file = 'test-data-train.csv' test_file = 'test-data-test.csv' folds_files = [(train_file, test_file)] data = Dataset.load_from_folds(folds_files, reader=reader) pkf = PredefinedKFold() trainset, testset = next(pkf.split(data)) return trainset, testset
def precompute_data(experiment_dir): rating_train_path = os.path.join(experiment_dir, 'ratings_train.txt') rating_test_path = os.path.join(experiment_dir, 'ratings_test.txt') ratings_reader = Reader(line_format="user item rating", sep=' ') dataset = Dataset.load_from_folds([(rating_train_path, rating_test_path)], ratings_reader) pkf = PredefinedKFold() trainset, testset = list(pkf.split(dataset))[0] n_x, yr = trainset.n_users, trainset.ir min_support = 1 args = [n_x, yr, min_support] sims = pearson(*args).astype(np.float32) return trainset, testset, sims
def load_data_hyper(train_file, submission_file): train_file, submission_file, df, df_toBeSubmitted = modify_data( train_file, submission_file) reader = Reader(line_format='user item rating', sep=',') fold = [(train_file, submission_file)] trainAndTest = Dataset.load_from_folds(fold, reader=reader) pkf = PredefinedKFold() # Go through 1 fold for trainset, testset in pkf.split(trainAndTest): data = trainset test = testset return data, test, df, df_toBeSubmitted
def test_PredifinedKFold(): reader = Reader(line_format='user item rating', sep=' ', skip_lines=3, rating_scale=(1, 5)) current_dir = os.path.dirname(os.path.realpath(__file__)) folds_files = [(current_dir + '/custom_train', current_dir + '/custom_test')] data = Dataset.load_from_folds(folds_files=folds_files, reader=reader) # Make sure rating files are read correctly pkf = PredefinedKFold() trainset, testset = next(pkf.split(data)) assert trainset.n_ratings == 6 assert len(testset) == 3
def test_PredifinedKFold(toy_data_reader): current_dir = os.path.dirname(os.path.realpath(__file__)) folds_files = [(current_dir + '/custom_train', current_dir + '/custom_test')] data = Dataset.load_from_folds(folds_files=folds_files, reader=toy_data_reader, rating_scale=(1, 5)) # Make sure rating files are read correctly pkf = PredefinedKFold() trainset, testset = next(pkf.split(data)) assert trainset.n_ratings == 6 assert len(testset) == 3 # Make sure pkf returns the same folds as the deprecated data.folds() with pytest.warns(UserWarning): trainset_, testset_ = next(data.folds()) assert testset_ == testset
def get_top_n_recommendations(self, test_set, top_n): self.test_set = test_set # Surprise requires a slightly different input data format, so we use two different CSVs test_path_tmp = "resources//test_file.csv" train_path_tmp = "resources//train_file.csv" self.train_set.to_csv(train_path_tmp, index=False, header=False) self.test_set.to_csv(test_path_tmp, index=False, header=False) fold_files = [(train_path_tmp, test_path_tmp)] reader = Reader(rating_scale=(1, 10), line_format='user item rating', sep=',') data = Dataset.load_from_folds(fold_files, reader=reader) for trainset, testset in PredefinedKFold().split(data): self.method.fit(trainset) already_ranked_items_by_users = self.train_set.groupby( 'userID')['itemID'].apply(list) recommendations = {} pbar = tqdm(total=len(self.test_set.userID.unique())) for userID in self.test_set.userID.unique(): pbar.update(1) if userID not in self.train_set.userID.unique(): recommendations[str(userID)] = [] continue items_expected_ranking = {} for itemID in self.train_set.itemID.unique(): if itemID in already_ranked_items_by_users[userID]: continue # We call here the specific Surprise method that we use for this model # The method predicts a score for a given item predicted = self.method.predict(str(userID), str(itemID), clip=False) items_expected_ranking[itemID] = predicted[3] # Now we just sort by decreasing scores and take the top N sorted_predictions = sorted(items_expected_ranking.items(), key=operator.itemgetter(1)) sorted_predictions.reverse() sorted_predictions = [x[0] for x in sorted_predictions] user_recommendations = sorted_predictions[:top_n] recommendations[str(userID)] = user_recommendations pbar.close() return recommendations
def func7(): import os from surprise import SVD from surprise import Dataset from surprise import Reader from surprise import accuracy from surprise.model_selection import PredefinedKFold files_dir = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/') reader = Reader('ml-100k') train_file = files_dir + 'u%d.base' test_file = files_dir + 'u%d.test' folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)] data = Dataset.load_from_folds(folds_files, reader=reader) pkf = PredefinedKFold() algo = SVD() for trainset, testset in pkf.split(data): algo.fit(trainset) predictions = algo.test(testset) accuracy.rmse(predictions, verbose=True)
def basic_rec(model_name, train_path, test_path, target_id): # build data # TODO check float and min_r reader = Reader(line_format='user item rating', sep='\t', rating_scale=(1, 5)) data = Dataset.load_from_folds([(train_path, test_path)], reader=reader) trainset, testset = None, None pkf = PredefinedKFold() for trainset_, testset_ in pkf.split(data): trainset, testset = trainset_, testset_ # train model rec_algo = get_model(model_name) rec_algo.fit(trainset) # eval preds = rec_algo.test(testset) rmse = accuracy.rmse(preds, verbose=True) # predor target fn_pred = lambda uid: rec_algo.predict(str(uid), str(target_id), r_ui=0 ).est target_predictions = list(map(fn_pred, range(trainset.n_users))) # topn testset = trainset.build_anti_testset() predictions = rec_algo.test(testset) top_n = get_top_n(predictions, n=50) hit_ratios = {} for uid, user_ratings in top_n.items(): topN = [int(iid) for (iid, _) in user_ratings] hits = [ 1 if target_id in topN[:i] else 0 for i in [1, 3, 5, 10, 20, 50] ] hit_ratios[int(uid)] = hits return target_predictions, hit_ratios
def test_knns(): """Ensure the k and min_k parameters are effective for knn algorithms.""" # the test and train files are from the ml-100k dataset (10% of u1.base and # 10 % of u1.test) train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train') test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k')) pkf = PredefinedKFold() # Actually, as KNNWithMeans and KNNBaseline have back up solutions for when # there are not enough neighbors, we can't really test them... klasses = (KNNBasic, ) # KNNWithMeans, KNNBaseline) k, min_k = 20, 5 for klass in klasses: algo = klass(k=k, min_k=min_k) for trainset, testset in pkf.split(data): algo.fit(trainset) predictions = algo.test(testset) for pred in predictions: if not pred.details['was_impossible']: assert min_k <= pred.details['actual_k'] <= k
def get_rating_predictions(self, test_set, cluster_user_mapping=None): self.test_set = test_set test_path_tmp = "..\\resources\\tmp\\test_file.csv" train_path_tmp = "..\\resources\\tmp\\train_file.csv" self.train_set.to_csv(train_path_tmp, index=False, header=False) self.test_set.to_csv(test_path_tmp, index=False, header=False) fold_files = [(train_path_tmp, test_path_tmp)] reader = Reader(rating_scale=(1, 10), line_format='user item rating', sep=',') data = Dataset.load_from_folds(fold_files, reader=reader) for trainset, testset in PredefinedKFold().split(data): if cluster_user_mapping is None: self.method.fit(trainset) else: df_users_in_clusters = pd.DataFrame.from_dict( cluster_user_mapping) df_cluser_users = df_users_in_clusters.groupby('') #Distinct clusters: clusters = list(set(cluster_user_mapping.values())) #for cluster in clusters: #cluster_train_data = trainset[trainset.userID.isin() userID] pass results = pd.DataFrame(columns=['userID', 'itemID', 'real', 'est']) pbar = tqdm(total=len(self.test_set.index)) for key, val in self.test_set.iterrows(): prediction = self.method.predict(str(val.userID), str(val.itemID), clip=False) results = results.append( { "userID": int(val.userID), "itemID": int(val.itemID), "real": int(val.rating), "est": int(prediction.est) }, ignore_index=True) pbar.update(1) pbar.close() return results
def get_top_n_recommendations(self, test_set, top_n): self.test_set = test_set test_path_tmp = "..\\resources\\tmp\\test_file.csv" train_path_tmp = "..\\resources\\tmp\\train_file.csv" self.train_set.to_csv(train_path_tmp, index=False, header=False) self.test_set.to_csv(test_path_tmp, index=False, header=False) fold_files = [(train_path_tmp, test_path_tmp)] reader = Reader(rating_scale=(1, 10), line_format='user item rating', sep=',') data = Dataset.load_from_folds(fold_files, reader=reader) for trainset, testset in PredefinedKFold().split(data): self.method.fit(trainset) already_ranked_items_by_users = self.train_set.groupby( 'userID')['itemID'].apply(list) recommendations = {} pbar = tqdm(total=len(self.test_set.userID.unique())) for userID in self.test_set.userID.unique(): pbar.update(1) if userID not in self.train_set.userID.unique(): recommendations[str(userID)] = [] continue items_expected_ranking = {} for itemID in self.train_set.itemID.unique(): if itemID in already_ranked_items_by_users[userID]: continue # Calc prediction for item for user predicted = self.method.predict(str(userID), str(itemID), clip=False) items_expected_ranking[itemID] = predicted[3] sorted_predictions = sorted(items_expected_ranking.items(), key=operator.itemgetter(1)) sorted_predictions.reverse() sorted_predictions = [str(x[0]) for x in sorted_predictions] user_recommendations = sorted_predictions[:top_n] recommendations[str(userID)] = user_recommendations pbar.close() return recommendations
def predict_rating_split_by_time(self, files_pair, algo_test): algo = algo_test[0] use_auto_parse = algo_test[1] if use_auto_parse: fold_files = [(files_pair)] reader = Reader(rating_scale=(1, 10), line_format='user item rating', sep=',') data = Dataset.load_from_folds(fold_files, reader=reader) for trainset, testset in PredefinedKFold().split(data): algo.fit(trainset) predictions = algo.test(testset) rmse = accuracy.rmse(predictions, verbose=False) return rmse else: # Prepare dataset train_set = pd.read_csv(files_pair[0], parse_dates=[3]) test_set = pd.read_csv(files_pair[1], parse_dates=[3]) item_to_id_mapping = {} user_to_id_mapping = {} item_index = 0 user_index = 0 all_sets = pd.concat([train_set, test_set]) for item in all_sets['itemID']: if item not in item_to_id_mapping.keys(): item_to_id_mapping[item] = item_index item_index += 1 for user in all_sets['userID']: if user not in user_to_id_mapping.keys(): user_to_id_mapping[user] = user_index user_index += 1 train_set['itemID'] = train_set['itemID'].map(item_to_id_mapping) test_set['itemID'] = test_set['itemID'].map(item_to_id_mapping) train_set['userID'] = train_set['userID'].map(user_to_id_mapping) test_set['userID'] = test_set['userID'].map(user_to_id_mapping) algo.fit(train_set) rec_list = algo.get_top_n_recommendations(test_set) pass
def test_dump(u1_ml100k): """Train an algorithm, compute its predictions then dump them. Ensure that the predictions that are loaded back are the correct ones, and that the predictions of the dumped algorithm are also equal to the other ones.""" random.seed(0) trainset, testset = next(PredefinedKFold().split(u1_ml100k)) algo = BaselineOnly() algo.fit(trainset) predictions = algo.test(testset) with tempfile.NamedTemporaryFile() as tmp_file: dump.dump(tmp_file.name, predictions, algo) predictions_dumped, algo_dumped = dump.load(tmp_file.name) predictions_algo_dumped = algo_dumped.test(testset) assert predictions == predictions_dumped assert predictions == predictions_algo_dumped
def pkf(): return PredefinedKFold()
def main(): class MyParser(argparse.ArgumentParser): '''A parser which prints the help message when an error occurs. Taken from http://stackoverflow.com/questions/4042452/display-help-message-with-python-argparse-when-script-is-called-without-any-argu.''' # noqa def error(self, message): sys.stderr.write('error: %s\n' % message) self.print_help() sys.exit(2) parser = MyParser( description='Evaluate the performance of a rating prediction ' + 'algorithm ' + 'on a given dataset using cross validation. You can use a built-in ' + 'or a custom dataset, and you can choose to automatically split the ' + 'dataset into folds, or manually specify train and test files. ' + 'Please refer to the documentation page ' + '(http://surprise.readthedocs.io/) for more details.', epilog="""Example:\n surprise -algo SVD -params "{'n_epochs': 5, 'verbose': True}" -load-builtin ml-100k -n-folds 3""") algo_choices = { 'NormalPredictor': NormalPredictor, 'BaselineOnly': BaselineOnly, 'KNNBasic': KNNBasic, 'KNNBaseline': KNNBaseline, 'KNNWithMeans': KNNWithMeans, 'SVD': SVD, 'SVDpp': SVDpp, 'NMF': NMF, 'SlopeOne': SlopeOne, 'CoClustering': CoClustering, } parser.add_argument('-algo', type=str, choices=algo_choices, help='The prediction algorithm to use. ' + 'Allowed values are ' + ', '.join(algo_choices.keys()) + '.', metavar='<prediction algorithm>') parser.add_argument('-params', type=str, metavar='<algorithm parameters>', default='{}', help='A kwargs dictionary that contains all the ' + 'algorithm parameters.' + 'Example: "{\'n_epochs\': 10}".') parser.add_argument('-load-builtin', type=str, dest='load_builtin', metavar='<dataset name>', default='ml-100k', help='The name of the built-in dataset to use.' + 'Allowed values are ' + ', '.join(dataset.BUILTIN_DATASETS.keys()) + '. Default is ml-100k.') parser.add_argument( '-load-custom', type=str, dest='load_custom', metavar='<file path>', default=None, help='A file path to custom dataset to use. ' + 'Ignored if ' + '-loadbuiltin is set. The -reader parameter needs ' + 'to be set.') parser.add_argument('-folds-files', type=str, dest='folds_files', metavar='<train1 test1 train2 test2... >', default=None, help='A list of custom train and test files. ' + 'Ignored if -load-builtin or -load-custom is set. ' 'The -reader parameter needs to be set.') parser.add_argument('-reader', type=str, metavar='<reader>', default=None, help='A Reader to read the custom dataset. Example: ' + '"Reader(line_format=\'user item rating timestamp\',' + ' sep=\'\\t\')"') parser.add_argument('-n-folds', type=int, dest='n_folds', metavar="<number of folds>", default=5, help='The number of folds for cross-validation. ' + 'Default is 5.') parser.add_argument('-seed', type=int, metavar='<random seed>', default=None, help='The seed to use for RNG. ' + 'Default is the current system time.') parser.add_argument('--with-dump', dest='with_dump', action='store_true', help='Dump the algorithm ' + 'results in a file (one file per fold). ' + 'Default is False.') parser.add_argument('-dump-dir', dest='dump_dir', type=str, metavar='<dir>', default=None, help='Where to dump the files. Ignored if ' + 'with-dump is not set. Default is ' + os.path.join(get_dataset_dir(), 'dumps/')) parser.add_argument('--clean', dest='clean', action='store_true', help='Remove the ' + get_dataset_dir() + ' directory and exit.') parser.add_argument('-v', '--version', action='version', version=__version__) args = parser.parse_args() if args.clean: folder = get_dataset_dir() shutil.rmtree(folder) print('Removed', folder) exit() # setup RNG rd.seed(args.seed) np.random.seed(args.seed) # setup algorithm params = eval(args.params) if args.algo is None: parser.error('No algorithm was specified.') algo = algo_choices[args.algo](**params) # setup dataset if args.load_custom is not None: # load custom and split if args.reader is None: parser.error('-reader parameter is needed.') reader = eval(args.reader) data = Dataset.load_from_file(args.load_custom, reader=reader) cv = KFold(n_splits=args.n_folds, random_state=args.seed) elif args.folds_files is not None: # load from files if args.reader is None: parser.error('-reader parameter is needed.') reader = eval(args.reader) folds_files = args.folds_files.split() folds_files = [(folds_files[i], folds_files[i + 1]) for i in range(0, len(folds_files) - 1, 2)] data = Dataset.load_from_folds(folds_files=folds_files, reader=reader) cv = PredefinedKFold() else: # load builtin dataset and split data = Dataset.load_builtin(args.load_builtin) cv = KFold(n_splits=args.n_folds, random_state=args.seed) cross_validate(algo, data, cv=cv, verbose=True)
from surprise import Dataset from surprise import Reader from surprise import accuracy from surprise.model_selection import PredefinedKFold # path to dataset folder files_dir = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/') # This time, we'll use the built-in reader. reader = Reader('ml-100k') # folds_files is a list of tuples containing file paths: # [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)] train_file = files_dir + 'u%d.base' test_file = files_dir + 'u%d.test' folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)] data = Dataset.load_from_folds(folds_files, reader=reader, rating_scale=(1, 5)) pkf = PredefinedKFold() algo = SVD() for trainset, testset in pkf.split(data): # train and test algorithm. algo.fit(trainset) predictions = algo.test(testset) # Compute and print Root Mean Squared Error accuracy.rmse(predictions, verbose=True)
def run_knn_baseline(sparse_data): #filename = "test.json" prefix = "knn_baseline_" trainFile = prefix + "train.txt" testFile = prefix + "test.txt" raw_data, userPurchasedSet, userTrueTestSet = preprocess( sparse_data, trainFile, testFile) folds_files = [(trainFile, testFile)] reader = Reader(line_format='user item rating', sep='\t') data = Dataset.load_from_folds(folds_files, reader=reader) pkf = PredefinedKFold() bsl_options = { 'method': 'sgd', 'n_epochs': 20, 'learning_rate': 0.005, } ### sim name: cosine msd pearson pearson_baseline ### user_based : True ---- similarity will be computed based on users ### : False ---- similarity will be computed based on items. sim_options = {'name': 'pearson_baseline', 'user_based': False} predictions = {} top_n = {} testsSet = None total_precisions = 0.0 total_recalls = 0.0 total_hit = 0.0 total_nDCG = 0.0 total_ffeature = 0.0 result_file = prefix + "result.txt" result_f = open(result_file, "w") for trainset, testset in pkf.split(data): testsSet = testset #algo = SVD(n_factors = 5) algo = KNNBaseline(bsl_options=bsl_options, sim_options=sim_options) algo.fit(trainset) pre = algo.test(testset) accuracy.rmse(pre) accuracy.mae(pre) #calculate_rmse(predictions) ### test rowNum = raw_data.get_row_size() colNum = raw_data.get_col_size() cur_time = time.time() time_cost = 0 for i in range(rowNum): user = raw_data.get_userID(i) predictions[user] = set() pq = [] heapq.heapify(pq) for j in range(colNum): item = raw_data.get_itemID(j) if user not in userPurchasedSet or item in userPurchasedSet[ user]: continue value = raw_data.get_val(user, item, 'rating') predict = algo.predict(user, item, r_ui=0, verbose=False)[3] if len(pq) >= 10: heapq.heappop(pq) heapq.heappush(pq, (predict, item)) top_n[user] = set() for items in pq: top_n[user].add(items[1]) if user in userTrueTestSet: curPrecisions = calculate_precision(top_n[user], userTrueTestSet[user]) curRecalls = calculate_recall(top_n[user], userTrueTestSet[user]) ffeature = calculate_f_feature(curPrecisions, curRecalls) curHit = isHit(top_n[user], userTrueTestSet[user]) cur_nDCG = calculate_NDCG(top_n[user], userTrueTestSet[user]) total_precisions += curPrecisions total_recalls += curRecalls total_hit += curHit total_nDCG += cur_nDCG total_ffeature += ffeature result_f.write(user + "\t" + str(curPrecisions) + "\t" + str(curRecalls) + "\t" + str(ffeature) + "\t" + str(curHit) + '\t' + str(cur_nDCG) + "\n") if i != 0 and i % 1000 == 0: duration = (time.time() - cur_time) / 60 time_cost += duration remaining_time = ((rowNum - i) / 1000) * duration cur_time = time.time() #print 'precisions', total_precisions, ' recalls', total_recalls, ' nDCG', total_nDCG print 'i:', i, "/", rowNum, 'remaining time:', remaining_time, 'min' print 'precicions', total_precisions, ' recalls', total_recalls, ' hit', total_hit, 'nDCG:', total_nDCG rowNum = raw_data.get_row_size() print 'avg_precisions:', total_precisions / rowNum, 'avg_recalls:', total_recalls / rowNum, 'avg_ffeature', str( total_ffeature / rowNum ), 'avg_hit:', total_hit / rowNum, 'avg_nDCG:', total_nDCG / rowNum result_f.write("avg:\t" + str(total_precisions / rowNum) + "\t" + str(total_recalls / rowNum) + "\t" + str(total_ffeature / rowNum) + "\t" + str(total_hit / rowNum) + '\t' + str(total_nDCG / rowNum) + "\n") result_f.close()
def test_trainset_testset(toy_data_reader): """Test the construct_trainset and construct_testset methods.""" current_dir = os.path.dirname(os.path.realpath(__file__)) folds_files = [(current_dir + '/custom_train', current_dir + '/custom_test')] data = Dataset.load_from_folds(folds_files=folds_files, reader=toy_data_reader) pkf = PredefinedKFold() trainset, testset = next(pkf.split(data)) # test ur ur = trainset.ur assert ur[0] == [(0, 4)] assert ur[1] == [(0, 4), (1, 2)] assert ur[40] == [] # not in the trainset # test ir ir = trainset.ir assert ir[0] == [(0, 4), (1, 4), (2, 1)] assert ir[1] == [(1, 2), (2, 1), (3, 5)] assert ir[20000] == [] # not in the trainset # test n_users, n_items, n_ratings, rating_scale assert trainset.n_users == 4 assert trainset.n_items == 2 assert trainset.n_ratings == 6 assert trainset.rating_scale == (1, 5) # test raw2inner for i in range(4): assert trainset.to_inner_uid('user' + str(i)) == i with pytest.raises(ValueError): trainset.to_inner_uid('unkown_user') for i in range(2): assert trainset.to_inner_iid('item' + str(i)) == i with pytest.raises(ValueError): trainset.to_inner_iid('unkown_item') # test inner2raw assert trainset._inner2raw_id_users is None assert trainset._inner2raw_id_items is None for i in range(4): assert trainset.to_raw_uid(i) == 'user' + str(i) for i in range(2): assert trainset.to_raw_iid(i) == 'item' + str(i) assert trainset._inner2raw_id_users is not None assert trainset._inner2raw_id_items is not None # Test the build_testset() method algo = BaselineOnly() algo.fit(trainset) testset = trainset.build_testset() algo.test(testset) # ensure an algorithm can manage the data assert ('user0', 'item0', 4) in testset assert ('user3', 'item1', 5) in testset assert ('user3', 'item1', 0) not in testset # Test the build_anti_testset() method algo = BaselineOnly() algo.fit(trainset) testset = trainset.build_anti_testset() algo.test(testset) # ensure an algorithm can manage the data assert ('user0', 'item0', trainset.global_mean) not in testset assert ('user3', 'item1', trainset.global_mean) not in testset assert ('user0', 'item1', trainset.global_mean) in testset assert ('user3', 'item0', trainset.global_mean) in testset
ratingMap[(x[0], x[1])] = x[2] # print("total data: " + str(len(ratingMap))) # add avg business stars # avgBusList = list() # userRatingRDD = userRatingRDD.filter(lambda t: int(preUserMap[t[0]]) % 30 == 0) # busRatingRDD = busRatingRDD.filter(lambda t: int(preBusinessMap[t[0]]) % 40 == 0) # print("add user avg count: " + str(userRatingRDD.count())) # print("add bus avg count: " + str(busRatingRDD.count())) reader = Reader(line_format='user item rating', sep=",", skip_lines=1) folds_files = [(trainingFilePath, validationFilePath)] data = Dataset.load_from_folds(folds_files, reader=reader) pkf = PredefinedKFold() algo = SVD() predictionList = list() for trainset, testset in pkf.split(data): # train and test algorithm. algo.fit(trainset) predictions = algo.test(testset) for uid, iid, true_r, est, _ in predictions: predictionList.append((uid, iid, est)) # Compute and print Root Mean Squared Error accuracy.rmse(predictions, verbose=True)
import os from surprise import SVD from surprise import SVDpp from surprise import Dataset from surprise import Reader from surprise.accuracy import neg_rmse from surprise.model_selection import cross_validate from surprise.model_selection import PredefinedKFold # the test and train files are from the ml-100k dataset (10% of u1.base and # 10 % of u1.test) train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train') test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k')) pkf = PredefinedKFold() def test_SVD_parameters(): """Ensure that all parameters are taken into account.""" # The baseline against which to compare. algo = SVD(n_factors=1, n_epochs=1, random_state=1) rmse_default = cross_validate(algo, data, [['neg_rmse', neg_rmse]], pkf)[ 'test_neg_rmse'] # n_factors algo = SVD(n_factors=2, n_epochs=1, random_state=1) rmse_factors = cross_validate(algo, data, [['neg_rmse', neg_rmse]], pkf)[ 'test_neg_rmse'] assert rmse_default != rmse_factors