def get_sim_matrix(load_sim_matrix): raw = pd.read_csv('books_metadata/ratings.csv') if load_sim_matrix is False: raw = pd.read_csv('books_metadata/ratings.csv') raw = raw[raw['book_id'] <= 1900] raw.drop_duplicates(inplace=True) print('we have', str(raw.shape[0]), 'ratings') print('the number of unique users we have is:', len(raw.user_id.unique())) print('the number of unique books we have is:', len(raw.book_id.unique())) rawTrain = raw[['user_id', 'book_id', 'rating']] rawTrain, rawHoldout = train_test_split(raw, test_size=0.25) reader = surprise.Reader(rating_scale=(1, 5)) data = surprise.Dataset.load_from_df(rawTrain, reader) sim_options = {'name': 'cosine', 'user_based': False} collabKNN = surprise.KNNWithMeans(k=100, sim_options=sim_options) kSplit = surprise.model_selection.split.KFold(n_splits=2, shuffle=False) for trainset, testset in kSplit.split(data): collabKNN.fit(trainset) predictionsKNN = collabKNN.test(testset) surprise.accuracy.rmse(predictionsKNN, verbose=True) sim_matrix = collabKNN.compute_similarities() with open('books_sim_matrix', 'wb') as output: pickle.dump(sim_matrix, output, protocol=pickle.HIGHEST_PROTOCOL) return sim_matrix with open('books_sim_matrix', 'rb') as input: sim_matrix = pickle.load(input) return sim_matrix
def train(ratings, k_neighbors, k_folds): """ Train a model and return it. Then we can use the model and evaluate it elsewhere @param ratings dataframe pandas dataframe to train on, with columns UserId, MovieId, Ratings @param k_neighbors number of neighbors to examine @param k_folds number of folds for cross validation @returns List of (algo, test data) We can call methods such as `test` and `evaluate` on this object """ train_data, test_data = cv.train_test_split(ratings, test_size=0.20) reader = sp.Reader(rating_scale=(1, 5)) trainset = sp.Dataset.load_from_df(train_data, reader) testset = sp.Dataset.load_from_df(test_data, reader) trainset.split(n_folds=k_folds) similarity_options = {'name': 'pearson', 'user_based': False} algo = sp.KNNWithMeans(sim_options=similarity_options, k=k_neighbors, min_k=5) for _trainset, _ in trainset.folds(): algo.train(_trainset) testset = testset.build_full_trainset().build_testset() return (algo, testset)
def meansKNN(train, test): """ Run the KNN means model from Surprise library. @param train: the training set in the Surprise format. @param test: the test set in the Surprise format. @return: the predictions in a numpy array. """ algo = spr.KNNWithMeans() algo.fit(train) predictions = algo.test(test) return get_predictions(predictions)
def algo_tester(data_object): ''' Produces a dataframe displaying all the different RMSE's, test & train times of the different surprise algorithms ---Parameters--- data_object(variable) created from the read_data_surprise function ---Returns--- returns a dataframe where you can compare the performance of different algorithms ''' benchmark = [] algos = [ sp.SVDpp(), sp.SVD(), sp.SlopeOne(), sp.NMF(), sp.NormalPredictor(), sp.KNNBaseline(), sp.KNNBasic(), sp.KNNWithMeans(), sp.KNNWithZScore(), sp.BaselineOnly(), sp.CoClustering() ] # Iterate over all algorithms for algorithm in algos: # Perform cross validation results = cross_validate(algorithm, data_object, measures=['RMSE'], cv=3, verbose=False) # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) benchmark = pd.DataFrame(benchmark).set_index('Algorithm').sort_values( 'test_rmse') return benchmark
uid_list = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11] # TODO - 2-1-1. KNNBasic, cosine sim_options = {'name': 'cosine'} algo = surprise.KNNBasic(sim_options=sim_options) algo.fit(trainset) results = get_top_n(algo, testset, uid_list, n=10, user_based=True) with open('2-1-1_results.txt', 'w') as f: for uid, ratings in sorted(results.items(), key=lambda x: int(x[0])): f.write('User ID %s top-10 results\n' % uid) for iid, score in ratings: f.write('Item ID %s\tscore %s\n' % (iid, str(score))) f.write('\n') # TODO - 2-1-2. KNNWithMeans, pearson sim_options2 = {'name': 'pearson'} algo = surprise.KNNWithMeans(sim_options=sim_options2) algo.fit(trainset) results = get_top_n(algo, testset, uid_list, n=10, user_based=True) with open('2-1-2_results.txt', 'w') as f: for uid, ratings in sorted(results.items(), key=lambda x: int(x[0])): f.write('User ID %s top-10 results\n' % uid) for iid, score in ratings: f.write('Item ID %s\tscore %s\n' % (iid, str(score))) f.write('\n') # # TODO - 2-2. Best Model bsl_options_ub = {'n_epochs': 30, 'method': 'als', 'reg_i': 10, 'reg_u': 0} sim_options_ub = {'name': 'msd', 'min_support': 1, 'user_based': True} best_algo_ub = surprise.KNNBaseline(sim_options=sim_options_ub, bsl_options=bsl_options_ub,
def main(args): parser = argparse.ArgumentParser(description= \ 'Deploys recommendation algorithms and outputs the recommendations list',\ formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--pickleLoadPath", type=str, action='store', \ help= 'If set=> load topN recoms from pickle file') parser.add_argument("--pickleSavePath", type=str, action='store', help='If set => Output .pickle file.') parser.add_argument("--proc", type=int, default=multiprocessing.cpu_count(), \ action='store', \ help= 'Number of processes to spawn for topN computation\n' + 'default is number of processors.') parser.add_argument("--update_freq", type=int, default=1, action='store', \ help= 'Number of clicks after which the model is updated') parser.add_argument("--topN_list", type=int, nargs="+", required=True, \ help= 'e.g., --topN_list 5 10 50\n' \ + 'topN=max(topN_list); the rest of the values are used for evaluation.') parser.add_argument("--drop_ratio", type=int, default=0, action='store', \ help= 'Number of random events to remove from the training set;\n' + \ 'default is 0; Currently not implemented for librec.') parser.add_argument("--evalTrain", dest='evalTrain', action='store_true', \ help='If set => evaluate on training set using k-fold validation.\n' \ + 'Else => evaluate only on test set') parser.add_argument("--dataset", type=str, action='store', \ help= 'Full path to the dataset.\n' + \ 'Must give --testSize and --validSize for the split') parser.add_argument("--testSize", type=int, default=0, action='store', help='TestSet size; default is 0 => no test set') parser.add_argument("--validSize", type=int, default=2000, action='store', \ help= 'Validation Set size; default is 2000.') parser.add_argument("--trainSet", type=str, action='store', \ help= 'Full path to the trainingSet.csv\n' + \ 'If given the (potential) training set split from --dataset will be overwritten') parser.add_argument("--validSet", type=str, action='store', \ help= 'Full path to the validationSet.csv\n' + \ 'If given the (potential) validation set split from --dataset will be overwritten') parser.add_argument("--testSet", type=str, action='store', \ help= 'Full path to the testSet.csv\n' + \ 'If given the (potential) test set split from --dataset will be overwritten') parser.add_argument("--librec_home", type=str, action='store', \ help= 'Full path to the librec folder cloned from git.') parser.add_argument("--config", type=str, action='store', \ help= 'Full path to the librec .properties file.\n' + \ 'Copy from: https://www.librec.net/dokuwiki/doku.php?id=AlgorithmList') parser.add_argument("--surprise_algo", type=str, action='store', \ help= 'Choose algorithm from surprise lib. Available options:\n' + \ '--surprise_algo SVD\n' + \ '--surprise_algo SVDpp\n' + \ '--surprise_algo PMF\n' + \ '--surprise_algo NMF\n' + \ '--surprise_algo KNNWithMeans\n') args = parser.parse_args(args) random.seed(42) # reproducability np.random.seed(42) if args.pickleLoadPath is None: """DATA""" train, valid, test = splitter.splitData( fullDataPath=args.dataset, validSize=args.validSize, testSize=args.testSize, \ trainSetPath=args.trainSet, validSetPath=args.validSet, testSetPath=args.testSet) """RECOMMENDATIONS""" if args.surprise_algo == 'SVD': algo = surprise.SVD() elif args.surprise_algo == 'KNNWithMeans': # sim_options = {'name': 'pearson_baseline', 'shrinkage': 2500, \ # 'user_based': False, } sim_options = {'name': 'cosine', 'user_based': False} algo = surprise.KNNWithMeans(k=40, sim_options=sim_options) elif args.surprise_algo == 'PMF': algo = surprise.SVD(n_factors=5, reg_all=0.12, lr_all=0.005, n_epochs=400) elif args.surprise_algo == 'NMF': algo = surprise.NMF(n_factors=5, n_epochs=400) elif args.surprise_algo == 'SVDpp': algo = surprise.SVDpp() testList = [] # output recommendations for the last element if len(test) > 0: testList.append(test) if len(valid) > 0: testList.append(valid) for test in testList: if args.librec_home is None: recs = surprise_recom(train, test, algo, drop_ratio=args.drop_ratio, \ update_freq=args.update_freq, N_list=args.topN_list, num=args.proc, \ evalTrain=args.evalTrain) else: recs = librec_recom(train, test, args.librec_home, args.config, \ update_freq=args.update_freq, N_list=args.topN_list, num=args.proc, \ evalTrain=args.evalTrain) if not args.pickleSavePath is None: with open(args.pickleSavePath, 'wb') as handle: pickle.dump(recs, handle) else: with open(args.pickleLoadPath, 'rb') as handle: recs = pickle.load(handle)
from surprise import model_selection import pandas as pd import csv def read_train_data(path): file_path = os.path.normpath(path) reader = surprise.Reader(line_format='timestamp user item rating', sep=',') data = surprise.Dataset.load_from_file(file_path, reader=reader) return data movie_train = read_train_data('E:/train_v2.csv') print(movie_train.raw_ratings) knn_estimator = surprise.KNNWithMeans() knn_grid = { 'k': [10, 20], 'sim_options': { 'name': ['cosine'], 'min_support': [1, 5], 'user_based': [False] } } knn_grid_estimator = model_selection.GridSearchCV(knn_estimator, knn_grid, measures=['rmse'], cv=3) #do grid search using cv strategy knn_grid_estimator.fit(movie_train) print(knn_grid_estimator.best_score['rmse'])
def main(train_df, target_df, cache_name="test", force_recompute=[]): """Train multiple models on train_df and predicts target_df Predictions are cached. If the indices don't match the indices of target_df, the cache is discarded. By default, if a method was already computed it is not recomputed again (except if the method name is listed in force_recompute). cache_name is the name to use to read and write the cache. Arguments: train_df {dataframe} -- Training dataframe target_df {dataframe} -- Testing dataframe Keyword Arguments: cache_name {str} -- Name to use for caching (default: {"test"}) force_recompute {list} -- Name(s) of methods to recompute, whether or not it was already computed. Useful to only recompute single methods without discarding the rest. (default: {[]}) Returns: Dataframe -- Dataframe with predictions for each methods as columns, IDs as indices """ global algo_in_use CACHED_DF_FILENAME = os.path.dirname( os.path.abspath(__file__)) +\ "/cache/cached_predictions_{}.pkl".format(cache_name) train_df = preprocess_df(train_df) trainset = pandas_to_data(train_df) ids_to_predict = target_df["Id"].to_list() # try to retrieve backup dataframe try: print("Retrieving cached predictions") all_algos_preds_df = pd.read_pickle(CACHED_DF_FILENAME) print("Ensuring cached IDs match given IDs") assert sorted(ids_to_predict) == sorted( all_algos_preds_df.index.values) print("Indices match, continuing") except (FileNotFoundError, AssertionError): print("No valid cached predictions found") all_algos_preds_df = pd.DataFrame(ids_to_predict, columns=["Id"]) all_algos_preds_df.set_index("Id", inplace=True) all_algos = { "SVD": spr.SVD(n_factors=200, n_epochs=100), "Baseline": spr.BaselineOnly(), "NMF": spr.NMF(n_factors=30, n_epochs=100), "Slope One": spr.SlopeOne(), "KNN Basic": spr.KNNBasic(k=60), "KNN Means": spr.KNNWithMeans(k=60), "KNN Baseline": spr.KNNBaseline(), "KNN Zscore": spr.KNNWithZScore(k=60), "SVD ++": spr.SVDpp(n_factors=40, n_epochs=100), "Co Clustering": spr.CoClustering() } for name in all_algos: print("##### {} ####".format(name)) if name in force_recompute and name in all_algos_preds_df.columns: all_algos_preds_df.drop(name, axis=1, inplace=True) if name in all_algos_preds_df.columns: print("Already computed {}, skipping".format(name)) continue algo = all_algos[name] time.sleep(1) algo.fit(trainset) time.sleep(1) algo_in_use = algo print("Generating predictions...") predictions = parallelize_predictions(ids_to_predict, 80) print("Done. Merging with previous results") this_algo_preds_df = pd.DataFrame(predictions, columns=["Id", name]) this_algo_preds_df.set_index("Id", inplace=True) all_algos_preds_df = pd.merge(all_algos_preds_df, this_algo_preds_df, left_index=True, right_index=True) all_algos_preds_df.to_pickle(CACHED_DF_FILENAME) print("DONE computing surprize") return all_algos_preds_df
train_path = path + '/Data/train_format.txt' train_reader = Reader(line_format='user item rating timestamp', sep=',', rating_scale=(0, 5)) trainset = Dataset.load_from_file(train_path, reader=train_reader) trainset = trainset.build_full_trainset() if args.model == 'NormalPredictor': model = surprise.NormalPredictor() elif args.model == 'BaselineOnly': model = surprise.BaselineOnly() elif args.model == 'KNNBasic': model = surprise.KNNBasic() elif args.model == 'KNNWithMeans': model = surprise.KNNWithMeans() elif args.model == 'KNNWithZScore': model = surprise.KNNWithZScore() elif args.model == 'KNNBaseline': model = surprise.KNNBaseline() elif args.model == 'SVD': model = surprise.SVD() elif args.model == 'SVDpp': model = surprise.SVDpp(verbose=True) elif args.model == 'NMF': model = surprise.NMF() elif args.model == 'SlopeOne': model = surprise.SlopeOne() elif args.model == 'CoClustering': model = surprise.CoClustering()
normalPredictor = surprise.NormalPredictor() normalPredictor_temp = surprise.model_selection.cross_validate( normalPredictor, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) print('normalPredictor--------------') print(normalPredictor_temp) baselineOnly = surprise.BaselineOnly() baselineOnly_temp = surprise.model_selection.cross_validate( baselineOnly, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) print('baselineOnly-----------------') print(baselineOnly_temp) knnBasic = surprise.KNNBasic() knnBasic_temp = surprise.model_selection.cross_validate( knnBasic, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) print('knnBasic-----------------') print(knnBasic_temp) knnWithMeans = surprise.KNNWithMeans() knnWithMeans_temp = surprise.model_selection.cross_validate( knnWithMeans, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) print('knnWithMeans-----------------') print(knnWithMeans_temp) knnBaseline = surprise.KNNBaseline() knnBaseline_temp = surprise.model_selection.cross_validate( knnBaseline, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) print('knnBaseline-----------------') print(knnBaseline_temp) svdpp = surprise.SVDpp() svdpp_temp = surprise.model_selection.cross_validate(svdpp, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
# with open('result.txt', 'w') as file: # for est_n, true_n in GridList: # precisions = [] # recalls = [] # for trainset, testset in kf.split(data): # algo.fit(trainset) # predictions = algo.test(testset, verbose=False) # prec, rec = precision_recall(predictions, est_n, true_n) # precisions.append(prec) # recalls.append(rec) # # precisions, recalls = precision_recall_at_k(predictions) # # # Precision and recall can then be averaged over all users # # precision.append(sum(prec for prec in precisions.values()) / len(precisions)) # # recall.append(sum(rec for rec in recalls.values()) / len(recalls)) # # mae.append(accuracy.mae(predictions)) # # rmse.append(accuracy.rmse(predictions)) # # precision = sum(precisions) / 5 # recall = sum(recalls) / 5 # print(precision,recall) # file.write(str(precision)+'\n') # file.write(str(recall)+'\n') # print(sum(mae) / 5) # print(sum(rmse) / 5) cross_validate(surprise.BaselineOnly(), data, cv=5, verbose=True) cross_validate(surprise.KNNBasic(), data, cv=5, verbose=True) cross_validate(surprise.KNNWithMeans(), data, cv=5, verbose=True) cross_validate(surprise.KNNBaseline(), data, cv=5, verbose=True) cross_validate(surprise.SVD(), data, cv=5, verbose=True)
bsl_options = {'method': 'sgd', 'learning_rate': .0005 } #bsl_options = {'method': 'sgd', # 'learning_rate': .0005 # } sim_options = {'name': 'cosine', 'min_support': 5, 'user_based': True, } algo1 = surprise.KNNBasic(sim_options=sim_options) algo2 = surprise.KNNWithMeans(k=15, min_k=5, sim_options=sim_options) algo3 = surprise.SVD(n_factors=20000, n_epochs=20, biased=False, init_mean=0, init_std_dev=0.1, lr_all=0.005, lr_bu=None, lr_bi=None, lr_qi=None, reg_all=None, reg_bu=0, reg_bi=0, reg_pu=0, reg_qi=0, verbose=True) #algo4 = surprise.SVDpp(n_factors=20, n_epochs=20, init_mean=0, init_std_dev=0.1, # lr_all=0.005, reg_all=None, verbose=True) print('Training model...') algo1.train(trainset) print('Making predictions...') predictions = algo1.test(test_set) print('Evaluating results...') surprise.accuracy.rmse(predictions, verbose=True)
def part3(): file_path = 'DMA_project2_team%02d_part2_UIR.csv' % team reader = Reader(line_format='user item rating', sep=',', rating_scale=(1, 10), skip_lines=1) data = Dataset.load_from_file(file_path, reader=reader) trainset = data.build_full_trainset() testset = trainset.build_anti_testset() # TODO: Requirement 3-2. User-based Recommendation uid_list = [ 'ffffbe8d854a4a5a8ab1a381224f5b80', 'ffe2f26d5c174e13b565d026e1d8c503', 'ffdccaff893246519b64d76c3561d8c7', 'ffdb001850984ce69c5f91360ac16e9c', 'ffca7b070c9d41e98eba01d23a920d52' ] # TODO - set algorithm for 3-2-1 algo = surprise.KNNBasic(k=40, min_k=1, sim_options={ 'name': 'cosine', 'user_based': True }, verbose=True) algo.fit(trainset) results = get_top_n(algo, testset, uid_list, n=10, user_based=True) with open('3-2-1.txt', 'w') as f: for uid, ratings in sorted(results.items(), key=lambda x: x[0]): f.write('User ID %s top-10 results\n' % uid) for iid, score in ratings: f.write('Item ID %s\tscore %s\n' % (iid, str(score))) f.write('\n') # TODO - set algorithm for 3-2-2 algo = surprise.KNNWithMeans(k=40, min_k=1, sim_options={ 'name': 'pearson', 'user_based': True }, verbose=True) algo.fit(trainset) results = get_top_n(algo, testset, uid_list, n=10, user_based=True) with open('3-2-2.txt', 'w') as f: for uid, ratings in sorted(results.items(), key=lambda x: x[0]): f.write('User ID %s top-10 results\n' % uid) for iid, score in ratings: f.write('Item ID %s\tscore %s\n' % (iid, str(score))) f.write('\n') # TODO - 3-2-3. Best Model kfold = KFold(n_splits=5, random_state=0) parameters = { 'k': [30, 40, 50], 'min_k': [1], 'sim_options': { 'name': ['pearson', 'cosine'], 'user_based': [True] } } # Select the best algo with grid search. print('Grid Search for user based model...') grid_KNNBasic = GridSearchCV(surprise.KNNBasic, measures=['rmse'], param_grid=parameters, cv=kfold) grid_KNNWithMeans = GridSearchCV(surprise.KNNWithMeans, measures=['rmse'], param_grid=parameters, cv=kfold) grid_KNNBasic.fit(data) grid_KNNWithMeans.fit(data) best_KNNBasic_score = grid_KNNBasic.best_score['rmse'] best_KNNWithMeans_score = grid_KNNWithMeans.best_score['rmse'] if best_KNNBasic_score < best_KNNWithMeans_score: algo_name = 'KNNBasic' best_algo_ub = grid_KNNBasic.best_estimator['rmse'] with_parameters = grid_KNNBasic.best_params['rmse'] score = best_KNNBasic_score else: algo_name = 'KNNWithMeans' best_algo_ub = grid_KNNWithMeans.best_estimator['rmse'] with_parameters = grid_KNNWithMeans.best_params['rmse'] score = best_KNNWithMeans_score print('The best UB algorithm is', algo_name, 'with', with_parameters, '\nscore:', score) # TODO: Requirement 3-3. Item-based Recommendation iid_list = ['art', 'teaching', 'career', 'college', 'medicine'] # TODO - set algorithm for 3-3-1 algo = surprise.KNNBasic(k=40, min_k=1, sim_options={ 'name': 'cosine', 'user_based': False }, verbose=True) algo.fit(trainset) results = get_top_n(algo, testset, iid_list, n=10, user_based=False) with open('3-3-1.txt', 'w') as f: for iid, ratings in sorted(results.items(), key=lambda x: x[0]): f.write('Item ID %s top-10 results\n' % iid) for uid, score in ratings: f.write('User ID %s\tscore %s\n' % (uid, str(score))) f.write('\n') # TODO - set algorithm for 3-3-2 algo = surprise.KNNWithMeans(k=40, min_k=1, sim_options={ 'name': 'pearson', 'user_based': False }, verbose=True) algo.fit(trainset) results = get_top_n(algo, testset, iid_list, n=10, user_based=False) with open('3-3-2.txt', 'w') as f: for iid, ratings in sorted(results.items(), key=lambda x: x[0]): f.write('Item ID %s top-10 results\n' % iid) for uid, score in ratings: f.write('User ID %s\tscore %s\n' % (uid, str(score))) f.write('\n') # TODO - 3-3-3. Best Model kfold = KFold(n_splits=5, random_state=0) parameters = { 'k': [30, 40, 50], 'min_k': [1], 'sim_options': { 'name': ['pearson', 'cosine'], 'user_based': [False] } } # Select the best algo with grid search. print('Grid Search for item based model...') grid_KNNBasic = GridSearchCV(surprise.KNNBasic, measures=['rmse'], param_grid=parameters, cv=kfold) grid_KNNWithMeans = GridSearchCV(surprise.KNNWithMeans, measures=['rmse'], param_grid=parameters, cv=kfold) grid_KNNBasic.fit(data) grid_KNNWithMeans.fit(data) best_KNNBasic_score = grid_KNNBasic.best_score['rmse'] best_KNNWithMeans_score = grid_KNNWithMeans.best_score['rmse'] if best_KNNBasic_score < best_KNNWithMeans_score: algo_name = 'KNNBasic' best_algo_ub = grid_KNNBasic.best_estimator['rmse'] with_parameters = grid_KNNBasic.best_params['rmse'] score = best_KNNBasic_score else: algo_name = 'KNNWithMeans' best_algo_ub = grid_KNNWithMeans.best_estimator['rmse'] with_parameters = grid_KNNWithMeans.best_params['rmse'] score = best_KNNWithMeans_score print('The best IB algorithm is', algo_name, 'with', with_parameters, '\nscore:', score) # TODO: Requirement 3-4. Matrix-factorization Recommendation # TODO - set algorithm for 3-4-1 algo = surprise.SVD(n_factors=100, n_epochs=50, biased=False) algo.fit(trainset) results = get_top_n(algo, testset, uid_list, n=10, user_based=True) with open('3-4-1.txt', 'w') as f: for uid, ratings in sorted(results.items(), key=lambda x: x[0]): f.write('User ID %s top-10 results\n' % uid) for iid, score in ratings: f.write('Item ID %s\tscore %s\n' % (iid, str(score))) f.write('\n') # TODO - set algorithm for 3-4-2 algo = surprise.SVD(n_factors=200, n_epochs=100, biased=True) algo.fit(trainset) results = get_top_n(algo, testset, uid_list, n=10, user_based=True) with open('3-4-2.txt', 'w') as f: for uid, ratings in sorted(results.items(), key=lambda x: x[0]): f.write('User ID %s top-10 results\n' % uid) for iid, score in ratings: f.write('Item ID %s\tscore %s\n' % (iid, str(score))) f.write('\n') # TODO - set algorithm for 3-4-3 algo = surprise.SVDpp(n_factors=100, n_epochs=50) algo.fit(trainset) results = get_top_n(algo, testset, uid_list, n=10, user_based=True) with open('3-4-3.txt', 'w') as f: for uid, ratings in sorted(results.items(), key=lambda x: x[0]): f.write('User ID %s top-10 results\n' % uid) for iid, score in ratings: f.write('Item ID %s\tscore %s\n' % (iid, str(score))) f.write('\n') # TODO - set algorithm for 3-4-4 algo = surprise.SVDpp(n_factors=100, n_epochs=100) algo.fit(trainset) results = get_top_n(algo, testset, uid_list, n=10, user_based=True) with open('3-4-4.txt', 'w') as f: for uid, ratings in sorted(results.items(), key=lambda x: x[0]): f.write('User ID %s top-10 results\n' % uid) for iid, score in ratings: f.write('Item ID %s\tscore %s\n' % (iid, str(score))) f.write('\n') # TODO - 3-4-5. Best Model kfold = KFold(n_splits=5, random_state=0) parameters_SVD = { 'n_factors': [50, 100, 200], 'n_epochs': [10, 50, 100, 200], 'biased': [True, False] } grid_SVD = GridSearchCV(surprise.SVD, measures=['rmse'], param_grid=parameters_SVD, cv=kfold) parameters_SVDpp = { 'n_factors': [50, 100, 200], 'n_epochs': [10, 50, 100, 200] } grid_SVDpp = GridSearchCV(surprise.SVDpp, measures=['rmse'], param_grid=parameters_SVDpp, cv=kfold) grid_SVD.fit(data) grid_SVDpp.fit(data) best_SVD_score = grid_SVD.best_score['rmse'] best_SVDpp_score = grid_SVDpp.best_score['rmse'] if best_SVD_score < best_SVDpp_score: algo_name = 'SVD' best_algo_mf = grid_SVD.best_estimator['rmse'] with_parameters = grid_SVD.best_params['rmse'] score = best_SVD_score else: algo_name = 'SVDpp' best_algo_mf = grid_SVDpp.best_estimator['rmse'] with_parameters = grid_SVDpp.best_params['rmse'] score = best_SVDpp_score print('The best MF algorithm is', algo_name, 'with', with_parameters, '\nscore:', score)
print("Done.") # defining the number of folds = 5 print("Performing splits...") kf = sp.model_selection.KFold(n_splits=5, random_state=0) print("Done.") ### ### PART 1.1 ### ''' application of all algorithms for recommendation made available by “Surprise” libraries, according to their default configuration. ''' algorithms = [sp.NormalPredictor(), sp.BaselineOnly(), sp.KNNBasic(),\ sp.KNNWithMeans(), sp.KNNWithZScore(), sp.KNNBaseline(),\ sp.SVD(), sp.SVDpp(), sp.NMF(), sp.SlopeOne(), sp.CoClustering()] for elem in algorithms: start_time = time.time() algo = elem sp.model_selection.cross_validate(algo, data, measures=['RMSE'], \ cv=kf, n_jobs = 2, verbose=True) print("--- %s seconds ---" % (time.time() - start_time)) print() ### ### PART 1.2 ### ''' Improvement of the quality of both KNNBaseline and SVD methods, by performing hyper-parameters tuning over 5-folds
import surprise as sp from surprise import Dataset from surprise.model_selection import cross_validate import NetflixDataLoad #for 100000 rows for fast processing data = Dataset.load_from_df( NetflixDataLoad.df_filterd[['Cust_Id', 'Movie_Id', 'Rating']][:100000]) n_folds = 5 for algo in [sp.SVD(), sp.SVDpp(), sp.KNNBasic(), sp.KNNWithMeans()]: print( cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=n_folds, verbose=True)) # Output Example # Evaluating RMSE, MAE of algorithm SVD on 5 split(s). # # Fold 1 Fold 2 Fold 3 Fold 4 Fold 5 Mean Std # RMSE 0.9311 0.9370 0.9320 0.9317 0.9391 0.9342 0.0032 # MAE 0.7350 0.7375 0.7341 0.7342 0.7375 0.7357 0.0015 # Fit time 6.53 7.11 7.23 7.15 3.99 6.40 1.23 # Test time 0.26 0.26 0.25 0.15 0.13 0.21 0.06
} mean_ap = [] precision = [] recall = [] fscore = [] normalized_DCG = [] mean_ap_train = [] precision_train = [] recall_train = [] fscore_train = [] normalized_DCG_train = [] for k_val in ks: print(k_val) algo = surprise.KNNWithMeans(k=k_val, sim_options=sim_options) pr = 0 re = 0 fs = 0 ap = 0 nd = 0 pr_train = 0 re_train = 0 fs_train = 0 ap_train = 0 nd_train = 0 for trainset, testset in data.folds(): algo.train(trainset) predictions_on_test = algo.test(testset) precisions_test, recalls_test = precision_recall_at_k(
# A reader is needed but only the rating_scale param is requiered. reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(train_df, reader) sim_options = {'user_based': [False]} results = [] # Iterate over all algorithms for algorithm in [ SVD(), surprise.NMF(), surprise.SlopeOne(), surprise.CoClustering(), surprise.KNNBasic(sim_options=sim_options), surprise.KNNWithMeans(sim_options=sim_options), surprise.KNNWithZScore(sim_options=sim_options), surprise.KNNBaseline(sim_options=sim_options), surprise.NormalPredictor(), surprise.BaselineOnly() ]: # Get string of algname for naming a pickle file a useful name alg_name = str(algorithm) alg_name = alg_name[alg_name.find('.') + 1:] alg_name = alg_name[alg_name.find('.') + 1:] alg_name = alg_name[alg_name.find('.') + 1:] alg_name = alg_name[:alg_name.find('object') - 1] # Take a look at cross validation results to compare model types print('\n\nModeling: {}\n'.format(str(alg_name)))
epochs=2, validation_split=0.1, shuffle=True) y_pred = model.predict([df_hybrid_test['User'], df_hybrid_test['Movie'], test_tfidf]) y_true = df_hybrid_test['Rating'].values rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true)) print('\n\nTesting Result With Keras Hybrid Deep Learning: {:.4f} RMSE'.format(rmse)) # Load dataset into surprise specific data-structure data = sp.Dataset.load_from_df(df_filterd[['User', 'Movie', 'Rating']].sample(20000), sp.Reader()) benchmark = [] # Iterate over all algorithms for algorithm in [sp.SVD(), sp.SVDpp(), sp.SlopeOne(), sp.NMF(), sp.NormalPredictor(), sp.KNNBaseline(), sp.KNNBasic(), sp.KNNWithMeans(), sp.KNNWithZScore(), sp.BaselineOnly(), sp.CoClustering()]: # Perform cross validation results = cross_validate(algorithm, data, measures=['RMSE', 'MAE'], cv=3, verbose=False) # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) # Store data benchmark.append(tmp) # Store results surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse', ascending=False) # Get data data = surprise_results[['test_rmse', 'test_mae']]
for j in tp.columns[1:-1]: tp[j] = tp[j].replace(np.nan, tpmean[j]) mb += [tp] mb = pd.concat(mb, axis=0, ignore_index=True) # Model Based df = pd.read_csv( 'C:/Users/mayij/Desktop/DOC/GITHUB/MLGH/collaborative filtering/movielens.csv' ) reader = surprise.Reader(rating_scale=(1, 5)) data = surprise.Dataset.load_from_df(df[['user', 'item', 'rating']], reader) trainset = data.build_full_trainset() # KNN algo = surprise.KNNWithMeans(sim_options={ 'name': 'cosine', 'user_based': False }) algo.fit(trainset) gs = surprise.model_selection.GridSearchCV(surprise.KNNWithMeans, { 'sim_options': { 'name': ['msd', 'cosine'], 'min_support': [3, 4, 5], 'user_based': [False, True] } }, measures=['rmse', 'mae'], cv=3) gs.fit(data) print(gs.best_score['rmse']) print(gs.best_params['rmse'])