def train(self): self.dataSetConstruct() print("model training...") self.algo = surprise.SVDpp(n_factors=1, n_epochs=300, lr_all=0.001, reg_all=0.01) surprise.model_selection.cross_validate(self.algo, self.data, measures=['RMSE', 'MAE'], cv=3, verbose=True) print("model training complete") print("Making predictions...") self.predictions = self.algo.test(self.testset) print("Predictions made") self.predictionDict = defaultdict(list) for uid, iid, true_r, est, _ in self.predictions: self.predictionDict[uid].append((iid, est)) print("Sorting results...") for id, ratings in self.predictionDict.items(): self.predictionDict[id] = sorted(ratings, key=lambda x: x[1], reverse=True)[0:self.n] print("Sorting complete") file = open(self.dictPath, 'wb') pickle.dump(self.predictionDict, file) file.close() print('Dict saved')
def __init__(self, hyper_params, user_count, item_count): latent_size = hyper_params['latent_size'] if hyper_params['model_type'] == 'kNN': self.model = surprise.prediction_algorithms.knns.KNNBasic( k=10, verbose=True) elif hyper_params['model_type'] == 'NMF': self.model = surprise.NMF(n_factors=latent_size, biased=False, n_epochs=50, verbose=True) elif hyper_params['model_type'] == 'SVD': self.model = surprise.SVD(n_factors=latent_size, verbose=True) elif hyper_params['model_type'] == 'SVD++': self.model = surprise.SVDpp(n_factors=latent_size, verbose=True) elif hyper_params['model_type'] == 'baseline': bsl_options = { 'method': 'sgd', 'n_epochs': 20, } self.model = surprise.prediction_algorithms.baseline_only.BaselineOnly( bsl_options=bsl_options, verbose=True) self.hyper_params = hyper_params self.user_count = user_count self.item_count = item_count
def surprise_SVDpp(trainset, finalset): "SVD++ model" algo = spr.SVDpp(n_factors=40, n_epochs=20, lr_all=0.001) algo.fit(trainset) predictions_final = algo.test(finalset) return spr_estimate_to_vect(predictions_final)
def SVDpp(train, test, rate): """ Run the SVD++ model from Surprise library. The number of factors is 40. The number of iterations is 20. @param train: the training set in the Surprise format. @param test: the test set in the Surprise format. @param rate: the learning rate of all parameters. @return: the predictions in a numpy array. """ algo = spr.SVDpp(n_factors=40, lr_all=rate, verbose=True) algo.fit(train) predictions = algo.test(test) return get_predictions(predictions)
def algo_tester(data_object): ''' Produces a dataframe displaying all the different RMSE's, test & train times of the different surprise algorithms ---Parameters--- data_object(variable) created from the read_data_surprise function ---Returns--- returns a dataframe where you can compare the performance of different algorithms ''' benchmark = [] algos = [ sp.SVDpp(), sp.SVD(), sp.SlopeOne(), sp.NMF(), sp.NormalPredictor(), sp.KNNBaseline(), sp.KNNBasic(), sp.KNNWithMeans(), sp.KNNWithZScore(), sp.BaselineOnly(), sp.CoClustering() ] # Iterate over all algorithms for algorithm in algos: # Perform cross validation results = cross_validate(algorithm, data_object, measures=['RMSE'], cv=3, verbose=False) # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) benchmark = pd.DataFrame(benchmark).set_index('Algorithm').sort_values( 'test_rmse') return benchmark
def train_model(self): # main training component def get_top_n(predictions, n=10): # get prediction results top_n = collections.defaultdict(list) for uid, iid, true_r, est, _ in predictions: top_n[uid].append((iid, est)) for uid, user_ratings in top_n.items(): user_ratings.sort(key=lambda x: x[1], reverse=True) top_n[uid] = user_ratings[:30] return top_n R = pd.read_csv('./Data/Spark_Training.csv') reader = surprise.Reader(rating_scale=(0.0, 4.0)) data = surprise.Dataset.load_from_df(R, reader) algo = surprise.SVDpp(lr_all=0.001, n_factors=100, n_epochs=20, reg_all=0.1) trainset = data.build_full_trainset() testset = trainset.build_anti_testset() print( 'Training started. Depends on your machine, this process may take more than an hour' ) algo.fit(trainset) # cross validation output = surprise.model_selection.cross_validate( algo, data, verbose=True, n_jobs=-2, cv=3, measures=['rmse', 'mae', 'fcp']) predictions = algo.test(testset) dump_pred = get_top_n(predictions, n=30) with open('./Saved Models/test_pred.pkl', 'wb') as f: pickle.dump(dump_pred, f, protocol=pickle.HIGHEST_PROTOCOL)
def nb_collaborative_filtering(self, critic, top_n=5): lower_rating = self.reviews_rs['rating'].min() upper_rating = self.reviews_rs['rating'].max() reader = surprise.Reader(rating_scale=(0.0, 10.0)) data = surprise.Dataset.load_from_df(self.reviews_rs, reader) alg = surprise.SVDpp() output = alg.fit(data.build_full_trainset()) # Get a list of all unique movies movies_id = self.reviews_rs['id'].unique() # Get a list of movies_id that reviewer 0 has rated critic_id = self.critic_uid[critic] movies_id_critic = self.reviews_rs.loc[self.reviews_rs['critic_uid'] == critic_id, 'id'] # Remove the movie_id that reviewer 0 has rated movies_ids_to_pred = np.setdiff1d(movies_id, movies_id_critic) testset = [[critic_id, movie_id, 10.0] for movie_id in movies_ids_to_pred] predictions = alg.test(testset) pred_ratings = np.array([pred.est for pred in predictions]) # Find the index of the maximum predicted rating i_max = np.argpartition(pred_ratings, -top_n)[-top_n:] # Use this to find the corresponding movie_id to recommend print('Top movies for reviewer {0}: {1}'.format( critic_id, self.critics[critic_id])) for i in i_max: movie_id = movies_ids_to_pred[i] print('movie_id: {0} with predicted rating: {1}'.format( movie_id, pred_ratings[i]))
preprocessed_dataset = dblp.load_preprocessed_dataset() x_train, y_train, x_test, y_test = dblp.get_fold_data(fold_counter, preprocessed_dataset, train_test_indices) df_train = dblp.create_user_item(x_train, y_train) reader = sr.Reader(rating_scale=(1, 1)) data_train = sr.Dataset.load_from_df(df_train[['userID', 'itemID', 'rating']], reader) df_test = dblp.create_user_item(x_test, y_test) data_test_temp = sr.Dataset.load_from_df( df_test[['userID', 'itemID', 'rating']], reader) temp = data_test_temp.build_full_trainset() data_test = temp.build_anti_testset() algo = sr.SVDpp() algo.fit(data_train.build_full_trainset()) def precision_recall_at_k(predictions, k=10, threshold=3.5): '''Return precision and recall at k metrics for each user.''' # First map the predictions to each user. user_est_true = defaultdict(list) for uid, _, true_r, est, _ in predictions: user_est_true[uid].append((est, true_r)) precisions = dict() recalls = dict() for uid, user_ratings in user_est_true.items():
if args.model == 'NormalPredictor': model = surprise.NormalPredictor() elif args.model == 'BaselineOnly': model = surprise.BaselineOnly() elif args.model == 'KNNBasic': model = surprise.KNNBasic() elif args.model == 'KNNWithMeans': model = surprise.KNNWithMeans() elif args.model == 'KNNWithZScore': model = surprise.KNNWithZScore() elif args.model == 'KNNBaseline': model = surprise.KNNBaseline() elif args.model == 'SVD': model = surprise.SVD() elif args.model == 'SVDpp': model = surprise.SVDpp(verbose=True) elif args.model == 'NMF': model = surprise.NMF() elif args.model == 'SlopeOne': model = surprise.SlopeOne() elif args.model == 'CoClustering': model = surprise.CoClustering() # cross_validate(model, trainset, cv=5, verbose=True) model.fit(trainset) lines = [] test_path = path + '/Data/test_format.txt' for line in tqdm(open(test_path, 'r').readlines()): user_id, item_id, timestamp, *tags = line.strip().split(',') rating = model.predict(user_id, item_id).est
def main(args): parser = argparse.ArgumentParser(description= \ 'Deploys recommendation algorithms and outputs the recommendations list',\ formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--pickleLoadPath", type=str, action='store', \ help= 'If set=> load topN recoms from pickle file') parser.add_argument("--pickleSavePath", type=str, action='store', help='If set => Output .pickle file.') parser.add_argument("--proc", type=int, default=multiprocessing.cpu_count(), \ action='store', \ help= 'Number of processes to spawn for topN computation\n' + 'default is number of processors.') parser.add_argument("--update_freq", type=int, default=1, action='store', \ help= 'Number of clicks after which the model is updated') parser.add_argument("--topN_list", type=int, nargs="+", required=True, \ help= 'e.g., --topN_list 5 10 50\n' \ + 'topN=max(topN_list); the rest of the values are used for evaluation.') parser.add_argument("--drop_ratio", type=int, default=0, action='store', \ help= 'Number of random events to remove from the training set;\n' + \ 'default is 0; Currently not implemented for librec.') parser.add_argument("--evalTrain", dest='evalTrain', action='store_true', \ help='If set => evaluate on training set using k-fold validation.\n' \ + 'Else => evaluate only on test set') parser.add_argument("--dataset", type=str, action='store', \ help= 'Full path to the dataset.\n' + \ 'Must give --testSize and --validSize for the split') parser.add_argument("--testSize", type=int, default=0, action='store', help='TestSet size; default is 0 => no test set') parser.add_argument("--validSize", type=int, default=2000, action='store', \ help= 'Validation Set size; default is 2000.') parser.add_argument("--trainSet", type=str, action='store', \ help= 'Full path to the trainingSet.csv\n' + \ 'If given the (potential) training set split from --dataset will be overwritten') parser.add_argument("--validSet", type=str, action='store', \ help= 'Full path to the validationSet.csv\n' + \ 'If given the (potential) validation set split from --dataset will be overwritten') parser.add_argument("--testSet", type=str, action='store', \ help= 'Full path to the testSet.csv\n' + \ 'If given the (potential) test set split from --dataset will be overwritten') parser.add_argument("--librec_home", type=str, action='store', \ help= 'Full path to the librec folder cloned from git.') parser.add_argument("--config", type=str, action='store', \ help= 'Full path to the librec .properties file.\n' + \ 'Copy from: https://www.librec.net/dokuwiki/doku.php?id=AlgorithmList') parser.add_argument("--surprise_algo", type=str, action='store', \ help= 'Choose algorithm from surprise lib. Available options:\n' + \ '--surprise_algo SVD\n' + \ '--surprise_algo SVDpp\n' + \ '--surprise_algo PMF\n' + \ '--surprise_algo NMF\n' + \ '--surprise_algo KNNWithMeans\n') args = parser.parse_args(args) random.seed(42) # reproducability np.random.seed(42) if args.pickleLoadPath is None: """DATA""" train, valid, test = splitter.splitData( fullDataPath=args.dataset, validSize=args.validSize, testSize=args.testSize, \ trainSetPath=args.trainSet, validSetPath=args.validSet, testSetPath=args.testSet) """RECOMMENDATIONS""" if args.surprise_algo == 'SVD': algo = surprise.SVD() elif args.surprise_algo == 'KNNWithMeans': # sim_options = {'name': 'pearson_baseline', 'shrinkage': 2500, \ # 'user_based': False, } sim_options = {'name': 'cosine', 'user_based': False} algo = surprise.KNNWithMeans(k=40, sim_options=sim_options) elif args.surprise_algo == 'PMF': algo = surprise.SVD(n_factors=5, reg_all=0.12, lr_all=0.005, n_epochs=400) elif args.surprise_algo == 'NMF': algo = surprise.NMF(n_factors=5, n_epochs=400) elif args.surprise_algo == 'SVDpp': algo = surprise.SVDpp() testList = [] # output recommendations for the last element if len(test) > 0: testList.append(test) if len(valid) > 0: testList.append(valid) for test in testList: if args.librec_home is None: recs = surprise_recom(train, test, algo, drop_ratio=args.drop_ratio, \ update_freq=args.update_freq, N_list=args.topN_list, num=args.proc, \ evalTrain=args.evalTrain) else: recs = librec_recom(train, test, args.librec_home, args.config, \ update_freq=args.update_freq, N_list=args.topN_list, num=args.proc, \ evalTrain=args.evalTrain) if not args.pickleSavePath is None: with open(args.pickleSavePath, 'wb') as handle: pickle.dump(recs, handle) else: with open(args.pickleLoadPath, 'rb') as handle: recs = pickle.load(handle)
def main(train_df, target_df, cache_name="test", force_recompute=[]): """Train multiple models on train_df and predicts target_df Predictions are cached. If the indices don't match the indices of target_df, the cache is discarded. By default, if a method was already computed it is not recomputed again (except if the method name is listed in force_recompute). cache_name is the name to use to read and write the cache. Arguments: train_df {dataframe} -- Training dataframe target_df {dataframe} -- Testing dataframe Keyword Arguments: cache_name {str} -- Name to use for caching (default: {"test"}) force_recompute {list} -- Name(s) of methods to recompute, whether or not it was already computed. Useful to only recompute single methods without discarding the rest. (default: {[]}) Returns: Dataframe -- Dataframe with predictions for each methods as columns, IDs as indices """ global algo_in_use CACHED_DF_FILENAME = os.path.dirname( os.path.abspath(__file__)) +\ "/cache/cached_predictions_{}.pkl".format(cache_name) train_df = preprocess_df(train_df) trainset = pandas_to_data(train_df) ids_to_predict = target_df["Id"].to_list() # try to retrieve backup dataframe try: print("Retrieving cached predictions") all_algos_preds_df = pd.read_pickle(CACHED_DF_FILENAME) print("Ensuring cached IDs match given IDs") assert sorted(ids_to_predict) == sorted( all_algos_preds_df.index.values) print("Indices match, continuing") except (FileNotFoundError, AssertionError): print("No valid cached predictions found") all_algos_preds_df = pd.DataFrame(ids_to_predict, columns=["Id"]) all_algos_preds_df.set_index("Id", inplace=True) all_algos = { "SVD": spr.SVD(n_factors=200, n_epochs=100), "Baseline": spr.BaselineOnly(), "NMF": spr.NMF(n_factors=30, n_epochs=100), "Slope One": spr.SlopeOne(), "KNN Basic": spr.KNNBasic(k=60), "KNN Means": spr.KNNWithMeans(k=60), "KNN Baseline": spr.KNNBaseline(), "KNN Zscore": spr.KNNWithZScore(k=60), "SVD ++": spr.SVDpp(n_factors=40, n_epochs=100), "Co Clustering": spr.CoClustering() } for name in all_algos: print("##### {} ####".format(name)) if name in force_recompute and name in all_algos_preds_df.columns: all_algos_preds_df.drop(name, axis=1, inplace=True) if name in all_algos_preds_df.columns: print("Already computed {}, skipping".format(name)) continue algo = all_algos[name] time.sleep(1) algo.fit(trainset) time.sleep(1) algo_in_use = algo print("Generating predictions...") predictions = parallelize_predictions(ids_to_predict, 80) print("Done. Merging with previous results") this_algo_preds_df = pd.DataFrame(predictions, columns=["Id", name]) this_algo_preds_df.set_index("Id", inplace=True) all_algos_preds_df = pd.merge(all_algos_preds_df, this_algo_preds_df, left_index=True, right_index=True) all_algos_preds_df.to_pickle(CACHED_DF_FILENAME) print("DONE computing surprize") return all_algos_preds_df
def fit(self): # fit model on dataset self.model = surprise.SVDpp().fit(self.rating_data)
# defining the number of folds = 5 print("Performing splits...") kf = sp.model_selection.KFold(n_splits=5, random_state=0) print("Done.") ### ### PART 1.1 ### ''' application of all algorithms for recommendation made available by “Surprise” libraries, according to their default configuration. ''' algorithms = [sp.NormalPredictor(), sp.BaselineOnly(), sp.KNNBasic(),\ sp.KNNWithMeans(), sp.KNNWithZScore(), sp.KNNBaseline(),\ sp.SVD(), sp.SVDpp(), sp.NMF(), sp.SlopeOne(), sp.CoClustering()] for elem in algorithms: start_time = time.time() algo = elem sp.model_selection.cross_validate(algo, data, measures=['RMSE'], \ cv=kf, n_jobs = 2, verbose=True) print("--- %s seconds ---" % (time.time() - start_time)) print() ### ### PART 1.2 ### ''' Improvement of the quality of both KNNBaseline and SVD methods, by performing hyper-parameters tuning over 5-folds Random-Search-Cross-Validation - KNN
delimiter=',', names=['uid', 'iid', 'rating']) print(dataset.head()) # lower_rating = dataset['rating'].min() upper_rating = dataset['rating'].max() print('Review range: {0} to {1}'.format(lower_rating, upper_rating)) # import sklearn import surprise # reader = surprise.Reader(rating_scale=(0.5, 4)) data = surprise.Dataset.load_from_df(dataset, reader) print("Now starting SVD calculation") # alg = surprise.SVDpp() # train = data.build_full_trainset() output = alg.fit(train) print("Displaying training data") print(train) print(output) # Extra line added # pred = alg.predict(uid='50', iid='52') score = pred.est print(score) ## # Get a list of all movie ids
datafile[['user_id', 'business_id', 'stars']], reader) A_train_dense = list([list(row) for row in A_train_dense]) for i in range(len(A_train_dense)): A_train_dense[i].append(None) A_train_dense = list([tuple(row) for row in A_train_dense]) A_test_dense = list([list(row) for row in A_test_dense]) for i in range(len(A_test_dense)): A_test_dense[i].append(None) A_test_dense = list([tuple(row) for row in A_test_dense]) trainset = data.construct_trainset(A_train_dense) testset = data.construct_testset(A_test_dense) # SVDpp: algo = surprise.SVDpp() algo.fit(trainset) predictions = algo.test(testset) print("model SVDpp: ") # Then compute RMSE accuracy.rmse(predictions) print("NDCG: " + str(sur_ndcg(atstd, predictions, product_index))) print("Precision: " + str(sur_precision(atstd, predictions, product_index))) # NMF: algo = surprise.NMF() algo.fit(trainset) predictions = algo.test(testset) print("model NMF: ")
import surprise as sp from surprise import Dataset from surprise.model_selection import cross_validate import NetflixDataLoad #for 100000 rows for fast processing data = Dataset.load_from_df( NetflixDataLoad.df_filterd[['Cust_Id', 'Movie_Id', 'Rating']][:100000]) n_folds = 5 for algo in [sp.SVD(), sp.SVDpp(), sp.KNNBasic(), sp.KNNWithMeans()]: print( cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=n_folds, verbose=True)) # Output Example # Evaluating RMSE, MAE of algorithm SVD on 5 split(s). # # Fold 1 Fold 2 Fold 3 Fold 4 Fold 5 Mean Std # RMSE 0.9311 0.9370 0.9320 0.9317 0.9391 0.9342 0.0032 # MAE 0.7350 0.7375 0.7341 0.7342 0.7375 0.7357 0.0015 # Fit time 6.53 7.11 7.23 7.15 3.99 6.40 1.23 # Test time 0.26 0.26 0.25 0.15 0.13 0.21 0.06
data = surprise.Dataset.load_from_df(dftrain, reader) print("finished combining the data with the reader =", datetime.now().time()) # param_grid = {'lr_all': np.arange(0.008,0.011,0.001), 'reg_all' : [0.1,0.3, 0.5]} # grid_s = surprise.model_selection.GridSearchCV(surprise.SVDpp,param_grid,measures = ['rmse','mae'],cv = cv) # grid_s.fit(data) # dict = grid_s.best_params['rmse'] # dict # dftrain = train.drop('timestamp', axis = 'columns') # dftrain = dftrain.reset_index(drop = True) # data = surprise.Dataset.load_from_df(dftrain, reader) # dict = grid_s.best_params['rmse'] alg = surprise.SVDpp()#lr_all = dict['lr_all'], reg_all = dict['reg_all']) print("finished creating the svdpp object =", datetime.now().time()) print("started model training =", datetime.now().time()) output = alg.fit(data.build_full_trainset()) print(output) print("finished training the model =", datetime.now().time()) dummies = [1]*len(test) test['rating'] = dummies predictions = alg.test(test.values) del test['rating'] print("finished predictions on testset =", datetime.now().time()) finpred = [ m.est for m in predictions]
# prepare data for normalization scaler = MinMaxScaler(feature_range=(0, 1)) # train the normalization # normalize the dataset df[['rating']] = scaler.fit_transform(df[['rating']]) print df.head(100) # A reader is still needed but only the rating_scale param is requiered. reader = surprise.Reader(rating_scale=(0, 1)) # The columns must correspond to user id, item id and ratings (in that order). dataset = surprise.Dataset.load_from_df(df[['uid', 'iid', 'rating']], reader) alg = surprise.SVDpp(lr_all=.001) output = alg.fit(dataset.build_full_trainset()) print output ''' pred = alg.predict(uid='3562446', iid='2982938') score = pred.est print score ''' while True: print 'input uid =>' puid=str(input()) if puid == 'exit' and not puid: break if puid == '\r': continue
def part3(): file_path = 'DMA_project2_team%02d_part2_UIR.csv' % team reader = Reader(line_format='user item rating', sep=',', rating_scale=(1, 10), skip_lines=1) data = Dataset.load_from_file(file_path, reader=reader) trainset = data.build_full_trainset() testset = trainset.build_anti_testset() # TODO: Requirement 3-2. User-based Recommendation uid_list = [ 'ffffbe8d854a4a5a8ab1a381224f5b80', 'ffe2f26d5c174e13b565d026e1d8c503', 'ffdccaff893246519b64d76c3561d8c7', 'ffdb001850984ce69c5f91360ac16e9c', 'ffca7b070c9d41e98eba01d23a920d52' ] # TODO - set algorithm for 3-2-1 algo = surprise.KNNBasic(k=40, min_k=1, sim_options={ 'name': 'cosine', 'user_based': True }, verbose=True) algo.fit(trainset) results = get_top_n(algo, testset, uid_list, n=10, user_based=True) with open('3-2-1.txt', 'w') as f: for uid, ratings in sorted(results.items(), key=lambda x: x[0]): f.write('User ID %s top-10 results\n' % uid) for iid, score in ratings: f.write('Item ID %s\tscore %s\n' % (iid, str(score))) f.write('\n') # TODO - set algorithm for 3-2-2 algo = surprise.KNNWithMeans(k=40, min_k=1, sim_options={ 'name': 'pearson', 'user_based': True }, verbose=True) algo.fit(trainset) results = get_top_n(algo, testset, uid_list, n=10, user_based=True) with open('3-2-2.txt', 'w') as f: for uid, ratings in sorted(results.items(), key=lambda x: x[0]): f.write('User ID %s top-10 results\n' % uid) for iid, score in ratings: f.write('Item ID %s\tscore %s\n' % (iid, str(score))) f.write('\n') # TODO - 3-2-3. Best Model kfold = KFold(n_splits=5, random_state=0) parameters = { 'k': [30, 40, 50], 'min_k': [1], 'sim_options': { 'name': ['pearson', 'cosine'], 'user_based': [True] } } # Select the best algo with grid search. print('Grid Search for user based model...') grid_KNNBasic = GridSearchCV(surprise.KNNBasic, measures=['rmse'], param_grid=parameters, cv=kfold) grid_KNNWithMeans = GridSearchCV(surprise.KNNWithMeans, measures=['rmse'], param_grid=parameters, cv=kfold) grid_KNNBasic.fit(data) grid_KNNWithMeans.fit(data) best_KNNBasic_score = grid_KNNBasic.best_score['rmse'] best_KNNWithMeans_score = grid_KNNWithMeans.best_score['rmse'] if best_KNNBasic_score < best_KNNWithMeans_score: algo_name = 'KNNBasic' best_algo_ub = grid_KNNBasic.best_estimator['rmse'] with_parameters = grid_KNNBasic.best_params['rmse'] score = best_KNNBasic_score else: algo_name = 'KNNWithMeans' best_algo_ub = grid_KNNWithMeans.best_estimator['rmse'] with_parameters = grid_KNNWithMeans.best_params['rmse'] score = best_KNNWithMeans_score print('The best UB algorithm is', algo_name, 'with', with_parameters, '\nscore:', score) # TODO: Requirement 3-3. Item-based Recommendation iid_list = ['art', 'teaching', 'career', 'college', 'medicine'] # TODO - set algorithm for 3-3-1 algo = surprise.KNNBasic(k=40, min_k=1, sim_options={ 'name': 'cosine', 'user_based': False }, verbose=True) algo.fit(trainset) results = get_top_n(algo, testset, iid_list, n=10, user_based=False) with open('3-3-1.txt', 'w') as f: for iid, ratings in sorted(results.items(), key=lambda x: x[0]): f.write('Item ID %s top-10 results\n' % iid) for uid, score in ratings: f.write('User ID %s\tscore %s\n' % (uid, str(score))) f.write('\n') # TODO - set algorithm for 3-3-2 algo = surprise.KNNWithMeans(k=40, min_k=1, sim_options={ 'name': 'pearson', 'user_based': False }, verbose=True) algo.fit(trainset) results = get_top_n(algo, testset, iid_list, n=10, user_based=False) with open('3-3-2.txt', 'w') as f: for iid, ratings in sorted(results.items(), key=lambda x: x[0]): f.write('Item ID %s top-10 results\n' % iid) for uid, score in ratings: f.write('User ID %s\tscore %s\n' % (uid, str(score))) f.write('\n') # TODO - 3-3-3. Best Model kfold = KFold(n_splits=5, random_state=0) parameters = { 'k': [30, 40, 50], 'min_k': [1], 'sim_options': { 'name': ['pearson', 'cosine'], 'user_based': [False] } } # Select the best algo with grid search. print('Grid Search for item based model...') grid_KNNBasic = GridSearchCV(surprise.KNNBasic, measures=['rmse'], param_grid=parameters, cv=kfold) grid_KNNWithMeans = GridSearchCV(surprise.KNNWithMeans, measures=['rmse'], param_grid=parameters, cv=kfold) grid_KNNBasic.fit(data) grid_KNNWithMeans.fit(data) best_KNNBasic_score = grid_KNNBasic.best_score['rmse'] best_KNNWithMeans_score = grid_KNNWithMeans.best_score['rmse'] if best_KNNBasic_score < best_KNNWithMeans_score: algo_name = 'KNNBasic' best_algo_ub = grid_KNNBasic.best_estimator['rmse'] with_parameters = grid_KNNBasic.best_params['rmse'] score = best_KNNBasic_score else: algo_name = 'KNNWithMeans' best_algo_ub = grid_KNNWithMeans.best_estimator['rmse'] with_parameters = grid_KNNWithMeans.best_params['rmse'] score = best_KNNWithMeans_score print('The best IB algorithm is', algo_name, 'with', with_parameters, '\nscore:', score) # TODO: Requirement 3-4. Matrix-factorization Recommendation # TODO - set algorithm for 3-4-1 algo = surprise.SVD(n_factors=100, n_epochs=50, biased=False) algo.fit(trainset) results = get_top_n(algo, testset, uid_list, n=10, user_based=True) with open('3-4-1.txt', 'w') as f: for uid, ratings in sorted(results.items(), key=lambda x: x[0]): f.write('User ID %s top-10 results\n' % uid) for iid, score in ratings: f.write('Item ID %s\tscore %s\n' % (iid, str(score))) f.write('\n') # TODO - set algorithm for 3-4-2 algo = surprise.SVD(n_factors=200, n_epochs=100, biased=True) algo.fit(trainset) results = get_top_n(algo, testset, uid_list, n=10, user_based=True) with open('3-4-2.txt', 'w') as f: for uid, ratings in sorted(results.items(), key=lambda x: x[0]): f.write('User ID %s top-10 results\n' % uid) for iid, score in ratings: f.write('Item ID %s\tscore %s\n' % (iid, str(score))) f.write('\n') # TODO - set algorithm for 3-4-3 algo = surprise.SVDpp(n_factors=100, n_epochs=50) algo.fit(trainset) results = get_top_n(algo, testset, uid_list, n=10, user_based=True) with open('3-4-3.txt', 'w') as f: for uid, ratings in sorted(results.items(), key=lambda x: x[0]): f.write('User ID %s top-10 results\n' % uid) for iid, score in ratings: f.write('Item ID %s\tscore %s\n' % (iid, str(score))) f.write('\n') # TODO - set algorithm for 3-4-4 algo = surprise.SVDpp(n_factors=100, n_epochs=100) algo.fit(trainset) results = get_top_n(algo, testset, uid_list, n=10, user_based=True) with open('3-4-4.txt', 'w') as f: for uid, ratings in sorted(results.items(), key=lambda x: x[0]): f.write('User ID %s top-10 results\n' % uid) for iid, score in ratings: f.write('Item ID %s\tscore %s\n' % (iid, str(score))) f.write('\n') # TODO - 3-4-5. Best Model kfold = KFold(n_splits=5, random_state=0) parameters_SVD = { 'n_factors': [50, 100, 200], 'n_epochs': [10, 50, 100, 200], 'biased': [True, False] } grid_SVD = GridSearchCV(surprise.SVD, measures=['rmse'], param_grid=parameters_SVD, cv=kfold) parameters_SVDpp = { 'n_factors': [50, 100, 200], 'n_epochs': [10, 50, 100, 200] } grid_SVDpp = GridSearchCV(surprise.SVDpp, measures=['rmse'], param_grid=parameters_SVDpp, cv=kfold) grid_SVD.fit(data) grid_SVDpp.fit(data) best_SVD_score = grid_SVD.best_score['rmse'] best_SVDpp_score = grid_SVDpp.best_score['rmse'] if best_SVD_score < best_SVDpp_score: algo_name = 'SVD' best_algo_mf = grid_SVD.best_estimator['rmse'] with_parameters = grid_SVD.best_params['rmse'] score = best_SVD_score else: algo_name = 'SVDpp' best_algo_mf = grid_SVDpp.best_estimator['rmse'] with_parameters = grid_SVDpp.best_params['rmse'] score = best_SVDpp_score print('The best MF algorithm is', algo_name, 'with', with_parameters, '\nscore:', score)
f.write('Item ID %s\tscore %s\n' % (iid, str(score))) f.write('\n') # TODO - 4-1-2. SVD, n_factors=200, n_epochs=100, biased=True algo = surprise.SVD(n_factors=200, n_epochs=100, biased=True) algo.fit(trainset) results = get_top_n(algo, testset, uid_list, n=10, user_based=True) with open('4-1-2_results.txt', 'w') as f: for uid, ratings in sorted(results.items(), key=lambda x: int(x[0])): f.write('User ID %s top-10 results\n' % uid) for iid, score in ratings: f.write('Item ID %s\tscore %s\n' % (iid, str(score))) f.write('\n') # TODO - 4-1-3. SVD++, n_factors=100, n_epochs=50 algo = surprise.SVDpp(n_factors=100, n_epochs=50) algo.fit(trainset) results = get_top_n(algo, testset, uid_list, n=10, user_based=True) with open('4-1-3_results.txt', 'w') as f: for uid, ratings in sorted(results.items(), key=lambda x: int(x[0])): f.write('User ID %s top-10 results\n' % uid) for iid, score in ratings: f.write('Item ID %s\tscore %s\n' % (iid, str(score))) f.write('\n') # TODO - 4-1-4. SVD++, n_factors=50, n_epochs=100 algo = surprise.SVDpp(n_factors=50, n_epochs=100) algo.fit(trainset) results = get_top_n(algo, testset, uid_list, n=10, user_based=True) with open('4-1-4_results.txt', 'w') as f: for uid, ratings in sorted(results.items(), key=lambda x: int(x[0])):
df = pd.DataFrame(ratings_dict); return sp.Dataset.load_from_df(df[['uid','vid', 'r']], reader); def load_data(path, r_range): train_set = convert_to_df(np.load(path + ".train"), r_range); test_set = convert_to_df(np.load(path + ".test"), r_range); return train_set.build_full_trainset(), test_set.build_full_trainset().build_testset(); if __name__ == '__main__': PREFIX = "/Users/morino/Downloads/dataset/"; names = [ 'ml-latest-small/ml', 'BX-CSV-Dump/bx', 'jester/jester']; teller = ["MovieLens", "BookCrossing", "Jester"]; r_ranges = [(1, 5), (1, 10), (0, 20)]; algos = [sp.SVD(biased = False), sp.SVDpp(), sp.NMF()]; algos_names = ['SVD', 'SVD++', 'NMF'] for i, name in enumerate(names): print("BEGIN {}".format(teller[i])); train_set, test_set = load_data(PREFIX + name, r_ranges[i]); for j, algo in enumerate(algos): algo.fit(train_set); preds = algo.test(test_set); print("{} RMSE {}".format(algos_names[j], sp.accuracy.rmse(preds))); print("END {}".format(teller[i]));
knnBasic = surprise.KNNBasic() knnBasic_temp = surprise.model_selection.cross_validate( knnBasic, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) print('knnBasic-----------------') print(knnBasic_temp) knnWithMeans = surprise.KNNWithMeans() knnWithMeans_temp = surprise.model_selection.cross_validate( knnWithMeans, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) print('knnWithMeans-----------------') print(knnWithMeans_temp) knnBaseline = surprise.KNNBaseline() knnBaseline_temp = surprise.model_selection.cross_validate( knnBaseline, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) print('knnBaseline-----------------') print(knnBaseline_temp) svdpp = surprise.SVDpp() svdpp_temp = surprise.model_selection.cross_validate(svdpp, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) print('svdpp-----------------') print(svdpp) nmf = surprise.NMF() nmf_temp = surprise.model_selection.cross_validate(nmf, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) print('nmf-----------------') print(nmf_temp)
print('-' * 12) print('-' * 12) return hr, arhr if __name__ == '__main__': # builtin dataset # data = env.Dataset.load_builtin('ml-100k') # =============================== load data ============================ # ml-latest-small # file_path = 'input/ml-latest-small/ratings.csv' # reader = env.Reader(line_format='user item rating timestamp', sep=',', skip_lines=1) # ------------------------------------------------------------------------------ # ml-100k file_path = 'input/ml-100k/u.data' reader = env.Reader(line_format='user item rating timestamp', sep='\t', skip_lines=1) # ------------------------------------------------------------------------------ # ml-20m # file_path = 'input/ml-20m/ratings.csv' # reader = env.Reader(line_format='user item rating timestamp', sep=',', skip_lines=1) # ============================================================================== data = env.Dataset.load_from_file(file_path, reader=reader) data.split(n_folds=5) algo = env.SVDpp() # evaluate_topn(algo, data, top_n=100, threshold=3, verbose=1) env.evaluate(algo, data, measures=['rmse', 'mae', 'fcp'], verbose=1)
epochs=2, validation_split=0.1, shuffle=True) y_pred = model.predict([df_hybrid_test['User'], df_hybrid_test['Movie'], test_tfidf]) y_true = df_hybrid_test['Rating'].values rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true)) print('\n\nTesting Result With Keras Hybrid Deep Learning: {:.4f} RMSE'.format(rmse)) # Load dataset into surprise specific data-structure data = sp.Dataset.load_from_df(df_filterd[['User', 'Movie', 'Rating']].sample(20000), sp.Reader()) benchmark = [] # Iterate over all algorithms for algorithm in [sp.SVD(), sp.SVDpp(), sp.SlopeOne(), sp.NMF(), sp.NormalPredictor(), sp.KNNBaseline(), sp.KNNBasic(), sp.KNNWithMeans(), sp.KNNWithZScore(), sp.BaselineOnly(), sp.CoClustering()]: # Perform cross validation results = cross_validate(algorithm, data, measures=['RMSE', 'MAE'], cv=3, verbose=False) # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) # Store data benchmark.append(tmp) # Store results surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse', ascending=False) # Get data data = surprise_results[['test_rmse', 'test_mae']]
dataset = pd.read_csv('C:/Users/Sridhar Sanobat/Documents/Data Science Examples/filmtrust/ratings.csv', delimiter = ',', names = ['uid', 'iid', 'rating']) print(dataset.head()) # lower_rating = dataset['rating'].min() upper_rating = dataset['rating'].max() print('Review range: {0} to {1}'.format(lower_rating, upper_rating)) # import sklearn import surprise # reader = surprise.Reader(rating_scale = (0.5, 4)) data = surprise.Dataset.load_from_df(dataset, reader) print("Now starting SVD calculation") # alg = surprise.SVDpp() # train = data.build_full_trainset() output = alg.fit(train) print("Displaying training data") print(train) print(output) # Extra line added # pred = alg.predict(uid = '50', iid = '52') score = pred.est print(score) ## # Get a list of all movie ids