# Number of relevant and recommended items in top k n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold)) for (est, true_r) in user_ratings[:k]) # Precision@K: Proportion of recommended items that are relevant # When n_rec_k is 0, Precision is undefined. We here set it to 0. precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0 # Recall@K: Proportion of relevant items that are recommended # When n_rel is 0, Recall is undefined. We here set it to 0. recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0 return precisions, recalls data = Dataset.load_builtin('ml-100k') kf = KFold(n_splits=5) algo = SVD() for trainset, testset in kf.split(data): algo.fit(trainset) predictions = algo.test(testset) precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4) # Precision and recall can then be averaged over all users print(sum(prec for prec in precisions.values()) / len(precisions)) print(sum(rec for rec in recalls.values()) / len(recalls))
def _Call_Movielens_for_Evaluation(self): data = Dataset.load_builtin('ml-100k') return data
def make_predictions(user_id): performance = [] algorithms = ['SVD', 'KNN', 'ALS'] # First train an SVD algorithm on the movielens dataset. data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() algo_SVD = SVD() algo_SVD.fit(trainset) # Then predict ratings for all pairs (u, i) that are NOT in the training set. # SVD algorithm testset = trainset.build_anti_testset() predictions_SVD = algo_SVD.test(testset) accurancy_SVD = accuracy.rmse(predictions_SVD) performance.append(accurancy_SVD) algo_KNN = KNNBasic() algo_KNN.fit(trainset) predictions_KNN = algo_SVD.test(testset) accurancy_KNN = accuracy.rmse(predictions_KNN) performance.append(accurancy_KNN) bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5} algo_ALS = BaselineOnly(bsl_options=bsl_options) algo_ALS.fit(trainset) predictions_ALS = algo_ALS.test(testset) accurancy_ALS = accuracy.rmse(predictions_ALS) performance.append(accurancy_ALS) # comparing algorithms by performance best_performance_index = performance.index(min(performance)) best_algorithm = algorithms[best_performance_index] if best_algorithm == 'SVD': top_n = get_top_n(predictions_SVD, n=10) elif best_algorithm == 'KNN': top_n = get_top_n(predictions_KNN, n=10) elif best_algorithm == 'ALS': top_n = get_top_n(predictions_ALS, n=10) i_cols = [ 'movie_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western' ] items = pd.read_csv('../../ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1') predictions = [] # Print the recommended items for the user for uid, user_ratings in top_n.items(): if int(uid) + 1 == int(user_id) + 1: # print(uid, [iid for (iid, _) in user_ratings]) for (iid, _) in user_ratings: title = items[items['movie_id'] == int(iid) + 1]['movie_title'] title_t = str(title) title_split = title_t.split() print(title_split) # print(title_split(1)) # print(title_split(2)) # print(title_t) predictions.append(title_t) return predictions
from __future__ import (absolute_import, division, print_function, unicode_literals) from surprise import Dataset from surprise import accuracy from surprise import SlopeOne from surprise.model_selection import train_test_split # Load the movielens-100k dataset UserID::MovieID::Rating::Timestamp data = Dataset.load_builtin('ml-1m') trainset, testset = train_test_split(data, test_size=.15) # Configura o algoritmo. K = número de vizinhos. Name = Tipo de medida de similiradade. User based = filtragem por usuário ou item. print("Usando o algoritmo SlopeOne") algoritmo = SlopeOne() algoritmo.fit(trainset) # Selecionamos o usuário e o filme que será analisado # User 49. Tem entre 18 e 24 anos. É programador e mora em Huston, Texas uid = str(49) # Filme visto e avaliado: Negotiator, The (1998)::Action|Thriller. Avaliação 4 iid = str(2058) # raw item id # get a prediction for specific users and items. print("Predição de avaliação: ") pred = algoritmo.predict(uid, iid, r_ui=4, verbose=True) # run the trained model against the testset
from surprise import Dataset from surprise import Reader from surprise import accuracy from surprise.model_selection import train_test_split from surprise.prediction_algorithms.predictions import PredictionImpossible from surprise.prediction_algorithms.algo_base import AlgoBase from surprise.prediction_algorithms.knns import SymmetricAlgo import copy from collections import defaultdict import os import matplotlib import plotly file_path_save_data = 'data/processed/' # don't forget to create this folder before running the scrypt datasetname = 'ml-100k' # valid datasetnames are 'ml-latest-small', 'ml-20m', and 'jester' data1 = Dataset.load_builtin(datasetname) path = '../ml-100k/u.item' df = pd.read_csv(path, sep="|", encoding="iso-8859-1", names=['id','name','date','space','url','cat1','cat2','cat3','cat4','cat5','cat6','cat7','cat8','cat9','cat10','cat11','cat12','cat13','cat14','cat15','cat16','cat17','cat18','cat19']) list_of_cats = {} df1 = df[['id','cat1','cat2','cat3','cat4','cat5','cat6','cat7','cat8','cat9','cat10','cat11','cat12','cat13','cat14','cat15','cat16','cat17','cat18','cat19']] for row in df.itertuples(index=True, name='Pandas'): id = str(getattr(row, "id")) cate_x = [getattr(row, "cat1"),getattr(row, "cat2"),getattr(row, "cat3"),getattr(row, "cat4"),getattr(row, "cat5"),getattr(row, "cat6"),getattr(row, "cat7"),getattr(row, "cat8"),getattr(row, "cat9"),getattr(row, "cat10"),getattr(row, "cat11"),getattr(row, "cat12"),getattr(row, "cat13"),getattr(row, "cat14"),getattr(row, "cat15"),getattr(row, "cat16"),getattr(row, "cat17"),getattr(row, "cat18"),getattr(row, "cat19"),] list_of_cats[id] = cate_x def into_rate(cate,ate): for index in range(len(cate)): if cate[index] == 1: cate[index] = rate
else: datasets_list = list(starmap(subset_dataset, data_subsetting_params)) end_t = time.time() print("\n%.2f seconds elapsed \n" % (end_t - start_t)) subsetted_datasets = {} for dataset in datasets_list: subsetted_datasets[name_prefix + dataset_desc_str(dataset)] = dataset return subsetted_datasets # %% jester = Dataset.load_builtin('jester') print("loaded jester") jester = rescale_dataset(jester) # parameters are: (dataset, min_ratings_per_user, max_ratings_per_user, total_ratings) jester_data_subsetting_params = [(jester, 127, 150, 50000), (jester, 116, 126, 50000), (jester, 35, 40, 50000), (jester, 26, 34, 20000), (jester, 16, 25, 20000), (jester, 1, 15, 20000)] jester_datasets = create_data_subsets(jester, "jester", jester_data_subsetting_params) del jester
def __init__(self): self.data = Dataset.load_builtin('ml-1m')
import numpy as np import csv from surprise import Dataset, KNNBasic, SVD, SVDpp, BaselineOnly from surprise.model_selection import KFold, cross_validate from cf_models import EbcrMsdKNN, EbcrCosKNN, EbcrNormPccKNN, NormPcc, SW_Norm_PccKNN, SW_MSD_KNN, SW_COS_KNN, LS_MSD_KNN, LS_COS_KNN, LS_Norm_PccKNN __author__ = "Yu DU" # Datasets initialisation ml_100k = Dataset.load_builtin('ml-100k') ml_1m = Dataset.load_builtin('ml-1m') jester = Dataset.load_builtin('jester') # Split train and test set kf = KFold(random_state=0, n_splits=5) list_k = [5, 10, 20, 40, 60, 80, 100, 200] list_k2 = [5, 10, 15, 20, 25, 30, 35, 40] # The Ml-100k Dataset with open('results_ml100k_all.csv', mode='w') as result_file: fieldnames = ['k', 'algo', 'MAE', 'RMSE'] writer = csv.DictWriter(result_file, fieldnames=fieldnames) writer.writeheader() # SVD algo svd = SVD() out_svd = cross_validate(svd, ml_100k, ['rmse', 'mae'], kf,
def execute(self, params, **kwargs): # Load the movielens-100k dataset (download it if needed), data = Dataset.load_builtin('ml-100k') self.marvin_initial_dataset = {"data": data}
run = Run.get_submitted_run() # manually downloading the file, as it requires a prompt otherwise url = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip' DATASETS_DIR = os.path.expanduser('~') + '/.surprise_data/' print("Starting") name = 'ml-100k' os.makedirs(DATASETS_DIR, exist_ok=True) urllib.request.urlretrieve(url, DATASETS_DIR + 'tmp.zip') with zipfile.ZipFile(DATASETS_DIR + 'tmp.zip', 'r') as tmp_zip: tmp_zip.extractall(DATASETS_DIR + name) data = Dataset.load_builtin(name) trainingSet = data.build_full_trainset() ############################################################################################################################# modelVariations = { "model1.pkl": { 'name': 'cosine', 'user_based': False }, "model2.pkl": { 'name': 'cosine', 'user_based': True }, "model3.pkl": { 'name': 'msd', 'user_based': True
# -- encoding:utf-8 -- import warnings from surprise import Dataset from surprise import evaluate from surprise import KNNBaseline, KNNBasic warnings.filterwarnings('ignore') # 1. 读取数据 # 方式一:直接使用surprise框架提供的内置API读取movielens的数据 # name: 指定加载什么数据,可选值: 'ml-100k', 'ml-1m', and 'jester' # 该API默认会将数据下载到路径"~/.surprise_data" data = Dataset.load_builtin(name='ml-100k') # 2. 做一个交叉验证的数据划分 data.split(5) # 3. 模型对象的构建 bsl_options = { 'method': 'als', # 给定求解方式,可选值:als和sgd 'n_epochs': 10, # 迭代次数 'reg_i': 20, # 正则化系数, 'reg_u': 10 # 正则化系数, } """ k=40: 给定预测时候的邻居样本的数目 min_k=1:在产生预测值的时候,只要要求有多少个临近用户/物品 sim_options={} : 给定相似度矩阵的计算方式 """ sim_options = { 'name':
Y_train = np.loadtxt('data/train.txt', '\t') - np.array([1, 1, 0]) Y_test = np.loadtxt('data/test.txt', '\t') - np.array([1, 1, 0]) mu = np.mean(Y_train[:, 2]) epochs = 100 lamb = 1 U_ub, V_ub, _, _ = matrix_factorization(Y_train, 943, 1682, k, lamb, 0.03, epochs) U_b, V_b, A_b, B_b = matrix_factorization(Y_train, 943, 1682, k, lamb, 0.03, epochs, True) err_unbiased = score(Y_test, U_ub, V_ub) err_biased = score(Y_test, U_b, V_b, True, A_b, B_b, mu) print('Test error (biased):', err_biased) print('Test error (unbiased):', err_unbiased) data_surprise = Dataset.load_builtin('ml-100k') data_train, data_test = train_test_split(data_surprise, test_size=0.1) model = SVD(n_factors=k) model.fit(data_train) rmse = accuracy.rmse(model.test(data_test)) print('Test error (SVD):', rmse**2 / 2) model = SVD(n_factors=k) data_full = data_surprise.build_full_trainset() model.fit(data_full) V = model.qi.T best, most_popular = get_best_and_popular() movie_selection = {'Best Movies': best, 'Most Popular Movies': most_popular} def scatterplot(x, y, color, selection, indices, title):
'NormalPredictor': '[{}]({})'.format('Random', stable + 'basic_algorithms.html#surprise.prediction_algorithms.random_pred.NormalPredictor'), 'ml-100k': '[{}]({})'.format('Movielens 100k', 'http://grouplens.org/datasets/movielens/100k'), 'ml-1m': '[{}]({})'.format('Movielens 1M', 'http://grouplens.org/datasets/movielens/1m'), } # set RNG np.random.seed(0) random.seed(0) dataset = 'ml-1m' data = Dataset.load_builtin(dataset) kf = KFold(random_state=0) # folds will be the same for all algorithms. table = [] for klass in classes: start = time.time() out = cross_validate(klass(), data, ['rmse', 'mae'], kf) cv_time = str(datetime.timedelta(seconds=int(time.time() - start))) link = LINK[klass.__name__] mean_rmse = '{:.3f}'.format(np.mean(out['test_rmse'])) mean_mae = '{:.3f}'.format(np.mean(out['test_mae'])) new_line = [link, mean_rmse, mean_mae, cv_time] print(tabulate([new_line], tablefmt="pipe")) # print current algo perf table.append(new_line)
def built_in() -> (surprise.dataset.DatasetAutoFolds): data = Dataset.load_builtin(name='ml-100k', prompt=True) return data
import numpy as np import pandas as pd import matplotlib.pyplot as pyplot from surprise import SVD, Dataset, NMF from surprise.model_selection import train_test_split from surprise.model_selection.search import GridSearchCV from surprise.model_selection.validation import cross_validate def custom_rmse_cv(): pass if __name__ == '__main__': df = pd.read_csv('data/ml-latest-small/ratings.csv') # ratings = pd.pivot_table(data=df, values='rating', index='userId', columns='movieId') ratings = Dataset.load_builtin('ml-100k') # train, test = train_test_split(ratings) # algo = SVD(n_factors=50) # SVD.fit(train) # param_grid = {'n_epochs': [20], 'lr_all': [.01], 'n_factors': [183], # 'reg_all': [.1], 'verbose': [True]} # gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3) # gs.fit(ratings) # print(gs.best_params) # for e in range(20,31): # print(e) # algo = SVD(n_factors=183, reg_all=.1, lr_all=.01, n_epochs=e) # results = cross_validate(algo, ratings, measures=['RMSE'])
from surprise import SVD from surprise import Dataset, Reader from surprise.dump import dump, load import os if os.path.exists("/pfs/out/model"): # If we have model saved by previous training, load it _, algo = load("/pfs/out/model") reader = Reader(line_format='user item rating timestamp', sep=' ') # Train model with each new committed train data for dirpath, dirs, files in os.walk("/pfs/training"): for filename in files: filepath = os.path.join(dirpath, file) with open(filepath) as f: data = Dataset.load_from_file( filepath, reader=reader).build_full_trainset() algo.fit(data) else: # If it's initial run, train with existing dataset # Load the movielens-100k dataset (download it if needed), data = Dataset.load_builtin('ml-100k').build_full_trainset() # We'll use the famous SVD algorithm. algo = SVD() algo.fit(data) # In both case, save trained model dump("/pfs/out/model", algo=algo)
import numpy as np from surprise import Dataset from surprise.model_selection import cross_validate from surprise.model_selection import KFold from surprise import KNNBasic # Load dataset dataset = 'ml-100k' data = Dataset.load_builtin(dataset) # Set KFold validation and ensure random state remains consistent between all models kf = KFold(n_splits=10, random_state = 0) # Use RMS error and mean abs error as metrics out = cross_validate(KNNBasic(), data, ['rmse', 'mae'], kf) # Format outputs from surprise CV method out meanTestRMSE = '{:.3f}'.format(np.mean(out['test_rmse'])) meanTestMAE = '{:.3f}'.format(np.mean(out['test_mae'])) # Print results print("\nKNN Basic Recommender Model has achieved:") print(" RMSE: " + meanTestRMSE) print(" MAE : " + meanTestMAE)
def load_movies_data(): data = pd.DataFrame(Dataset.load_builtin("ml-100k").raw_ratings) data[0] = pd.to_numeric(data[0]) - 1 data[1] = pd.to_numeric(data[1]) - 1 del data[3] return data.values
#importing surprise package and builtin data from surprise import Dataset, evaluate from surprise import KNNBasic from collections import defaultdict # loading data dataset = Dataset.load_builtin("ml-100k") trainingSet = dataset.build_full_trainset() trainingSet # cosine similarity between 2 vectors sim_options = {'name': 'cosine', 'user_based': False} knn = KNNBasic(sim_options=sim_options) # training the model knn.train(trainingSet) # movie recommendations for users testSet = trainingSet.build_anti_testset() predictions = knn.test(testSet) #top three movie recommendations for each user. def get_top5_recommendations(predictions, topN=5): top_recs = defaultdict(list) for uid, iid, true_r, est, _ in predictions: top_recs[uid].append((iid, est)) for uid, user_ratings in top_recs.items():
def __init__(self, embedding_dimension=20, n_items_to_recommend=4, seed=0, n_users=40, n_items=500, normalize_reward=False): """ Environment that models some sequential recommendation process by using MovieLens Dataset PMF (Probabilistic Matrix Factorization) is performed to obtain user/item embeddings :param embedding_dimension: size of the user/item embeddings :param n_items_to_recommend: number of items to recommend actions is a list of that size :param seed: :param n_users: number of users :param n_items: number of items :param normalize_reward: normalize [1,5] ranks to [-1,1] rewards """ self.normalize_reward = normalize_reward self.embedding_dimension = embedding_dimension self.n_rec = n_items_to_recommend self.seed(seed) # Load the movielens-100k dataset (download it if needed), data = Dataset.load_builtin('ml-100k') # sample random trainset and testset # test set is made of 25% of the ratings. self.trainset, self.testset = train_test_split(data, test_size=.25) self.algo = SVD(n_factors=self.embedding_dimension, biased=False) self.algo.fit(self.trainset) self.users = self.algo.pu[:n_users] self.items = self.algo.qi[:n_items] self.n_users = len(self.users) self.n_items = len(self.items) if self.n_users < n_users: warnings.warn("Only %d users are available in dataset" % self.n_users) if self.n_items < n_items: warnings.warn("Only %d items are available in dataset" % self.n_items) self.Users = {} for i in range(self.n_users): user = User(id=i, embedding=self.users[i]) self.Users[user.id] = user self.Items = {} for j in range(self.n_items): item = Item(id=j, embedding=self.items[j], use_until=np.inf) self.Items[item.id] = item self.active_uid = self.np_random.choice(range(self.n_users)) self.bought_items = defaultdict(set) # logs self.steps_count = 0 self.info = {} # TODO: make action and observation space. checkout robotics envs + FlattenDictWrapper # https://github.com/openai/gym/tree/5404b39d06f72012f562ec41f60734bd4b5ceb4b/gym/envs/robotics self.action_space = None self.observation_space = None
def load_ratings_from_surprise(self) -> DatasetAutoFolds: ratings = Dataset.load_builtin('ml-100k') return ratings
def main(): # Load the movielens-100k dataset UserID::MovieID::Rating::Timestamp data = Dataset.load_builtin('ml-100k') trainset, testset = train_test_split(data, test_size=.15) user_collaborative_filtering(trainset, testset)
import pandas as pd from surprise import Dataset from surprise import Reader # This is the same data that was plotted for similarity earlier # with one new user "E" who has rated only movie 1 #Load_data ratings_dict = { "item": [1, 2, 1, 2, 1, 2, 1, 2, 1], "user": ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D', 'E'], "rating": [1, 2, 2, 4, 2.5, 4, 4.5, 5, 3], } df = pd.DataFrame(ratings_dict) reader = Reader(rating_scale=(1, 5)) #Load Pandas DataFrame data = Dataset.load_from_df(df[["item", "user", "rating"]], reader) #Load builtin movie lens dataset movielens = Dataset.load_builtin('ml-100k') #Recommender.py from surprise import KNNWithMeans #To use item based cosine similarity sim_options = { "name": "cosine", "user_based": False #compute similarities between items } algo = KNNWithMeans(sim_options=sim_options)
from surprise import KNNWithMeans, KNNWithZScore from surprise import Dataset from surprise.model_selection import GridSearchCV, RandomizedSearchCV, KFold import time training_data = Dataset.load_builtin("ml-100k") sim_options = { "name": ["pearson", "msd", "cosine"], "min_support": [2, 4, 5], "user_based": [False, True], } print('\nRUNNING GRID SEARCH') print(' "name": ["pearson", "msd", "cosine"]') print(' "min_support": [2, 3, 4, 5]') print(' "user_based": [False, True]\n') param_grid = {"sim_options": sim_options} start_time = time.time() # GridSearchCV gs = RandomizedSearchCV(KNNWithZScore, param_grid, measures=["rmse", "mae"], cv=3) gs.fit(training_data) print() print("RMSE:", gs.best_score["rmse"]) print(gs.best_params["rmse"]) print()
def experiments(config_file): args = get_args_parser().parse_args(['@' + config_file]) # Set seed np.random.seed(int(args.seed)) # Construct output directory timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") outdir = args.outdir + str(args.dataset) + "/" + timestamp + '/' # Create results directory outdir_path = Path(outdir) if not outdir_path.is_dir(): os.makedirs(outdir) # Logging logfile = outdir + 'log.txt' log(logfile, "Directory " + outdir + " created.") # Set dataset if str(args.dataset) == 'ml-100k': dataset_name = 'MovieLens 100K' else: dataset_name = 'MovieLens 1M' # Load the MovieLens dataset (download it if needed), data = Dataset.load_builtin(str(args.dataset)) # 80-20 split train_dataset, test_dataset = train_test_split(data, test_size=.20, random_state=int(args.seed)) # Run Autoencoder [a_mse, a_runtime] = autoencoder(str(args.dataset), logfile, int(args.seed)) # Set algorithms user_based_msd_sim_options = {'name': 'msd', 'user_based': True} user_based_pearson_baseline_sim_options = { 'name': 'pearson_baseline', 'user_based': True } user_based_msd_algo = KNNBasic(sim_options=user_based_msd_sim_options) user_based_pearson_baseline_algo = KNNBasic( sim_options=user_based_pearson_baseline_sim_options) item_based_sim_options = {'name': 'msd', 'user_based': False} item_based_pearson_baseline_sim_options = { 'name': 'pearson_baseline', 'user_based': False } item_based_msd_algo = KNNBasic(sim_options=item_based_sim_options) item_based_pearson_baseline_algo = KNNBasic( sim_options=item_based_pearson_baseline_sim_options) algorithms = ( ("User MSD", user_based_msd_algo), ("User Pearson Baseline", user_based_pearson_baseline_algo), ("Item MSD", item_based_msd_algo), ("Item Pearson Baseline", item_based_pearson_baseline_algo), ) # Plotting plt.style.use('dark_background') fig, ax = plt.subplots() # Autoencoder results runtimes = [a_runtime] mses = [a_mse] # ax.annotate("Autoencoder", (runtimes[0] + .001, mses[0] + .001)) # Running for name, algorithm in algorithms: log(logfile, dataset_name + ", " + name) # Train time_start = time.time() algorithm.fit(train_dataset) time_stop = time.time() log( logfile, 'Train time: {0:f}'.format(round(time_stop - time_start, 2)).strip('0')) # Test time_start = time.time() predictions = algorithm.test(test_dataset) time_stop = time.time() runtime = round(time_stop - time_start, 2) runtimes += [runtime] log(logfile, 'Test time: {0:f}'.format(runtime).strip('0')) # MSE metric mse = accuracy.mse(predictions, verbose=False) mses += [mse] log(logfile, 'MSE: {0:1.4f}\n'.format(mse)) # Draw scatter plot ax.scatter(runtimes[1:], mses[1:], marker='x', color='red') # ax.scatter(runtimes, mses, marker='x', color='red') # Annotate scatter plot, i=0 is for Autoencoder for i, (name, _) in enumerate(algorithms): ax.annotate(name, (runtimes[i + 1] + .001, mses[i + 1] + .001)) # Set plot settings plt.title("{}".format(dataset_name), size=15) plt.xlabel('Runtime (s)') plt.ylabel('MSE') # Save plot plt.savefig(outdir + 'plot.png', bbox_inches='tight')
from surprise import Dataset from surprise import Reader rating_dict = { "item": [1, 2, 1, 2, 1, 2, 1, 2, 1], "user": ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D', 'E'], "rating": [1, 2, 2, 4, 2.5, 4, 4.5, 5, 3] } df = pd.DataFrame(rating_dict) reader = Reader(rating_scale=(1, 5)) # Load the dataframe data = Dataset.load_from_df(df[["user", "item", "rating"]], reader) # builtin Movielens-100k data movielens = Dataset.load_builtin('ml-100k') # Configure KnnwithMeans from surprise import KNNWithMeans sim_options = {"name": "cosine", "user_based": False} algo = KNNWithMeans(sim_options=sim_options) # Predict the Rating of Movie E trainSet = data.build_full_trainset() algo.fit(trainSet) prediction = algo.predict('E', 2) print(prediction.est)
def main(args=None): _file, noise, epochs, pca_com = process_args(args) data = Dataset.load_builtin('ml-1m') rc = RatingCollection(data.raw_ratings) print("Constructing network...") d_losses = [] g_losses = [] print("Starting run...") distances = [] for i in range(len(rc.folds)): print("Fold {}...".format(i + 1)) training_data = {} for idx, value in enumerate(rc.folds): #if idx != i: training_data = {**training_data, **rc._get_matrix(value)} print('Calculating principle components...') pca = PCA(pca_com) pca.fit(get_sample(training_data, len(training_data.keys()))) dis_arch = [MOVIES_COUNT, 300, 1] gen_arch = [noise, 300, MOVIES_COUNT] tf.reset_default_graph() network = gan(dis_arch, gen_arch, pca_com, 50) session = tf.Session() session.run(tf.global_variables_initializer()) for it in range(epochs): users = get_sample(training_data, 50) _sample = sample_Z(50, noise) users_p = get_perturbed_batch(users) users_pca = pca.transform(users) # _, D_loss_curr = session.run([network.discriminator_optimizer, network.discriminator_loss], # feed_dict={network.discriminator_input: users, network.generator_input: _sample, # network.generator_condition: users_pca, network.pert: users_p, network.keep_prob: 0.5}) _, D_loss_curr = session.run( [network.discriminator_optimizer, network.discriminator_loss], feed_dict={ network.discriminator_input: users, network.generator_input: _sample, network.generator_condition: users_pca, network.keep_prob: 0.5 }) _, G_loss_curr = session.run( [network.generator_optimizer, network.generator_loss], feed_dict={ network.generator_input: _sample, network.generator_condition: users_pca, network.keep_prob: 0.5 }) if it % 100 == 0: d_losses.append(D_loss_curr) g_losses.append(G_loss_curr) print('Iteration {} of {} ---------------'.format(it, epochs)) print('D loss: {:.4}'.format(D_loss_curr)) print('G_loss: {:.4}'.format(G_loss_curr)) # Get the classification distances test_fold = rc._get_matrix(rc.folds[i]) sample_size = len(test_fold) users = get_sample(test_fold, sample_size).astype(np.float32) _sample = sample_Z(sample_size, noise) users_pca = pca.transform(users) generated_images = session.run(network.generator.prob, feed_dict={ network.generator_input: _sample, network.generator_condition: users_pca }) feed_users = get_sample(test_fold, sample_size).astype(np.float32) feed_users = tf.convert_to_tensor(feed_users, dtype=tf.float32) generated_images = tf.convert_to_tensor(generated_images, dtype=tf.float32) result = tf.contrib.gan.eval.frechet_classifier_distance_from_activations( feed_users, generated_images) result = session.run(result) distances.append(result) write_output(d_losses, g_losses, distances, _file) break
def get_top_n(predictions, n=10): # First map the predictions to each user. top_n = defaultdict(list) for uid, iid, true_r, est, _ in predictions: top_n[uid].append((iid, est)) # Then sort the predictions for each user and retrieve the k highest ones. for uid, user_ratings in top_n.items(): user_ratings.sort(key=lambda x: x[1], reverse=True) top_n[uid] = user_ratings[:n] return top_n movie_train = Dataset.load_builtin('ml-100k') print(movie_train.raw_ratings) knn_estimator = KNNBasic knn_grid = { 'k': [10, 20], 'sim_options': { 'name': ['cosine', 'msd'], 'min_support': [1, 5], 'user_based': [True, False] } } gs = model_selection.GridSearchCV(knn_estimator, knn_grid, measures=['rmse'], cv=3)
import random import pandas as pd from surprise.prediction_algorithms.knns import KNNBasic as KNN from surprise import Dataset from surprise import accuracy from surprise.model_selection import GridSearchCV data = Dataset.load_builtin('jester') raw_ratings = data.raw_ratings random.shuffle(raw_ratings) threshold = int(.9 * len(raw_ratings)) train_raw_ratings = raw_ratings[:threshold] test_raw_ratings = raw_ratings[threshold:] data.raw_ratings = train_raw_ratings print('Grid Search...') #param_grid = {'k': [1, 10, 100, 1000, 10000, 100000, 1000000], "sim_options['name']": ['MSD', 'cosine', 'pearson'], "sim_options['user_based']": [True, False]} param_grid = { 'k': [100], "sim_options['name']": ['MSD'], "sim_options['user_based']": [True] } grid_search = GridSearchCV(KNN, param_grid, measures=['rmse'], cv=3) grid_search.fit(data)
# -*- coding: utf-8 -*- """ Created on Tue Apr 28 10:07:52 2020 @author: lucas """ import pandas as pd from surprise import Dataset from surprise import KNNWithMeans from surprise.model_selection import GridSearchCV from surprise import SVD data = Dataset.load_builtin("ml-100k") #Dataset.load_from_df() #%% memory-based approach sim_options = { "name": ["msd", "cosine"], "min_support": [3, 4, 5], "user_based": [False, True], } param_grid = {"sim_options": sim_options} gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse", "mae"], cv=3) gs.fit(data) # print(gs.best_score["rmse"]) print(gs.best_params["rmse"])
""" This module describes how to use the GridSearchCV() class for finding the best parameter combination of a given algorithm. """ from __future__ import (absolute_import, division, print_function, unicode_literals) from surprise import SVD from surprise import Dataset from surprise.model_selection import GridSearchCV # Use movielens-100K data = Dataset.load_builtin('ml-100k') param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6]} gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3) gs.fit(data) # best RMSE score print(gs.best_score['rmse']) # combination of parameters that gave the best RMSE score print(gs.best_params['rmse']) # We can now use the algorithm that yields the best rmse: algo = gs.best_estimator['rmse'] algo.fit(data.build_full_trainset())
def load_ratings_from_surprise(name: str) -> DatasetAutoFolds: ratings = Dataset.load_builtin(name) return ratings