def test_wrong_file_name(): """Ensure file names are checked when creating a (custom) Dataset.""" wrong_files = [('does_not_exist', 'does_not_either')] with pytest.raises(ValueError): Dataset.load_from_folds(folds_files=wrong_files, reader=Reader(), rating_scale=(1, 5))
def test_gridsearchcv_same_splits(): """Ensure that all parameter combinations are tested on the same splits (we check their RMSE scores are the same once averaged over the splits, which should be enough). We use as much parallelism as possible.""" data_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_file(data_file, reader=Reader('ml-100k'), rating_scale=(1, 5)) kf = KFold(3, shuffle=True, random_state=4) # all RMSE should be the same (as param combinations are the same) param_grid = {'n_epochs': [5], 'lr_all': [.2, .2], 'reg_all': [.4, .4], 'n_factors': [5], 'random_state': [0]} gs = GridSearchCV(SVD, param_grid, measures=['RMSE'], cv=kf, n_jobs=1) gs.fit(data) rmse_scores = [m for m in gs.cv_results['mean_test_rmse']] assert len(set(rmse_scores)) == 1 # assert rmse_scores are all equal # Note: actually, even when setting random_state=None in kf, the same folds # are used because we use product(param_comb, kf.split(...)). However, it's # needed to have the same folds when calling fit again: gs.fit(data) rmse_scores += [m for m in gs.cv_results['mean_test_rmse']] assert len(set(rmse_scores)) == 1 # assert rmse_scores are all equal
def test_cross_validate(toy_data): # First test with a specified CV iterator. current_dir = os.path.dirname(os.path.realpath(__file__)) folds_files = [(current_dir + '/custom_train', current_dir + '/custom_test')] reader = Reader(line_format='user item rating', sep=' ', skip_lines=3) data = Dataset.load_from_folds(folds_files=folds_files, reader=reader, rating_scale=(1, 5)) algo = NormalPredictor() pkf = ms.PredefinedKFold() ret = ms.cross_validate(algo, data, measures=['rmse', 'mae'], cv=pkf, verbose=1) # Basically just test that keys (dont) exist as they should assert len(ret['test_rmse']) == 1 assert len(ret['test_mae']) == 1 assert len(ret['fit_time']) == 1 assert len(ret['test_time']) == 1 assert 'test_fcp' not in ret assert 'train_rmse' not in ret assert 'train_mae' not in ret # Test that 5 fold CV is used when cv=None # Also check that train_* key exist when return_train_measures is True. ret = ms.cross_validate(algo, toy_data, measures=['rmse', 'mae'], cv=None, return_train_measures=True, verbose=True) assert len(ret['test_rmse']) == 5 assert len(ret['test_mae']) == 5 assert len(ret['fit_time']) == 5 assert len(ret['test_time']) == 5 assert len(ret['train_rmse']) == 5 assert len(ret['train_mae']) == 5
def test_zero_rating_canary(): ratings_dict = {'itemID': [0, 0, 0, 0, 1, 1], 'userID': [0, 1, 2, 3, 3, 4], 'rating': [-10, 10, 0, -5, 0, 5]} df = pd.DataFrame(ratings_dict) data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], rating_scale=(-10, 10)) trainset = data.build_full_trainset() # test ur and ir fields. Kind of OK, but the purpose of the test is # precisely to test what would happen if we removed them... assert trainset.ir[0] == [(0, -10), (1, 10), (2, 0), (3, -5)] assert trainset.ir[1] == [(3, 0), (4, 5)] assert trainset.ur[0] == [(0, -10)] assert trainset.ur[1] == [(0, 10)] assert trainset.ur[2] == [(0, 0)] assert trainset.ur[3] == [(0, -5), (1, 0)] assert trainset.ur[4] == [(1, 5)] print(trainset.ur) # ... so also test all_ratings which should be more reliable. all_ratings = list(trainset.all_ratings()) assert (0, 0, -10) in all_ratings assert (1, 0, 10) in all_ratings assert (2, 0, 0) in all_ratings assert (3, 0, -5) in all_ratings assert (3, 1, 0) in all_ratings assert (4, 1, 5) in all_ratings
def small_ml(): """Return a Dataset object with 2000 movielens-100k ratings. """ data_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_file(data_file, Reader('ml-100k'), rating_scale=(1, 5)) return data
def toy_data(toy_data_reader): toy_data_path = (os.path.dirname(os.path.realpath(__file__)) + '/custom_dataset') data = Dataset.load_from_file(file_path=toy_data_path, reader=toy_data_reader, rating_scale=(1, 5)) return data
def test_randomizedsearchcv_cv_results(): """Test the cv_results attribute""" f = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_file(f, Reader('ml-100k'), rating_scale=(1, 5)) kf = KFold(3, shuffle=True, random_state=4) param_distributions = {'n_epochs': [5], 'lr_all': uniform(.2, .3), 'reg_all': uniform(.4, .3), 'n_factors': [5], 'random_state': [0]} n_iter = 5 rs = RandomizedSearchCV(SVD, param_distributions, n_iter=n_iter, measures=['RMSE', 'mae'], cv=kf, return_train_measures=True) rs.fit(data) # test keys split*_test_rmse, mean and std dev. assert rs.cv_results['split0_test_rmse'].shape == (n_iter,) assert rs.cv_results['split1_test_rmse'].shape == (n_iter,) assert rs.cv_results['split2_test_rmse'].shape == (n_iter,) assert rs.cv_results['mean_test_rmse'].shape == (n_iter,) assert np.allclose(rs.cv_results['mean_test_rmse'], np.mean([rs.cv_results['split0_test_rmse'], rs.cv_results['split1_test_rmse'], rs.cv_results['split2_test_rmse']], axis=0)) assert np.allclose(rs.cv_results['std_test_rmse'], np.std([rs.cv_results['split0_test_rmse'], rs.cv_results['split1_test_rmse'], rs.cv_results['split2_test_rmse']], axis=0)) # test keys split*_train_mae, mean and std dev. assert rs.cv_results['split0_train_rmse'].shape == (n_iter,) assert rs.cv_results['split1_train_rmse'].shape == (n_iter,) assert rs.cv_results['split2_train_rmse'].shape == (n_iter,) assert rs.cv_results['mean_train_rmse'].shape == (n_iter,) assert np.allclose(rs.cv_results['mean_train_rmse'], np.mean([rs.cv_results['split0_train_rmse'], rs.cv_results['split1_train_rmse'], rs.cv_results['split2_train_rmse']], axis=0)) assert np.allclose(rs.cv_results['std_train_rmse'], np.std([rs.cv_results['split0_train_rmse'], rs.cv_results['split1_train_rmse'], rs.cv_results['split2_train_rmse']], axis=0)) # test fit and train times dimensions. assert rs.cv_results['mean_fit_time'].shape == (n_iter,) assert rs.cv_results['std_fit_time'].shape == (n_iter,) assert rs.cv_results['mean_test_time'].shape == (n_iter,) assert rs.cv_results['std_test_time'].shape == (n_iter,) assert rs.cv_results['params'] is rs.param_combinations # assert that best parameter in rs.cv_results['rank_test_measure'] is # indeed the best_param attribute. best_index = np.argmin(rs.cv_results['rank_test_rmse']) assert rs.cv_results['params'][best_index] == rs.best_params['rmse'] best_index = np.argmin(rs.cv_results['rank_test_mae']) assert rs.cv_results['params'][best_index] == rs.best_params['mae']
def __init__(self, algo: AlgoBase, path: str=None, fmt='user item rating', sep=','): self.algo = algo if path: self.data = Dataset.load_from_file(path, reader=Reader(line_format=fmt, sep=sep, skip_lines=1)) else: self.data = None self.trainset = None self.init()
def u1_ml100k(): """Return a Dataset object that contains 10% of the u1 fold from movielens 100k. Trainset has 8000 ratings and testset has 2000. """ train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train') test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k'), rating_scale=(1, 5)) return data
def test_load_form_df(): """Ensure reading dataset from pandas dataframe is OK.""" # DF creation. ratings_dict = {'itemID': [1, 1, 1, 2, 2], 'userID': [9, 32, 2, 45, '10000'], 'rating': [3, 2, 4, 3, 1]} df = pd.DataFrame(ratings_dict) data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], rating_scale=(1, 5)) # Assert split and folds can be used without problems with pytest.warns(UserWarning): data.split(2) assert sum(1 for _ in data.folds()) == 2 # assert users and items are correctly mapped trainset = data.build_full_trainset() assert trainset.knows_user(trainset.to_inner_uid(9)) assert trainset.knows_user(trainset.to_inner_uid('10000')) assert trainset.knows_item(trainset.to_inner_iid(2)) # assert r(9, 1) = 3 and r(2, 1) = 4 uid9 = trainset.to_inner_uid(9) uid2 = trainset.to_inner_uid(2) iid1 = trainset.to_inner_iid(1) assert trainset.ur[uid9] == [(iid1, 3)] assert trainset.ur[uid2] == [(iid1, 4)] # mess up the column ordering and assert that users are not correctly # mapped data = Dataset.load_from_df(df[['rating', 'itemID', 'userID']], rating_scale=(1, 5)) trainset = data.build_full_trainset() with pytest.raises(ValueError): trainset.to_inner_uid('10000')
def test_nearest_neighbors(): """Ensure the nearest neighbors are different when using user-user similarity vs item-item.""" reader = Reader(line_format='user item rating', sep=' ', skip_lines=3) data_file = os.path.dirname(os.path.realpath(__file__)) + '/custom_train' data = Dataset.load_from_file(data_file, reader, rating_scale=(1, 5)) trainset = data.build_full_trainset() algo_ub = KNNBasic(sim_options={'user_based': True}) algo_ub.fit(trainset) algo_ib = KNNBasic(sim_options={'user_based': False}) algo_ib.fit(trainset) assert algo_ub.get_neighbors(0, k=10) != algo_ib.get_neighbors(0, k=10)
def collaborative(self,ratings,user_id): reader = Reader() #ratings.head() temp_ratings = ratings data = Dataset.load_from_df(temp_ratings[['user_id', 'book_id', 'rating']], reader) data.split(n_folds=2) ## Training the data ## svd = SVD() evaluate(svd, data, measures=['RMSE', 'MAE']) trainset = data.build_full_trainset() algo = SVD() algo.fit(trainset) #svd.train(trainset) ## Testing the data ## from collections import defaultdict testset = trainset.build_anti_testset() predictions = algo.test(testset) count = 0 for uid, iid, true_r, est, _ in predictions: if uid == user_id: count = count+1 temp_ratings.loc[len(temp_ratings)+1]= [uid,iid,est] #print("count\n") #print(count) #print("\n--------here-------\n") #print(temp_ratings) cb = temp_ratings[(temp_ratings['user_id'] == user_id)][['book_id', 'rating']] #print("\n--------here-------\n") #print(cb) cb = temp_ratings[(temp_ratings['user_id'] == user_id)][['book_id', 'rating']] return(cb)
def test_gridsearchcv_refit(u1_ml100k): """Test refit function of GridSearchCV.""" data_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_file(data_file, Reader('ml-100k'), rating_scale=(1, 5)) param_grid = {'n_epochs': [5], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6], 'n_factors': [2]} # assert gs.fit() and gs.test will use best estimator for mae (first # appearing in measures) gs = GridSearchCV(SVD, param_grid, measures=['mae', 'rmse'], cv=2, refit=True) gs.fit(data) gs_preds = gs.test(data.construct_testset(data.raw_ratings)) mae_preds = gs.best_estimator['mae'].test( data.construct_testset(data.raw_ratings)) assert gs_preds == mae_preds # assert gs.fit() and gs.test will use best estimator for rmse gs = GridSearchCV(SVD, param_grid, measures=['mae', 'rmse'], cv=2, refit='rmse') gs.fit(data) gs_preds = gs.test(data.construct_testset(data.raw_ratings)) rmse_preds = gs.best_estimator['rmse'].test( data.construct_testset(data.raw_ratings)) assert gs_preds == rmse_preds # test that predict() can be called gs.predict(2, 4) # assert test() and predict() cannot be used when refit is false gs = GridSearchCV(SVD, param_grid, measures=['mae', 'rmse'], cv=2, refit=False) gs.fit(data) with pytest.raises(ValueError): gs_preds = gs.test(data.construct_testset(data.raw_ratings)) with pytest.raises(ValueError): gs.predict('1', '2') # test that error is raised if used with load_from_folds gs = GridSearchCV(SVD, param_grid, measures=['mae', 'rmse'], cv=2, refit=True) with pytest.raises(ValueError): gs.fit(u1_ml100k)
def test_randomizedsearchcv_refit(u1_ml100k): """Test refit method of RandomizedSearchCV class.""" data_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_file(data_file, Reader('ml-100k'), rating_scale=(1, 5)) param_distributions = {'n_epochs': [5], 'lr_all': uniform(0.002, 0.003), 'reg_all': uniform(0.4, 0.2), 'n_factors': [2]} # assert rs.fit() and rs.test will use best estimator for mae (first # appearing in measures) rs = RandomizedSearchCV(SVD, param_distributions, measures=['mae', 'rmse'], cv=2, refit=True) rs.fit(data) rs_preds = rs.test(data.construct_testset(data.raw_ratings)) mae_preds = rs.best_estimator['mae'].test( data.construct_testset(data.raw_ratings)) assert rs_preds == mae_preds # assert rs.fit() and rs.test will use best estimator for rmse rs = RandomizedSearchCV(SVD, param_distributions, measures=['mae', 'rmse'], cv=2, refit='rmse') rs.fit(data) rs_preds = rs.test(data.construct_testset(data.raw_ratings)) rmse_preds = rs.best_estimator['rmse'].test( data.construct_testset(data.raw_ratings)) assert rs_preds == rmse_preds # test that predict() can be called rs.predict(2, 4) # assert test() and predict() cannot be used when refit is false rs = RandomizedSearchCV(SVD, param_distributions, measures=['mae', 'rmse'], cv=2, refit=False) rs.fit(data) with pytest.raises(ValueError): rs.test(data.construct_testset(data.raw_ratings)) with pytest.raises(ValueError): rs.predict('1', '2') # test that error is raised if used with load_from_folds rs = RandomizedSearchCV(SVD, param_distributions, measures=['mae', 'rmse'], cv=2, refit=True) with pytest.raises(ValueError): rs.fit(u1_ml100k)
def test_deprecated_way(): """Test all Dataset constructors without passing rating_scale as a parameter. Make sure we revert back to the Reader object, with a warning message. Also, make sure ValueError is raised if reader has no rating_scale in this context. Not using dataset fixtures here for more control. """ # test load_from_file toy_data_path = (os.path.dirname(os.path.realpath(__file__)) + '/custom_dataset') with pytest.warns(UserWarning): reader = Reader(line_format='user item rating', sep=' ', skip_lines=3, rating_scale=(1, 5)) data = Dataset.load_from_file(file_path=toy_data_path, reader=reader) with pytest.raises(ValueError): reader = Reader(line_format='user item rating', sep=' ', skip_lines=3, rating_scale=None) data = Dataset.load_from_file(file_path=toy_data_path, reader=reader) # test load_from_folds train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train') test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') with pytest.warns(UserWarning): reader = Reader(line_format='user item rating timestamp', sep='\t', rating_scale=(1, 5)) data = Dataset.load_from_folds([(train_file, test_file)], reader=reader) with pytest.raises(ValueError): reader = Reader(line_format='user item rating timestamp', sep='\t', rating_scale=None) data = Dataset.load_from_folds([(train_file, test_file)], reader=reader) # test load_from_df ratings_dict = {'itemID': [1, 1, 1, 2, 2], 'userID': [9, 32, 2, 45, '10000'], 'rating': [3, 2, 4, 3, 1]} df = pd.DataFrame(ratings_dict) with pytest.warns(UserWarning): reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader=reader) with pytest.raises(ValueError): reader = Reader(rating_scale=None) data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], # noqa reader=reader)
def test_PredifinedKFold(toy_data_reader): current_dir = os.path.dirname(os.path.realpath(__file__)) folds_files = [(current_dir + '/custom_train', current_dir + '/custom_test')] data = Dataset.load_from_folds(folds_files=folds_files, reader=toy_data_reader, rating_scale=(1, 5)) # Make sure rating files are read correctly pkf = PredefinedKFold() trainset, testset = next(pkf.split(data)) assert trainset.n_ratings == 6 assert len(testset) == 3 # Make sure pkf returns the same folds as the deprecated data.folds() with pytest.warns(UserWarning): trainset_, testset_ = next(data.folds()) assert testset_ == testset
def test_LeaveOneOut(toy_data): loo = LeaveOneOut() with pytest.raises(ValueError): next(loo.split(toy_data)) # each user only has 1 item so trainsets fail reader = Reader('ml-100k') data_path = (os.path.dirname(os.path.realpath(__file__)) + '/u1_ml100k_test') data = Dataset.load_from_file(file_path=data_path, reader=reader, rating_scale=(1, 5)) # Test random_state parameter # If random_state is None, you get different split each time (conditioned # by rng of course) loo = LeaveOneOut(random_state=None) testsets_a = [testset for (_, testset) in loo.split(data)] testsets_b = [testset for (_, testset) in loo.split(data)] assert testsets_a != testsets_b # Repeated called to split when random_state is set lead to the same folds loo = LeaveOneOut(random_state=1) testsets_a = [testset for (_, testset) in loo.split(data)] testsets_b = [testset for (_, testset) in loo.split(data)] assert testsets_a == testsets_b # Make sure only one rating per user is present in the testset loo = LeaveOneOut() for _, testset in loo.split(data): cnt = Counter([uid for (uid, _, _) in testset]) assert all(val == 1 for val in itervalues(cnt)) # test the min_n_ratings parameter loo = LeaveOneOut(min_n_ratings=5) for trainset, _ in loo.split(data): assert all(len(ratings) >= 5 for ratings in itervalues(trainset.ur)) loo = LeaveOneOut(min_n_ratings=10) for trainset, _ in loo.split(data): assert all(len(ratings) >= 10 for ratings in itervalues(trainset.ur)) loo = LeaveOneOut(min_n_ratings=10000) # too high with pytest.raises(ValueError): next(loo.split(data))
def test_performances(): """Test the returned dict. Also do dumping.""" current_dir = os.path.dirname(os.path.realpath(__file__)) folds_files = [(current_dir + '/custom_train', current_dir + '/custom_test')] reader = Reader(line_format='user item rating', sep=' ', skip_lines=3) data = Dataset.load_from_folds(folds_files=folds_files, reader=reader, rating_scale=(1, 5)) algo = NormalPredictor() tmp_dir = tempfile.mkdtemp() # create tmp dir with pytest.warns(UserWarning): performances = evaluate(algo, data, measures=['RmSe', 'Mae'], with_dump=True, dump_dir=tmp_dir, verbose=2) shutil.rmtree(tmp_dir) # remove tmp dir assert performances['RMSE'] is performances['rmse'] assert performances['MaE'] is performances['mae']
def test_build_anti_testset(): ratings_dict = {'itemID': [1, 2, 3, 4, 5, 6, 7, 8, 9], 'userID': [1, 2, 3, 4, 5, 6, 7, 8, 9], 'rating': [1, 2, 3, 4, 5, 6, 7, 8, 9]} df = pd.DataFrame(ratings_dict) data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], rating_scale=(1, 5)) with pytest.warns(UserWarning): data.split(2) trainset, __testset = next(data.folds()) # fill with some specific value for fillvalue in (0, 42., -1): anti = trainset.build_anti_testset(fill=fillvalue) for (u, i, r) in anti: assert r == fillvalue # fill with global_mean anti = trainset.build_anti_testset(fill=None) for (u, i, r) in anti: assert r == trainset.global_mean expect = trainset.n_users * trainset.n_items assert trainset.n_ratings + len(anti) == expect
from surprise import Dataset from surprise import Reader from surprise import accuracy from surprise.model_selection import PredefinedKFold # path to dataset folder files_dir = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/') # This time, we'll use the built-in reader. reader = Reader('ml-100k') # folds_files is a list of tuples containing file paths: # [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)] train_file = files_dir + 'u%d.base' test_file = files_dir + 'u%d.test' folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)] data = Dataset.load_from_folds(folds_files, reader=reader, rating_scale=(1, 5)) pkf = PredefinedKFold() algo = SVD() for trainset, testset in pkf.split(data): # train and test algorithm. algo.fit(trainset) predictions = algo.test(testset) # Compute and print Root Mean Squared Error accuracy.rmse(predictions, verbose=True)
'NormalPredictor': '[{}]({})'.format('Random', stable + 'basic_algorithms.html#surprise.prediction_algorithms.random_pred.NormalPredictor'), 'ml-100k': '[{}]({})'.format('Movielens 100k', 'http://grouplens.org/datasets/movielens/100k'), 'ml-1m': '[{}]({})'.format('Movielens 1M', 'http://grouplens.org/datasets/movielens/1m'), } # set RNG np.random.seed(0) random.seed(0) dataset = 'ml-1m' data = Dataset.load_builtin(dataset) kf = KFold(random_state=0) # folds will be the same for all algorithms. table = [] for klass in classes: start = time.time() out = cross_validate(klass(), data, ['rmse', 'mae'], kf) cv_time = str(datetime.timedelta(seconds=int(time.time() - start))) link = LINK[klass.__name__] mean_rmse = '{:.3f}'.format(np.mean(out['test_rmse'])) mean_mae = '{:.3f}'.format(np.mean(out['test_mae'])) new_line = [link, mean_rmse, mean_mae, cv_time] print(tabulate([new_line], tablefmt="pipe")) # print current algo perf table.append(new_line)
#reading files df_ratings = pd.read_csv('input/ratings.csv') df_movies = pd.read_csv('input/movies.csv') df_ratings = df_ratings.drop(columns='timestamp') print(df_movies.head(5)) print(df_ratings.head(5)) #splitting data into train and test sets train_split, test_split = train_test_split(df_ratings, test_size=0.25, random_state=20) print("Training data size:", train_split.shape) print("Test data size:", test_split.shape) #reader for parsing the ratings file reader = Reader(rating_scale=(1, 5)) #building the train and test set, loading the data from dataframe train_build = Dataset.load_from_df(train_split, reader) test_build = Dataset.load_from_df(test_split, reader) trainset = train_build.build_full_trainset() testset = test_build.build_full_trainset().build_testset() print("Test set size:", len(testset)) #model building #takes in factors, epochs, learning rate and regularization parameter model = SVDpp(n_factors=20, n_epochs=5, lr_all=0.09, reg_all=0.5) model.fit(trainset) #making predictions predictions = model.test(testset) #calculating rmse accuracy.rmse(predictions, verbose=True) #Save all the predicted ratings and convert it to a dataframe all_recommendations_list = defaultdict(list) all_recommendations_df = pd.DataFrame([])
def _convert_data(self, follow_resp): # Must be (user_id, item_id, rating) d = [[], [], []] user_id = [] item_id = [] rating = [] follows = defaultdict(set) inverse_follows = defaultdict(set) for follow in follow_resp.results: # Do not have to worry about follow state, because even a rejected # follow is still a strong signal of interest by the followee. user_id.append(follow.follower) item_id.append(follow.followed) rating.append(1) follows[follow.follower].add(follow.followed) inverse_follows[follow.followed].add(follow.follower) # Now randomly put zeros in non-existing links in the network. # This is necessary as the problem is an example of PU-learning, where # we have no negative samples to "drag down" the recommendation # confidence. That's to say, without zeros, the model will never have # incentive not to recommend everyone, as it is never told that an # unsuitable recommendation is bad. self._logger.debug('Assigning zeros randomly.') # Create a set of all users we can create recommendations for by # getting the set union of both sides of all follow connections. all_users = tuple(set(follows) | set(inverse_follows)) # We only want to add as many zeros as there are ones. num_zeros_added = 0 num_ones = len(follow_resp.results) # Important to keep track of attempts to randomly add zeros, so that # in a densely connected graph (eg. on a small instance where everyone # follows everyone else) the loop doesn't continue forever. # We set the max attempts (rather arbitrarily) to the square of the # number of different users; this is the number of possible ways of # choosing two random users from the set of all users. num_attempts = 0 max_attempts = len(all_users)**2 # Continue adding zeros until there are the same number as there are # ones, or until we give up. while num_zeros_added < num_ones and num_attempts < max_attempts: num_attempts += 1 follower = random.choice(all_users) followed = random.choice(all_users) if follower == followed: # Cannot follow yourself. continue if followed in follows[follower]: # Follow already exists. continue user_id.append(follower) item_id.append(followed) rating.append(0) num_zeros_added += 1 # We don't want to accidentally re-add this as another zero later, # so reuse the follow set to ensure if this (follower, followed) # pair comes up again randomly, it is skipped. follows[follower].add(followed) d = {'follower': user_id, 'followee': item_id, 'rating': rating} df = pd.DataFrame(data=d) reader = Reader(rating_scale=(0, 1)) return Dataset.load_from_df(df[['follower', 'followee', 'rating']], reader)
def test_trainset_testset(toy_data_reader): """Test the construct_trainset and construct_testset methods.""" current_dir = os.path.dirname(os.path.realpath(__file__)) folds_files = [(current_dir + '/custom_train', current_dir + '/custom_test')] data = Dataset.load_from_folds(folds_files=folds_files, reader=toy_data_reader) pkf = PredefinedKFold() trainset, testset = next(pkf.split(data)) # test ur ur = trainset.ur assert ur[0] == [(0, 4)] assert ur[1] == [(0, 4), (1, 2)] assert ur[40] == [] # not in the trainset # test ir ir = trainset.ir assert ir[0] == [(0, 4), (1, 4), (2, 1)] assert ir[1] == [(1, 2), (2, 1), (3, 5)] assert ir[20000] == [] # not in the trainset # test n_users, n_items, n_ratings, rating_scale assert trainset.n_users == 4 assert trainset.n_items == 2 assert trainset.n_ratings == 6 assert trainset.rating_scale == (1, 5) # test raw2inner for i in range(4): assert trainset.to_inner_uid('user' + str(i)) == i with pytest.raises(ValueError): trainset.to_inner_uid('unknown_user') for i in range(2): assert trainset.to_inner_iid('item' + str(i)) == i with pytest.raises(ValueError): trainset.to_inner_iid('unknown_item') # test inner2raw assert trainset._inner2raw_id_users is None assert trainset._inner2raw_id_items is None for i in range(4): assert trainset.to_raw_uid(i) == 'user' + str(i) for i in range(2): assert trainset.to_raw_iid(i) == 'item' + str(i) assert trainset._inner2raw_id_users is not None assert trainset._inner2raw_id_items is not None # Test the build_testset() method algo = BaselineOnly() algo.fit(trainset) testset = trainset.build_testset() algo.test(testset) # ensure an algorithm can manage the data assert ('user0', 'item0', 4) in testset assert ('user3', 'item1', 5) in testset assert ('user3', 'item1', 0) not in testset # Test the build_anti_testset() method algo = BaselineOnly() algo.fit(trainset) testset = trainset.build_anti_testset() algo.test(testset) # ensure an algorithm can manage the data assert ('user0', 'item0', trainset.global_mean) not in testset assert ('user3', 'item1', trainset.global_mean) not in testset assert ('user0', 'item1', trainset.global_mean) in testset assert ('user3', 'item0', trainset.global_mean) in testset
import numpy as np import pandas as pd from surprise import Dataset from surprise import KNNWithMeans from surprise import SVD from surprise.model_selection import KFold import warnings warnings.filterwarnings('ignore') import PredictedRating # 0. Data Load - Movie lens 1M data data = Dataset.load_builtin('ml-1m') kf = KFold(n_splits=5) sim_options = {'name': 'cosine', 'user_based': True} # 1. Precision & Recall & F1-measure class Precision_Recall_F1: def __init__(self, data, algo): self.data = data self.algo = algo def precision_recall_at_k(self, predictions, k=10, threshold=3.5): '''Return precision and recall at k metrics for each user.''' # First map the predictions to each user. user_est_true = defaultdict(list)
import pytest from surprise import KNNBasic from surprise import KNNWithMeans from surprise import KNNBaseline from surprise import Dataset from surprise import Reader from surprise.accuracy import neg_rmse from surprise.model_selection import cross_validate from surprise.model_selection import PredefinedKFold # the test and train files are from the ml-100k dataset (10% of u1.base and # 10 % of u1.test) train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train') test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k')) pkf = PredefinedKFold() def test_name_field(): """Ensure the name field is taken into account.""" sim_options = {'name': 'cosine'} algo = KNNBasic(sim_options=sim_options) rmse_cosine = cross_validate(algo, data, [['neg_rmse', neg_rmse]], pkf)['test_neg_rmse'] sim_options = {'name': 'msd'} algo = KNNBasic(sim_options=sim_options) rmse_msd = cross_validate(algo, data, [['neg_rmse', neg_rmse]], pkf)['test_neg_rmse']
import numpy as np import csv from surprise import Dataset, KNNBasic, SVD, SVDpp, BaselineOnly from surprise.model_selection import KFold, cross_validate from cf_models import EbcrMsdKNN, EbcrCosKNN, EbcrNormPccKNN, NormPcc, SW_Norm_PccKNN, SW_MSD_KNN, SW_COS_KNN, LS_MSD_KNN, LS_COS_KNN, LS_Norm_PccKNN __author__ = "Yu DU" # Datasets initialisation ml_100k = Dataset.load_builtin('ml-100k') ml_1m = Dataset.load_builtin('ml-1m') jester = Dataset.load_builtin('jester') # Split train and test set kf = KFold(random_state=0, n_splits=5) list_k = [5, 10, 20, 40, 60, 80, 100, 200] list_k2 = [5, 10, 15, 20, 25, 30, 35, 40] # The Ml-100k Dataset with open('results_ml100k_all.csv', mode='w') as result_file: fieldnames = ['k', 'algo', 'MAE', 'RMSE'] writer = csv.DictWriter(result_file, fieldnames=fieldnames) writer.writeheader() # SVD algo svd = SVD() out_svd = cross_validate(svd, ml_100k, ['rmse', 'mae'], kf,
def ubcf_eval(self, co_pe, df_path): kfold = input("Enter number of folds required to Evaluate:") reader = Reader(line_format="user item rating", sep=',', rating_scale=(1, 5)) df = Dataset.load_from_file(df_path, reader=reader) self.splitter(kfold, df) # SIMILARITY & ALGORITHM DEFINING sim_op = {'name': co_pe, 'user_based': True} algo = KNNBasic(sim_options=sim_op) # RESPONSIBLE TO EXECUTE DATA SPLITS MENTIONED IN STEP 4 start = time.time() perf = evaluate(algo, df, measures=['RMSE', 'MAE']) end = time.time() print_perf(perf) print "\nTotal Time elapsed =", (end - start) print "Average time per fold =", (end - start) / kfold, "\n" print perf ds = pd.read_csv("pred_matrix-full_ubcf.csv") confusion_matrix = np.matrix(ds) FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix) FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix) TP = np.diag(confusion_matrix) TN = confusion_matrix.sum() - (FP + FN + TP) # Sensitivity, hit rate, recall, or true positive rate TPR = TP / (TP + FN) # Specificity or true negative rate TNR = TN / (TN + FP) # Precision or positive predictive value PPV = TP / (TP + FP) # Negative predictive value NPV = TN / (TN + FN) # Fall out or false positive rate FPR = FP / (FP + TN) # False negative rate FNR = FN / (TP + FN) # False discovery rate FDR = FP / (TP + FP) # Overall accuracy ACC = (TP + TN) / (TP + FP + FN + TN) print "\nTrue Positive:\n", TP, "\n\nTrue Negative\n", TN, "\n\nFalse Positive\n", FP, "\n\nFalse Negative\n", FN print "-" * 30 print "\nTrue Postive Ratio =", TPR, "\n\nFalse Positive Ratio =", FPR print "-" * 30 print "*" * 20 print confusion_matrix print "Accuracy with current Algorithm", algo, "is ", ACC.mean(axis=0)
'/Users/ronlitman/Ronlitman/University/Statistic/שנה א׳ - סמט׳ א׳/למידה סטטיסטית/Netflix/df_join.csv' ) def hide_y(df, size=0.2): pass # df = get_full_df() df = pd.read_csv( '/Users/ronlitman/Ronlitman/University/Statistic/שנה א׳ - סמט׳ א׳/למידה סטטיסטית/Netflix/df_join.csv' ) reader = Reader(rating_scale=(0.5, 5.0)) data = Dataset.load_from_df(df[['uid', 'iid', 'rating']], reader) # Split data into 5 folds print('Split data into 5 folds') data.split(n_folds=5) # svd print('SVD') algo = SVD() evaluate(algo, data, measures=['RMSE']) # Retrieve the trainset. trainset = data.build_full_trainset() algo.train(trainset)
def parameter_tuning(): """ After deciding to use the KNNWithMeans algorithm our next step is to tune its parameters to further increase its accuracy. There are three parameters we can tune: (1) The similarity options, in particular which option we use for computing the similarity matrix. [*] (2) The min_k parameter. (3) The k parameter. 1. Sim options: We can decide between using the standard (naive) cosine similarity, pearson correlation (centred cosine similarity) or MSD (mean squared differences). Since the pearson similarity outperforms the others we stick to it. 2. The min_k parameter: The minimum number of neighbors to take into account for computing the weighted adjusted ratings. If less than min_k neighbors are available, meaning that not enough games have been rated by the user, that have a similarity of >= 0, the prediction is equal to the average rating for the particular game. 3. The k parameter: The maximum number of neighbors to take into account for computing the weighted adjusted ratings. In our case the k games rated by the target user that are most similar to the game we are trying to predict. We focus on this parameter below. More information can be found here: https://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNWithMeans As a result we chose the following parameters in our production environment: (1) Pearson correlation (centred cosine similarity) (2) k = 40 (3) min_k = 5 [*] Actually the similarity options include another parameter that determines whether we use item-item or user-user similarities. Since we already distinguished between the two in our benchmarking we focus purely on which approach to use for computing the similarity matrix here. """ # import reduced dataset: df = import_all_reviews('C:/Users/lukas/PycharmProjects/board-game-recommender/import/Data/Joined/Results/Reviews_Reduced_SMALL.csv') # check for duplicates: duplicates = len(df) - len(df.drop_duplicates(subset=['game_key', 'user_key'])) # drop duplicates: df = df.drop_duplicates(subset=['game_key', 'user_key']) print('duplicates removed: ' + str(duplicates)) ## Surprise: reader = Reader(rating_scale=(1, 10)) data = Dataset.load_from_df(df[['user_key', 'game_key', 'rating']], reader) results = [] sim_option = {'name': 'pearson', 'user_based': False} min_k = 5 # try out different parameters for k: k_parameter = list(range(10, 200, 10)) min_k_parameter = [1, 5, 10] # Cross validate: for k in k_parameter: for min_k in min_k_parameter: algo = KNNWithMeans(k=k, min_k=min_k, sim_options=sim_option) results.append( cross_validate(algo, data, measures=['RMSE'], cv=5, return_train_measures=True, n_jobs=-3, verbose=True)) # Print results: for i, result in enumerate(results): print('k = ' + str(k_parameter[i//len(min_k_parameter)]) + '\t \t' + 'min_k = ' + str(min_k_parameter[i % len(min_k_parameter)]) + '\t \t RMSE Score: \t' + str(result['test_rmse'].mean()) + '\t\t Fit-Time: ' + str( result['fit_time']) + '\t\t Train-Time: ' + str(result['test_time']))
df=pd.DataFrame(r) df.columns=['mem_id','res_id','rating'] #평점이 없는 m_id와 res_id 추출 from numpy import nan R_ori= df.pivot_table('rating', index='mem_id',columns='res_id') zero_maxtrix=R_ori[R_ori==nan].reset_index().melt('mem_id', var_name='res_id')[['mem_id','res_id']] #가져온 데이터를 학습 from surprise import Reader reader=Reader(rating_scale=(0.01,5)) data = Dataset.load_from_df(df[['mem_id','res_id','rating']], reader=reader) trainset=data.build_full_trainset() algo=SVD(n_epochs=20,n_factors=50, random_state=0) algo.fit(trainset) #현재 시간을 저장 now = datetime.datetime.now() formattedDate = now.strftime("%Y%m%d_%H%M%S") print(formattedDate) #현재 시간명으로 temp table 생성 sql1 = """CREATE TABLE res_recommend_svd_{}( res_id INT, pred_rating Float, member_id INT)""".format(formattedDate)
def execute(self, params, **kwargs): # Load the movielens-100k dataset (download it if needed), data = Dataset.load_builtin('ml-100k') self.marvin_initial_dataset = {"data": data}
if self.trainset.knows_user(u) and self.trainset.knows_item(i): return np.dot(self.p[u], self.q[i]) else: return self.trainset.global_mean timex = [] mem = [] m1 = psutil.virtual_memory().percent #For 100 record dataset start = time.time() df1 = pd.read_csv('C:/Users/dell pc/Desktop/Project/ratings_1million1.csv', dtype={'rating': float}) reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df1[['user_id', 'book_id', 'rating']], reader) data.split(2) algo = MatrixFacto(learning_rate=.01, n_epochs=10, n_factors=10) result1 = surprise.evaluate(algo, data, measures=['RMSE']) end = time.time() print("Time1", end - start) timex.append(end - start) m2 = psutil.virtual_memory().percent #print(m2) mem.append(m2) #For 1000 record dataset start = time.time() df2 = pd.read_csv('C:/Users/dell pc/Desktop/Project/ratings_1million2.csv', dtype={'rating': float}) reader = Reader(rating_scale=(1, 5))
def get_top_n(predictions, n=10): # First map the predictions to each user. top_n = defaultdict(list) for uid, iid, true_r, est, _ in predictions: top_n[uid].append((iid, est)) # Then sort the predictions for each user and retrieve the k highest ones. for uid, user_ratings in top_n.items(): user_ratings.sort(key=lambda x: x[1], reverse=True) top_n[uid] = user_ratings[:n] return top_n # First train an SVD algorithm on the movielens dataset. data = Dataset.load_from_file('df') trainset = data.build_full_trainset() algo = SVD() algo.fit(trainset) # Than predict ratings for all pairs (u, i) that are NOT in the training set. testset = trainset.build_anti_testset() predictions = algo.test(testset) top_n = get_top_n(predictions, n=10) # Print the recommended items for each user for uid, user_ratings in top_n.items(): print(uid, [iid for (iid, _) in user_ratings]) #------------------- To compute precision@k and recall@k using surprise-----------------------------------
import pickle import pandas as pd from surprise import SVD, Reader, Dataset print('Now offline-train Amazon') df = pd.read_csv('./exp_data/amazon_exp.csv') reader = Reader() data = Dataset.load_from_df( df=df[['user_id', 'item_id', 'rating']], reader=reader, rating_scale=(1, 5)) train_set = data.build_full_trainset() raw_ratings = [(uid, iid, float(r)) for (uid, iid, r, time) in df.itertuples(index=False)] raw2inner_id_users = {} raw2inner_id_items = {} current_u_index = 0 current_i_index = 0 for urid, irid, r in raw_ratings: try: uid = raw2inner_id_users[urid] except KeyError: uid = current_u_index raw2inner_id_users[urid] = current_u_index current_u_index += 1 try: iid = raw2inner_id_items[irid] except: iid = current_i_index raw2inner_id_items[irid] = current_i_index current_i_index += 1 user_dict = {val: key for key, val in raw2inner_id_users.items()}
from util import * user, book, user_test, book_test, rate, user_all, book_all, user_dict, book_dict = read_data() # Creation of the dataframe. Column names are irrelevant. ratings_dict = {'itemID': book, 'userID': user, 'rating': rate} df = pd.DataFrame(ratings_dict) # A reader is still needed but only the rating_scale param is requiered. reader = Reader(rating_scale=(1, 10)) # The columns must correspond to user id, item id and ratings (in that order). data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader) # Models algos = [] algos_name = [] algos_name.append('BS_ALS') bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 1, 'reg_i': 5 } algos.append(BaselineOnly(bsl_options=bsl_options)) algos_name.append('BS_SGD')
def test_wrong_file_name(): """Ensure file names are checked when creating a (custom) Dataset.""" wrong_files = [('does_not_exist', 'does_not_either')] with pytest.raises(ValueError): Dataset.load_from_folds(folds_files=wrong_files, reader=Reader())
def test_trainset_testset(toy_data_reader): """Test the construct_trainset and construct_testset methods.""" current_dir = os.path.dirname(os.path.realpath(__file__)) folds_files = [(current_dir + '/custom_train', current_dir + '/custom_test')] data = Dataset.load_from_folds(folds_files=folds_files, reader=toy_data_reader, rating_scale=(1, 5)) with pytest.warns(UserWarning): trainset, testset = next(data.folds()) # test ur ur = trainset.ur assert ur[0] == [(0, 4)] assert ur[1] == [(0, 4), (1, 2)] assert ur[40] == [] # not in the trainset # test ir ir = trainset.ir assert ir[0] == [(0, 4), (1, 4), (2, 1)] assert ir[1] == [(1, 2), (2, 1), (3, 5)] assert ir[20000] == [] # not in the trainset # test n_users, n_items, n_ratings, rating_scale assert trainset.n_users == 4 assert trainset.n_items == 2 assert trainset.n_ratings == 6 assert trainset.rating_scale == (1, 5) # test raw2inner for i in range(4): assert trainset.to_inner_uid('user' + str(i)) == i with pytest.raises(ValueError): trainset.to_inner_uid('unkown_user') for i in range(2): assert trainset.to_inner_iid('item' + str(i)) == i with pytest.raises(ValueError): trainset.to_inner_iid('unkown_item') # test inner2raw assert trainset._inner2raw_id_users is None assert trainset._inner2raw_id_items is None for i in range(4): assert trainset.to_raw_uid(i) == 'user' + str(i) for i in range(2): assert trainset.to_raw_iid(i) == 'item' + str(i) assert trainset._inner2raw_id_users is not None assert trainset._inner2raw_id_items is not None # Test the build_testset() method algo = BaselineOnly() algo.fit(trainset) testset = trainset.build_testset() algo.test(testset) # ensure an algorithm can manage the data assert ('user0', 'item0', 4) in testset assert ('user3', 'item1', 5) in testset assert ('user3', 'item1', 0) not in testset # Test the build_anti_testset() method algo = BaselineOnly() algo.fit(trainset) testset = trainset.build_anti_testset() algo.test(testset) # ensure an algorithm can manage the data assert ('user0', 'item0', trainset.global_mean) not in testset assert ('user3', 'item1', trainset.global_mean) not in testset assert ('user0', 'item1', trainset.global_mean) in testset assert ('user3', 'item0', trainset.global_mean) in testset
# writing data to file from surprise import dump from collections import defaultdict ## Load user reviews df_users = pd.read_csv( r'D:\data science\nutrition\epi_reviews_75plus_w_usernames.csv', index_col=0) df_users = df_users.loc[:, 'user':'******'] # formalize rating scale reader = Reader(rating_scale=(1, 4)) # for centered: (-3, 3) # put data into surprise format data = Dataset.load_from_df(df_users, reader) print(get_methods(data)) # Do a Grid Search for different hyperparameter values (earlier I tried this # using only users with at least 8 ratings and the defaults were best for # n_epochs, lr_all and reg_all, so I will fix them here and vary n_factors): # # Note that the handbook suggests using different lrs for different params # param_grid = {'n_factors': [10, 15, 20]} # gs = GridSearchCV(SVDpp, param_grid, measures=['rmse'], cv=5) # gs.fit(data) # # best RMSE score # print(gs.best_score['rmse'])
one_data = one_data + user_info_dic[a_data[0]] + busi_info_dic[ a_data[1]] + [float(a_data[2])] new_test_data.append(one_data) new_test_data = pd.DataFrame(new_test_data) new_train_only_data = new_train_data.iloc[:, 0:4] new_train_label = new_train_data.iloc[:, 4] new_test_only_data = new_test_data.iloc[:, 0:4] clf = LinearRegression().fit(new_train_only_data, new_train_label) y_pre = clf.predict(new_test_only_data) linear_prediction = [] for i in range(len(y_pre)): all_info = [test_data_get[i][0]] + [test_data_get[i][1]] + [y_pre[i]] linear_prediction.append(all_info) ####################################surprise###################################### surprise_reader = Reader(line_format='user item rating', sep=',', skip_lines=1) surprise_train = Dataset.load_from_file(input_file, reader=surprise_reader) surprise_train = surprise_train.build_full_trainset() surprise_test_data = sc.parallelize(test_data_get).map( lambda s: (s[0], s[1], float(s[2]))).collect() params = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5} surprise_formula = BaselineOnly(bsl_options=params) surprise_formula.fit(surprise_train) surprise_predict = surprise_formula.test(surprise_test_data) surprise_prediction = [] for i in range(len(surprise_predict)): surprise_prediction.append([ surprise_predict[i][0], surprise_predict[i][1], surprise_predict[i][3] ]) ################################SVD######################################## from surprise import SVD svd_surprise = SVD(n_epochs=30, lr_all=0.008, reg_all=0.2)
def SVD_surprise_only(Trainset, N=30): reader = Reader() Trainset_changetype = Dataset.load_from_df( Trainset[['Member_encoding', 'Game_encoding', 'score']], reader) Trainset_changetype_result = Trainset_changetype.build_full_trainset() svd = SVD( n_factors=20, n_epochs=20, lr_all=0.01, #0.0001, random_state=1234) svd.fit(Trainset_changetype_result) games = list(Trainset.Game_encoding.unique() ) # Get our unique games that were purchased x = np.zeros([len(games), len(games)]) for k in range(0, round(np.shape(x)[0] / 200) + 1): for l in range(0, round(np.shape(x)[0] / 200) + 1): minxindex = k * 200 minyindex = l * 200 maxxindex = ((k + 1) * 200) #- 1 maxyindex = ((l + 1) * 200) #- 1 if k == round(np.shape(x)[0] / 200): maxxindex = np.shape(x)[1] + 1 if l == round(np.shape(x)[0] / 200): maxyindex = np.shape(x)[1] + 1 cut0 = np.dot(svd.pu, np.transpose(svd.qi[minxindex:maxxindex, :])) cut1 = np.dot(svd.pu, np.transpose(svd.qi[minyindex:maxyindex, :])) x[minxindex:maxxindex, minyindex:maxyindex] = cosine_similarity(np.transpose(cut0), np.transpose(cut1)) #model SVD_New cosine_sim_x = pd.DataFrame(data=x, index=games, columns=games) gamesplayed = Trainset.groupby([ 'Member_encoding' ])['Game_encoding'].apply(list).reset_index(name='games') gamesmax = np.array( gamesplayed.games.map(lambda x: ((cosine_sim_x.loc[x, :].values).max(axis=0)))) gamelist = np.array(cosine_sim_x.columns) def Get_neighbor_30(x): # x[x>0.99] = 0.0 return (gamelist[np.flip(np.argsort(x, axis=0))[0:N, ]]) filtered = list(map(Get_neighbor_30, gamesmax)) filtered_array = np.array(filtered) filtered_array = filtered_array.reshape( filtered_array.shape[0] * filtered_array.shape[1], -1) filtered_array = filtered_array.reshape(-1, ) SVD_Neighbor = pd.DataFrame({ 'Member_encoding': np.repeat(np.array(np.unique(Trainset.Member_encoding)), N, axis=0), 'Game_encoding': filtered_array }) #SVD_Neighbor_result = SVD_Neighbor.groupby('member_id').head(12) SVD_Neighbor_result = SVD_Neighbor.merge( Trainset[['Member_encoding', 'Game_encoding', 'score']], how='left', on=['Member_encoding', 'Game_encoding']) SVD_Neighbor_result.score = np.where(SVD_Neighbor_result.score.isna(), 0, SVD_Neighbor_result.score) SVD_Neighbor_result = SVD_Neighbor_result.sort_values( by=['Member_encoding', 'score'], ascending=False) SVD_Neighbor_result = SVD_Neighbor_result.groupby('Member_encoding').head( 12) return SVD_Neighbor, SVD_Neighbor_result
popular_products = pd.DataFrame(new_df.groupby('productId')['Rating'].count()) most_popular = popular_products.sort_values('Rating', ascending=False) most_popular.head(30).plot(kind="bar") plt.show() from surprise import KNNWithMeans from surprise import Dataset from surprise import accuracy from surprise import Reader import os from surprise.model_selection import train_test_split #Reading the dataset reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(new_df, reader) #Splitting the dataset trainset, testset = train_test_split(data, test_size=0.3, random_state=10) # Use user_based true/false to switch between user-based or item-based collaborative filtering algo = KNNWithMeans(k=5, sim_options={ 'name': 'pearson_baseline', 'user_based': False }) algo.fit(trainset) # run the trained model against the testset test_pred = algo.test(testset) print(test_pred)
from surprise import SVD from surprise import Dataset from surprise.model_selection import cross_validate # Load the movielens-100k dataset (download it if needed). data = Dataset.load_builtin('ml-100k') # Use the famous SVD algorithm. algo = SVD() # Run 5-fold cross-validation and print results. cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
# Reference https://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVD import pandas as pd import numpy as np from surprise import SVD from surprise import Dataset, Reader, accuracy from surprise.model_selection import cross_validate reader = Reader(line_format='user item rating', sep=',', rating_scale=(1, 5), skip_lines=1) ## Load the training set into surprise's custom dataset object train_set = Dataset.load_from_file('data_movie_lens_100k/ratings_train.csv', reader=reader) train_set = train_set.build_full_trainset() ## Load the test set into surprise's custom dataset object ## (Need to use intermediate pandas DataFrame because the true ratings are missing) test_df = pd.read_csv('data_movie_lens_100k/ratings_test_masked.csv') test_set = Dataset.load_from_df(test_df, reader=reader) test_set = test_set.build_full_trainset().build_testset() # Use the SVD algorithm for n_factors in [1]: ## Fit model to training set model = SVD(n_factors=n_factors) model.fit(train_set) ## Measure predictions on train set
# get predictions based on training set testSet = trainingSet.build_anti_testset() testPredictions = knn.test(testSet) top3_recommendations = get_top_recommendations(testPredictions) print_recommendations(top3_recommendations) def parse_input(input): return input def recommend(input): return top3_recommendations[input['uid']] legion.model.export( recommend, { 'uid': legion.model.int32 }) legion.model.save() recommendation_example = recommend({'uid': 1}) print(repr(recommendation_example)) # Additional memory workload file_path = (os.path.expanduser('~') + '/.surprise_data/ml-100k/ml-100k/u.data') reader = Reader(line_format='user item rating timestamp', sep='\t', skip_lines=0) data = Dataset.load_from_file(file_path, reader=reader)
""" This module descibes how to load a dataset from a pandas dataframe. """ from __future__ import (absolute_import, division, print_function, unicode_literals) import pandas as pd from surprise import NormalPredictor from surprise import Dataset from surprise.model_selection import cross_validate # Creation of the dataframe. Column names are irrelevant. ratings_dict = {'itemID': [1, 1, 1, 2, 2], 'userID': [9, 32, 2, 45, 'user_foo'], 'rating': [3, 2, 4, 3, 1]} df = pd.DataFrame(ratings_dict) # The columns must correspond to user id, item id and ratings (in that order). data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], rating_scale=(1, 5)) # We can now use this dataset as we please, e.g. calling cross_validate cross_validate(NormalPredictor(), data, cv=2)
from surprise import SVD from surprise import Dataset, accuracy from surprise.model_selection import cross_validate,train_test_split # Load the movielens-100k dataset (download it if needed). data = Dataset.load_builtin('ml-100k') # sample random trainset and testset # test set is made of 25% of the ratings. trainset, testset = train_test_split(data, test_size=.25) # We'll use the famous SVD algorithm. algo = SVD() # Train the algorithm on the trainset, and predict ratings for the testset algo.fit(trainset) predictions = algo.test(testset) # Then compute RMSE accuracy.rmse(predictions) # Run 5-fold cross-validation and print results. #cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) import pandas as pd from surprise import NormalPredictor from surprise import Dataset from surprise import Reader from surprise.model_selection import cross_validate
def hybrid(userId,train_rd): #get_ipython().magic('matplotlib inline') import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from scipy import stats from ast import literal_eval from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.metrics.pairwise import linear_kernel, cosine_similarity from nltk.stem.snowball import SnowballStemmer from nltk.stem.wordnet import WordNetLemmatizer from nltk.corpus import wordnet from surprise import Reader, Dataset, SVD, evaluate import warnings; warnings.simplefilter('ignore') # In[2]: #Popularity# md = pd.read_csv('CustomData/FinalData.csv') fd = pd.read_csv('avg_ratings1.csv') fd[fd['rating'].notnull()]['rating'] = fd[fd['rating'].notnull()]['rating'].astype('float') vote_averages= fd[fd['rating'].notnull()]['rating'] C = vote_averages.mean() fd1 = pd.read_csv('ratings_count.csv') fd1[fd1['rating'].notnull()]['rating'] = fd1[fd1['rating'].notnull()]['rating'].astype('float') vote_counts = fd1[fd1['rating'].notnull()]['rating'] # In[3]: m = vote_counts.quantile(0.75) # In[4]: md['ratings_count'] = fd1['rating'] md['average_rating'] = fd['rating'] # In[28]: #print(md.shape) qualified = md[(md['ratings_count'].notnull())][['book_id','title', 'authors', 'ratings_count', 'average_rating']] qualified['ratings_count'] = qualified['ratings_count'].astype('float') qualified['average_rating'] = qualified['average_rating'].astype('float') #qualified.shape # In[29]: def weighted_rating(x): v = x['ratings_count'] R = x['average_rating'] return (v/(v+m) * R) + (m/(m+v) * C) # In[30]: qualified['popularity_rating'] = qualified.apply(weighted_rating, axis=1) #qualified['wr'] #qualified = qualified.sort_values('popularity_rating', ascending=False).head(250) pop = qualified[['book_id','popularity_rating']] #print(qualified.shape) #print(pop.shape) # In[11]: ### Collaborative ## reader = Reader() ratings=train_rd #ratings = pd.read_csv('ratings.csv') #ratings.head() temp_ratings = ratings[0:1000] #print(temp_ratings) data = Dataset.load_from_df(temp_ratings[['user_id', 'book_id', 'rating']], reader) data.split(n_folds=2) # In[12]: svd = SVD() evaluate(svd, data, measures=['RMSE', 'MAE']) # In[13]: trainset = data.build_full_trainset() #svd.train(trainset) algo = SVD() algo.fit(trainset) ## usefule = temp_rating[rating] # In[14]: #print(len(temp_ratings[temp_ratings['user_id']==userId])) # In[ ]: def get_top_n(predictions, n=10): '''Return the top-N recommendation for each user from a set of predictions. Args: predictions(list of Prediction objects): The list of predictions, as returned by the test method of an algorithm. n(int): The number of recommendation to output for each user. Default is 10. Returns: A dict where keys are user (raw) ids and values are lists of tuples: [(raw item id, rating estimation), ...] of size n. ''' # First map the predictions to each user. top_n = defaultdict(list) for uid, iid, true_r, est, _ in predictions: top_n[uid].append((iid, est)) # Then sort the predictions for each user and retrieve the k highest ones. for uid, user_ratings in top_n.items(): #user_ratings.sort(key=lambda x: x[1], reverse=True) top_n[uid] = user_ratings[:n] return top_n # In[15]: from collections import defaultdict testset = trainset.build_anti_testset() predictions = algo.test(testset) ''' top_n = get_top_n(predictions, n=10000) #print(top_n) #result = pd.DataFrame(top_n) #print(result) for uid, user_ratings in top_n.items(): #print(uid, [iid for (iid , _) in user_ratings]) for uid, iid, true_r, est, _ in predictions: temp_ratings.loc[uid]= [uid,iid,est] #temp_ratings[i]['cf'] = temp_ratings[(temp_ratings['user_id'] == uid)][['book_id']] ''' count = 0 for uid, iid, true_r, est, _ in predictions: if uid == userId: count = count+1 temp_ratings.loc[len(temp_ratings)+1]= [uid,iid,est] #print('here') #print(uid) #temp_ratings.append([uid,iid,est],ignore_index=True) #print(count) #print(temp_ratings) # In[16]: #print(len(temp_ratings[temp_ratings['user_id']==2])) # In[ ]: # In[46]: ##### CONTENT ###### import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from scipy import stats from ast import literal_eval from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.metrics.pairwise import linear_kernel, cosine_similarity from nltk.stem.snowball import SnowballStemmer from nltk.stem.wordnet import WordNetLemmatizer from nltk.corpus import wordnet from surprise import Reader, Dataset, SVD, evaluate import csv import warnings; warnings.simplefilter('ignore') # In[48]: md=pd.read_csv('CustomData/FinalData.csv') rd=train_rd #rd=pd.read_csv('ratings.csv') md['book_id'] = md['book_id'].astype('int') rd['book_id'] = rd['book_id'].astype('int') rd['user_id'] = rd['user_id'].astype('int') rd['rating'] = rd['rating'].astype('int') #print(md.head()) md['authors'] = md['authors'].str.replace(' ','') md['authors'] = md['authors'].str.lower() md['authors'] = md['authors'].str.replace(',',' ') #print(md.head()) md['authors'] = md['authors'].apply(lambda x: [x,x]) #print(md['authors']) md['Genres']=md['Genres'].str.split(';') #print(md['Genres']) md['soup'] = md['authors'] + md['Genres'] #print(md['soup']) md['soup'] = md['soup'].str.join(' ') #md['soup'].fillna({}) #print(md['soup']) count = CountVectorizer(analyzer='word',ngram_range=(1,1),min_df=0, stop_words='english') count_matrix = count.fit_transform(md['soup']) #print (count_matrix.shape) #print np.array(count.get_feature_names()) #print(count_matrix.shape) cosine_sim = cosine_similarity(count_matrix, count_matrix) # In[91]: def build_user_profiles(): user_profiles=np.zeros((53421,999)) #print(rd.iloc[0]['user_id']) #len(rd['book_id']) for i in range(0,1000): u=rd.iloc[i]['user_id'] b=rd.iloc[i]['book_id'] #print(u,b) #print(i) #if b<999: #print("match at "+str(b)) user_profiles[u][b-1]=rd.iloc[i]['rating'] #print(user_profiles) return user_profiles user_profiles=build_user_profiles() def _get_similar_items_to_user_profile(person_id): #Computes the cosine similarity between the user profile and all item profiles #print(user_profiles[person_id]) #print("\n---------\n") #print(cosine_sim[0]) user_ratings = np.empty((999,1)) cnt=0 for i in range(0,998): book_sim=cosine_sim[i] user_sim=user_profiles[person_id] user_ratings[i]=(book_sim.dot(user_sim))/sum(cosine_sim[i]) maxval = max(user_ratings) #print(maxval) for i in range(0,998): user_ratings[i]=((user_ratings[i]*5.0)/(maxval)) #print(user_ratings[i]) if(user_ratings[i]>3): #print("MILA KUCCHHH") cnt+=1 #print(max(user_ratings)) #print (cnt) #print(cosine_similarities) #return similar_items return user_ratings content_ratings = _get_similar_items_to_user_profile(userId) # In[100]: num = md[['book_id']] #print(num) num1 = pd.DataFrame(data=content_ratings[0:,0:]) frames = [num, num1] #result = pd.concat([df1, df4], axis=1, join_axes=[df1.index]) mer = pd.concat(frames, axis =1,join_axes=[num.index]) mer.columns=['book_id', 'content_rating'] #print(mer.shape) #print('here') #print(mer) # In[102]: ## for user 2 # #print(temp_ratings.shape) cb = temp_ratings[(temp_ratings['user_id'] == userId)][['book_id', 'rating']] # print(cb.shape) # print(pop.shape) hyb = md[['book_id']] hyb = hyb.merge(cb,on = 'book_id') hyb = hyb.merge(pop, on='book_id') hyb = hyb.merge(mer, on='book_id') #hyb.shape # In[106]: def weighted_rating(x): v = x['rating'] R = x['popularity_rating'] c = x['content_rating'] return 0.4*v + 0.2*R + 0.4 * c # In[107]: print(hyb) hyb['final'] = hyb.apply(weighted_rating, axis=1) hyb = hyb.sort_values('final', ascending=False).head(999) #print(hyb['final']) print(hyb) return hyb
"""# arrange dataset""" data = rating_author.groupby(['User-ID', 'Author'])["Book-Rating"].agg(['mean']).reset_index() data.sort_values(by=['User-ID', 'Author'], inplace=True) data.columns = ["userID", "author", "raw_ratings"] ## binning raw_ratings data.raw_ratings = data.raw_ratings.apply(lambda x : 0 if x == 0 else (1 if x in [1,2,3,4] else (2 if x in[5, 6, 7] else 3))) data.raw_ratings = data.raw_ratings.astype("int") """# make dataset for surprise""" reader = Reader(rating_scale=(0, 10)) data = Dataset.load_from_df(data[["userID", "author", "raw_ratings"]], reader) del user, item, rating, rating_author gc.collect() """# train by surprise""" kf = KFold(random_state=0, n_splits=3) classes = (SVD, SVDpp, NMF, KNNBaseline, BaselineOnly, CoClustering) for idx, klass in enumerate(classes): print(classes[idx]) for trainset, testset in kf.split(data): # train and test algorithm. algo = klass() algo.fit(trainset)
from surprise.model_selection import cross_validate reader = Reader() ratings.head() # In[25]: ratings.shape # In[27]: data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader) #data.split(n_folds=5) svd = SVD() cross_validate(svd, data, measures=['RMSE', 'MAE']) # In[28]: trainset = data.build_full_trainset() svd.fit(trainset) # In[29]:
def k_recommend(model, k, testset): reader = Reader(line_format='user item rating', sep=',', skip_lines=1) fold_files = [('~/Desktop/Tufts/Fall2018/COMP135/Project3/trainset.csv', '~/Desktop/Tufts/Fall2018/COMP135/Project3/testset.csv')] pdkfold = sp.model_selection.split.PredefinedKFold() clf = model.best_estimator['mae'] data = Dataset.load_from_folds(fold_files, reader=reader) for train, test in pdkfold.split(data): clf.fit(train) test1 = train.build_anti_testset() preds = clf.test(test1) top_n = defaultdict(list) for uid, iid, true_r, est, _ in preds: top_n[uid].append((iid, est)) for uid, user_ratings in top_n.items(): user_ratings.sort(key=lambda x: x[1], reverse=True) top_n[uid] = user_ratings[:k] """ for uid, user_ratings in top_n.items(): print(uid, [iid for (iid, _) in user_ratings]) for uid, user_ratings in top_n.items(): print uid, user_ratings """ for uid in top_n: i = 0 for iid in top_n[uid]: found = False for iid2 in testset[uid]: if iid[0] == str(iid2[0]): a = iid[0] top_n[uid].remove(top_n[uid][i]) top_n[uid].insert(i,(a, iid2[1])) found = True i += 1 break if found == False: a = iid[0] top_n[uid].remove(top_n[uid][i]) top_n[uid].insert(i,(a, 2)) i += 1 total_sum = 0.0 user_sum = 0.0 us_rec = [] for uid in top_n: i = 0.0 for iid in top_n[uid]: i += 1.0 user_sum += iid[1] total_sum += float(user_sum / i) us_rec.append(user_sum / i) user_sum = 0.0 #print us_rec print "Average rating: ", (total_sum/float(len(top_n)))
from collections import defaultdict #Load the movielens-100k dataset ##should be ratings url = 'https://raw.githubusercontent.com/MutugiD/Data-Problems/master/Recommender/movie_ratings.csv' data = pd.read_csv(url) rating_dict = { 'itemID': list(data.movieId), 'userID': list(data.userId), 'rating': list(data.rating) } df = pd.DataFrame(rating_dict) reader = Reader(line_format='user item rating timestamp', sep='\t') reader = Reader(rating_scale=(0.5, 5.0)) data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader) df.groupby('itemID')['rating'].count().reset_index().sort_values( 'rating', ascending=False)[:10] data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader) benchmark = [] # Iterate over SVD, NMF, NormalPredictor, KNNBasic for algo in [SVD(), NMF(), NormalPredictor(), KNNBasic()]: # Perform cross validation results = cross_validate( algo, data, measures=['RMSE'], cv=3, verbose=False) # Get results & append algorithm name temp = pd.DataFrame.from_dict(results).mean(axis=0) temp = temp.append( pd.Series([str(algo).split(' ')[0].split('.')[-1]], index=['Algorithm']))
from surprise import KNNWithMeans from surprise import Dataset, print_perf, Reader from surprise.model_selection import cross_validate import os # 指定文件所在路径 file_path = os.path.expanduser('mydata.csv') # 告诉文本阅读器,文本的格式是怎么样的 reader = Reader(line_format='user item rating', sep=',') # 加载数据 data = Dataset.load_from_file(file_path, reader=reader) trainset = data.build_full_trainset() # Use user_based true/false to switch between user-based or item-based collaborative filtering algo = KNNWithMeans(k=50, sim_options={'user_based': False})#取最相似的用户进行计算时,只取最相似的k个 algo.fit(trainset) # we can now query for specific predicions uid = str(5) # raw user id iid = str(1) # raw item id # get a prediction for specific users and items. pred = algo.predict(uid, iid) print('rating of user-{0} to item-{1} is '.format(uid, iid), pred.est)# rating of user-5 to item-1 #---------------------------- uid = str(5) # raw user id iid = str(5) # raw item id # get a prediction for specific users and items. pred = algo.predict(uid, iid) print('rating of user-{0} to item-{1} is '.format(uid, iid), pred.est)
# * Precision at K: Proportion of recommended items that are relevant precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1 # * Recall at K: Proportion of relevant items that are recommended recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1 return precisions, recalls # * using reader to be able to deal with the imported CSV reader = Reader(line_format="user item rating timestamp", sep=",", rating_scale=(1, 5), skip_lines=1) # * loading the csv data = Dataset.load_from_file( file_path="../../ML_Dataset/ml-latest-small/ratings.csv", reader=reader) # * dividing in train and test sets trainset, testset = train_test_split(data, test_size=0.25) # * define a cross-validation iterator kf = KFold(n_splits=5) # * Choosing KNN With Means as algorithm algo = KNNWithMeans() # * Train the algorithm on the trainset, and predict ratings for the testset for trainset, testset in kf.split(data): predictions = algo.fit(trainset).test(testset) precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4) accuracy.rmse(predictions) accuracy.mae(predictions)
import pandas as pd from surprise import Reader from surprise import SVD from surprise import Dataset from surprise.model_selection import cross_validate if __name__ == '__main__': df = pd.read_csv("data_1m.csv") reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['user_id', 'song_id', 'rating']], reader) algo = SVD() cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)
# In[83]: listOfRMSE = [] models = [] # In[84]: from surprise import Reader, Dataset, SVD, SVDpp, evaluate, accuracy from surprise.model_selection import train_test_split reader = Reader(line_format='user item rating', sep=',', rating_scale=(0,6)) df_temp1 = df_final_user_repo_star_v3.copy(deep=True); data = Dataset.load_from_df(df_temp1, reader) # Test that surprise is working by running SVD on the dataset # We'll use the famous SVD algorithm. algo = SVD(n_factors= 100, n_epochs= 20, biased=True, init_std_dev=0.1, lr_all=0.005) # Train the algorithm on the trainset, and predict ratings for the testset trainset = data.build_full_trainset() algo.fit(trainset) testset = trainset.build_anti_testset() svd_predictions = algo.test(testset) rmse_svd = accuracy.rmse(svd_predictions) print(rmse_svd)
""" This module describes how to use the GridSearchCV() class for finding the best parameter combination of a given algorithm. """ from __future__ import (absolute_import, division, print_function, unicode_literals) from surprise import SVD from surprise import Dataset from surprise.model_selection import GridSearchCV # Use movielens-100K data = Dataset.load_builtin('ml-100k') param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6]} gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3) gs.fit(data) # best RMSE score print(gs.best_score['rmse']) # combination of parameters that gave the best RMSE score print(gs.best_params['rmse']) # We can now use the algorithm that yields the best rmse: algo = gs.best_estimator['rmse'] algo.fit(data.build_full_trainset())
import zipfile from surprise import Reader, Dataset, SVD, evaluate # Unzip ml-100k.zip zipfile = zipfile.ZipFile('ml-100k.zip', 'r') zipfile.extractall() zipfile.close() # Read data into an array of strings with open('./ml-100k/u.data') as f: all_lines = f.readlines() # Prepare the data to be used in Surprise reader = Reader(line_format='user item rating timestamp', sep='\t') data = Dataset.load_from_file('./ml-100k/u.data', reader=reader) # Split the dataset into 5 folds and choose the algorithm data.split(n_folds=5) algo = SVD() # Train and test reporting the RMSE and MAE scores evaluate(algo, data, measures=['RMSE', 'MAE']) # Retrieve the trainset. trainset = data.build_full_trainset() algo.train(trainset) # Predict a certain item userid = str(196) itemid = str(302) actual_rating = 4