def test_rmse(): """Tests for the RMSE function.""" predictions = [pred(0, 0), pred(1, 1), pred(2, 2), pred(100, 100)] assert rmse(predictions) == 0 predictions = [pred(0, 0), pred(0, 2)] assert rmse(predictions) == sqrt((0 - 2)**2 / 2) predictions = [pred(2, 0), pred(3, 4)] assert rmse(predictions) == sqrt(((2 - 0)**2 + (3 - 4)**2) / 2) with pytest.raises(ValueError): rmse([])
def test_sanity_checks(u1_ml100k, pkf): """ Basic sanity checks for all algorithms: check that RMSE stays the same. """ expected_rmse = { BaselineOnly: 1.0268524031297395, KNNBasic: 1.1337265249554591, KNNWithMeans: 1.1043129441881696, KNNBaseline: 1.0700718041752253, KNNWithZScore: 1.11179436167853, SVD: 1.0077323320656948, SVDpp: 1.00284553561452, NMF: 1.0865370266372372, SlopeOne: 1.1559939123891685, CoClustering: 1.0841941385276614, } for klass, rmse in iteritems(expected_rmse): if klass in (SVD, SVDpp, NMF, CoClustering): algo = klass(random_state=0) else: algo = klass() trainset, testset = next(pkf.split(u1_ml100k)) algo.fit(trainset) predictions = algo.test(testset) assert accuracy.rmse(predictions, verbose=False) == rmse
# A = 90% of the data, B = 10% of the data threshold = int(.9 * len(raw_ratings)) A_raw_ratings = raw_ratings[:threshold] B_raw_ratings = raw_ratings[threshold:] data.raw_ratings = A_raw_ratings # data is now the set A # Select your best algo with grid search. print('Grid Search...') param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005]} grid_search = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3) grid_search.fit(data) algo = grid_search.best_estimator['rmse'] # retrain on the whole set A trainset = data.build_full_trainset() algo.fit(trainset) # Compute biased accuracy on A predictions = algo.test(trainset.build_testset()) print('Biased accuracy on A,', end=' ') accuracy.rmse(predictions) # Compute unbiased accuracy on B testset = data.construct_testset(B_raw_ratings) # testset is now the set B predictions = algo.test(testset) print('Unbiased accuracy on B,', end=' ') accuracy.rmse(predictions)
from amaze import Dataset from amaze import Reader from amaze import accuracy from amaze.model_selection import PredefinedKFold # path to dataset folder files_dir = os.path.expanduser('~/.amaze_data/ml-100k/ml-100k/') # This time, we'll use the built-in reader. reader = Reader('ml-100k') # folds_files is a list of tuples containing file paths: # [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)] train_file = files_dir + 'u%d.base' test_file = files_dir + 'u%d.test' folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)] data = Dataset.load_from_folds(folds_files, reader=reader, rating_scale=(1, 5)) pkf = PredefinedKFold() algo = SVD() for trainset, testset in pkf.split(data): # train and test algorithm. algo.fit(trainset) predictions = algo.test(testset) # Compute and print Root Mean Squared Error accuracy.rmse(predictions, verbose=True)
from amaze import Dataset from amaze import SVD from amaze import accuracy from amaze.model_selection import KFold data = Dataset.load_builtin('ml-100k') algo = SVD() trainset = data.build_full_trainset() algo.fit(trainset) testset = trainset.build_testset() predictions = algo.test(testset) # RMSE should be low as we are biased accuracy.rmse(predictions, verbose=True) # ~ 0.68 (which is low) # We can also do this during a cross-validation procedure! print('CV procedure:') kf = KFold(n_splits=3) for i, (trainset_cv, testset_cv) in enumerate(kf.split(data)): print('fold number', i + 1) algo.fit(trainset_cv) print('On testset,', end=' ') predictions = algo.test(testset_cv) accuracy.rmse(predictions, verbose=True) print('On trainset,', end=' ') predictions = algo.test(trainset_cv.build_testset())