示例#1
0
def test_rmse():
    """Tests for the RMSE function."""

    predictions = [pred(0, 0), pred(1, 1), pred(2, 2), pred(100, 100)]
    assert rmse(predictions) == 0

    predictions = [pred(0, 0), pred(0, 2)]
    assert rmse(predictions) == sqrt((0 - 2)**2 / 2)

    predictions = [pred(2, 0), pred(3, 4)]
    assert rmse(predictions) == sqrt(((2 - 0)**2 + (3 - 4)**2) / 2)

    with pytest.raises(ValueError):
        rmse([])
示例#2
0
def test_sanity_checks(u1_ml100k, pkf):
    """
    Basic sanity checks for all algorithms: check that RMSE stays the same.
    """

    expected_rmse = {
        BaselineOnly: 1.0268524031297395,
        KNNBasic: 1.1337265249554591,
        KNNWithMeans: 1.1043129441881696,
        KNNBaseline: 1.0700718041752253,
        KNNWithZScore: 1.11179436167853,
        SVD: 1.0077323320656948,
        SVDpp: 1.00284553561452,
        NMF: 1.0865370266372372,
        SlopeOne: 1.1559939123891685,
        CoClustering: 1.0841941385276614,
    }

    for klass, rmse in iteritems(expected_rmse):
        if klass in (SVD, SVDpp, NMF, CoClustering):
            algo = klass(random_state=0)
        else:
            algo = klass()
        trainset, testset = next(pkf.split(u1_ml100k))
        algo.fit(trainset)
        predictions = algo.test(testset)
        assert accuracy.rmse(predictions, verbose=False) == rmse
示例#3
0
# A = 90% of the data, B = 10% of the data
threshold = int(.9 * len(raw_ratings))
A_raw_ratings = raw_ratings[:threshold]
B_raw_ratings = raw_ratings[threshold:]

data.raw_ratings = A_raw_ratings  # data is now the set A

# Select your best algo with grid search.
print('Grid Search...')
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005]}
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
grid_search.fit(data)

algo = grid_search.best_estimator['rmse']

# retrain on the whole set A
trainset = data.build_full_trainset()
algo.fit(trainset)

# Compute biased accuracy on A
predictions = algo.test(trainset.build_testset())
print('Biased accuracy on A,', end='   ')
accuracy.rmse(predictions)

# Compute unbiased accuracy on B
testset = data.construct_testset(B_raw_ratings)  # testset is now the set B
predictions = algo.test(testset)
print('Unbiased accuracy on B,', end=' ')
accuracy.rmse(predictions)
示例#4
0
from amaze import Dataset
from amaze import Reader
from amaze import accuracy
from amaze.model_selection import PredefinedKFold

# path to dataset folder
files_dir = os.path.expanduser('~/.amaze_data/ml-100k/ml-100k/')

# This time, we'll use the built-in reader.
reader = Reader('ml-100k')

# folds_files is a list of tuples containing file paths:
# [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)]
train_file = files_dir + 'u%d.base'
test_file = files_dir + 'u%d.test'
folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)]

data = Dataset.load_from_folds(folds_files, reader=reader, rating_scale=(1, 5))
pkf = PredefinedKFold()

algo = SVD()

for trainset, testset in pkf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)
示例#5
0
from amaze import Dataset
from amaze import SVD
from amaze import accuracy
from amaze.model_selection import KFold

data = Dataset.load_builtin('ml-100k')

algo = SVD()

trainset = data.build_full_trainset()
algo.fit(trainset)

testset = trainset.build_testset()
predictions = algo.test(testset)
# RMSE should be low as we are biased
accuracy.rmse(predictions, verbose=True)  # ~ 0.68 (which is low)

# We can also do this during a cross-validation procedure!
print('CV procedure:')

kf = KFold(n_splits=3)
for i, (trainset_cv, testset_cv) in enumerate(kf.split(data)):
    print('fold number', i + 1)
    algo.fit(trainset_cv)

    print('On testset,', end='  ')
    predictions = algo.test(testset_cv)
    accuracy.rmse(predictions, verbose=True)

    print('On trainset,', end=' ')
    predictions = algo.test(trainset_cv.build_testset())