Пример #1
0
def test_fsvd_save_load(tmp_path):
    tmp_path = lktu.norm_path(tmp_path)
    mod_file = tmp_path / 'funksvd.npz'

    ratings = lktu.ml_pandas.renamed.ratings

    original = svd.FunkSVD(20, iterations=20)
    original.fit(ratings)

    assert original.global_bias_ == approx(ratings.rating.mean())
    assert original.item_features_.shape == (ratings.item.nunique(), 20)
    assert original.user_features_.shape == (ratings.user.nunique(), 20)

    original.save(mod_file)
    assert mod_file.exists()

    algo = svd.FunkSVD(20, iterations=20)
    algo.load(mod_file)
    assert algo.global_bias_ == original.global_bias_
    assert np.all(algo.user_bias_ == original.user_bias_)
    assert np.all(algo.item_bias_ == original.item_bias_)
    assert np.all(algo.user_features_ == original.user_features_)
    assert np.all(algo.item_features_ == original.item_features_)
    assert np.all(algo.item_index_ == original.item_index_)
    assert np.all(algo.user_index_ == original.user_index_)
Пример #2
0
def test_sweep_filenames(tmp_path):
    tmp_path = norm_path(tmp_path)
    work = pathlib.Path(tmp_path)
    sweep = batch.MultiEval(tmp_path)

    ratings = ml_pandas.renamed.ratings
    folds = []
    for part, (train, test) in enumerate(xf.partition_users(ratings, 2, xf.SampleN(5))):
        trfn = work / 'p{}-train.csv'.format(part)
        tefn = work / 'p{}-test.csv'.format(part)
        train.to_csv(trfn)
        test.to_csv(tefn)
        folds.append((trfn, tefn))

    sweep.add_datasets(folds, DataSet='ml-small')
    sweep.add_algorithms([Bias(damping=0), Bias(damping=5), Bias(damping=10)],
                         attrs=['damping'])
    sweep.add_algorithms(Popular())

    try:
        sweep.run()
    finally:
        if (work / 'runs.csv').exists():
            runs = pd.read_csv(work / 'runs.csv')
            print(runs)

    assert (work / 'runs.csv').exists()
    assert (work / 'runs.parquet').exists()
    assert (work / 'predictions.parquet').exists()
    assert (work / 'recommendations.parquet').exists()

    runs = pd.read_parquet(work / 'runs.parquet')
    # 4 algorithms by 2 partitions
    assert len(runs) == 8
Пример #3
0
def test_bias_save(tmp_path):
    tmp_path = lktu.norm_path(tmp_path)

    original = bl.Bias(damping=5)
    original.fit(simple_df)
    assert original.mean_ == approx(3.5)
    fn = tmp_path / 'bias.dat'

    _log.info('saving to %s', fn)
    original.save(fn)

    algo = bl.Bias()
    algo.load(fn)
    assert algo.mean_ == original.mean_

    assert algo.item_offsets_ is not None
    assert algo.item_offsets_.index.name == 'item'
    assert set(algo.item_offsets_.index) == set([1, 2, 3])
    assert algo.item_offsets_.loc[1:3].values == approx(
        np.array([0, 0.25, -0.25]))

    assert algo.user_offsets_ is not None
    assert algo.user_offsets_.index.name == 'user'
    assert set(algo.user_offsets_.index) == set([10, 12, 13])
    assert algo.user_offsets_.loc[[10, 12, 13]].values == \
        approx(np.array([0.25, -00.08333, -0.20833]), abs=1.0e-4)
Пример #4
0
def test_sweep_oneshot(tmp_path):
    tmp_path = norm_path(tmp_path)
    work = pathlib.Path(tmp_path)
    sweep = batch.MultiEval(tmp_path, combine=False)

    ratings = ml_pandas.renamed.ratings
    sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)), name='ml-small')
    sweep.add_algorithms(Bias(damping=5))

    try:
        sweep.run(3)
    finally:
        if (work / 'runs.csv').exists():
            runs = pd.read_csv(work / 'runs.csv')
            print(runs)

    assert not (work / 'runs.csv').exists()
    assert not (work / 'runs.parquet').exists()
    assert not (work / 'predictions.parquet').exists()
    assert not (work / 'recommendations.parquet').exists()

    assert (work / 'run-3.json').exists()
    assert (work / 'predictions-3.parquet').exists()
    assert (work / 'recommendations-3.parquet').exists()

    with (work / 'run-3.json').open() as f:
        run = json.load(f)
    assert run['RunId'] == 3
Пример #5
0
def test_sweep_save(tmp_path):
    tmp_path = norm_path(tmp_path)
    work = pathlib.Path(tmp_path)
    sweep = batch.MultiEval(tmp_path)

    ratings = ml_pandas.renamed.ratings
    sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)), name='ml-small')
    sweep.add_algorithms(Bias(damping=5))

    sweep.persist_data()
    pf = work / 'sweep.dat'
    with pf.open('wb') as f:
        pickle.dump(sweep, f)

    with pf.open('rb') as f:
        sweep = pickle.load(f)

    try:
        sweep.run()
    finally:
        if (work / 'runs.csv').exists():
            runs = pd.read_csv(work / 'runs.csv')
            print(runs)

    assert (work / 'runs.csv').exists()
    assert (work / 'runs.parquet').exists()
    assert (work / 'predictions.parquet').exists()
    assert (work / 'recommendations.parquet').exists()

    runs = pd.read_parquet(work / 'runs.parquet')
    # 1 algorithms by 5 partitions
    assert len(runs) == 5
Пример #6
0
def test_sweep_norecs(tmp_path):
    tmp_path = norm_path(tmp_path)
    work = pathlib.Path(tmp_path)
    sweep = batch.MultiEval(tmp_path, recommend=None)

    ratings = ml_pandas.renamed.ratings
    folds = xf.partition_users(ratings, 5, xf.SampleN(5))
    sweep.add_datasets(folds, DataSet='ml-small')
    sweep.add_algorithms([Bias(damping=0), Bias(damping=5), Bias(damping=10)],
                         attrs=['damping'])
    sweep.add_algorithms(Popular())

    try:
        sweep.run()
    finally:
        if (work / 'runs.csv').exists():
            runs = pd.read_csv(work / 'runs.csv')
            print(runs)

    assert (work / 'runs.csv').exists()
    assert (work / 'runs.parquet').exists()
    assert (work / 'predictions.parquet').exists()
    assert not (work / 'recommendations.parquet').exists()

    runs = pd.read_parquet(work / 'runs.parquet')
    # 4 algorithms by 5 partitions
    assert len(runs) == 20
    assert all(np.sort(runs.AlgoClass.unique()) == ['Bias', 'Popular'])
    bias_runs = runs[runs.AlgoClass == 'Bias']
    assert all(bias_runs.damping.notna())
    pop_runs = runs[runs.AlgoClass == 'Popular']
    assert all(pop_runs.damping.isna())

    preds = pd.read_parquet(work / 'predictions.parquet')
    assert all(preds.RunId.isin(bias_runs.RunId))
Пример #7
0
def test_csr_save_load(tmp_path, prefix, values):
    tmp_path = lktu.norm_path(tmp_path)
    coords = np.random.choice(np.arange(50 * 100, dtype=np.int32), 1000, False)
    rows = np.mod(coords, 100, dtype=np.int32)
    cols = np.floor_divide(coords, 100, dtype=np.int32)
    if values:
        vals = np.random.randn(1000)
    else:
        vals = None

    csr = lm.csr_from_coo(rows, cols, vals, (100, 50))
    assert csr.nrows == 100
    assert csr.ncols == 50
    assert csr.nnz == 1000

    data = lm.csr_save(csr, prefix=prefix)

    np.savez_compressed(tmp_path / 'matrix.npz', **data)

    with np.load(tmp_path / 'matrix.npz') as npz:
        csr2 = lm.csr_load(npz, prefix=prefix)

    assert csr2.nrows == csr.nrows
    assert csr2.ncols == csr.ncols
    assert csr2.nnz == csr.nnz
    assert all(csr2.rowptrs == csr.rowptrs)
    assert all(csr2.colinds == csr.colinds)
    if values:
        assert all(csr2.values == csr.values)
    else:
        assert csr2.values is None
Пример #8
0
def test_ii_save_load(tmp_path):
    "Save and load a model"
    tmp_path = lktu.norm_path(tmp_path)
    original = knn.ItemItem(30, save_nbrs=500)
    _log.info('building model')
    original.fit(lktu.ml_sample())

    fn = tmp_path / 'ii.mod'
    _log.info('saving model to %s', fn)
    original.save(fn)
    _log.info('reloading model')

    algo = knn.ItemItem(30)
    algo.load(fn)
    _log.info('checking model')

    assert all(np.logical_not(np.isnan(algo.sim_matrix_.values)))
    assert all(algo.sim_matrix_.values > 0)
    # a little tolerance
    assert all(algo.sim_matrix_.values < 1 + 1.0e-6)

    assert all(algo.item_counts_ == original.item_counts_)
    assert algo.item_counts_.sum() == algo.sim_matrix_.nnz
    assert algo.sim_matrix_.nnz == original.sim_matrix_.nnz
    assert all(algo.sim_matrix_.rowptrs == original.sim_matrix_.rowptrs)
    assert algo.sim_matrix_.values == approx(original.sim_matrix_.values)

    r_mat = algo.sim_matrix_
    o_mat = original.sim_matrix_
    assert all(r_mat.rowptrs == o_mat.rowptrs)

    for i in range(len(algo.item_index_)):
        sp = r_mat.rowptrs[i]
        ep = r_mat.rowptrs[i + 1]

        # everything is in decreasing order
        assert all(np.diff(r_mat.values[sp:ep]) <= 0)
        assert all(r_mat.values[sp:ep] == o_mat.values[sp:ep])

    means = ml_ratings.groupby('item').rating.mean()
    assert means[algo.item_index_].values == approx(original.item_means_)

    matrix = lm.csr_to_scipy(algo.sim_matrix_)

    items = pd.Series(algo.item_index_)
    items = items[algo.item_counts_ > 0]
    for i in items.sample(50):
        ipos = algo.item_index_.get_loc(i)
        _log.debug('checking item %d at position %d', i, ipos)

        row = matrix.getrow(ipos)

        # it should be sorted !
        # check this by diffing the row values, and make sure they're negative
        assert all(np.diff(row.data) < 1.0e-6)
Пример #9
0
def test_sweep_combine(tmp_path):
    tmp_path = norm_path(tmp_path)
    work = pathlib.Path(tmp_path)
    sweep = batch.MultiEval(tmp_path, combine=False)

    ratings = ml_pandas.renamed.ratings
    sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)), name='ml-small')

    sweep.add_algorithms([Bias(damping=0), Bias(damping=5)],
                         attrs=['damping'])
    sweep.add_algorithms(Popular())

    sweep.persist_data()

    for i in range(1, 6):
        assert (work / 'ds{}-train.parquet'.format(i)).exists()
        assert (work / 'ds{}-test.parquet'.format(i)).exists()

    for ds, cf, dsa in sweep.datasets:
        assert isinstance(ds, tuple)
        train, test = ds
        assert isinstance(train, pathlib.Path)
        assert isinstance(test, pathlib.Path)

    assert sweep.run_count() == 5 * 3

    try:
        sweep.run()
    finally:
        if (work / 'runs.csv').exists():
            runs = pd.read_csv(work / 'runs.csv')
            print(runs)

    assert not (work / 'runs.csv').exists()
    assert not (work / 'runs.parquet').exists()
    assert not (work / 'predictions.parquet').exists()
    assert not (work / 'recommendations.parquet').exists()

    for i, (ds, a) in enumerate(sweep._flat_runs()):
        run = i + 1
        assert (work / 'run-{}.json'.format(run)).exists()
        if isinstance(a.algorithm, Predictor):
            assert (work / 'predictions-{}.parquet'.format(run)).exists()
        assert (work / 'recommendations-{}.parquet'.format(run)).exists()

    sweep.collect_results()

    assert (work / 'runs.csv').exists()
    assert (work / 'runs.parquet').exists()
    assert (work / 'predictions.parquet').exists()
    assert (work / 'recommendations.parquet').exists()

    runs = pd.read_parquet(work / 'runs.parquet')
    assert len(runs) == 5 * 3
Пример #10
0
def test_als_save_load(tmp_path):
    tmp_path = lktu.norm_path(tmp_path)
    mod_file = tmp_path / 'als.npz'
    algo = als.ImplicitMF(20, iterations=5)
    ratings = lktu.ml_pandas.renamed.ratings
    algo.fit(ratings)

    algo.save(mod_file)
    assert mod_file.exists()

    restored = als.ImplicitMF(20)
    restored.load(mod_file)
    assert np.all(restored.user_features_ == algo.user_features_)
    assert np.all(restored.item_features_ == algo.item_features_)
    assert np.all(restored.item_index_ == algo.item_index_)
    assert np.all(restored.user_index_ == algo.user_index_)
Пример #11
0
def test_fallback_save_load(tmp_path):
    tmp_path = lktu.norm_path(tmp_path)

    original = basic.Fallback(basic.Memorized(simple_df), basic.Bias())
    original.fit(lktu.ml_pandas.renamed.ratings)

    fn = tmp_path / 'fallback'
    original.save(fn)

    algo = basic.Fallback(basic.Memorized(simple_df), basic.Bias())
    algo.load(fn)

    bias = algo.algorithms[1]
    assert bias.mean_ == approx(lktu.ml_pandas.ratings.rating.mean())

    def exp_val(user, item):
        v = bias.mean_
        if user is not None:
            v += bias.user_offsets_.loc[user]
        if item is not None:
            v += bias.item_offsets_.loc[item]
        return v

    # first user + item
    preds = algo.predict_for_user(10, [1])
    assert preds.loc[1] == 4.0
    # second user + first item
    preds = algo.predict_for_user(15, [1])
    assert preds.loc[1] == approx(exp_val(15, 1))

    # second item + user item
    preds = algo.predict_for_user(12, [2])
    assert preds.loc[2] == approx(exp_val(12, 2))

    # blended
    preds = algo.predict_for_user(10, [1, 5])
    assert preds.loc[1] == 4.0
    assert preds.loc[5] == approx(exp_val(10, 5))

    # blended unknown
    preds = algo.predict_for_user(10, [5, 1, -23081])
    assert len(preds) == 3
    assert preds.loc[1] == 4.0
    assert preds.loc[5] == approx(exp_val(10, 5))
    assert preds.loc[-23081] == approx(exp_val(10, None))
Пример #12
0
def test_ii_train_ml100k(tmp_path):
    "Test an unbounded model on ML-100K"
    tmp_path = lktu.norm_path(tmp_path)

    ratings = lktu.ml100k.load_ratings()
    algo = knn.ItemItem(30)
    _log.info('training model')
    algo.fit(ratings)

    _log.info('testing model')

    assert all(np.logical_not(np.isnan(algo.sim_matrix_.values)))
    assert all(algo.sim_matrix_.values > 0)

    # a little tolerance
    assert all(algo.sim_matrix_.values < 1 + 1.0e-6)

    assert algo.item_counts_.sum() == algo.sim_matrix_.nnz

    means = ratings.groupby('item').rating.mean()
    assert means[algo.item_index_].values == approx(algo.item_means_)

    # save
    fn = tmp_path / 'ii.mod'
    _log.info('saving model to %s', fn)
    algo.save(fn)
    _log.info('reloading model')
    restored = knn.ItemItem(30)
    restored.load(fn)
    assert all(restored.sim_matrix_.values > 0)

    r_mat = restored.sim_matrix_
    o_mat = algo.sim_matrix_

    assert all(r_mat.rowptrs == o_mat.rowptrs)

    for i in range(len(restored.item_index_)):
        sp = r_mat.rowptrs[i]
        ep = r_mat.rowptrs[i + 1]

        # everything is in decreasing order
        assert all(np.diff(r_mat.values[sp:ep]) <= 0)
        assert all(r_mat.values[sp:ep] == o_mat.values[sp:ep])
Пример #13
0
def test_fallback_save_load(tmp_path):
    tmp_path = lktu.norm_path(tmp_path)

    original = basic.Fallback(basic.Memorized(simple_df), basic.Bias())
    original.fit(lktu.ml_pandas.renamed.ratings)

    fn = tmp_path / 'fallback'
    original.save(fn)

    algo = basic.Fallback(basic.Memorized(simple_df), basic.Bias())
    algo.load(fn)

    bias = algo.algorithms[1]
    assert bias.mean_ == approx(lktu.ml_pandas.ratings.rating.mean())

    # first user + item
    preds = algo.predict_for_user(10, [1])
    assert preds.loc[1] == 4.0
    # second user + first item
    preds = algo.predict_for_user(15, [1])
    assert preds.loc[1] == approx(bias.mean_ + bias.user_offsets_.loc[15] +
                                  bias.item_offsets_.loc[1])

    # second item + user item
    preds = algo.predict_for_user(12, [2])
    assert preds.loc[2] == approx(bias.mean_ + bias.user_offsets_.loc[12] +
                                  bias.item_offsets_.loc[2])

    # blended
    preds = algo.predict_for_user(10, [1, 5])
    assert preds.loc[1] == 4.0
    assert preds.loc[5] == approx(bias.mean_ + bias.user_offsets_.loc[10] +
                                  bias.item_offsets_.loc[5])

    # blended unknown
    preds = algo.predict_for_user(10, [5, 1, -23081])
    assert len(preds) == 3
    assert preds.loc[1] == 4.0
    assert preds.loc[5] == approx(bias.mean_ + bias.user_offsets_.loc[10] +
                                  bias.item_offsets_.loc[5])
    assert preds.loc[-23081] == approx(bias.mean_ + bias.user_offsets_.loc[10])
Пример #14
0
def test_sweep_persist(tmp_path):
    tmp_path = norm_path(tmp_path)
    work = pathlib.Path(tmp_path)
    sweep = batch.MultiEval(tmp_path)

    ratings = ml_pandas.renamed.ratings
    sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)),
                       name='ml-small')
    sweep.persist_data()

    for i in range(1, 6):
        assert (work / 'ds{}-train.parquet'.format(i)).exists()
        assert (work / 'ds{}-test.parquet'.format(i)).exists()

    for ds, cf, dsa in sweep.datasets:
        assert isinstance(ds, tuple)
        train, test = ds
        assert isinstance(train, pathlib.Path)
        assert isinstance(test, pathlib.Path)

    sweep.add_algorithms(
        [Bias(damping=0), Bias(damping=5),
         Bias(damping=10)],
        attrs=['damping'])
    sweep.add_algorithms(Popular())

    try:
        sweep.run()
    finally:
        if (work / 'runs.csv').exists():
            runs = pd.read_csv(work / 'runs.csv')
            print(runs)

    assert (work / 'runs.csv').exists()
    assert (work / 'runs.parquet').exists()
    assert (work / 'predictions.parquet').exists()
    assert (work / 'recommendations.parquet').exists()

    runs = pd.read_parquet(work / 'runs.parquet')
    # 4 algorithms by 5 partitions
    assert len(runs) == 20
Пример #15
0
def test_als_save_load(tmp_path):
    tmp_path = lktu.norm_path(tmp_path)
    mod_file = tmp_path / 'als.npz'
    original = als.BiasedMF(20, iterations=5)
    ratings = lktu.ml_pandas.renamed.ratings
    original.fit(ratings)

    assert original.global_bias_ == approx(ratings.rating.mean())

    original.save(mod_file)
    assert mod_file.exists()

    algo = als.BiasedMF(20)
    algo.load(mod_file)
    assert algo.global_bias_ == original.global_bias_
    assert np.all(algo.user_bias_ == original.user_bias_)
    assert np.all(algo.item_bias_ == original.item_bias_)
    assert np.all(algo.user_features_ == original.user_features_)
    assert np.all(algo.item_features_ == original.item_features_)
    assert np.all(algo.item_index_ == original.item_index_)
    assert np.all(algo.user_index_ == original.user_index_)
Пример #16
0
def test_pop_save_load(tmp_path):
    tmp_path = lktu.norm_path(tmp_path)
    original = basic.Popular()
    original.fit(lktu.ml_pandas.renamed.ratings)

    fn = tmp_path / 'pop.mod'
    original.save(fn)

    algo = basic.Popular()
    algo.load(fn)

    counts = lktu.ml_pandas.renamed.ratings.groupby('item').user.count()
    counts = counts.nlargest(100)

    assert algo.item_pop_.max() == counts.max()

    recs = algo.recommend(2038, 100)
    assert len(recs) == 100
    assert all(np.diff(recs.score) <= 0)

    assert recs.score.iloc[0] == counts.max()
    # the 10 most popular should be the same
    assert all(counts.index[:10] == recs.item[:10])
Пример #17
0
def test_uu_save_load(tmp_path):
    tmp_path = lktu.norm_path(tmp_path)

    orig = knn.UserUser(30)
    _log.info('training model')
    orig.fit(ml_ratings)

    fn = tmp_path / 'uu.model'
    _log.info('saving to %s', fn)
    orig.save(fn)

    _log.info('reloading model')
    algo = knn.UserUser(30)
    algo.load(fn)
    _log.info('checking model')

    # it should have computed correct means
    umeans = ml_ratings.groupby('user').rating.mean()
    mlmeans = pd.Series(algo.user_means_, index=algo.user_index_, name='mean')
    umeans, mlmeans = umeans.align(mlmeans)
    assert mlmeans.values == approx(umeans.values)

    # we should be able to reconstruct rating values
    uir = ml_ratings.set_index(['user', 'item']).rating
    r_items = matrix.csr_rowinds(algo.transpose_matrix_)
    ui_rbdf = pd.DataFrame({
        'user':
        algo.user_index_[algo.transpose_matrix_.colinds],
        'item':
        algo.item_index_[r_items],
        'nrating':
        algo.transpose_matrix_.values
    }).set_index(['user', 'item'])
    ui_rbdf = ui_rbdf.join(mlmeans)
    ui_rbdf['rating'] = ui_rbdf['nrating'] + ui_rbdf['mean']
    ui_rbdf['orig_rating'] = uir
    assert ui_rbdf.rating.values == approx(ui_rbdf.orig_rating.values)
Пример #18
0
def test_uu_save_load_implicit(tmp_path):
    "Save and load user-user on an implicit data set."
    tmp_path = lktu.norm_path(tmp_path)
    orig = knn.UserUser(20, center=False, aggregate='sum')
    data = ml_ratings.loc[:, ['user', 'item']]

    orig.fit(data)
    orig.save(tmp_path / 'uu.mod')

    algo = knn.UserUser(20, center=False, aggregate='sum')
    algo.load(tmp_path / 'uu.mod')
    assert algo.user_means_ is None
    assert all(algo.user_index_ == orig.user_index_)
    assert all(algo.item_index_ == orig.item_index_)

    assert all(algo.rating_matrix_.rowptrs == orig.rating_matrix_.rowptrs)
    assert all(algo.rating_matrix_.colinds == orig.rating_matrix_.colinds)
    assert all(algo.rating_matrix_.values == orig.rating_matrix_.values)

    assert all(
        algo.transpose_matrix_.rowptrs == orig.transpose_matrix_.rowptrs)
    assert all(
        algo.transpose_matrix_.colinds == orig.transpose_matrix_.colinds)
    assert algo.transpose_matrix_.values is None