def test_fsvd_save_load(tmp_path): tmp_path = lktu.norm_path(tmp_path) mod_file = tmp_path / 'funksvd.npz' ratings = lktu.ml_pandas.renamed.ratings original = svd.FunkSVD(20, iterations=20) original.fit(ratings) assert original.global_bias_ == approx(ratings.rating.mean()) assert original.item_features_.shape == (ratings.item.nunique(), 20) assert original.user_features_.shape == (ratings.user.nunique(), 20) original.save(mod_file) assert mod_file.exists() algo = svd.FunkSVD(20, iterations=20) algo.load(mod_file) assert algo.global_bias_ == original.global_bias_ assert np.all(algo.user_bias_ == original.user_bias_) assert np.all(algo.item_bias_ == original.item_bias_) assert np.all(algo.user_features_ == original.user_features_) assert np.all(algo.item_features_ == original.item_features_) assert np.all(algo.item_index_ == original.item_index_) assert np.all(algo.user_index_ == original.user_index_)
def test_sweep_filenames(tmp_path): tmp_path = norm_path(tmp_path) work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path) ratings = ml_pandas.renamed.ratings folds = [] for part, (train, test) in enumerate(xf.partition_users(ratings, 2, xf.SampleN(5))): trfn = work / 'p{}-train.csv'.format(part) tefn = work / 'p{}-test.csv'.format(part) train.to_csv(trfn) test.to_csv(tefn) folds.append((trfn, tefn)) sweep.add_datasets(folds, DataSet='ml-small') sweep.add_algorithms([Bias(damping=0), Bias(damping=5), Bias(damping=10)], attrs=['damping']) sweep.add_algorithms(Popular()) try: sweep.run() finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert (work / 'runs.csv').exists() assert (work / 'runs.parquet').exists() assert (work / 'predictions.parquet').exists() assert (work / 'recommendations.parquet').exists() runs = pd.read_parquet(work / 'runs.parquet') # 4 algorithms by 2 partitions assert len(runs) == 8
def test_bias_save(tmp_path): tmp_path = lktu.norm_path(tmp_path) original = bl.Bias(damping=5) original.fit(simple_df) assert original.mean_ == approx(3.5) fn = tmp_path / 'bias.dat' _log.info('saving to %s', fn) original.save(fn) algo = bl.Bias() algo.load(fn) assert algo.mean_ == original.mean_ assert algo.item_offsets_ is not None assert algo.item_offsets_.index.name == 'item' assert set(algo.item_offsets_.index) == set([1, 2, 3]) assert algo.item_offsets_.loc[1:3].values == approx( np.array([0, 0.25, -0.25])) assert algo.user_offsets_ is not None assert algo.user_offsets_.index.name == 'user' assert set(algo.user_offsets_.index) == set([10, 12, 13]) assert algo.user_offsets_.loc[[10, 12, 13]].values == \ approx(np.array([0.25, -00.08333, -0.20833]), abs=1.0e-4)
def test_sweep_oneshot(tmp_path): tmp_path = norm_path(tmp_path) work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path, combine=False) ratings = ml_pandas.renamed.ratings sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)), name='ml-small') sweep.add_algorithms(Bias(damping=5)) try: sweep.run(3) finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert not (work / 'runs.csv').exists() assert not (work / 'runs.parquet').exists() assert not (work / 'predictions.parquet').exists() assert not (work / 'recommendations.parquet').exists() assert (work / 'run-3.json').exists() assert (work / 'predictions-3.parquet').exists() assert (work / 'recommendations-3.parquet').exists() with (work / 'run-3.json').open() as f: run = json.load(f) assert run['RunId'] == 3
def test_sweep_save(tmp_path): tmp_path = norm_path(tmp_path) work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path) ratings = ml_pandas.renamed.ratings sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)), name='ml-small') sweep.add_algorithms(Bias(damping=5)) sweep.persist_data() pf = work / 'sweep.dat' with pf.open('wb') as f: pickle.dump(sweep, f) with pf.open('rb') as f: sweep = pickle.load(f) try: sweep.run() finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert (work / 'runs.csv').exists() assert (work / 'runs.parquet').exists() assert (work / 'predictions.parquet').exists() assert (work / 'recommendations.parquet').exists() runs = pd.read_parquet(work / 'runs.parquet') # 1 algorithms by 5 partitions assert len(runs) == 5
def test_sweep_norecs(tmp_path): tmp_path = norm_path(tmp_path) work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path, recommend=None) ratings = ml_pandas.renamed.ratings folds = xf.partition_users(ratings, 5, xf.SampleN(5)) sweep.add_datasets(folds, DataSet='ml-small') sweep.add_algorithms([Bias(damping=0), Bias(damping=5), Bias(damping=10)], attrs=['damping']) sweep.add_algorithms(Popular()) try: sweep.run() finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert (work / 'runs.csv').exists() assert (work / 'runs.parquet').exists() assert (work / 'predictions.parquet').exists() assert not (work / 'recommendations.parquet').exists() runs = pd.read_parquet(work / 'runs.parquet') # 4 algorithms by 5 partitions assert len(runs) == 20 assert all(np.sort(runs.AlgoClass.unique()) == ['Bias', 'Popular']) bias_runs = runs[runs.AlgoClass == 'Bias'] assert all(bias_runs.damping.notna()) pop_runs = runs[runs.AlgoClass == 'Popular'] assert all(pop_runs.damping.isna()) preds = pd.read_parquet(work / 'predictions.parquet') assert all(preds.RunId.isin(bias_runs.RunId))
def test_csr_save_load(tmp_path, prefix, values): tmp_path = lktu.norm_path(tmp_path) coords = np.random.choice(np.arange(50 * 100, dtype=np.int32), 1000, False) rows = np.mod(coords, 100, dtype=np.int32) cols = np.floor_divide(coords, 100, dtype=np.int32) if values: vals = np.random.randn(1000) else: vals = None csr = lm.csr_from_coo(rows, cols, vals, (100, 50)) assert csr.nrows == 100 assert csr.ncols == 50 assert csr.nnz == 1000 data = lm.csr_save(csr, prefix=prefix) np.savez_compressed(tmp_path / 'matrix.npz', **data) with np.load(tmp_path / 'matrix.npz') as npz: csr2 = lm.csr_load(npz, prefix=prefix) assert csr2.nrows == csr.nrows assert csr2.ncols == csr.ncols assert csr2.nnz == csr.nnz assert all(csr2.rowptrs == csr.rowptrs) assert all(csr2.colinds == csr.colinds) if values: assert all(csr2.values == csr.values) else: assert csr2.values is None
def test_ii_save_load(tmp_path): "Save and load a model" tmp_path = lktu.norm_path(tmp_path) original = knn.ItemItem(30, save_nbrs=500) _log.info('building model') original.fit(lktu.ml_sample()) fn = tmp_path / 'ii.mod' _log.info('saving model to %s', fn) original.save(fn) _log.info('reloading model') algo = knn.ItemItem(30) algo.load(fn) _log.info('checking model') assert all(np.logical_not(np.isnan(algo.sim_matrix_.values))) assert all(algo.sim_matrix_.values > 0) # a little tolerance assert all(algo.sim_matrix_.values < 1 + 1.0e-6) assert all(algo.item_counts_ == original.item_counts_) assert algo.item_counts_.sum() == algo.sim_matrix_.nnz assert algo.sim_matrix_.nnz == original.sim_matrix_.nnz assert all(algo.sim_matrix_.rowptrs == original.sim_matrix_.rowptrs) assert algo.sim_matrix_.values == approx(original.sim_matrix_.values) r_mat = algo.sim_matrix_ o_mat = original.sim_matrix_ assert all(r_mat.rowptrs == o_mat.rowptrs) for i in range(len(algo.item_index_)): sp = r_mat.rowptrs[i] ep = r_mat.rowptrs[i + 1] # everything is in decreasing order assert all(np.diff(r_mat.values[sp:ep]) <= 0) assert all(r_mat.values[sp:ep] == o_mat.values[sp:ep]) means = ml_ratings.groupby('item').rating.mean() assert means[algo.item_index_].values == approx(original.item_means_) matrix = lm.csr_to_scipy(algo.sim_matrix_) items = pd.Series(algo.item_index_) items = items[algo.item_counts_ > 0] for i in items.sample(50): ipos = algo.item_index_.get_loc(i) _log.debug('checking item %d at position %d', i, ipos) row = matrix.getrow(ipos) # it should be sorted ! # check this by diffing the row values, and make sure they're negative assert all(np.diff(row.data) < 1.0e-6)
def test_sweep_combine(tmp_path): tmp_path = norm_path(tmp_path) work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path, combine=False) ratings = ml_pandas.renamed.ratings sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)), name='ml-small') sweep.add_algorithms([Bias(damping=0), Bias(damping=5)], attrs=['damping']) sweep.add_algorithms(Popular()) sweep.persist_data() for i in range(1, 6): assert (work / 'ds{}-train.parquet'.format(i)).exists() assert (work / 'ds{}-test.parquet'.format(i)).exists() for ds, cf, dsa in sweep.datasets: assert isinstance(ds, tuple) train, test = ds assert isinstance(train, pathlib.Path) assert isinstance(test, pathlib.Path) assert sweep.run_count() == 5 * 3 try: sweep.run() finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert not (work / 'runs.csv').exists() assert not (work / 'runs.parquet').exists() assert not (work / 'predictions.parquet').exists() assert not (work / 'recommendations.parquet').exists() for i, (ds, a) in enumerate(sweep._flat_runs()): run = i + 1 assert (work / 'run-{}.json'.format(run)).exists() if isinstance(a.algorithm, Predictor): assert (work / 'predictions-{}.parquet'.format(run)).exists() assert (work / 'recommendations-{}.parquet'.format(run)).exists() sweep.collect_results() assert (work / 'runs.csv').exists() assert (work / 'runs.parquet').exists() assert (work / 'predictions.parquet').exists() assert (work / 'recommendations.parquet').exists() runs = pd.read_parquet(work / 'runs.parquet') assert len(runs) == 5 * 3
def test_als_save_load(tmp_path): tmp_path = lktu.norm_path(tmp_path) mod_file = tmp_path / 'als.npz' algo = als.ImplicitMF(20, iterations=5) ratings = lktu.ml_pandas.renamed.ratings algo.fit(ratings) algo.save(mod_file) assert mod_file.exists() restored = als.ImplicitMF(20) restored.load(mod_file) assert np.all(restored.user_features_ == algo.user_features_) assert np.all(restored.item_features_ == algo.item_features_) assert np.all(restored.item_index_ == algo.item_index_) assert np.all(restored.user_index_ == algo.user_index_)
def test_fallback_save_load(tmp_path): tmp_path = lktu.norm_path(tmp_path) original = basic.Fallback(basic.Memorized(simple_df), basic.Bias()) original.fit(lktu.ml_pandas.renamed.ratings) fn = tmp_path / 'fallback' original.save(fn) algo = basic.Fallback(basic.Memorized(simple_df), basic.Bias()) algo.load(fn) bias = algo.algorithms[1] assert bias.mean_ == approx(lktu.ml_pandas.ratings.rating.mean()) def exp_val(user, item): v = bias.mean_ if user is not None: v += bias.user_offsets_.loc[user] if item is not None: v += bias.item_offsets_.loc[item] return v # first user + item preds = algo.predict_for_user(10, [1]) assert preds.loc[1] == 4.0 # second user + first item preds = algo.predict_for_user(15, [1]) assert preds.loc[1] == approx(exp_val(15, 1)) # second item + user item preds = algo.predict_for_user(12, [2]) assert preds.loc[2] == approx(exp_val(12, 2)) # blended preds = algo.predict_for_user(10, [1, 5]) assert preds.loc[1] == 4.0 assert preds.loc[5] == approx(exp_val(10, 5)) # blended unknown preds = algo.predict_for_user(10, [5, 1, -23081]) assert len(preds) == 3 assert preds.loc[1] == 4.0 assert preds.loc[5] == approx(exp_val(10, 5)) assert preds.loc[-23081] == approx(exp_val(10, None))
def test_ii_train_ml100k(tmp_path): "Test an unbounded model on ML-100K" tmp_path = lktu.norm_path(tmp_path) ratings = lktu.ml100k.load_ratings() algo = knn.ItemItem(30) _log.info('training model') algo.fit(ratings) _log.info('testing model') assert all(np.logical_not(np.isnan(algo.sim_matrix_.values))) assert all(algo.sim_matrix_.values > 0) # a little tolerance assert all(algo.sim_matrix_.values < 1 + 1.0e-6) assert algo.item_counts_.sum() == algo.sim_matrix_.nnz means = ratings.groupby('item').rating.mean() assert means[algo.item_index_].values == approx(algo.item_means_) # save fn = tmp_path / 'ii.mod' _log.info('saving model to %s', fn) algo.save(fn) _log.info('reloading model') restored = knn.ItemItem(30) restored.load(fn) assert all(restored.sim_matrix_.values > 0) r_mat = restored.sim_matrix_ o_mat = algo.sim_matrix_ assert all(r_mat.rowptrs == o_mat.rowptrs) for i in range(len(restored.item_index_)): sp = r_mat.rowptrs[i] ep = r_mat.rowptrs[i + 1] # everything is in decreasing order assert all(np.diff(r_mat.values[sp:ep]) <= 0) assert all(r_mat.values[sp:ep] == o_mat.values[sp:ep])
def test_fallback_save_load(tmp_path): tmp_path = lktu.norm_path(tmp_path) original = basic.Fallback(basic.Memorized(simple_df), basic.Bias()) original.fit(lktu.ml_pandas.renamed.ratings) fn = tmp_path / 'fallback' original.save(fn) algo = basic.Fallback(basic.Memorized(simple_df), basic.Bias()) algo.load(fn) bias = algo.algorithms[1] assert bias.mean_ == approx(lktu.ml_pandas.ratings.rating.mean()) # first user + item preds = algo.predict_for_user(10, [1]) assert preds.loc[1] == 4.0 # second user + first item preds = algo.predict_for_user(15, [1]) assert preds.loc[1] == approx(bias.mean_ + bias.user_offsets_.loc[15] + bias.item_offsets_.loc[1]) # second item + user item preds = algo.predict_for_user(12, [2]) assert preds.loc[2] == approx(bias.mean_ + bias.user_offsets_.loc[12] + bias.item_offsets_.loc[2]) # blended preds = algo.predict_for_user(10, [1, 5]) assert preds.loc[1] == 4.0 assert preds.loc[5] == approx(bias.mean_ + bias.user_offsets_.loc[10] + bias.item_offsets_.loc[5]) # blended unknown preds = algo.predict_for_user(10, [5, 1, -23081]) assert len(preds) == 3 assert preds.loc[1] == 4.0 assert preds.loc[5] == approx(bias.mean_ + bias.user_offsets_.loc[10] + bias.item_offsets_.loc[5]) assert preds.loc[-23081] == approx(bias.mean_ + bias.user_offsets_.loc[10])
def test_sweep_persist(tmp_path): tmp_path = norm_path(tmp_path) work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path) ratings = ml_pandas.renamed.ratings sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)), name='ml-small') sweep.persist_data() for i in range(1, 6): assert (work / 'ds{}-train.parquet'.format(i)).exists() assert (work / 'ds{}-test.parquet'.format(i)).exists() for ds, cf, dsa in sweep.datasets: assert isinstance(ds, tuple) train, test = ds assert isinstance(train, pathlib.Path) assert isinstance(test, pathlib.Path) sweep.add_algorithms( [Bias(damping=0), Bias(damping=5), Bias(damping=10)], attrs=['damping']) sweep.add_algorithms(Popular()) try: sweep.run() finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert (work / 'runs.csv').exists() assert (work / 'runs.parquet').exists() assert (work / 'predictions.parquet').exists() assert (work / 'recommendations.parquet').exists() runs = pd.read_parquet(work / 'runs.parquet') # 4 algorithms by 5 partitions assert len(runs) == 20
def test_als_save_load(tmp_path): tmp_path = lktu.norm_path(tmp_path) mod_file = tmp_path / 'als.npz' original = als.BiasedMF(20, iterations=5) ratings = lktu.ml_pandas.renamed.ratings original.fit(ratings) assert original.global_bias_ == approx(ratings.rating.mean()) original.save(mod_file) assert mod_file.exists() algo = als.BiasedMF(20) algo.load(mod_file) assert algo.global_bias_ == original.global_bias_ assert np.all(algo.user_bias_ == original.user_bias_) assert np.all(algo.item_bias_ == original.item_bias_) assert np.all(algo.user_features_ == original.user_features_) assert np.all(algo.item_features_ == original.item_features_) assert np.all(algo.item_index_ == original.item_index_) assert np.all(algo.user_index_ == original.user_index_)
def test_pop_save_load(tmp_path): tmp_path = lktu.norm_path(tmp_path) original = basic.Popular() original.fit(lktu.ml_pandas.renamed.ratings) fn = tmp_path / 'pop.mod' original.save(fn) algo = basic.Popular() algo.load(fn) counts = lktu.ml_pandas.renamed.ratings.groupby('item').user.count() counts = counts.nlargest(100) assert algo.item_pop_.max() == counts.max() recs = algo.recommend(2038, 100) assert len(recs) == 100 assert all(np.diff(recs.score) <= 0) assert recs.score.iloc[0] == counts.max() # the 10 most popular should be the same assert all(counts.index[:10] == recs.item[:10])
def test_uu_save_load(tmp_path): tmp_path = lktu.norm_path(tmp_path) orig = knn.UserUser(30) _log.info('training model') orig.fit(ml_ratings) fn = tmp_path / 'uu.model' _log.info('saving to %s', fn) orig.save(fn) _log.info('reloading model') algo = knn.UserUser(30) algo.load(fn) _log.info('checking model') # it should have computed correct means umeans = ml_ratings.groupby('user').rating.mean() mlmeans = pd.Series(algo.user_means_, index=algo.user_index_, name='mean') umeans, mlmeans = umeans.align(mlmeans) assert mlmeans.values == approx(umeans.values) # we should be able to reconstruct rating values uir = ml_ratings.set_index(['user', 'item']).rating r_items = matrix.csr_rowinds(algo.transpose_matrix_) ui_rbdf = pd.DataFrame({ 'user': algo.user_index_[algo.transpose_matrix_.colinds], 'item': algo.item_index_[r_items], 'nrating': algo.transpose_matrix_.values }).set_index(['user', 'item']) ui_rbdf = ui_rbdf.join(mlmeans) ui_rbdf['rating'] = ui_rbdf['nrating'] + ui_rbdf['mean'] ui_rbdf['orig_rating'] = uir assert ui_rbdf.rating.values == approx(ui_rbdf.orig_rating.values)
def test_uu_save_load_implicit(tmp_path): "Save and load user-user on an implicit data set." tmp_path = lktu.norm_path(tmp_path) orig = knn.UserUser(20, center=False, aggregate='sum') data = ml_ratings.loc[:, ['user', 'item']] orig.fit(data) orig.save(tmp_path / 'uu.mod') algo = knn.UserUser(20, center=False, aggregate='sum') algo.load(tmp_path / 'uu.mod') assert algo.user_means_ is None assert all(algo.user_index_ == orig.user_index_) assert all(algo.item_index_ == orig.item_index_) assert all(algo.rating_matrix_.rowptrs == orig.rating_matrix_.rowptrs) assert all(algo.rating_matrix_.colinds == orig.rating_matrix_.colinds) assert all(algo.rating_matrix_.values == orig.rating_matrix_.values) assert all( algo.transpose_matrix_.rowptrs == orig.transpose_matrix_.rowptrs) assert all( algo.transpose_matrix_.colinds == orig.transpose_matrix_.colinds) assert algo.transpose_matrix_.values is None