def test_sample_users(): """Sampling users when dataframe has non-unique indices""" ratings = lktu.ml_test.ratings ratings = ratings.set_index('user') ##forces non-unique index with pytest.raises(ValueError): for split in xf.sample_users(ratings, 5, 100, xf.SampleN(5)): pass
def test_fill_users(): rla = topn.RecListAnalysis() rla.add_metric(topn.precision) rla.add_metric(topn.recall) algo = UserUser(20, min_nbrs=10) algo = Recommender.adapt(algo) splits = xf.sample_users(ml_test.ratings, 1, 50, xf.SampleN(5)) train, test = next(splits) algo.fit(train) rec_users = test['user'].sample(50).unique() recs = batch.recommend(algo, rec_users, 25) scores = rla.compute(recs, test, include_missing=True) assert len(scores) == test['user'].nunique() assert scores['recall'].notna().sum() == len(rec_users) assert all(scores['ntruth'] == 5) mscores = rla.compute(recs, test) assert len(mscores) < len(scores) recall = scores.loc[scores['recall'].notna(), 'recall'].copy() recall, mrecall = recall.align(mscores['recall']) assert all(recall == mrecall)
def split_dataset(ratings, user_fraction=.1): """Split a dataset in train/test data""" n_users = len(ratings['user'].unique()) # There are many ways to separate a dataset in (train, test) data, here are two: # - Row separation: the test set will contain users that the model knows. # The performance of the model will be its ability to predict "new" # tastes for a known user # - User separation: the test set will contain users that the model has # never encountered. The performance of the model will be its abiliy to # predict new users behaviours considering the behaviour of other # known users. # see [lkpy documentation](https://lkpy.readthedocs.io/en/stable/crossfold.html) # Here the sampling is as follow: # - Sample test_fraction * n_total users # - Randomly select half of their listenings for the test set result = list( xf.sample_users(ratings[['user', 'item', 'rating']], partitions=1, size=int(n_users * user_fraction), method=xf.SampleFrac(.5)))[0] print(f'n test users: {len(result.test["user"].unique())}') return result.train, result.test
def do_prepare(opts): name = opts['-d'] ml = MovieLens(f'data/{name}') train, test = next(sample_users(ml.ratings, 1, 10000, SampleN(5))) test.to_parquet(f'data/{name}-test.parquet', index=False) _log.info('getting popular recs') pop = Popular() pop.fit(train) pop_recs = recommend(pop, test['user'].unique(), 100) _log.info('getting ALS recs') als = ImplicitMF(20, iterations=10) als = Recommender.adapt(als) als.fit(train.drop(columns=['rating'])) als_recs = recommend(als, test['user'].unique(), 100) _log.info('merging recs') recs = pd.concat({ 'Popular': pop_recs, 'ALS': als_recs }, names=['Algorithm']) recs.reset_index('Algorithm', inplace=True) recs.to_parquet(f'data/{name}-recs.parquet', index=False)
def test_adv_fill_users(): rla = topn.RecListAnalysis() rla.add_metric(topn.precision) rla.add_metric(topn.recall) a_uu = UserUser(30, min_nbrs=10) a_uu = Recommender.adapt(a_uu) a_ii = ItemItem(20, min_nbrs=4) a_ii = Recommender.adapt(a_ii) splits = xf.sample_users(ml_test.ratings, 2, 50, xf.SampleN(5)) all_recs = {} all_test = {} for i, (train, test) in enumerate(splits): a_uu.fit(train) rec_users = test['user'].sample(50).unique() all_recs[(i + 1, 'UU')] = batch.recommend(a_uu, rec_users, 25) a_ii.fit(train) rec_users = test['user'].sample(50).unique() all_recs[(i + 1, 'II')] = batch.recommend(a_ii, rec_users, 25) all_test[i + 1] = test recs = pd.concat(all_recs, names=['part', 'algo']) recs.reset_index(['part', 'algo'], inplace=True) recs.reset_index(drop=True, inplace=True) test = pd.concat(all_test, names=['part']) test.reset_index(['part'], inplace=True) test.reset_index(drop=True, inplace=True) scores = rla.compute(recs, test, include_missing=True) inames = scores.index.names scores.sort_index(inplace=True) assert len(scores) == 50 * 4 assert all(scores['ntruth'] == 5) assert scores['recall'].isna().sum() > 0 _log.info('scores:\n%s', scores) ucounts = scores.reset_index().groupby('algo')['user'].agg( ['count', 'nunique']) assert all(ucounts['count'] == 100) assert all(ucounts['nunique'] == 100) mscores = rla.compute(recs, test) mscores = mscores.reset_index().set_index(inames) mscores.sort_index(inplace=True) assert len(mscores) < len(scores) _log.info('mscores:\n%s', mscores) recall = scores.loc[scores['recall'].notna(), 'recall'].copy() recall, mrecall = recall.align(mscores['recall']) assert all(recall == mrecall)
def test_batch_predict_preshared(): from lenskit.algorithms import basic import lenskit.crossfold as xf algo = basic.Bias() splits = xf.sample_users(lktu.ml_test.ratings, 1, 100, xf.SampleN(5)) train, test = next(splits) ares = lkb.train_isolated(algo, train) preds = lkb.predict(ares, test) assert len(preds) == len(test) assert not any(preds['prediction'].isna())
def test_sample_users_frac_oversize_ndj(): ratings = lktu.ml_test.ratings splits = xf.sample_users(ratings, 20, 100, xf.SampleN(5), disjoint=False) splits = list(splits) assert len(splits) == 20 for s in splits: ucounts = s.test.groupby('user').agg('count') assert len(ucounts) == 100 assert len(s.test) == 5 * 100 assert all(ucounts == 5) assert all(s.test.index.union(s.train.index) == ratings.index) assert len(s.test) + len(s.train) == len(ratings)
def test_tf_isvd(ml20m): algo = lenskit_tf.IntegratedBiasMF(20) def eval(train, test): _log.info('running training') algo.fit(train) _log.info('testing %d users', test.user.nunique()) return batch.predict(algo, test) folds = xf.sample_users(ml20m, 2, 5000, xf.SampleFrac(0.2)) preds = pd.concat(eval(train, test) for (train, test) in folds) mae = pm.mae(preds.prediction, preds.rating) assert mae == approx(0.60, abs=0.025) user_rmse = preds.groupby('user').apply( lambda df: pm.rmse(df.prediction, df.rating)) assert user_rmse.mean() == approx(0.92, abs=0.05)
def test_global_metric(): import lenskit.crossfold as xf import lenskit.batch as batch from lenskit.algorithms.bias import Bias train, test = next( xf.sample_users(lktu.ml_test.ratings, 1, 200, xf.SampleFrac(0.5))) algo = Bias() algo.fit(train) preds = batch.predict(algo, test) rmse = pm.global_metric(preds) assert rmse == pm.rmse(preds.prediction, preds.rating) mae = pm.global_metric(preds, metric=pm.mae) assert mae == pm.mae(preds.prediction, preds.rating)
def test_save_models(tmp_path, format): tmp_path = norm_path(tmp_path) work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path, save_models=format) sweep.add_algorithms(Bias(5)) sweep.add_algorithms(Popular()) ratings = ml_test.ratings sweep.add_datasets(lambda: xf.sample_users(ratings, 2, 100, xf.SampleN(5)), name='ml-small') sweep.run() runs = pd.read_parquet(fspath(tmp_path / 'runs.parquet')) runs = runs.set_index('RunId') for i in range(4): run_id = i + 1 fn = work / 'model-{}'.format(run_id) if format is True: fn = fn.with_suffix('.pkl') assert fn.exists() with fn.open('rb') as f: algo = pickle.load(f) elif format == 'gzip': fn = fn.with_suffix('.pkl.gz') assert fn.exists() with gzip.open(fspath(fn), 'rb') as f: algo = pickle.load(f) elif format == 'joblib': fn = fn.with_suffix('.jlpkl') assert fn.exists() algo = joblib.load(fn) else: assert False assert algo is not None algo_class = algo.__class__.__name__ if isinstance(algo, TopN): algo_class = algo.predictor.__class__.__name__ assert algo_class == runs.loc[run_id, 'AlgoClass']
def test_sample_users(): ratings = lktu.ml_test.ratings splits = xf.sample_users(ratings, 5, 100, xf.SampleN(5)) splits = list(splits) assert len(splits) == 5 for s in splits: ucounts = s.test.groupby('user').agg('count') assert len(s.test) == 5 * 100 assert len(ucounts) == 100 assert all(ucounts == 5) assert all(s.test.index.union(s.train.index) == ratings.index) assert len(s.test) + len(s.train) == len(ratings) # no overlapping users for s1, s2 in it.product(splits, splits): if s1 is s2: continue us1 = s1.test.user.unique() us2 = s2.test.user.unique() assert len(np.intersect1d(us1, us2)) == 0
def test_user_metric(): import lenskit.crossfold as xf import lenskit.batch as batch from lenskit.algorithms.bias import Bias train, test = next( xf.sample_users(lktu.ml_test.ratings, 1, 200, xf.SampleFrac(0.5))) algo = Bias() algo.fit(train) preds = batch.predict(algo, test) rmse = pm.user_metric(preds) u_rmse = preds.groupby('user').apply( lambda df: pm.rmse(df.prediction, df.rating)) assert rmse == approx(u_rmse.mean()) mae = pm.user_metric(preds, metric=pm.mae) u_mae = preds.groupby('user').apply( lambda df: pm.mae(df.prediction, df.rating)) assert mae == approx(u_mae.mean())
def test_sample_users_frac(): ratings = lktu.ml_test.ratings splits = xf.sample_users(ratings, 5, 100, xf.SampleFrac(0.2)) splits = list(splits) assert len(splits) == 5 ucounts = ratings.groupby('user').item.count() uss = ucounts * 0.2 for s in splits: tucs = s.test.groupby('user').item.count() assert len(tucs) == 100 assert all(tucs >= uss.loc[tucs.index] - 1) assert all(tucs <= uss.loc[tucs.index] + 1) assert all(s.test.index.union(s.train.index) == ratings.index) assert len(s.test) + len(s.train) == len(ratings) # no overlapping users for s1, s2 in it.product(splits, splits): if s1 is s2: continue us1 = s1.test.user.unique() us2 = s2.test.user.unique() assert len(np.intersect1d(us1, us2)) == 0
def test_sample_users_frac_oversize(): ratings = lktu.ml_test.ratings splits = xf.sample_users(ratings, 20, 100, xf.SampleN(5)) splits = list(splits) assert len(splits) == 20 for s in splits: ucounts = s.test.groupby('user').agg('count') assert len(ucounts) < 100 assert all(ucounts == 5) assert all(s.test.index.union(s.train.index) == ratings.index) assert len(s.test) + len(s.train) == len(ratings) users = ft.reduce(lambda us1, us2: us1 | us2, (set(s.test.user) for s in splits)) assert len(users) == ratings.user.nunique() assert users == set(ratings.user) for s1, s2 in it.product(splits, splits): if s1 is s2: continue us1 = s1.test.user.unique() us2 = s2.test.user.unique() assert len(np.intersect1d(us1, us2)) == 0
user_n = df.loc[df['count'] < condition] return user_n game_count = groupby_count(result, 'user', 'item') user_5 = prune(game_count, 5) user_less_5 = user_5.index user_less_5 pruned_data_5 = result.set_index('user').drop(user_less_5) pruned_data_5.reset_index(inplace=True) #pairs_user = list(partition_users(pruned_data_5, 5, xf.SampleN(1))) pairs_user = list(sample_users(pruned_data_5, 5, 12000, xf.SampleN(1))) pickle_out = open("sample_user.pickle", "wb") pickle.dump(pairs_user, pickle_out) pickle_out.close() truth = pd.concat((p.test for p in pairs_user)) #truth.to_csv(r'results/steam/pruned_5.csv') def algo_eval(path, algo, dataset): evaluation = batch.MultiEval(path=path, predict=False, recommend=100) evaluation.add_algorithms(algos=algo) evaluation.add_datasets(data=dataset) evaluation.run()