def test_sweep_oneshot(tmp_path): tmp_path = norm_path(tmp_path) work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path, combine=False) ratings = ml_pandas.renamed.ratings sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)), name='ml-small') sweep.add_algorithms(Bias(damping=5)) try: sweep.run(3) finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert not (work / 'runs.csv').exists() assert not (work / 'runs.parquet').exists() assert not (work / 'predictions.parquet').exists() assert not (work / 'recommendations.parquet').exists() assert (work / 'run-3.json').exists() assert (work / 'predictions-3.parquet').exists() assert (work / 'recommendations-3.parquet').exists() with (work / 'run-3.json').open() as f: run = json.load(f) assert run['RunId'] == 3
def test_batch_rmse(): import lenskit.crossfold as xf import lenskit.batch as batch import lenskit.algorithms.basic as bl ratings = lktu.ml100k.ratings algo = bl.Bias(damping=5) def eval(train, test): algo.fit(train) preds = batch.predict(algo, test) return preds.set_index(['user', 'item']) results = pd.concat((eval(train, test) for (train, test) in xf.partition_users(ratings, 5, xf.SampleN(5)))) user_rmse = results.groupby('user').apply(lambda df: pm.rmse(df.prediction, df.rating)) # we should have all users users = ratings.user.unique() assert len(user_rmse) == len(users) missing = np.setdiff1d(users, user_rmse.index) assert len(missing) == 0 # we should not have any missing values assert all(user_rmse.notna()) # we should have a reasonable mean assert user_rmse.mean() == approx(0.93, abs=0.05)
def test_fill_users(): rla = topn.RecListAnalysis() rla.add_metric(topn.precision) rla.add_metric(topn.recall) algo = UserUser(20, min_nbrs=10) algo = Recommender.adapt(algo) splits = xf.sample_users(ml_test.ratings, 1, 50, xf.SampleN(5)) train, test = next(splits) algo.fit(train) rec_users = test['user'].sample(50).unique() recs = batch.recommend(algo, rec_users, 25) scores = rla.compute(recs, test, include_missing=True) assert len(scores) == test['user'].nunique() assert scores['recall'].notna().sum() == len(rec_users) assert all(scores['ntruth'] == 5) mscores = rla.compute(recs, test) assert len(mscores) < len(scores) recall = scores.loc[scores['recall'].notna(), 'recall'].copy() recall, mrecall = recall.align(mscores['recall']) assert all(recall == mrecall)
def test_sweep_nopreds(tmp_path): work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path, eval_n_jobs=1) ratings = ml_test.ratings folds = [(train, test.drop(columns=['rating'])) for (train, test) in xf.partition_users(ratings, 5, xf.SampleN(5)) ] sweep.add_datasets(folds, DataSet='ml-small') sweep.add_algorithms(Popular()) sweep.add_algorithms(Bias(damping=0)) try: sweep.run() finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert (work / 'runs.csv').exists() assert (work / 'runs.parquet').exists() assert not (work / 'predictions.parquet').exists() assert (work / 'recommendations.parquet').exists() runs = pd.read_parquet(work / 'runs.parquet') # 2 algorithms by 5 partitions assert len(runs) == 10 assert all(np.sort(runs.AlgoClass.unique()) == ['Bias', 'Popular']) bias_runs = runs[runs.AlgoClass == 'Bias'] recs = pd.read_parquet(work / 'recommendations.parquet') assert all(recs.RunId.isin(runs.RunId)) assert recs['score'].dtype == np.float64
def test_sweep_save(tmp_path): tmp_path = norm_path(tmp_path) work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path) ratings = ml_pandas.renamed.ratings sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)), name='ml-small') sweep.add_algorithms(Bias(damping=5)) sweep.persist_data() pf = work / 'sweep.dat' with pf.open('wb') as f: pickle.dump(sweep, f) with pf.open('rb') as f: sweep = pickle.load(f) try: sweep.run() finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert (work / 'runs.csv').exists() assert (work / 'runs.parquet').exists() assert (work / 'predictions.parquet').exists() assert (work / 'recommendations.parquet').exists() runs = pd.read_parquet(work / 'runs.parquet') # 1 algorithms by 5 partitions assert len(runs) == 5
def test_sweep_filenames(tmp_path): tmp_path = norm_path(tmp_path) work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path) ratings = ml_pandas.renamed.ratings folds = [] for part, (train, test) in enumerate(xf.partition_users(ratings, 2, xf.SampleN(5))): trfn = work / 'p{}-train.csv'.format(part) tefn = work / 'p{}-test.csv'.format(part) train.to_csv(trfn) test.to_csv(tefn) folds.append((trfn, tefn)) sweep.add_datasets(folds, DataSet='ml-small') sweep.add_algorithms([Bias(damping=0), Bias(damping=5), Bias(damping=10)], attrs=['damping']) sweep.add_algorithms(Popular()) try: sweep.run() finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert (work / 'runs.csv').exists() assert (work / 'runs.parquet').exists() assert (work / 'predictions.parquet').exists() assert (work / 'recommendations.parquet').exists() runs = pd.read_parquet(work / 'runs.parquet') # 4 algorithms by 2 partitions assert len(runs) == 8
def test_sweep_norecs(tmp_path): work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path, recommend=None) ratings = ml_test.ratings folds = xf.partition_users(ratings, 5, xf.SampleN(5)) sweep.add_datasets(folds, DataSet='ml-small') sweep.add_algorithms([Bias(damping=0), Bias(damping=5), Bias(damping=10)], attrs=['damping']) sweep.add_algorithms(Popular()) try: sweep.run() finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert (work / 'runs.csv').exists() assert (work / 'runs.parquet').exists() assert (work / 'predictions.parquet').exists() assert not (work / 'recommendations.parquet').exists() runs = pd.read_parquet(work / 'runs.parquet') # 4 algorithms by 5 partitions assert len(runs) == 20 assert all(np.sort(runs.AlgoClass.unique()) == ['Bias', 'Popular']) bias_runs = runs[runs.AlgoClass == 'Bias'] assert all(bias_runs.damping.notna()) pop_runs = runs[runs.AlgoClass == 'Popular'] assert all(pop_runs.damping.isna()) preds = pd.read_parquet(work / 'predictions.parquet') assert all(preds.RunId.isin(bias_runs.RunId))
def test_partition_users(): """Partitioning ratings when dataframe has non-unique indices""" ratings = lktu.ml_test.ratings ratings = ratings.set_index('user') ##forces non-unique index with pytest.raises(ValueError): for split in xf.partition_users(ratings, 5, xf.SampleN(5)): pass
def test_sweep_combine(tmp_path): tmp_path = norm_path(tmp_path) work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path, combine=False) ratings = ml_pandas.renamed.ratings sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)), name='ml-small') sweep.add_algorithms([Bias(damping=0), Bias(damping=5)], attrs=['damping']) sweep.add_algorithms(Popular()) sweep.persist_data() for i in range(1, 6): assert (work / 'ds{}-train.parquet'.format(i)).exists() assert (work / 'ds{}-test.parquet'.format(i)).exists() for ds, cf, dsa in sweep.datasets: assert isinstance(ds, tuple) train, test = ds assert isinstance(train, pathlib.Path) assert isinstance(test, pathlib.Path) assert sweep.run_count() == 5 * 3 try: sweep.run() finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert not (work / 'runs.csv').exists() assert not (work / 'runs.parquet').exists() assert not (work / 'predictions.parquet').exists() assert not (work / 'recommendations.parquet').exists() for i, (ds, a) in enumerate(sweep._flat_runs()): run = i + 1 assert (work / 'run-{}.json'.format(run)).exists() if isinstance(a.algorithm, Predictor): assert (work / 'predictions-{}.parquet'.format(run)).exists() assert (work / 'recommendations-{}.parquet'.format(run)).exists() sweep.collect_results() assert (work / 'runs.csv').exists() assert (work / 'runs.parquet').exists() assert (work / 'predictions.parquet').exists() assert (work / 'recommendations.parquet').exists() runs = pd.read_parquet(work / 'runs.parquet') assert len(runs) == 5 * 3
def test_sample_n(): ratings = lktu.ml_test.ratings users = np.random.choice(ratings.user.unique(), 5, replace=False) s5 = xf.SampleN(5) for u in users: udf = ratings[ratings.user == u] tst = s5(udf) trn = udf.loc[udf.index.difference(tst.index), :] assert len(tst) == 5 assert len(tst) + len(trn) == len(udf) s10 = xf.SampleN(10) for u in users: udf = ratings[ratings.user == u] tst = s10(udf) trn = udf.loc[udf.index.difference(tst.index), :] assert len(tst) == 10 assert len(tst) + len(trn) == len(udf)
def test_adv_fill_users(): rla = topn.RecListAnalysis() rla.add_metric(topn.precision) rla.add_metric(topn.recall) a_uu = UserUser(30, min_nbrs=10) a_uu = Recommender.adapt(a_uu) a_ii = ItemItem(20, min_nbrs=4) a_ii = Recommender.adapt(a_ii) splits = xf.sample_users(ml_test.ratings, 2, 50, xf.SampleN(5)) all_recs = {} all_test = {} for i, (train, test) in enumerate(splits): a_uu.fit(train) rec_users = test['user'].sample(50).unique() all_recs[(i + 1, 'UU')] = batch.recommend(a_uu, rec_users, 25) a_ii.fit(train) rec_users = test['user'].sample(50).unique() all_recs[(i + 1, 'II')] = batch.recommend(a_ii, rec_users, 25) all_test[i + 1] = test recs = pd.concat(all_recs, names=['part', 'algo']) recs.reset_index(['part', 'algo'], inplace=True) recs.reset_index(drop=True, inplace=True) test = pd.concat(all_test, names=['part']) test.reset_index(['part'], inplace=True) test.reset_index(drop=True, inplace=True) scores = rla.compute(recs, test, include_missing=True) inames = scores.index.names scores.sort_index(inplace=True) assert len(scores) == 50 * 4 assert all(scores['ntruth'] == 5) assert scores['recall'].isna().sum() > 0 _log.info('scores:\n%s', scores) ucounts = scores.reset_index().groupby('algo')['user'].agg( ['count', 'nunique']) assert all(ucounts['count'] == 100) assert all(ucounts['nunique'] == 100) mscores = rla.compute(recs, test) mscores = mscores.reset_index().set_index(inames) mscores.sort_index(inplace=True) assert len(mscores) < len(scores) _log.info('mscores:\n%s', mscores) recall = scores.loc[scores['recall'].notna(), 'recall'].copy() recall, mrecall = recall.align(mscores['recall']) assert all(recall == mrecall)
def test_batch_predict_preshared(): from lenskit.algorithms import basic import lenskit.crossfold as xf algo = basic.Bias() splits = xf.sample_users(lktu.ml_test.ratings, 1, 100, xf.SampleN(5)) train, test = next(splits) ares = lkb.train_isolated(algo, train) preds = lkb.predict(ares, test) assert len(preds) == len(test) assert not any(preds['prediction'].isna())
def create_train_test_rec_data(self): # For now, no cross-validation, just split the data into 1 train and 1 test set. for i, tp in enumerate( xf.partition_users(data=self.data_dense, partitions=1, method=xf.SampleN(5), rng_spec=1)): train = tp.train test = tp.test train.to_csv( f'{conf.SYN_DATA_DIR}syn_train_{self.current_date}.csv') test.to_csv(f'{conf.SYN_DATA_DIR}syn_test_{self.current_date}.csv') return train, test
def test_sample_users_frac_oversize_ndj(): ratings = lktu.ml_test.ratings splits = xf.sample_users(ratings, 20, 100, xf.SampleN(5), disjoint=False) splits = list(splits) assert len(splits) == 20 for s in splits: ucounts = s.test.groupby('user').agg('count') assert len(ucounts) == 100 assert len(s.test) == 5 * 100 assert all(ucounts == 5) assert all(s.test.index.union(s.train.index) == ratings.index) assert len(s.test) + len(s.train) == len(ratings)
def create_save_train_val_test_rec_data(dense_data, fn): # For now, no cross-validation, just split the data into 1 train and 1 test set. for i, tp in enumerate( xf.partition_users(data=dense_data, partitions=1, method=xf.SampleN(5), rng_spec=1)): train = tp.train test = tp.test test.to_csv(f'{conf.SYN_DATA_DIR}syn_test_{fn}.csv') print("[INFO] Train/test split created") for i, tp in enumerate( xf.partition_users(data=train, partitions=1, method=xf.SampleN(5), rng_spec=1)): train = tp.train val = tp.test train.to_csv(f'{conf.SYN_DATA_DIR}syn_train_{fn}.csv') val.to_csv(f'{conf.SYN_DATA_DIR}syn_val_{fn}.csv') print("[INFO] Train/val split created") return train, val, test
def test_partition_users(): ratings = lktu.ml_pandas.renamed.ratings splits = xf.partition_users(ratings, 5, xf.SampleN(5)) splits = list(splits) assert len(splits) == 5 for s in splits: ucounts = s.test.groupby('user').agg('count') assert all(ucounts == 5) assert all(s.test.index.union(s.train.index) == ratings.index) assert len(s.test) + len(s.train) == len(ratings) users = ft.reduce(lambda us1, us2: us1 | us2, (set(s.test.user) for s in splits)) assert len(users) == ratings.user.nunique() assert users == set(ratings.user)
def test_save_models(tmp_path, format): tmp_path = norm_path(tmp_path) work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path, save_models=format) sweep.add_algorithms(Bias(5)) sweep.add_algorithms(Popular()) ratings = ml_test.ratings sweep.add_datasets(lambda: xf.sample_users(ratings, 2, 100, xf.SampleN(5)), name='ml-small') sweep.run() runs = pd.read_parquet(fspath(tmp_path / 'runs.parquet')) runs = runs.set_index('RunId') for i in range(4): run_id = i + 1 fn = work / 'model-{}'.format(run_id) if format is True: fn = fn.with_suffix('.pkl') assert fn.exists() with fn.open('rb') as f: algo = pickle.load(f) elif format == 'gzip': fn = fn.with_suffix('.pkl.gz') assert fn.exists() with gzip.open(fspath(fn), 'rb') as f: algo = pickle.load(f) elif format == 'joblib': fn = fn.with_suffix('.jlpkl') assert fn.exists() algo = joblib.load(fn) else: assert False assert algo is not None algo_class = algo.__class__.__name__ if isinstance(algo, TopN): algo_class = algo.predictor.__class__.__name__ assert algo_class == runs.loc[run_id, 'AlgoClass']
def test_sweep_persist(tmp_path): tmp_path = norm_path(tmp_path) work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path) ratings = ml_pandas.renamed.ratings sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)), name='ml-small') sweep.persist_data() for i in range(1, 6): assert (work / 'ds{}-train.parquet'.format(i)).exists() assert (work / 'ds{}-test.parquet'.format(i)).exists() for ds, cf, dsa in sweep.datasets: assert isinstance(ds, tuple) train, test = ds assert isinstance(train, pathlib.Path) assert isinstance(test, pathlib.Path) sweep.add_algorithms( [Bias(damping=0), Bias(damping=5), Bias(damping=10)], attrs=['damping']) sweep.add_algorithms(Popular()) try: sweep.run() finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert (work / 'runs.csv').exists() assert (work / 'runs.parquet').exists() assert (work / 'predictions.parquet').exists() assert (work / 'recommendations.parquet').exists() runs = pd.read_parquet(work / 'runs.parquet') # 4 algorithms by 5 partitions assert len(runs) == 20
def test_sample_users(): ratings = lktu.ml_test.ratings splits = xf.sample_users(ratings, 5, 100, xf.SampleN(5)) splits = list(splits) assert len(splits) == 5 for s in splits: ucounts = s.test.groupby('user').agg('count') assert len(s.test) == 5 * 100 assert len(ucounts) == 100 assert all(ucounts == 5) assert all(s.test.index.union(s.train.index) == ratings.index) assert len(s.test) + len(s.train) == len(ratings) # no overlapping users for s1, s2 in it.product(splits, splits): if s1 is s2: continue us1 = s1.test.user.unique() us2 = s2.test.user.unique() assert len(np.intersect1d(us1, us2)) == 0
def test_sample_users_frac_oversize(): ratings = lktu.ml_test.ratings splits = xf.sample_users(ratings, 20, 100, xf.SampleN(5)) splits = list(splits) assert len(splits) == 20 for s in splits: ucounts = s.test.groupby('user').agg('count') assert len(ucounts) < 100 assert all(ucounts == 5) assert all(s.test.index.union(s.train.index) == ratings.index) assert len(s.test) + len(s.train) == len(ratings) users = ft.reduce(lambda us1, us2: us1 | us2, (set(s.test.user) for s in splits)) assert len(users) == ratings.user.nunique() assert users == set(ratings.user) for s1, s2 in it.product(splits, splits): if s1 is s2: continue us1 = s1.test.user.unique() us2 = s2.test.user.unique() assert len(np.intersect1d(us1, us2)) == 0
def test_partition_may_skip_train(): """Partitioning when users may not have enough ratings to be in the train set and test set.""" ratings = lktu.ml_test.ratings # make a data set where some users only have 1 rating ratings = ratings.sample(frac=0.1) users = ratings.groupby('user')['rating'].count() assert users.min() == 1.0 # we should have some small users! users.name = 'ur_count' splits = xf.partition_users(ratings, 5, xf.SampleN(1)) splits = list(splits) assert len(splits) == 5 # now we go make sure we're missing some users! And don't have any NaN ratings for train, test in splits: # no null ratings assert all(train['rating'].notna()) # see if test users with 1 rating are missing from train test = test.join(users, on='user') assert all(~(test.loc[test['ur_count'] == 1, 'user'].isin(train['user'].unique()))) # and users with more than one rating are in train assert all(test.loc[test['ur_count'] > 1, 'user'].isin(train['user'].unique()))
def main(args): dsname = args.get('DATASET') partitions = int(args.get('-p')) output = args.get('-o') _log.info('locating data set %s', dsname) data = getattr(datasets, dsname) _log.info('loading ratings') ratings = data.ratings path = Path(output) path.mkdir(exist_ok=True, parents=True) _log.info('writing to %s', path) testRowsPerUsers = 5 for i, tp in enumerate( xf.partition_users(ratings, partitions, xf.SampleN(testRowsPerUsers)), 1): # _log.info('writing train set %d', i) # tp.train.to_csv(path / f'train-{i}.csv.gz', index=False) _log.info('writing test set %d', i) tp.test.index.name = 'index' tp.test.to_csv(path / f'test-{i}.csv.gz')
from docopt import docopt from lkdemo import datasets, log from pathlib import Path import lenskit.crossfold as xf _log = log.script(__file__) args = docopt(__doc__) dsname = args.get('DATASET') partitions = int(args.get('-p')) output = args.get('-o') _log.info('locating data set %s', dsname) data = getattr(datasets, dsname) _log.info('loading ratings') ratings = data.ratings path = Path(output) path.mkdir(exist_ok=True, parents=True) _log.info('writing to %s', path) testRowsPerUsers = 5 for i, tp in enumerate( xf.partition_users(ratings, partitions, xf.SampleN(testRowsPerUsers)), 1): tp.train.to_csv(path / f'train-{i}.csv', index=False) tp.test.to_csv(path / f'test-{i}.csv', index=False)
""" from docopt import docopt from lkdemo import datasets, log from pathlib import Path import lenskit.crossfold as xf _log = log.script(__file__) args = docopt(__doc__) dsname = args.get('DATASET') partitions = int(args.get('-p')) output = args.get('-o') _log.info('locating data set %s', dsname) data = getattr(datasets, dsname) _log.info('loading ratings') ratings = data.ratings path = Path(output) path.mkdir(exist_ok=True, parents=True) _log.info('writing to %s', path) testRowsPerUsers = 5 for i, tp in enumerate(xf.partition_users(ratings, partitions, xf.SampleN(testRowsPerUsers)),1): tp.train.to_csv(path / f'train-{i}.csv.gz', index=False) tp.test.to_csv(path / f'test-{i}.csv.gz' , index=False)
user_n = df.loc[df['count'] < condition] return user_n game_count = groupby_count(result, 'user', 'item') user_5 = prune(game_count, 5) user_less_5 = user_5.index user_less_5 pruned_data_5 = result.set_index('user').drop(user_less_5) pruned_data_5.reset_index(inplace=True) #pairs_user = list(partition_users(pruned_data_5, 5, xf.SampleN(1))) pairs_user = list(sample_users(pruned_data_5, 5, 12000, xf.SampleN(1))) pickle_out = open("sample_user.pickle", "wb") pickle.dump(pairs_user, pickle_out) pickle_out.close() truth = pd.concat((p.test for p in pairs_user)) #truth.to_csv(r'results/steam/pruned_5.csv') def algo_eval(path, algo, dataset): evaluation = batch.MultiEval(path=path, predict=False, recommend=100) evaluation.add_algorithms(algos=algo) evaluation.add_datasets(data=dataset) evaluation.run()
# read in the movielens 100k ratings with pandas # https://grouplens.org/datasets/movielens/100k/ ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=['user', 'item', 'rating', 'timestamp']) algoKNN = knn.ItemItem(30) algoFunk = funk.FunkSVD(2) algoAls = als.BiasedMF(20) # split the data in a test and a training set # for each user leave one row out for test purpose data = ratings nb_partitions = 1 splits = xf.partition_users(data, nb_partitions, xf.SampleN(1)) for (trainSet, testSet) in splits: train = trainSet test = testSet # train model modelKNN = algoKNN.fit(train) modelFunk = algoFunk.fit(train) modelALS = algoAls.fit(train) users = test.user.unique() def get_recommendations_Funk_SVD(user_id, nb_recommendations = 1): ''' Return a recommendation