示例#1
0
def test_implicit_als_batch_accuracy():
    import lenskit.crossfold as xf
    from lenskit import batch, topn

    ratings = lktu.ml100k.ratings

    algo_t = ALS(25)

    def eval(train, test):
        _log.info('running training')
        train['rating'] = train.rating.astype(np.float_)
        algo = util.clone(algo_t)
        algo.fit(train)
        users = test.user.unique()
        _log.info('testing %d users', len(users))
        recs = batch.recommend(algo, users, 100)
        return recs

    folds = list(xf.partition_users(ratings, 5, xf.SampleFrac(0.2)))
    test = pd.concat(f.test for f in folds)

    recs = pd.concat(eval(train, test) for (train, test) in folds)

    _log.info('analyzing recommendations')
    rla = topn.RecListAnalysis()
    rla.add_metric(topn.ndcg)
    results = rla.compute(recs, test)
    dcg = results.ndcg
    _log.info('nDCG for %d users is %.4f', len(dcg), dcg.mean())
    assert dcg.mean() > 0
示例#2
0
def test_als_implicit_batch_accuracy():
    import lenskit.crossfold as xf
    from lenskit import batch
    from lenskit import topn

    ratings = lktu.ml100k.load_ratings()

    algo = als.ImplicitMF(25, iterations=20)

    def eval(train, test):
        _log.info('running training')
        train['rating'] = train.rating.astype(np.float_)
        algo.fit(train)
        users = test.user.unique()
        _log.info('testing %d users', len(users))
        candidates = topn.UnratedCandidates(train)
        recs = batch.recommend(algo, users, 100, candidates)
        return recs

    folds = list(xf.partition_users(ratings, 5, xf.SampleFrac(0.2)))
    test = pd.concat(te for (tr, te) in folds)
    recs = pd.concat(eval(train, test) for (train, test) in folds)

    _log.info('analyzing recommendations')
    rla = topn.RecListAnalysis()
    rla.add_metric(topn.ndcg)
    results = rla.compute(recs, test)
    _log.info('nDCG for users is %.4f', results.ndcg.mean())
    assert results.ndcg.mean() > 0
示例#3
0
def test_fsvd_batch_accuracy():
    from lenskit.algorithms import basic
    from lenskit.algorithms import bias
    import lenskit.crossfold as xf
    from lenskit import batch
    import lenskit.metrics.predict as pm

    ratings = lktu.ml100k.ratings

    svd_algo = svd.FunkSVD(25, 125, damping=10)
    algo = basic.Fallback(svd_algo, bias.Bias(damping=10))

    def eval(train, test):
        _log.info('running training')
        algo.fit(train)
        _log.info('testing %d users', test.user.nunique())
        return batch.predict(algo, test)

    folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2))
    preds = pd.concat(eval(train, test) for (train, test) in folds)
    mae = pm.mae(preds.prediction, preds.rating)
    assert mae == approx(0.74, abs=0.025)

    user_rmse = preds.groupby('user').apply(lambda df: pm.rmse(df.prediction, df.rating))
    assert user_rmse.mean() == approx(0.92, abs=0.05)
示例#4
0
def test_tf_bmf_batch_accuracy(tf_session):
    from lenskit.algorithms import basic
    from lenskit.algorithms import bias
    import lenskit.crossfold as xf
    from lenskit import batch
    import lenskit.metrics.predict as pm

    ratings = lktu.ml100k.ratings

    algo = lktf.BiasedMF(25,
                         damping=10,
                         batch_size=1024,
                         epochs=20,
                         rng_spec=42)
    algo = basic.Fallback(algo, bias.Bias(damping=10))

    def eval(train, test):
        _log.info('running training')
        algo.fit(train)
        _log.info('testing %d users', test.user.nunique())
        return batch.predict(algo, test)

    folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2))
    preds = pd.concat(eval(train, test) for (train, test) in folds)
    mae = pm.mae(preds.prediction, preds.rating)
    assert mae == approx(0.83, abs=0.025)

    user_rmse = preds.groupby('user').apply(
        lambda df: pm.rmse(df.prediction, df.rating))
    assert user_rmse.mean() == approx(1.03, abs=0.05)
示例#5
0
def test_sweep_oneshot(tmp_path):
    tmp_path = norm_path(tmp_path)
    work = pathlib.Path(tmp_path)
    sweep = batch.MultiEval(tmp_path, combine=False)

    ratings = ml_pandas.renamed.ratings
    sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)), name='ml-small')
    sweep.add_algorithms(Bias(damping=5))

    try:
        sweep.run(3)
    finally:
        if (work / 'runs.csv').exists():
            runs = pd.read_csv(work / 'runs.csv')
            print(runs)

    assert not (work / 'runs.csv').exists()
    assert not (work / 'runs.parquet').exists()
    assert not (work / 'predictions.parquet').exists()
    assert not (work / 'recommendations.parquet').exists()

    assert (work / 'run-3.json').exists()
    assert (work / 'predictions-3.parquet').exists()
    assert (work / 'recommendations-3.parquet').exists()

    with (work / 'run-3.json').open() as f:
        run = json.load(f)
    assert run['RunId'] == 3
示例#6
0
def test_sweep_nopreds(tmp_path):
    work = pathlib.Path(tmp_path)
    sweep = batch.MultiEval(tmp_path, eval_n_jobs=1)

    ratings = ml_test.ratings
    folds = [(train, test.drop(columns=['rating']))
             for (train, test) in xf.partition_users(ratings, 5, xf.SampleN(5))
             ]
    sweep.add_datasets(folds, DataSet='ml-small')
    sweep.add_algorithms(Popular())
    sweep.add_algorithms(Bias(damping=0))

    try:
        sweep.run()
    finally:
        if (work / 'runs.csv').exists():
            runs = pd.read_csv(work / 'runs.csv')
            print(runs)

    assert (work / 'runs.csv').exists()
    assert (work / 'runs.parquet').exists()
    assert not (work / 'predictions.parquet').exists()
    assert (work / 'recommendations.parquet').exists()

    runs = pd.read_parquet(work / 'runs.parquet')
    # 2 algorithms by 5 partitions
    assert len(runs) == 10
    assert all(np.sort(runs.AlgoClass.unique()) == ['Bias', 'Popular'])
    bias_runs = runs[runs.AlgoClass == 'Bias']

    recs = pd.read_parquet(work / 'recommendations.parquet')
    assert all(recs.RunId.isin(runs.RunId))
    assert recs['score'].dtype == np.float64
示例#7
0
def test_tf_bpr_batch_accuracy(tf_session):
    from lenskit.algorithms import basic
    import lenskit.crossfold as xf
    from lenskit import batch, topn

    ratings = lktu.ml100k.ratings

    algo = lktf.BPR(20, batch_size=1024, epochs=20, rng_spec=42)
    algo = Recommender.adapt(algo)

    all_recs = []
    all_test = []
    for train, test in xf.partition_users(ratings, 5, xf.SampleFrac(0.2)):
        _log.info('running training')
        algo.fit(train)
        _log.info('testing %d users', test.user.nunique())
        recs = batch.recommend(algo, np.unique(test.user), 50)
        all_recs.append(recs)
        all_test.append(test)

    _log.info('analyzing results')
    rla = topn.RecListAnalysis()
    rla.add_metric(topn.ndcg)
    rla.add_metric(topn.recip_rank)
    scores = rla.compute(pd.concat(all_recs, ignore_index=True),
                         pd.concat(all_test, ignore_index=True),
                         include_missing=True)
    scores.fillna(0, inplace=True)
    _log.info('MRR: %f', scores['recip_rank'].mean())
    _log.info('nDCG: %f', scores['ndcg'].mean())
    assert scores['ndcg'].mean() > 0.1
示例#8
0
def test_bias_batch_predict(ncpus):
    from lenskit.algorithms import bias
    import lenskit.crossfold as xf
    from lenskit import batch
    import lenskit.metrics.predict as pm

    ratings = lktu.ml100k.ratings

    algo = bias.Bias(damping=5)

    def eval(train, test):
        _log.info('running training')
        algo.fit(train)
        _log.info('testing %d users', test.user.nunique())
        recs = batch.predict(algo, test, n_jobs=ncpus)
        return recs

    preds = pd.concat(
        (eval(train, test)
         for (train,
              test) in xf.partition_users(ratings, 5, xf.SampleFrac(0.2))))

    _log.info('analyzing predictions')
    rmse = pm.rmse(preds.prediction, preds.rating)
    _log.info('RMSE is %f', rmse)
    assert rmse == pytest.approx(0.95, abs=0.1)
示例#9
0
def test_als_batch_accuracy():
    from lenskit.algorithms import basic
    import lenskit.crossfold as xf
    import lenskit.metrics.predict as pm

    ratings = lktu.ml100k.ratings

    lu_algo = als.BiasedMF(25, iterations=20, damping=5, method='lu')
    cd_algo = als.BiasedMF(25, iterations=25, damping=5, method='cd')
    # algo = basic.Fallback(svd_algo, basic.Bias(damping=5))

    def eval(train, test):
        _log.info('training LU')
        lu_algo.fit(train)
        _log.info('training CD')
        cd_algo.fit(train)
        _log.info('testing %d users', test.user.nunique())
        return test.assign(lu_pred=lu_algo.predict(test), cd_pred=cd_algo.predict(test))

    folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2))
    preds = pd.concat(eval(train, test) for (train, test) in folds)
    preds['abs_diff'] = np.abs(preds.lu_pred - preds.cd_pred)
    _log.info('predictions:\n%s', preds.sort_values('abs_diff', ascending=False))
    _log.info('diff summary:\n%s', preds.abs_diff.describe())

    lu_mae = pm.mae(preds.lu_pred, preds.rating)
    assert lu_mae == approx(0.73, abs=0.025)
    cd_mae = pm.mae(preds.cd_pred, preds.rating)
    assert cd_mae == approx(0.73, abs=0.025)

    user_rmse = preds.groupby('user').apply(lambda df: pm.rmse(df.lu_pred, df.rating))
    assert user_rmse.mean() == approx(0.91, abs=0.05)
    user_rmse = preds.groupby('user').apply(lambda df: pm.rmse(df.cd_pred, df.rating))
    assert user_rmse.mean() == approx(0.91, abs=0.05)
def test_bias_batch_recommend():
    from lenskit.algorithms import basic
    import lenskit.crossfold as xf
    from lenskit import batch, topn

    if not os.path.exists('ml-100k/u.data'):
        raise pytest.skip()

    ratings = pd.read_csv('ml-100k/u.data',
                          sep='\t',
                          names=['user', 'item', 'rating', 'timestamp'])

    algo = basic.Bias(damping=5)
    algo = TopN(algo)

    def eval(train, test):
        _log.info('running training')
        algo.fit(train)
        _log.info('testing %d users', test.user.nunique())
        recs = batch.recommend(algo, test.user.unique(), 100)
        return recs

    folds = list(xf.partition_users(ratings, 5, xf.SampleFrac(0.2)))
    test = pd.concat(y for (x, y) in folds)

    recs = pd.concat(eval(train, test) for (train, test) in folds)

    _log.info('analyzing recommendations')
    rla = topn.RecListAnalysis()
    rla.add_metric(topn.ndcg)
    results = rla.compute(recs, test)
    dcg = results.ndcg
    _log.info('nDCG for %d users is %f (max=%f)', len(dcg), dcg.mean(),
              dcg.max())
    assert dcg.mean() > 0
示例#11
0
def test_alogrithms():
    # data = MovieLens('ml-latest-small')
    data = ML1M('ml-1m')
    ratings = data.ratings
    print('Initial ratings table head:')
    print(ratings.head())
    algorithms = [
        basic.Bias(damping=5),
        basic.Popular(),
        item_knn.ItemItem(20),
        user_knn.UserUser(20),
        als.BiasedMF(50),
        als.ImplicitMF(50),
        funksvd.FunkSVD(50)
    ]
    pairs = list(
        partition_users(ratings[['user', 'item', 'rating']], 5,
                        SampleFrac(0.2)))
    eval_algorithms(dataset=pairs, algorithms=algorithms)
    runs = display_runs()
    recs = display_recommendations()
    truth = pd.concat((p.test for p in pairs), ignore_index=True)
    ndcg_means = check_recommendations(runs, recs, truth)
    print('NDCG means:')
    print(ndcg_means)
    plot_comparison(ndcg_means)
示例#12
0
def test_sweep_save(tmp_path):
    tmp_path = norm_path(tmp_path)
    work = pathlib.Path(tmp_path)
    sweep = batch.MultiEval(tmp_path)

    ratings = ml_pandas.renamed.ratings
    sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)), name='ml-small')
    sweep.add_algorithms(Bias(damping=5))

    sweep.persist_data()
    pf = work / 'sweep.dat'
    with pf.open('wb') as f:
        pickle.dump(sweep, f)

    with pf.open('rb') as f:
        sweep = pickle.load(f)

    try:
        sweep.run()
    finally:
        if (work / 'runs.csv').exists():
            runs = pd.read_csv(work / 'runs.csv')
            print(runs)

    assert (work / 'runs.csv').exists()
    assert (work / 'runs.parquet').exists()
    assert (work / 'predictions.parquet').exists()
    assert (work / 'recommendations.parquet').exists()

    runs = pd.read_parquet(work / 'runs.parquet')
    # 1 algorithms by 5 partitions
    assert len(runs) == 5
示例#13
0
def test_uu_implicit_batch_accuracy():
    from lenskit import batch, topn
    import lenskit.crossfold as xf

    ratings = lktu.ml100k.ratings

    algo = knn.UserUser(30, center=False, aggregate='sum')

    folds = list(xf.partition_users(ratings, 5, xf.SampleFrac(0.2)))
    all_test = pd.concat(f.test for f in folds)

    rec_lists = []
    for train, test in folds:
        _log.info('running training')
        rec_algo = Recommender.adapt(algo)
        rec_algo.fit(train.loc[:, ['user', 'item']])
        _log.info('testing %d users', test.user.nunique())
        recs = batch.recommend(rec_algo, test.user.unique(), 100, n_jobs=2)
        rec_lists.append(recs)
    recs = pd.concat(rec_lists)

    rla = topn.RecListAnalysis()
    rla.add_metric(topn.ndcg)
    results = rla.compute(recs, all_test)
    user_dcg = results.ndcg

    dcg = user_dcg.mean()
    assert dcg >= 0.03
示例#14
0
def test_sweep_filenames(tmp_path):
    tmp_path = norm_path(tmp_path)
    work = pathlib.Path(tmp_path)
    sweep = batch.MultiEval(tmp_path)

    ratings = ml_pandas.renamed.ratings
    folds = []
    for part, (train, test) in enumerate(xf.partition_users(ratings, 2, xf.SampleN(5))):
        trfn = work / 'p{}-train.csv'.format(part)
        tefn = work / 'p{}-test.csv'.format(part)
        train.to_csv(trfn)
        test.to_csv(tefn)
        folds.append((trfn, tefn))

    sweep.add_datasets(folds, DataSet='ml-small')
    sweep.add_algorithms([Bias(damping=0), Bias(damping=5), Bias(damping=10)],
                         attrs=['damping'])
    sweep.add_algorithms(Popular())

    try:
        sweep.run()
    finally:
        if (work / 'runs.csv').exists():
            runs = pd.read_csv(work / 'runs.csv')
            print(runs)

    assert (work / 'runs.csv').exists()
    assert (work / 'runs.parquet').exists()
    assert (work / 'predictions.parquet').exists()
    assert (work / 'recommendations.parquet').exists()

    runs = pd.read_parquet(work / 'runs.parquet')
    # 4 algorithms by 2 partitions
    assert len(runs) == 8
示例#15
0
def test_sweep_norecs(tmp_path):
    work = pathlib.Path(tmp_path)
    sweep = batch.MultiEval(tmp_path, recommend=None)

    ratings = ml_test.ratings
    folds = xf.partition_users(ratings, 5, xf.SampleN(5))
    sweep.add_datasets(folds, DataSet='ml-small')
    sweep.add_algorithms([Bias(damping=0), Bias(damping=5), Bias(damping=10)],
                         attrs=['damping'])
    sweep.add_algorithms(Popular())

    try:
        sweep.run()
    finally:
        if (work / 'runs.csv').exists():
            runs = pd.read_csv(work / 'runs.csv')
            print(runs)

    assert (work / 'runs.csv').exists()
    assert (work / 'runs.parquet').exists()
    assert (work / 'predictions.parquet').exists()
    assert not (work / 'recommendations.parquet').exists()

    runs = pd.read_parquet(work / 'runs.parquet')
    # 4 algorithms by 5 partitions
    assert len(runs) == 20
    assert all(np.sort(runs.AlgoClass.unique()) == ['Bias', 'Popular'])
    bias_runs = runs[runs.AlgoClass == 'Bias']
    assert all(bias_runs.damping.notna())
    pop_runs = runs[runs.AlgoClass == 'Popular']
    assert all(pop_runs.damping.isna())

    preds = pd.read_parquet(work / 'predictions.parquet')
    assert all(preds.RunId.isin(bias_runs.RunId))
示例#16
0
def test_ii_batch_recommend(ncpus):
    import lenskit.crossfold as xf
    from lenskit import topn

    ratings = lktu.ml100k.ratings

    def eval(train, test):
        _log.info('running training')
        algo = knn.ItemItem(30)
        algo = Recommender.adapt(algo)
        algo.fit(train)
        _log.info('testing %d users', test.user.nunique())
        recs = batch.recommend(algo, test.user.unique(), 100, n_jobs=ncpus)
        return recs

    test_frames = []
    recs = []
    for train, test in xf.partition_users(ratings, 5, xf.SampleFrac(0.2)):
        test_frames.append(test)
        recs.append(eval(train, test))

    test = pd.concat(test_frames)
    recs = pd.concat(recs)

    _log.info('analyzing recommendations')
    rla = topn.RecListAnalysis()
    rla.add_metric(topn.ndcg)
    results = rla.compute(recs, test)
    dcg = results.ndcg
    _log.info('nDCG for %d users is %f', len(dcg), dcg.mean())
    assert dcg.mean() > 0.03
示例#17
0
def test_batch_rmse():
    import lenskit.crossfold as xf
    import lenskit.batch as batch
    import lenskit.algorithms.basic as bl

    ratings = lktu.ml100k.ratings
    algo = bl.Bias(damping=5)

    def eval(train, test):
        algo.fit(train)
        preds = batch.predict(algo, test)
        return preds.set_index(['user', 'item'])

    results = pd.concat((eval(train, test)
                         for (train, test)
                         in xf.partition_users(ratings, 5, xf.SampleN(5))))

    user_rmse = results.groupby('user').apply(lambda df: pm.rmse(df.prediction, df.rating))

    # we should have all users
    users = ratings.user.unique()
    assert len(user_rmse) == len(users)
    missing = np.setdiff1d(users, user_rmse.index)
    assert len(missing) == 0

    # we should not have any missing values
    assert all(user_rmse.notna())

    # we should have a reasonable mean
    assert user_rmse.mean() == approx(0.93, abs=0.05)
示例#18
0
def test_hpf_batch_accuracy():
    import lenskit.crossfold as xf
    from lenskit import batch, topn
    import lenskit.metrics.topn as lm

    ratings = lktu.ml100k.load_ratings()

    algo = hpf.HPF(25)

    def eval(train, test):
        _log.info('running training')
        train['rating'] = train.rating.astype(np.float_)
        algo.fit(train)
        users = test.user.unique()
        _log.info('testing %d users', len(users))
        candidates = topn.UnratedCandidates(train)
        recs = batch.recommend(algo, users, 100, candidates, test)
        return recs

    folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2))
    recs = pd.concat(eval(train, test) for (train, test) in folds)

    _log.info('analyzing recommendations')
    dcg = recs.groupby('user').rating.apply(lm.dcg)
    _log.info('dcg for users is %.4f', dcg.mean())
    assert dcg.mean() > 0
示例#19
0
def test_ii_batch_recommend(ncpus):
    import lenskit.crossfold as xf
    from lenskit import batch, topn

    if not os.path.exists('ml-100k/u.data'):
        raise pytest.skip()

    ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=['user', 'item', 'rating', 'timestamp'])

    def eval(train, test):
        _log.info('running training')
        algo = knn.ItemItem(30)
        algo = Recommender.adapt(algo)
        algo.fit(train)
        _log.info('testing %d users', test.user.nunique())
        recs = batch.recommend(algo, test.user.unique(), 100, n_jobs=ncpus)
        return recs

    test_frames = []
    recs = []
    for train, test in xf.partition_users(ratings, 5, xf.SampleFrac(0.2)):
        test_frames.append(test)
        recs.append(eval(train, test))

    test = pd.concat(test_frames)
    recs = pd.concat(recs)

    _log.info('analyzing recommendations')
    rla = topn.RecListAnalysis()
    rla.add_metric(topn.ndcg)
    results = rla.compute(recs, test)
    dcg = results.ndcg
    _log.info('nDCG for %d users is %f', len(dcg), dcg.mean())
    assert dcg.mean() > 0.03
示例#20
0
def test_ii_batch_accuracy():
    from lenskit.algorithms import basic
    import lenskit.crossfold as xf
    from lenskit import batch
    import lenskit.metrics.predict as pm

    ratings = lktu.ml100k.ratings

    ii_algo = knn.ItemItem(30)
    algo = basic.Fallback(ii_algo, basic.Bias())

    def eval(train, test):
        _log.info('running training')
        algo.fit(train)
        _log.info('testing %d users', test.user.nunique())
        return batch.predict(algo, test, n_jobs=4)

    preds = pd.concat((eval(train, test)
                       for (train, test)
                       in xf.partition_users(ratings, 5, xf.SampleFrac(0.2))))
    mae = pm.mae(preds.prediction, preds.rating)
    assert mae == approx(0.70, abs=0.025)

    user_rmse = preds.groupby('user').apply(lambda df: pm.rmse(df.prediction, df.rating))
    assert user_rmse.mean() == approx(0.90, abs=0.05)
示例#21
0
def test_partition_users():
    """Partitioning ratings when dataframe has non-unique indices"""
    ratings = lktu.ml_test.ratings
    ratings = ratings.set_index('user')  ##forces non-unique index
    with pytest.raises(ValueError):
        for split in xf.partition_users(ratings, 5, xf.SampleN(5)):
            pass
示例#22
0
def recommend(algo_wrappers, ratings):
    all_recs = []
    test_data = []
    for train, test in xf.partition_users(ratings[['user', 'item', 'rating']],
                                          5, xf.SampleFrac(0.2)):
        test_data.append(test)
        for algo_wrapper in algo_wrappers:
            all_recs.append(do_recommend(algo_wrapper, train, test))
    return all_recs, test_data
示例#23
0
class LegMedLensKit():
    def loadData():
        ratings = pd.read_csv('/Users/josse/Desktop/ratings.dat',
                              sep='::',
                              names=['user', 'item', 'rating', 'timestamp'])
        print(ratings.head())
        return (ratings)

    #print ("test")
    ratings = loadData()
    data_matrix = np.array(
        ratings.pivot(index='item', columns='user', values='rating'))
    print(data_matrix)
    data_matrix_rev = np.nan_to_num(data_matrix)
    print(data_matrix_rev)

    algo_ii = knn.ItemItem(20)
    algo_als = als.BiasedMF(50)

    def eval(aname, algo, train, test):
        print("test")
        fittable = util.clone(algo)
        fittable = Recommender.adapt(fittable)
        fittable.fit(train)
        users = test.user.unique()
        # now we run the recommender
        recs = batch.recommend(fittable, users, 100)
        # add the algorithm name for analyzability
        recs['Algorithm'] = aname
        print("recs")
        print(recs.head())
        return recs

    all_recs = []
    test_data = []

    for train, test in xf.partition_users(ratings[['user', 'item', 'rating']],
                                          1, xf.SampleFrac(0.2)):
        test_data.append(test)
        #print(test.head(10))
        all_recs.append(eval('ItemItem', algo_ii, train, test))
        all_recs.append(eval('ALS', algo_als, train, test))

    print("test2")
    print(all_recs.head())
    all_recs = pd.concat(all_recs, ignore_index=True)
    print(all_recs.head())
    test_data = pd.concat(test_data, ignore_index=True)
    #print(test_data.head)

    rla = topn.RecListAnalysis()
    rla.add_metric(topn.ndcg)
    results = rla.compute(all_recs, test_data)
    results.head()

    results.groupby('Algorithm').ndcg.mean()
    results.groupby('Algorithm').ndcg.mean().plot.bar()
示例#24
0
def test_sweep_combine(tmp_path):
    tmp_path = norm_path(tmp_path)
    work = pathlib.Path(tmp_path)
    sweep = batch.MultiEval(tmp_path, combine=False)

    ratings = ml_pandas.renamed.ratings
    sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)), name='ml-small')

    sweep.add_algorithms([Bias(damping=0), Bias(damping=5)],
                         attrs=['damping'])
    sweep.add_algorithms(Popular())

    sweep.persist_data()

    for i in range(1, 6):
        assert (work / 'ds{}-train.parquet'.format(i)).exists()
        assert (work / 'ds{}-test.parquet'.format(i)).exists()

    for ds, cf, dsa in sweep.datasets:
        assert isinstance(ds, tuple)
        train, test = ds
        assert isinstance(train, pathlib.Path)
        assert isinstance(test, pathlib.Path)

    assert sweep.run_count() == 5 * 3

    try:
        sweep.run()
    finally:
        if (work / 'runs.csv').exists():
            runs = pd.read_csv(work / 'runs.csv')
            print(runs)

    assert not (work / 'runs.csv').exists()
    assert not (work / 'runs.parquet').exists()
    assert not (work / 'predictions.parquet').exists()
    assert not (work / 'recommendations.parquet').exists()

    for i, (ds, a) in enumerate(sweep._flat_runs()):
        run = i + 1
        assert (work / 'run-{}.json'.format(run)).exists()
        if isinstance(a.algorithm, Predictor):
            assert (work / 'predictions-{}.parquet'.format(run)).exists()
        assert (work / 'recommendations-{}.parquet'.format(run)).exists()

    sweep.collect_results()

    assert (work / 'runs.csv').exists()
    assert (work / 'runs.parquet').exists()
    assert (work / 'predictions.parquet').exists()
    assert (work / 'recommendations.parquet').exists()

    runs = pd.read_parquet(work / 'runs.parquet')
    assert len(runs) == 5 * 3
示例#25
0
def eval_algos(ratings, algorithms):
    all_recs = []
    test_data = []
    for train, test in partition_users(ratings[['user', 'item', 'rating']], 5,
                                       SampleFrac(0.2)):
        test_data.append(test)
        for key, value in algorithms.items():
            all_recs.append(eval(key, value, train, test))
    all_recs = pd.concat(all_recs, ignore_index=True)
    print('Algorithms\' results table head:')
    print(all_recs.head())
    test_data = pd.concat(test_data, ignore_index=True)
    return all_recs, test_data
 def create_train_test_rec_data(self):
     # For now, no cross-validation, just split the data into 1 train and 1 test set.
     for i, tp in enumerate(
             xf.partition_users(data=self.data_dense,
                                partitions=1,
                                method=xf.SampleN(5),
                                rng_spec=1)):
         train = tp.train
         test = tp.test
         train.to_csv(
             f'{conf.SYN_DATA_DIR}syn_train_{self.current_date}.csv')
         test.to_csv(f'{conf.SYN_DATA_DIR}syn_test_{self.current_date}.csv')
     return train, test
示例#27
0
    def run(self, strategy_context: RecommenderAlgorithmStrategyContext) -> np.ndarray:
        data_set_source = strategy_context.data_set_source
        data_frame_reader: DataFrameReaderStrategy = self.data_frame_reader_factory.create(data_set_source)
        data_set: DataFrame = data_frame_reader.parse(DataFrameReaderStrategyContext(data_set_source))

        partition = list(partition_users(data=data_set, partitions=1, method=crossfold.SampleFrac(0.2)))[0]
        test, train = partition.test, partition.train
        number_of_recommendations = strategy_context.number_of_recommendations
        algorithm = Recommender.adapt(Bias())
        trained_algorithm = algorithm.fit(train)
        recommendations = lenskit.batch.recommend(trained_algorithm, test['user'].unique(), number_of_recommendations)
        return recommendations.groupby('user')['item'].apply(lambda x: x).to_numpy().reshape(
            (-1, number_of_recommendations))
示例#28
0
def create_save_train_val_test_rec_data(dense_data, fn):
    # For now, no cross-validation, just split the data into 1 train and 1 test set.
    for i, tp in enumerate(
            xf.partition_users(data=dense_data,
                               partitions=1,
                               method=xf.SampleN(5),
                               rng_spec=1)):
        train = tp.train
        test = tp.test
        test.to_csv(f'{conf.SYN_DATA_DIR}syn_test_{fn}.csv')
        print("[INFO] Train/test split created")

    for i, tp in enumerate(
            xf.partition_users(data=train,
                               partitions=1,
                               method=xf.SampleN(5),
                               rng_spec=1)):
        train = tp.train
        val = tp.test
        train.to_csv(f'{conf.SYN_DATA_DIR}syn_train_{fn}.csv')
        val.to_csv(f'{conf.SYN_DATA_DIR}syn_val_{fn}.csv')
        print("[INFO] Train/val split created")

    return train, val, test
示例#29
0
def test_partition_users():
    ratings = lktu.ml_pandas.renamed.ratings
    splits = xf.partition_users(ratings, 5, xf.SampleN(5))
    splits = list(splits)
    assert len(splits) == 5

    for s in splits:
        ucounts = s.test.groupby('user').agg('count')
        assert all(ucounts == 5)
        assert all(s.test.index.union(s.train.index) == ratings.index)
        assert len(s.test) + len(s.train) == len(ratings)

    users = ft.reduce(lambda us1, us2: us1 | us2,
                      (set(s.test.user) for s in splits))
    assert len(users) == ratings.user.nunique()
    assert users == set(ratings.user)
示例#30
0
def test_uu_batch_accuracy():
    from lenskit.algorithms import basic
    import lenskit.crossfold as xf
    import lenskit.metrics.predict as pm

    ratings = lktu.ml100k.ratings

    uu_algo = knn.UserUser(30)
    algo = basic.Fallback(uu_algo, basic.Bias())

    folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2))
    preds = [__batch_eval((algo, train, test)) for (train, test) in folds]
    preds = pd.concat(preds)
    mae = pm.mae(preds.prediction, preds.rating)
    assert mae == approx(0.71, abs=0.028)

    user_rmse = preds.groupby('user').apply(lambda df: pm.rmse(df.prediction, df.rating))
    assert user_rmse.mean() == approx(0.91, abs=0.055)