示例#1
0
def test_uu_known_preds():
    from lenskit import batch

    algo = knn.UserUser(30, min_sim=1.0e-6)
    _log.info('training %s on ml data', algo)
    algo.fit(lktu.ml_test.ratings)

    dir = Path(__file__).parent
    pred_file = dir / 'user-user-preds.csv'
    _log.info('reading known predictions from %s', pred_file)
    known_preds = pd.read_csv(str(pred_file))
    pairs = known_preds.loc[:, ['user', 'item']]
    _log.info('generating %d known predictions', len(pairs))

    preds = batch.predict(algo, pairs)
    merged = pd.merge(known_preds.rename(columns={'prediction': 'expected'}), preds)
    assert len(merged) == len(preds)
    merged['error'] = merged.expected - merged.prediction
    try:
        assert not any(merged.prediction.isna() & merged.expected.notna())
    except AssertionError as e:
        bad = merged[merged.prediction.isna() & merged.expected.notna()]
        _log.error('%d missing predictions:\n%s', len(bad), bad)
        raise e

    err = merged.error
    err = err[err.notna()]
    try:
        assert all(err.abs() < 0.01)
    except AssertionError as e:
        bad = merged[merged.error.notna() & (merged.error.abs() >= 0.01)]
        _log.error('%d erroneous predictions:\n%s', len(bad), bad)
        raise e
示例#2
0
def main(args):
    mod_name = args.get('-m')
    input = args.get('--splits')
    output = args.get('-o')
    n_recs = int(args.get('-n'))
    model = args.get('ALGO')

    _log.info(f'importing from module {mod_name}')
    algorithms = importlib.import_module(mod_name)

    algo = getattr(algorithms, model)
    algo = Recommender.adapt(algo)

    path = Path(input)
    dest = Path(output)
    dest.mkdir(exist_ok=True, parents=True)

    ds_def = getattr(datasets, path.name, None)

    for file in path.glob("test-*"):
        test = pd.read_csv(file, sep=',')
        suffix = file.name[5:]
        train_file = path / f'train-{suffix}'
        timer = util.Stopwatch()

        if 'index' in test.columns:
            _log.info('setting test index')
            test = test.set_index('index')
        else:
            _log.warn('no index column found in %s', file.name)

        if train_file.exists():
            _log.info('[%s] loading training data from %s', timer, train_file)
            train = pd.read_csv(path / f'train-{suffix}', sep=',')
        elif ds_def is not None:
            _log.info('[%s] extracting training data from data set %s', timer,
                      path.name)
            train = datasets.ds_diff(ds_def.ratings, test)
            train.reset_index(drop=True, inplace=True)
        else:
            _log.error('could not find training data for %s', file.name)
            continue

        _log.info('[%s] Fitting the model', timer)
        # We train isolated to manage resource use
        model = batch.train_isolated(algo, train)
        try:
            _log.info('[%s] generating recommendations for unique users',
                      timer)
            users = test.user.unique()
            recs = batch.recommend(model, users, n_recs)
            _log.info('[%s] writing recommendations to %s', timer, dest)
            recs.to_csv(dest / f'recs-{suffix}', index=False)

            if isinstance(algo, Predictor) and not args['--no-predict']:
                _log.info('[%s] generating predictions for user-item', timer)
                preds = batch.predict(model, test)
                preds.to_csv(dest / f'pred-{suffix}', index=False)
        finally:
            model.close()
示例#3
0
    def test_predict(self):
        comp1 = self._algo.algorithms[0]
        comp2 = self._algo.algorithms[1]
        hybrid = self._algo

        pred1 = batch.predict(comp1, self._pred_tests)
        pred2 = batch.predict(comp2, self._pred_tests)
        hybrid = batch.predict(hybrid, self._pred_tests)

        pred_lst = 0.5 * pred1['prediction'] + 0.5 * pred2['prediction']
        algo_lst = hybrid['prediction']

        preds = zip(pred_lst, algo_lst)

        for pred, actual in preds:
            self.assertAlmostEqual(pred, actual, 3, 'Prediction does not match components.')
示例#4
0
def test_ii_known_preds():
    from lenskit import batch

    algo = knn.ItemItem(20, min_sim=1.0e-6)
    _log.info('training %s on ml data', algo)
    algo.fit(lktu.ml_test.ratings)
    assert algo.center
    assert algo.item_means_ is not None
    _log.info('model means: %s', algo.item_means_)

    dir = Path(__file__).parent
    pred_file = dir / 'item-item-preds.csv'
    _log.info('reading known predictions from %s', pred_file)
    known_preds = pd.read_csv(str(pred_file))
    pairs = known_preds.loc[:, ['user', 'item']]

    preds = batch.predict(algo, pairs)
    merged = pd.merge(known_preds.rename(columns={'prediction': 'expected'}), preds)
    assert len(merged) == len(preds)
    merged['error'] = merged.expected - merged.prediction
    try:
        assert not any(merged.prediction.isna() & merged.expected.notna())
    except AssertionError as e:
        bad = merged[merged.prediction.isna() & merged.expected.notna()]
        _log.error('erroneously missing or present predictions:\n%s', bad)
        raise e

    err = merged.error
    err = err[err.notna()]
    try:
        assert all(err.abs() < 0.03)  # FIXME this threshold is too high
    except AssertionError as e:
        bad = merged[merged.error.notna() & (merged.error.abs() >= 0.01)]
        _log.error('erroneous predictions:\n%s', bad)
        raise e
示例#5
0
def __batch_eval(job):
    from lenskit import batch
    algo, train, test = job
    _log.info('running training')
    algo.fit(train)
    _log.info('testing %d users', test.user.nunique())
    return batch.predict(algo, test)
示例#6
0
def _build_predict(ratings, fold):
    algo = Fallback(knn.ItemItem(20), Bias(5))
    train = ratings[ratings['partition'] != fold]
    algo.fit(train)

    test = ratings[ratings['partition'] == fold]
    preds = batch.predict(algo, test, n_jobs=1)
    return preds
示例#7
0
def generate_predictions(model, user_item):
    """Generate the rating predictions for each user->item pair

    :returns: pd.DataFrame. A dataframe with at least the columns 'user', 
        'item', 'prediction' (the predicted scores)
    """

    return batch.predict(model, user_item)
 def eval(aname, algo, train, test, all_preds):
     fittable = util.clone(algo)
     fittable = Recommender.adapt(fittable)
     fittable.fit(train)
     # predict ratings
     preds = batch.predict(fittable, test)
     preds['Algorithm'] = aname
     all_preds.append(preds)
示例#9
0
def predict(algo_wrappers, ratings):
    all_preds = []
    for algo_wrapper in algo_wrappers:
        algo_wrapper.algo.fit(ratings)
        preds = batch.predict(algo_wrapper.algo, ratings)
        preds['Algorithm'] = algo_wrapper.name
        all_preds.append(preds)
    return all_preds
示例#10
0
def test_predict_single(mlb):
    tf = pd.DataFrame({'user': [1], 'item': [31]})
    res = lkb.predict(mlb.algo, tf)

    assert len(res) == 1
    assert all(res.user == 1)
    assert set(res.columns) == set(['user', 'item', 'prediction'])
    assert all(res.item == 31)

    expected = mlb.algo.mean_ + mlb.algo.item_offsets_.loc[31] + mlb.algo.user_offsets_.loc[1]
    assert res.prediction.iloc[0] == pytest.approx(expected)
示例#11
0
def test_batch_predict_preshared():
    from lenskit.algorithms import basic
    import lenskit.crossfold as xf

    algo = basic.Bias()
    splits = xf.sample_users(lktu.ml_test.ratings, 1, 100, xf.SampleN(5))
    train, test = next(splits)

    ares = lkb.train_isolated(algo, train)
    preds = lkb.predict(ares, test)
    assert len(preds) == len(test)
    assert not any(preds['prediction'].isna())
示例#12
0
def getPredictions(algo, dataPairs, model):
    """
    Generate a recommendation for the user
    :param algo: the given algorithm
    :param model: the given trained model
    :param user: the user
    :return: recommendation
    """
    # Generate $num_recommendations for the givenuser
    predictions = batch.predict(algo, dataPairs, model)
    # https://lkpy.readthedocs.io/en/latest/batch.ht ml?highlight=batch
    # batch.predict returns dataframe [dataPairs['all columns'], 'prediction']
    #np.savetxt("./results/full_prediction.csv", predictions, delimiter=" ")
    return predictions
示例#13
0
def test_als_isolate(ml20m, rng):
    users = rng.choice(ml20m['user'].unique(), 5000, replace=False)
    algo = BiasedMF(20, iterations=10)
    algo = Recommender.adapt(algo)
    _log.info('training %s', algo)
    ares = batch.train_isolated(algo, ml20m)
    try:
        _log.info('recommending with %s', algo)
        recs = batch.recommend(ares, users, 10)
        assert recs['user'].nunique() == 5000
        _log.info('predicting with %s', algo)
        pairs = ml20m.sample(1000)
        preds = batch.predict(ares, pairs)
        assert len(preds) == len(pairs)
    finally:
        ares.close()
示例#14
0
def test_predict_two_users(mlb):
    uids = [5, 10]
    tf = None
    # make sure we get both UIDs
    while tf is None or len(set(tf.user)) < 2:
        tf = mlb.ratings[mlb.ratings.user.isin(uids)].loc[:, ('user', 'item')].sample(10)

    res = lkb.predict(mlb.algo, tf)

    assert len(res) == 10
    assert set(res.user) == set(uids)

    preds = res.set_index(['user', 'item'])
    preds['rating'] = mlb.algo.mean_
    preds['rating'] += mlb.algo.item_offsets_
    preds['rating'] += mlb.algo.user_offsets_
    assert preds.prediction.values == pytest.approx(preds.rating.values)
示例#15
0
def test_global_metric():
    import lenskit.crossfold as xf
    import lenskit.batch as batch
    from lenskit.algorithms.bias import Bias

    train, test = next(
        xf.sample_users(lktu.ml_test.ratings, 1, 200, xf.SampleFrac(0.5)))
    algo = Bias()
    algo.fit(train)

    preds = batch.predict(algo, test)

    rmse = pm.global_metric(preds)
    assert rmse == pm.rmse(preds.prediction, preds.rating)

    mae = pm.global_metric(preds, metric=pm.mae)
    assert mae == pm.mae(preds.prediction, preds.rating)
示例#16
0
def test_user_metric():
    import lenskit.crossfold as xf
    import lenskit.batch as batch
    from lenskit.algorithms.bias import Bias

    train, test = next(
        xf.sample_users(lktu.ml_test.ratings, 1, 200, xf.SampleFrac(0.5)))
    algo = Bias()
    algo.fit(train)

    preds = batch.predict(algo, test)

    rmse = pm.user_metric(preds)
    u_rmse = preds.groupby('user').apply(
        lambda df: pm.rmse(df.prediction, df.rating))
    assert rmse == approx(u_rmse.mean())

    mae = pm.user_metric(preds, metric=pm.mae)
    u_mae = preds.groupby('user').apply(
        lambda df: pm.mae(df.prediction, df.rating))
    assert mae == approx(u_mae.mean())
示例#17
0
def test_predict_user(mlb):
    uid = 5
    urates = mlb.ratings[mlb.ratings.user == uid]

    test_rated = urates.item.sample(5)
    unrated = np.setdiff1d(mlb.ratings.item.unique(), urates.item.values)
    test_unrated = np.random.choice(unrated, 10, replace=False)
    test_items = pd.concat([test_rated, pd.Series(test_unrated)])

    tf = pd.DataFrame({'user': uid, 'item': test_items})
    res = lkb.predict(mlb.algo, tf)

    assert len(res) == 15
    assert set(res.columns) == set(['user', 'item', 'prediction'])
    assert all(res.user == uid)
    assert set(res.item) == set(test_items)

    # did we get the right predictions?
    preds = res.set_index(['user', 'item'])
    preds['rating'] = mlb.algo.mean_
    preds['rating'] += mlb.algo.item_offsets_
    preds['rating'] += mlb.algo.user_offsets_.loc[uid]
    assert preds.prediction.values == pytest.approx(preds.rating.values)
示例#18
0
 def eval(train, test):
     _log.info('running training')
     algo.fit(train)
     _log.info('testing %d users', test.user.nunique())
     return batch.predict(algo, test)
示例#19
0

test_bool = True

train = pd.read_pickle("../data/ml-1m-split/train.pkl")
val = pd.read_pickle("../data/ml-1m-split/val.pkl")
test = pd.read_pickle("../data/ml-1m-split/test.pkl")

num_factors = 30
num_iters = 100

model = BiasedMF(num_factors, iterations=num_iters)
print("Fitting model...")
model.fit(train)
print("Making validation predictions...")
val_preds = predict(model, val)
val_result = rmse(val_preds["prediction"], val_preds["rating"])

if test_bool:
    print("Making test predictions...")
    test_preds = predict(model, test)
    test_result = rmse(test_preds["prediction"], test_preds["rating"])
else:
    test_result = 0

print("============= RESULTS =============\nFactors: {}\nIterations: {}\nValidation RMSE: {}\nTest RMSE: {}" \
    .format(num_factors, num_iters, val_result, test_result))



示例#20
0
dest.mkdir(exist_ok=True, parents=True)

for file in path.glob("test-*"):
    test = pd.read_csv(file, sep=',')
    suffix = file.name[5:]

    try:
        train = pd.read_csv(path / f'train-{suffix}', sep=',')
    except FileNotFoundError:
        _log.error(f'train-{suffix} does not exists')
        continue

    _log.info('Fitting the model')

    users = test.user.unique()

    fittable = util.clone(algo)
    fittable = Recommender.adapt(fittable)
    fittable.fit(train)

    _log.info(f'generating recommendations for unique users')
    recs = batch.recommend(fittable, users, n_recs)
    _log.info(f'writing recommendations to {dest}')
    suffix = model + suffix
    recs.to_csv(dest / f'recs-{suffix}', index=False)

    if isinstance(fittable, Predictor):
        _log.info(f'generating predictions for user-item')
        preds = batch.predict(fittable, test)
        preds.to_csv(dest / f'pred-{suffix}', index=False)
示例#21
0
 def eval(train, test):
     algo.fit(train)
     preds = batch.predict(algo, test)
     return preds.set_index(['user', 'item'])