def test_uu_known_preds(): from lenskit import batch algo = knn.UserUser(30, min_sim=1.0e-6) _log.info('training %s on ml data', algo) algo.fit(lktu.ml_test.ratings) dir = Path(__file__).parent pred_file = dir / 'user-user-preds.csv' _log.info('reading known predictions from %s', pred_file) known_preds = pd.read_csv(str(pred_file)) pairs = known_preds.loc[:, ['user', 'item']] _log.info('generating %d known predictions', len(pairs)) preds = batch.predict(algo, pairs) merged = pd.merge(known_preds.rename(columns={'prediction': 'expected'}), preds) assert len(merged) == len(preds) merged['error'] = merged.expected - merged.prediction try: assert not any(merged.prediction.isna() & merged.expected.notna()) except AssertionError as e: bad = merged[merged.prediction.isna() & merged.expected.notna()] _log.error('%d missing predictions:\n%s', len(bad), bad) raise e err = merged.error err = err[err.notna()] try: assert all(err.abs() < 0.01) except AssertionError as e: bad = merged[merged.error.notna() & (merged.error.abs() >= 0.01)] _log.error('%d erroneous predictions:\n%s', len(bad), bad) raise e
def main(args): mod_name = args.get('-m') input = args.get('--splits') output = args.get('-o') n_recs = int(args.get('-n')) model = args.get('ALGO') _log.info(f'importing from module {mod_name}') algorithms = importlib.import_module(mod_name) algo = getattr(algorithms, model) algo = Recommender.adapt(algo) path = Path(input) dest = Path(output) dest.mkdir(exist_ok=True, parents=True) ds_def = getattr(datasets, path.name, None) for file in path.glob("test-*"): test = pd.read_csv(file, sep=',') suffix = file.name[5:] train_file = path / f'train-{suffix}' timer = util.Stopwatch() if 'index' in test.columns: _log.info('setting test index') test = test.set_index('index') else: _log.warn('no index column found in %s', file.name) if train_file.exists(): _log.info('[%s] loading training data from %s', timer, train_file) train = pd.read_csv(path / f'train-{suffix}', sep=',') elif ds_def is not None: _log.info('[%s] extracting training data from data set %s', timer, path.name) train = datasets.ds_diff(ds_def.ratings, test) train.reset_index(drop=True, inplace=True) else: _log.error('could not find training data for %s', file.name) continue _log.info('[%s] Fitting the model', timer) # We train isolated to manage resource use model = batch.train_isolated(algo, train) try: _log.info('[%s] generating recommendations for unique users', timer) users = test.user.unique() recs = batch.recommend(model, users, n_recs) _log.info('[%s] writing recommendations to %s', timer, dest) recs.to_csv(dest / f'recs-{suffix}', index=False) if isinstance(algo, Predictor) and not args['--no-predict']: _log.info('[%s] generating predictions for user-item', timer) preds = batch.predict(model, test) preds.to_csv(dest / f'pred-{suffix}', index=False) finally: model.close()
def test_predict(self): comp1 = self._algo.algorithms[0] comp2 = self._algo.algorithms[1] hybrid = self._algo pred1 = batch.predict(comp1, self._pred_tests) pred2 = batch.predict(comp2, self._pred_tests) hybrid = batch.predict(hybrid, self._pred_tests) pred_lst = 0.5 * pred1['prediction'] + 0.5 * pred2['prediction'] algo_lst = hybrid['prediction'] preds = zip(pred_lst, algo_lst) for pred, actual in preds: self.assertAlmostEqual(pred, actual, 3, 'Prediction does not match components.')
def test_ii_known_preds(): from lenskit import batch algo = knn.ItemItem(20, min_sim=1.0e-6) _log.info('training %s on ml data', algo) algo.fit(lktu.ml_test.ratings) assert algo.center assert algo.item_means_ is not None _log.info('model means: %s', algo.item_means_) dir = Path(__file__).parent pred_file = dir / 'item-item-preds.csv' _log.info('reading known predictions from %s', pred_file) known_preds = pd.read_csv(str(pred_file)) pairs = known_preds.loc[:, ['user', 'item']] preds = batch.predict(algo, pairs) merged = pd.merge(known_preds.rename(columns={'prediction': 'expected'}), preds) assert len(merged) == len(preds) merged['error'] = merged.expected - merged.prediction try: assert not any(merged.prediction.isna() & merged.expected.notna()) except AssertionError as e: bad = merged[merged.prediction.isna() & merged.expected.notna()] _log.error('erroneously missing or present predictions:\n%s', bad) raise e err = merged.error err = err[err.notna()] try: assert all(err.abs() < 0.03) # FIXME this threshold is too high except AssertionError as e: bad = merged[merged.error.notna() & (merged.error.abs() >= 0.01)] _log.error('erroneous predictions:\n%s', bad) raise e
def __batch_eval(job): from lenskit import batch algo, train, test = job _log.info('running training') algo.fit(train) _log.info('testing %d users', test.user.nunique()) return batch.predict(algo, test)
def _build_predict(ratings, fold): algo = Fallback(knn.ItemItem(20), Bias(5)) train = ratings[ratings['partition'] != fold] algo.fit(train) test = ratings[ratings['partition'] == fold] preds = batch.predict(algo, test, n_jobs=1) return preds
def generate_predictions(model, user_item): """Generate the rating predictions for each user->item pair :returns: pd.DataFrame. A dataframe with at least the columns 'user', 'item', 'prediction' (the predicted scores) """ return batch.predict(model, user_item)
def eval(aname, algo, train, test, all_preds): fittable = util.clone(algo) fittable = Recommender.adapt(fittable) fittable.fit(train) # predict ratings preds = batch.predict(fittable, test) preds['Algorithm'] = aname all_preds.append(preds)
def predict(algo_wrappers, ratings): all_preds = [] for algo_wrapper in algo_wrappers: algo_wrapper.algo.fit(ratings) preds = batch.predict(algo_wrapper.algo, ratings) preds['Algorithm'] = algo_wrapper.name all_preds.append(preds) return all_preds
def test_predict_single(mlb): tf = pd.DataFrame({'user': [1], 'item': [31]}) res = lkb.predict(mlb.algo, tf) assert len(res) == 1 assert all(res.user == 1) assert set(res.columns) == set(['user', 'item', 'prediction']) assert all(res.item == 31) expected = mlb.algo.mean_ + mlb.algo.item_offsets_.loc[31] + mlb.algo.user_offsets_.loc[1] assert res.prediction.iloc[0] == pytest.approx(expected)
def test_batch_predict_preshared(): from lenskit.algorithms import basic import lenskit.crossfold as xf algo = basic.Bias() splits = xf.sample_users(lktu.ml_test.ratings, 1, 100, xf.SampleN(5)) train, test = next(splits) ares = lkb.train_isolated(algo, train) preds = lkb.predict(ares, test) assert len(preds) == len(test) assert not any(preds['prediction'].isna())
def getPredictions(algo, dataPairs, model): """ Generate a recommendation for the user :param algo: the given algorithm :param model: the given trained model :param user: the user :return: recommendation """ # Generate $num_recommendations for the givenuser predictions = batch.predict(algo, dataPairs, model) # https://lkpy.readthedocs.io/en/latest/batch.ht ml?highlight=batch # batch.predict returns dataframe [dataPairs['all columns'], 'prediction'] #np.savetxt("./results/full_prediction.csv", predictions, delimiter=" ") return predictions
def test_als_isolate(ml20m, rng): users = rng.choice(ml20m['user'].unique(), 5000, replace=False) algo = BiasedMF(20, iterations=10) algo = Recommender.adapt(algo) _log.info('training %s', algo) ares = batch.train_isolated(algo, ml20m) try: _log.info('recommending with %s', algo) recs = batch.recommend(ares, users, 10) assert recs['user'].nunique() == 5000 _log.info('predicting with %s', algo) pairs = ml20m.sample(1000) preds = batch.predict(ares, pairs) assert len(preds) == len(pairs) finally: ares.close()
def test_predict_two_users(mlb): uids = [5, 10] tf = None # make sure we get both UIDs while tf is None or len(set(tf.user)) < 2: tf = mlb.ratings[mlb.ratings.user.isin(uids)].loc[:, ('user', 'item')].sample(10) res = lkb.predict(mlb.algo, tf) assert len(res) == 10 assert set(res.user) == set(uids) preds = res.set_index(['user', 'item']) preds['rating'] = mlb.algo.mean_ preds['rating'] += mlb.algo.item_offsets_ preds['rating'] += mlb.algo.user_offsets_ assert preds.prediction.values == pytest.approx(preds.rating.values)
def test_global_metric(): import lenskit.crossfold as xf import lenskit.batch as batch from lenskit.algorithms.bias import Bias train, test = next( xf.sample_users(lktu.ml_test.ratings, 1, 200, xf.SampleFrac(0.5))) algo = Bias() algo.fit(train) preds = batch.predict(algo, test) rmse = pm.global_metric(preds) assert rmse == pm.rmse(preds.prediction, preds.rating) mae = pm.global_metric(preds, metric=pm.mae) assert mae == pm.mae(preds.prediction, preds.rating)
def test_user_metric(): import lenskit.crossfold as xf import lenskit.batch as batch from lenskit.algorithms.bias import Bias train, test = next( xf.sample_users(lktu.ml_test.ratings, 1, 200, xf.SampleFrac(0.5))) algo = Bias() algo.fit(train) preds = batch.predict(algo, test) rmse = pm.user_metric(preds) u_rmse = preds.groupby('user').apply( lambda df: pm.rmse(df.prediction, df.rating)) assert rmse == approx(u_rmse.mean()) mae = pm.user_metric(preds, metric=pm.mae) u_mae = preds.groupby('user').apply( lambda df: pm.mae(df.prediction, df.rating)) assert mae == approx(u_mae.mean())
def test_predict_user(mlb): uid = 5 urates = mlb.ratings[mlb.ratings.user == uid] test_rated = urates.item.sample(5) unrated = np.setdiff1d(mlb.ratings.item.unique(), urates.item.values) test_unrated = np.random.choice(unrated, 10, replace=False) test_items = pd.concat([test_rated, pd.Series(test_unrated)]) tf = pd.DataFrame({'user': uid, 'item': test_items}) res = lkb.predict(mlb.algo, tf) assert len(res) == 15 assert set(res.columns) == set(['user', 'item', 'prediction']) assert all(res.user == uid) assert set(res.item) == set(test_items) # did we get the right predictions? preds = res.set_index(['user', 'item']) preds['rating'] = mlb.algo.mean_ preds['rating'] += mlb.algo.item_offsets_ preds['rating'] += mlb.algo.user_offsets_.loc[uid] assert preds.prediction.values == pytest.approx(preds.rating.values)
def eval(train, test): _log.info('running training') algo.fit(train) _log.info('testing %d users', test.user.nunique()) return batch.predict(algo, test)
test_bool = True train = pd.read_pickle("../data/ml-1m-split/train.pkl") val = pd.read_pickle("../data/ml-1m-split/val.pkl") test = pd.read_pickle("../data/ml-1m-split/test.pkl") num_factors = 30 num_iters = 100 model = BiasedMF(num_factors, iterations=num_iters) print("Fitting model...") model.fit(train) print("Making validation predictions...") val_preds = predict(model, val) val_result = rmse(val_preds["prediction"], val_preds["rating"]) if test_bool: print("Making test predictions...") test_preds = predict(model, test) test_result = rmse(test_preds["prediction"], test_preds["rating"]) else: test_result = 0 print("============= RESULTS =============\nFactors: {}\nIterations: {}\nValidation RMSE: {}\nTest RMSE: {}" \ .format(num_factors, num_iters, val_result, test_result))
dest.mkdir(exist_ok=True, parents=True) for file in path.glob("test-*"): test = pd.read_csv(file, sep=',') suffix = file.name[5:] try: train = pd.read_csv(path / f'train-{suffix}', sep=',') except FileNotFoundError: _log.error(f'train-{suffix} does not exists') continue _log.info('Fitting the model') users = test.user.unique() fittable = util.clone(algo) fittable = Recommender.adapt(fittable) fittable.fit(train) _log.info(f'generating recommendations for unique users') recs = batch.recommend(fittable, users, n_recs) _log.info(f'writing recommendations to {dest}') suffix = model + suffix recs.to_csv(dest / f'recs-{suffix}', index=False) if isinstance(fittable, Predictor): _log.info(f'generating predictions for user-item') preds = batch.predict(fittable, test) preds.to_csv(dest / f'pred-{suffix}', index=False)
def eval(train, test): algo.fit(train) preds = batch.predict(algo, test) return preds.set_index(['user', 'item'])