def test_adv_fill_users(): rla = topn.RecListAnalysis() rla.add_metric(topn.precision) rla.add_metric(topn.recall) a_uu = UserUser(30, min_nbrs=10) a_uu = Recommender.adapt(a_uu) a_ii = ItemItem(20, min_nbrs=4) a_ii = Recommender.adapt(a_ii) splits = xf.sample_users(ml_test.ratings, 2, 50, xf.SampleN(5)) all_recs = {} all_test = {} for i, (train, test) in enumerate(splits): a_uu.fit(train) rec_users = test['user'].sample(50).unique() all_recs[(i + 1, 'UU')] = batch.recommend(a_uu, rec_users, 25) a_ii.fit(train) rec_users = test['user'].sample(50).unique() all_recs[(i + 1, 'II')] = batch.recommend(a_ii, rec_users, 25) all_test[i + 1] = test recs = pd.concat(all_recs, names=['part', 'algo']) recs.reset_index(['part', 'algo'], inplace=True) recs.reset_index(drop=True, inplace=True) test = pd.concat(all_test, names=['part']) test.reset_index(['part'], inplace=True) test.reset_index(drop=True, inplace=True) scores = rla.compute(recs, test, include_missing=True) inames = scores.index.names scores.sort_index(inplace=True) assert len(scores) == 50 * 4 assert all(scores['ntruth'] == 5) assert scores['recall'].isna().sum() > 0 _log.info('scores:\n%s', scores) ucounts = scores.reset_index().groupby('algo')['user'].agg( ['count', 'nunique']) assert all(ucounts['count'] == 100) assert all(ucounts['nunique'] == 100) mscores = rla.compute(recs, test) mscores = mscores.reset_index().set_index(inames) mscores.sort_index(inplace=True) assert len(mscores) < len(scores) _log.info('mscores:\n%s', mscores) recall = scores.loc[scores['recall'].notna(), 'recall'].copy() recall, mrecall = recall.align(mscores['recall']) assert all(recall == mrecall)
def test_fill_users(): rla = topn.RecListAnalysis() rla.add_metric(topn.precision) rla.add_metric(topn.recall) algo = UserUser(20, min_nbrs=10) algo = Recommender.adapt(algo) splits = xf.sample_users(ml_test.ratings, 1, 50, xf.SampleN(5)) train, test = next(splits) algo.fit(train) rec_users = test['user'].sample(50).unique() recs = batch.recommend(algo, rec_users, 25) scores = rla.compute(recs, test, include_missing=True) assert len(scores) == test['user'].nunique() assert scores['recall'].notna().sum() == len(rec_users) assert all(scores['ntruth'] == 5) mscores = rla.compute(recs, test) assert len(mscores) < len(scores) recall = scores.loc[scores['recall'].notna(), 'recall'].copy() recall, mrecall = recall.align(mscores['recall']) assert all(recall == mrecall)
def _run_eval(self, params): timer = Stopwatch() _log.info('evaluating at %s', params) if self.retrainer: if not self.retrainer.initialized: self.retrainer.fit_initial(self.train) algo = self.retrainer.instantiate(params) else: algo = self.module.instantiate(params, not self.explicit) algo = Recommender.adapt(algo) _log.info('[%s] train %s', timer, algo) algo.fit(self.train) _log.info('[%s] recommend %s', timer, algo) users = self.test['user'].unique() recs = batch.recommend(algo, users, self.n_recs, n_jobs=self.n_jobs) if len(recs) == 0: _log.info('[%s] %s produced no recommendations', timer, algo) return 0 _log.info('[%s] evaluate %s', timer, algo) rla = topn.RecListAnalysis() rla.add_metric(topn.recip_rank) rla.add_metric(topn.recall) scores = rla.compute(recs, self.test, include_missing=True) assert len(scores) == len(self.test) mrr = scores['recip_rank'].fillna(0).mean() hr = scores['recall'].fillna(0).mean() _log.info('%s had MRR of %.3f', algo, mrr) _log.info('%s had hit rate of %.3f', algo, hr) return -mrr
def main(args): mod_name = args.get('-m', 'lkdemo.algorithms') out = args.get('FILE', None) model = args.get('ALGO') dsname = args.get('DATASET') _log.info('importing from module %s', mod_name) algorithms = importlib.import_module(mod_name) _log.info('locating model %s', model) algo = getattr(algorithms, model) _log.info('locating data set %s', dsname) data = getattr(datasets, dsname) _log.info('loading ratings') ratings = data.ratings _log.info('training model') algo = Recommender.adapt(algo) timer = Stopwatch() algo.fit(ratings) timer.stop() _log.info('trained model in %s', timer) if resource: res = resource.getrusage(resource.RUSAGE_SELF) _log.info('%.2fs user, %.2fs system, %.1fMB max RSS', res.ru_utime, res.ru_stime, res.ru_maxrss / 1024) if out is None: out = f'models/{dsname}-{model}.pkl.gz' _log.info('writing to %s', out) pathlib.Path(out).parent.mkdir(parents=True, exist_ok=True) with gzip.open(out, 'wb') as f: pickle.dump(algo, f, 4)
def train_model(train, n_factors=30, n_iterations=20, regularization=.1, save_training_loss=False, confidence_factor=40): """Train (and evaluate iterations if requested) model""" # Encapsulate the model into a TopN recommender model = Recommender.adapt( als.ImplicitMF(n_factors, iterations=n_iterations, weight=confidence_factor, progress=tqdm, method='cg')) # Compute the confidence values for user-item pairs train['rating'] = 1 + confidence_factor * train['rating'] if save_training_loss: loss = np.zeros(n_iterations) for i, intermediate_model in enumerate(model.fit_iters(train)): predictions = generate_predictions(intermediate_model, train) loss[i] = evaluate_model_loss(intermediate_model, predictions) else: model.fit(train) loss = None return model, loss
def main(args): mod_name = args.get('-m') input = args.get('--splits') output = args.get('-o') n_recs = int(args.get('-n')) model = args.get('ALGO') _log.info(f'importing from module {mod_name}') algorithms = importlib.import_module(mod_name) algo = getattr(algorithms, model) algo = Recommender.adapt(algo) path = Path(input) dest = Path(output) dest.mkdir(exist_ok=True, parents=True) ds_def = getattr(datasets, path.name, None) for file in path.glob("test-*"): test = pd.read_csv(file, sep=',') suffix = file.name[5:] train_file = path / f'train-{suffix}' timer = util.Stopwatch() if 'index' in test.columns: _log.info('setting test index') test = test.set_index('index') else: _log.warn('no index column found in %s', file.name) if train_file.exists(): _log.info('[%s] loading training data from %s', timer, train_file) train = pd.read_csv(path / f'train-{suffix}', sep=',') elif ds_def is not None: _log.info('[%s] extracting training data from data set %s', timer, path.name) train = datasets.ds_diff(ds_def.ratings, test) train.reset_index(drop=True, inplace=True) else: _log.error('could not find training data for %s', file.name) continue _log.info('[%s] Fitting the model', timer) # We train isolated to manage resource use model = batch.train_isolated(algo, train) try: _log.info('[%s] generating recommendations for unique users', timer) users = test.user.unique() recs = batch.recommend(model, users, n_recs) _log.info('[%s] writing recommendations to %s', timer, dest) recs.to_csv(dest / f'recs-{suffix}', index=False) if isinstance(algo, Predictor) and not args['--no-predict']: _log.info('[%s] generating predictions for user-item', timer) preds = batch.predict(model, test) preds.to_csv(dest / f'pred-{suffix}', index=False) finally: model.close()
def test_uu_implicit_batch_accuracy(): from lenskit import batch, topn import lenskit.crossfold as xf ratings = lktu.ml100k.ratings algo = knn.UserUser(30, center=False, aggregate='sum') folds = list(xf.partition_users(ratings, 5, xf.SampleFrac(0.2))) all_test = pd.concat(f.test for f in folds) rec_lists = [] for train, test in folds: _log.info('running training') rec_algo = Recommender.adapt(algo) rec_algo.fit(train.loc[:, ['user', 'item']]) _log.info('testing %d users', test.user.nunique()) recs = batch.recommend(rec_algo, test.user.unique(), 100, n_jobs=2) rec_lists.append(recs) recs = pd.concat(rec_lists) rla = topn.RecListAnalysis() rla.add_metric(topn.ndcg) results = rla.compute(recs, all_test) user_dcg = results.ndcg dcg = user_dcg.mean() assert dcg >= 0.03
def do_prepare(opts): name = opts['-d'] ml = MovieLens(f'data/{name}') train, test = next(sample_users(ml.ratings, 1, 10000, SampleN(5))) test.to_parquet(f'data/{name}-test.parquet', index=False) _log.info('getting popular recs') pop = Popular() pop.fit(train) pop_recs = recommend(pop, test['user'].unique(), 100) _log.info('getting ALS recs') als = ImplicitMF(20, iterations=10) als = Recommender.adapt(als) als.fit(train.drop(columns=['rating'])) als_recs = recommend(als, test['user'].unique(), 100) _log.info('merging recs') recs = pd.concat({ 'Popular': pop_recs, 'ALS': als_recs }, names=['Algorithm']) recs.reset_index('Algorithm', inplace=True) recs.to_parquet(f'data/{name}-recs.parquet', index=False)
def test_tf_bpr_batch_accuracy(tf_session): from lenskit.algorithms import basic import lenskit.crossfold as xf from lenskit import batch, topn ratings = lktu.ml100k.ratings algo = lktf.BPR(20, batch_size=1024, epochs=20, rng_spec=42) algo = Recommender.adapt(algo) all_recs = [] all_test = [] for train, test in xf.partition_users(ratings, 5, xf.SampleFrac(0.2)): _log.info('running training') algo.fit(train) _log.info('testing %d users', test.user.nunique()) recs = batch.recommend(algo, np.unique(test.user), 50) all_recs.append(recs) all_test.append(test) _log.info('analyzing results') rla = topn.RecListAnalysis() rla.add_metric(topn.ndcg) rla.add_metric(topn.recip_rank) scores = rla.compute(pd.concat(all_recs, ignore_index=True), pd.concat(all_test, ignore_index=True), include_missing=True) scores.fillna(0, inplace=True) _log.info('MRR: %f', scores['recip_rank'].mean()) _log.info('nDCG: %f', scores['ndcg'].mean()) assert scores['ndcg'].mean() > 0.1
def eval(aname, algo, train, test, all_preds): fittable = util.clone(algo) fittable = Recommender.adapt(fittable) fittable.fit(train) # predict ratings preds = batch.predict(fittable, test) preds['Algorithm'] = aname all_preds.append(preds)
def _train_algo(data, algo, ratings): algo = Recommender.adapt(algo) _log.info('training algorithm %s', algo) timer = Stopwatch() algo.fit(ratings) timer.stop() _log.info('trained %s in %s', algo, timer) return algo
def eval(train, test): _log.info('running training') algo = knn.ItemItem(30) algo = Recommender.adapt(algo) algo.fit(train) _log.info('testing %d users', test.user.nunique()) recs = batch.recommend(algo, test.user.unique(), 100, n_jobs=ncpus) return recs
def eval(train, test): train['rating'] = train.rating.astype(np.float_) _log.info('training CG') cg_algo = als.ImplicitMF(25, iterations=20, method='cg') cg_algo = Recommender.adapt(cg_algo) cg_algo.fit(train) _log.info('training LU') lu_algo = als.ImplicitMF(25, iterations=20, method='lu') lu_algo = Recommender.adapt(lu_algo) lu_algo.fit(train) users = test.user.unique() _log.info('testing %d users', len(users)) cg_recs = batch.recommend(cg_algo, users, 100, n_jobs=2) lu_recs = batch.recommend(lu_algo, users, 100, n_jobs=2) return pd.concat({ 'CG': cg_recs, 'LU': lu_recs }, names=['Method']).reset_index('Method')
def test_uu_train_adapt(): "Test training an adapted user-user (#129)." from lenskit.algorithms import Recommender uu = knn.UserUser(30) uu = Recommender.adapt(uu) ret = uu.fit(ml_ratings) assert ret is uu assert isinstance(uu.predictor, knn.UserUser)
def do_recommend(algo_wrapper, train, test): fittable = util.clone(algo_wrapper.algo) fittable = Recommender.adapt(fittable) fittable.fit(train) users = test.user.unique() # now we run the recommender recs = batch.recommend(fittable, users, N) # add the algorithm name for analyzability recs['Algorithm'] = algo_wrapper.name return recs
def batch_eval(aname, algo, train, test): fittable = util.clone(algo) fittable = Recommender.adapt(fittable) fittable.fit(train) users = test.user.unique() # Now we run the recommender recs = batch.recommend(fittable, users, 10) # Add the algorithm name for analyzability recs['Algorithm'] = aname return recs
def eval(self, aname, algo): """ Fit the model to the input data and create predictions. """ fittable = util.clone(algo) fittable = Recommender.adapt(fittable) fittable.fit(self.train) users = self.test.user.unique() recs = batch.recommend(fittable, users, self.num_recs) recs['Algorithm'] = aname return recs
def test_store_iknn(store_cls): algo = ItemItem(10) algo = Recommender.adapt(algo) algo.fit(lktu.ml_test.ratings) with store_cls() as store: k = store.put_model(algo) client = store.client() a2 = client.get_model(k) assert a2 is not algo del a2
def user_eval(aname, algo, train, userId): fittable = util.clone(algo) fittable = Recommender.adapt(fittable) fittable.fit(train) #user_ratings = load_user_reviews_from_table(userId) # Now we run the recommender recs = fittable.recommend(userId, 10) #recs = fittable.recommend(userId, 10, ratings=user_ratings) # Add the algorithm name for analyzability recs['Algorithm'] = aname return recs
def __init__(self, mysql): self.mysql = mysql self.connection = mysql.get_connection() self.movies = self.read_table( """select * from movielenstable WHERE title IS NOT NULL AND genres IS NOT NULL;""" ) self.movies.columns = ['item', 'title', 'genres'] self.ratings = self.read_table( """select * from lensratings WHERE rating IS NOT NULL;""") self.ratings.columns = ['user', 'item', 'rating'] self.user_user = UserUser(15, min_nbrs=3) self.algorithm = Recommender.adapt(self.user_user) self.algorithm.fit(self.ratings)
def run(self, strategy_context: RecommenderAlgorithmStrategyContext) -> np.ndarray: data_set_source = strategy_context.data_set_source data_frame_reader: DataFrameReaderStrategy = self.data_frame_reader_factory.create(data_set_source) data_set: DataFrame = data_frame_reader.parse(DataFrameReaderStrategyContext(data_set_source)) partition = list(partition_users(data=data_set, partitions=1, method=crossfold.SampleFrac(0.2)))[0] test, train = partition.test, partition.train number_of_recommendations = strategy_context.number_of_recommendations algorithm = Recommender.adapt(Bias()) trained_algorithm = algorithm.fit(train) recommendations = lenskit.batch.recommend(trained_algorithm, test['user'].unique(), number_of_recommendations) return recommendations.groupby('user')['item'].apply(lambda x: x).to_numpy().reshape( (-1, number_of_recommendations))
def eval(aname, algo, train, test): print("test") fittable = util.clone(algo) fittable = Recommender.adapt(fittable) fittable.fit(train) users = test.user.unique() # now we run the recommender recs = batch.recommend(fittable, users, 100) # add the algorithm name for analyzability recs['Algorithm'] = aname print("recs") print(recs.head()) return recs
def test_train_isolate(): algo = Bias() algo = Recommender.adapt(algo) saved = train_isolated(algo, ml_test.ratings) try: trained = saved.get() assert isinstance(trained, TopN) recs = trained.recommend(10, 10) assert len(recs) == 10 del recs, trained finally: saved.close()
def train(options: TrainOptions): seed = init_rng(rng_seed(), 'train-model', options.data, options.algo) _log.info('using random seed %s', seed) ddir = data_dir / options.data rating_file = ddir / 'ratings.parquet' if options.train_data == 'all': mdir = ddir / 'models' elif options.train_data == 'eval': mdir = ddir / 'eval' else: raise ValueError(f'unknown training data {options.train_data}') mdir.mkdir(parents=True, exist_ok=True) mfn = mdir / f'{options.algo_fn}.model' if options.default: _log.warn('Using default settings') opt_fn = None else: opt_fn = ddir / 'tuning' / f'{options.algo_fn}.json' _log.info('Using algorithm optimization results %s', opt_fn) with LogFile(mdir / f'{options.algo_fn}.log'): _log.info('reading ratings from %s', rating_file) ratings = pd.read_parquet(rating_file) if options.drop_ratings and 'rating' in ratings.columns: _log.info('dropping rating column') ratings = ratings.drop(columns=['rating']) if options.train_data == 'eval': _log.info('reading test data') test = pd.read_parquet(ddir / 'eval' / 'test-ratings.parquet') train_mask = pd.Series(True, index=ratings.index) train_mask[test.index] = False ratings = ratings[train_mask].copy().reset_index(drop=True) implicit = 'rating' not in ratings.columns _log.info('loading algorithm %s for %s in %s mode', options.data, options.algo, 'implicit' if implicit else 'explicit') algo = get_algorithm(options.data, options.algo, opt_fn, implicit) algo = Recommender.adapt(algo) _log.info('training %s on %s ratings', algo, len(ratings)) timer = Stopwatch() model = algo.fit(ratings) timer.stop() _log.info('trained in %s', timer) _log.info('saving model to %s', mfn) with open(mfn, 'wb') as f: p = dt.CompactingPickler(f, protocol=4) p.dump(model)
def test_train_isolate_file(tmp_path): fn = tmp_path / 'saved.bpk' algo = Bias() algo = Recommender.adapt(algo) saved = train_isolated(algo, ml_test.ratings, file=fn) try: assert saved.path == fn trained = saved.get() assert isinstance(trained, TopN) recs = trained.recommend(10, 10) assert len(recs) == 10 del recs, trained finally: saved.close()
def test_als_isolate(ml20m, rng): users = rng.choice(ml20m['user'].unique(), 5000, replace=False) algo = BiasedMF(20, iterations=10) algo = Recommender.adapt(algo) _log.info('training %s', algo) ares = batch.train_isolated(algo, ml20m) try: _log.info('recommending with %s', algo) recs = batch.recommend(ares, users, 10) assert recs['user'].nunique() == 5000 _log.info('predicting with %s', algo) pairs = ml20m.sample(1000) preds = batch.predict(ares, pairs) assert len(preds) == len(pairs) finally: ares.close()
def predictRatingForUnseenMovies(self, userMovieRatings, predictConfigDict): numOfRecom, maxNumOfNeigh, minNumOfNeigh = \ self._validatePredictConfig(predictConfigDict) if None in (numOfRecom, maxNumOfNeigh, minNumOfNeigh): return False, None userUser = UserUser(maxNumOfNeigh, min_nbrs=minNumOfNeigh) algo = Recommender.adapt(userUser) algo.fit(self.movieDataset.ratings) userRecom = algo.recommend(self.NON_EXISTING_USER, numOfRecom, ratings=pd.Series(userMovieRatings)) return True, userRecom
def objective_fn(params: Dict[str, Any]): algo = als.BiasedMF( features=params["features"], iterations=params["iteration"], reg=0.1, damping=5, ) model = util.clone(algo) model = Recommender.adapt(model) model.fit(train_df) recs = batch.recommend(model, test_users, recsize) rla = topn.RecListAnalysis() rla.add_metric(topn.ndcg) results = rla.compute(recs, test_df) target_metric = -results.ndcg.mean() return {"loss": target_metric, "status": STATUS_OK}
out = args.get('FILE', None) model = args.get('ALGO') dsname = args.get('DATASET') _log.info('importing from module %s', mod_name) algorithms = importlib.import_module(mod_name) _log.info('locating model %s', model) algo = getattr(algorithms, model) _log.info('locating data set %s', dsname) data = getattr(datasets, dsname) _log.info('loading ratings') ratings = data.ratings _log.info('training model') algo = Recommender.adapt(algo) timer = Stopwatch() algo.fit(ratings) timer.stop() _log.info('trained model in %s', timer) if resource: res = resource.getrusage(resource.RUSAGE_SELF) _log.info('%.2fs user, %.2fs system, %.1fMB max RSS', res.ru_utime, res.ru_stime, res.ru_maxrss / 1024) if out is None: out = f'models/{dsname}-{model}.pkl.gz' _log.info('writing to %s', out) pathlib.Path(out).parent.mkdir(parents=True, exist_ok=True) with gzip.open(out, 'wb') as f:
dest.mkdir(exist_ok=True, parents=True) for file in path.glob("test-*"): test = pd.read_csv(file, sep=',') suffix = file.name[5:] try: train = pd.read_csv(path / f'train-{suffix}', sep=',') except FileNotFoundError: _log.error(f'train-{suffix} does not exists') continue _log.info('Fitting the model') users = test.user.unique() fittable = util.clone(algo) fittable = Recommender.adapt(fittable) fittable.fit(train) _log.info(f'generating recommendations for unique users') recs = batch.recommend(fittable, users, n_recs) _log.info(f'writing recommendations to {dest}') suffix = model + suffix recs.to_csv(dest / f'recs-{suffix}', index=False) if isinstance(fittable, Predictor): _log.info(f'generating predictions for user-item') preds = batch.predict(fittable, test) preds.to_csv(dest / f'pred-{suffix}', index=False)