Exemplo n.º 1
0
def test_adv_fill_users():
    rla = topn.RecListAnalysis()
    rla.add_metric(topn.precision)
    rla.add_metric(topn.recall)

    a_uu = UserUser(30, min_nbrs=10)
    a_uu = Recommender.adapt(a_uu)
    a_ii = ItemItem(20, min_nbrs=4)
    a_ii = Recommender.adapt(a_ii)

    splits = xf.sample_users(ml_test.ratings, 2, 50, xf.SampleN(5))
    all_recs = {}
    all_test = {}
    for i, (train, test) in enumerate(splits):
        a_uu.fit(train)
        rec_users = test['user'].sample(50).unique()
        all_recs[(i + 1, 'UU')] = batch.recommend(a_uu, rec_users, 25)

        a_ii.fit(train)
        rec_users = test['user'].sample(50).unique()
        all_recs[(i + 1, 'II')] = batch.recommend(a_ii, rec_users, 25)
        all_test[i + 1] = test

    recs = pd.concat(all_recs, names=['part', 'algo'])
    recs.reset_index(['part', 'algo'], inplace=True)
    recs.reset_index(drop=True, inplace=True)

    test = pd.concat(all_test, names=['part'])
    test.reset_index(['part'], inplace=True)
    test.reset_index(drop=True, inplace=True)

    scores = rla.compute(recs, test, include_missing=True)
    inames = scores.index.names
    scores.sort_index(inplace=True)
    assert len(scores) == 50 * 4
    assert all(scores['ntruth'] == 5)
    assert scores['recall'].isna().sum() > 0
    _log.info('scores:\n%s', scores)

    ucounts = scores.reset_index().groupby('algo')['user'].agg(
        ['count', 'nunique'])
    assert all(ucounts['count'] == 100)
    assert all(ucounts['nunique'] == 100)

    mscores = rla.compute(recs, test)
    mscores = mscores.reset_index().set_index(inames)
    mscores.sort_index(inplace=True)
    assert len(mscores) < len(scores)
    _log.info('mscores:\n%s', mscores)

    recall = scores.loc[scores['recall'].notna(), 'recall'].copy()
    recall, mrecall = recall.align(mscores['recall'])
    assert all(recall == mrecall)
Exemplo n.º 2
0
def test_fill_users():
    rla = topn.RecListAnalysis()
    rla.add_metric(topn.precision)
    rla.add_metric(topn.recall)

    algo = UserUser(20, min_nbrs=10)
    algo = Recommender.adapt(algo)

    splits = xf.sample_users(ml_test.ratings, 1, 50, xf.SampleN(5))
    train, test = next(splits)
    algo.fit(train)

    rec_users = test['user'].sample(50).unique()
    recs = batch.recommend(algo, rec_users, 25)

    scores = rla.compute(recs, test, include_missing=True)
    assert len(scores) == test['user'].nunique()
    assert scores['recall'].notna().sum() == len(rec_users)
    assert all(scores['ntruth'] == 5)

    mscores = rla.compute(recs, test)
    assert len(mscores) < len(scores)

    recall = scores.loc[scores['recall'].notna(), 'recall'].copy()
    recall, mrecall = recall.align(mscores['recall'])
    assert all(recall == mrecall)
Exemplo n.º 3
0
    def _run_eval(self, params):
        timer = Stopwatch()

        _log.info('evaluating at %s', params)

        if self.retrainer:
            if not self.retrainer.initialized:
                self.retrainer.fit_initial(self.train)
            algo = self.retrainer.instantiate(params)
        else:
            algo = self.module.instantiate(params, not self.explicit)
            algo = Recommender.adapt(algo)
            _log.info('[%s] train %s', timer, algo)
            algo.fit(self.train)

        _log.info('[%s] recommend %s', timer, algo)
        users = self.test['user'].unique()
        recs = batch.recommend(algo, users, self.n_recs, n_jobs=self.n_jobs)

        if len(recs) == 0:
            _log.info('[%s] %s produced no recommendations', timer, algo)
            return 0

        _log.info('[%s] evaluate %s', timer, algo)
        rla = topn.RecListAnalysis()
        rla.add_metric(topn.recip_rank)
        rla.add_metric(topn.recall)
        scores = rla.compute(recs, self.test, include_missing=True)
        assert len(scores) == len(self.test)
        mrr = scores['recip_rank'].fillna(0).mean()
        hr = scores['recall'].fillna(0).mean()
        _log.info('%s had MRR of %.3f', algo, mrr)
        _log.info('%s had hit rate of %.3f', algo, hr)
        return -mrr
Exemplo n.º 4
0
def main(args):
    mod_name = args.get('-m', 'lkdemo.algorithms')
    out = args.get('FILE', None)
    model = args.get('ALGO')
    dsname = args.get('DATASET')

    _log.info('importing from module %s', mod_name)
    algorithms = importlib.import_module(mod_name)

    _log.info('locating model %s', model)
    algo = getattr(algorithms, model)
    _log.info('locating data set %s', dsname)
    data = getattr(datasets, dsname)

    _log.info('loading ratings')
    ratings = data.ratings
    _log.info('training model')
    algo = Recommender.adapt(algo)
    timer = Stopwatch()
    algo.fit(ratings)
    timer.stop()
    _log.info('trained model in %s', timer)
    if resource:
        res = resource.getrusage(resource.RUSAGE_SELF)
        _log.info('%.2fs user, %.2fs system, %.1fMB max RSS', res.ru_utime,
                  res.ru_stime, res.ru_maxrss / 1024)

    if out is None:
        out = f'models/{dsname}-{model}.pkl.gz'

    _log.info('writing to %s', out)
    pathlib.Path(out).parent.mkdir(parents=True, exist_ok=True)
    with gzip.open(out, 'wb') as f:
        pickle.dump(algo, f, 4)
Exemplo n.º 5
0
def train_model(train,
                n_factors=30,
                n_iterations=20,
                regularization=.1,
                save_training_loss=False,
                confidence_factor=40):
    """Train (and evaluate iterations if requested) model"""

    # Encapsulate the model into a TopN recommender
    model = Recommender.adapt(
        als.ImplicitMF(n_factors,
                       iterations=n_iterations,
                       weight=confidence_factor,
                       progress=tqdm,
                       method='cg'))

    # Compute the confidence values for user-item pairs
    train['rating'] = 1 + confidence_factor * train['rating']

    if save_training_loss:
        loss = np.zeros(n_iterations)

        for i, intermediate_model in enumerate(model.fit_iters(train)):
            predictions = generate_predictions(intermediate_model, train)
            loss[i] = evaluate_model_loss(intermediate_model, predictions)

    else:
        model.fit(train)
        loss = None

    return model, loss
Exemplo n.º 6
0
def main(args):
    mod_name = args.get('-m')
    input = args.get('--splits')
    output = args.get('-o')
    n_recs = int(args.get('-n'))
    model = args.get('ALGO')

    _log.info(f'importing from module {mod_name}')
    algorithms = importlib.import_module(mod_name)

    algo = getattr(algorithms, model)
    algo = Recommender.adapt(algo)

    path = Path(input)
    dest = Path(output)
    dest.mkdir(exist_ok=True, parents=True)

    ds_def = getattr(datasets, path.name, None)

    for file in path.glob("test-*"):
        test = pd.read_csv(file, sep=',')
        suffix = file.name[5:]
        train_file = path / f'train-{suffix}'
        timer = util.Stopwatch()

        if 'index' in test.columns:
            _log.info('setting test index')
            test = test.set_index('index')
        else:
            _log.warn('no index column found in %s', file.name)

        if train_file.exists():
            _log.info('[%s] loading training data from %s', timer, train_file)
            train = pd.read_csv(path / f'train-{suffix}', sep=',')
        elif ds_def is not None:
            _log.info('[%s] extracting training data from data set %s', timer,
                      path.name)
            train = datasets.ds_diff(ds_def.ratings, test)
            train.reset_index(drop=True, inplace=True)
        else:
            _log.error('could not find training data for %s', file.name)
            continue

        _log.info('[%s] Fitting the model', timer)
        # We train isolated to manage resource use
        model = batch.train_isolated(algo, train)
        try:
            _log.info('[%s] generating recommendations for unique users',
                      timer)
            users = test.user.unique()
            recs = batch.recommend(model, users, n_recs)
            _log.info('[%s] writing recommendations to %s', timer, dest)
            recs.to_csv(dest / f'recs-{suffix}', index=False)

            if isinstance(algo, Predictor) and not args['--no-predict']:
                _log.info('[%s] generating predictions for user-item', timer)
                preds = batch.predict(model, test)
                preds.to_csv(dest / f'pred-{suffix}', index=False)
        finally:
            model.close()
Exemplo n.º 7
0
def test_uu_implicit_batch_accuracy():
    from lenskit import batch, topn
    import lenskit.crossfold as xf

    ratings = lktu.ml100k.ratings

    algo = knn.UserUser(30, center=False, aggregate='sum')

    folds = list(xf.partition_users(ratings, 5, xf.SampleFrac(0.2)))
    all_test = pd.concat(f.test for f in folds)

    rec_lists = []
    for train, test in folds:
        _log.info('running training')
        rec_algo = Recommender.adapt(algo)
        rec_algo.fit(train.loc[:, ['user', 'item']])
        _log.info('testing %d users', test.user.nunique())
        recs = batch.recommend(rec_algo, test.user.unique(), 100, n_jobs=2)
        rec_lists.append(recs)
    recs = pd.concat(rec_lists)

    rla = topn.RecListAnalysis()
    rla.add_metric(topn.ndcg)
    results = rla.compute(recs, all_test)
    user_dcg = results.ndcg

    dcg = user_dcg.mean()
    assert dcg >= 0.03
Exemplo n.º 8
0
def do_prepare(opts):
    name = opts['-d']
    ml = MovieLens(f'data/{name}')

    train, test = next(sample_users(ml.ratings, 1, 10000, SampleN(5)))

    test.to_parquet(f'data/{name}-test.parquet', index=False)

    _log.info('getting popular recs')
    pop = Popular()
    pop.fit(train)
    pop_recs = recommend(pop, test['user'].unique(), 100)

    _log.info('getting ALS recs')
    als = ImplicitMF(20, iterations=10)
    als = Recommender.adapt(als)
    als.fit(train.drop(columns=['rating']))
    als_recs = recommend(als, test['user'].unique(), 100)

    _log.info('merging recs')
    recs = pd.concat({
        'Popular': pop_recs,
        'ALS': als_recs
    },
                     names=['Algorithm'])
    recs.reset_index('Algorithm', inplace=True)
    recs.to_parquet(f'data/{name}-recs.parquet', index=False)
Exemplo n.º 9
0
def test_tf_bpr_batch_accuracy(tf_session):
    from lenskit.algorithms import basic
    import lenskit.crossfold as xf
    from lenskit import batch, topn

    ratings = lktu.ml100k.ratings

    algo = lktf.BPR(20, batch_size=1024, epochs=20, rng_spec=42)
    algo = Recommender.adapt(algo)

    all_recs = []
    all_test = []
    for train, test in xf.partition_users(ratings, 5, xf.SampleFrac(0.2)):
        _log.info('running training')
        algo.fit(train)
        _log.info('testing %d users', test.user.nunique())
        recs = batch.recommend(algo, np.unique(test.user), 50)
        all_recs.append(recs)
        all_test.append(test)

    _log.info('analyzing results')
    rla = topn.RecListAnalysis()
    rla.add_metric(topn.ndcg)
    rla.add_metric(topn.recip_rank)
    scores = rla.compute(pd.concat(all_recs, ignore_index=True),
                         pd.concat(all_test, ignore_index=True),
                         include_missing=True)
    scores.fillna(0, inplace=True)
    _log.info('MRR: %f', scores['recip_rank'].mean())
    _log.info('nDCG: %f', scores['ndcg'].mean())
    assert scores['ndcg'].mean() > 0.1
 def eval(aname, algo, train, test, all_preds):
     fittable = util.clone(algo)
     fittable = Recommender.adapt(fittable)
     fittable.fit(train)
     # predict ratings
     preds = batch.predict(fittable, test)
     preds['Algorithm'] = aname
     all_preds.append(preds)
Exemplo n.º 11
0
def _train_algo(data, algo, ratings):
    algo = Recommender.adapt(algo)
    _log.info('training algorithm %s', algo)
    timer = Stopwatch()
    algo.fit(ratings)
    timer.stop()
    _log.info('trained %s in %s', algo, timer)
    return algo
Exemplo n.º 12
0
 def eval(train, test):
     _log.info('running training')
     algo = knn.ItemItem(30)
     algo = Recommender.adapt(algo)
     algo.fit(train)
     _log.info('testing %d users', test.user.nunique())
     recs = batch.recommend(algo, test.user.unique(), 100, n_jobs=ncpus)
     return recs
Exemplo n.º 13
0
 def eval(train, test):
     train['rating'] = train.rating.astype(np.float_)
     _log.info('training CG')
     cg_algo = als.ImplicitMF(25, iterations=20, method='cg')
     cg_algo = Recommender.adapt(cg_algo)
     cg_algo.fit(train)
     _log.info('training LU')
     lu_algo = als.ImplicitMF(25, iterations=20, method='lu')
     lu_algo = Recommender.adapt(lu_algo)
     lu_algo.fit(train)
     users = test.user.unique()
     _log.info('testing %d users', len(users))
     cg_recs = batch.recommend(cg_algo, users, 100, n_jobs=2)
     lu_recs = batch.recommend(lu_algo, users, 100, n_jobs=2)
     return pd.concat({
         'CG': cg_recs,
         'LU': lu_recs
     }, names=['Method']).reset_index('Method')
Exemplo n.º 14
0
def test_uu_train_adapt():
    "Test training an adapted user-user (#129)."
    from lenskit.algorithms import Recommender

    uu = knn.UserUser(30)
    uu = Recommender.adapt(uu)
    ret = uu.fit(ml_ratings)
    assert ret is uu
    assert isinstance(uu.predictor, knn.UserUser)
Exemplo n.º 15
0
def do_recommend(algo_wrapper, train, test):
    fittable = util.clone(algo_wrapper.algo)
    fittable = Recommender.adapt(fittable)
    fittable.fit(train)
    users = test.user.unique()
    # now we run the recommender
    recs = batch.recommend(fittable, users, N)
    # add the algorithm name for analyzability
    recs['Algorithm'] = algo_wrapper.name
    return recs
Exemplo n.º 16
0
def batch_eval(aname, algo, train, test):
    fittable = util.clone(algo)
    fittable = Recommender.adapt(fittable)
    fittable.fit(train)
    users = test.user.unique()
    # Now we run the recommender
    recs = batch.recommend(fittable, users, 10)
    # Add the algorithm name for analyzability
    recs['Algorithm'] = aname
    return recs
Exemplo n.º 17
0
 def eval(self, aname, algo):
     """
     Fit the model to the input data and create predictions.
     """
     fittable = util.clone(algo)
     fittable = Recommender.adapt(fittable)
     fittable.fit(self.train)
     users = self.test.user.unique()
     recs = batch.recommend(fittable, users, self.num_recs)
     recs['Algorithm'] = aname
     return recs
Exemplo n.º 18
0
def test_store_iknn(store_cls):
    algo = ItemItem(10)
    algo = Recommender.adapt(algo)
    algo.fit(lktu.ml_test.ratings)

    with store_cls() as store:
        k = store.put_model(algo)
        client = store.client()

        a2 = client.get_model(k)
        assert a2 is not algo
        del a2
Exemplo n.º 19
0
def user_eval(aname, algo, train, userId):
    fittable = util.clone(algo)
    fittable = Recommender.adapt(fittable)
    fittable.fit(train)
    #user_ratings = load_user_reviews_from_table(userId)

    # Now we run the recommender
    recs = fittable.recommend(userId, 10)
    #recs = fittable.recommend(userId, 10, ratings=user_ratings)
    # Add the algorithm name for analyzability
    recs['Algorithm'] = aname
    return recs
 def __init__(self, mysql):
     self.mysql = mysql
     self.connection = mysql.get_connection()
     self.movies = self.read_table(
         """select * from movielenstable WHERE title IS NOT NULL AND genres IS NOT NULL;"""
     )
     self.movies.columns = ['item', 'title', 'genres']
     self.ratings = self.read_table(
         """select * from lensratings WHERE rating IS NOT NULL;""")
     self.ratings.columns = ['user', 'item', 'rating']
     self.user_user = UserUser(15, min_nbrs=3)
     self.algorithm = Recommender.adapt(self.user_user)
     self.algorithm.fit(self.ratings)
Exemplo n.º 21
0
    def run(self, strategy_context: RecommenderAlgorithmStrategyContext) -> np.ndarray:
        data_set_source = strategy_context.data_set_source
        data_frame_reader: DataFrameReaderStrategy = self.data_frame_reader_factory.create(data_set_source)
        data_set: DataFrame = data_frame_reader.parse(DataFrameReaderStrategyContext(data_set_source))

        partition = list(partition_users(data=data_set, partitions=1, method=crossfold.SampleFrac(0.2)))[0]
        test, train = partition.test, partition.train
        number_of_recommendations = strategy_context.number_of_recommendations
        algorithm = Recommender.adapt(Bias())
        trained_algorithm = algorithm.fit(train)
        recommendations = lenskit.batch.recommend(trained_algorithm, test['user'].unique(), number_of_recommendations)
        return recommendations.groupby('user')['item'].apply(lambda x: x).to_numpy().reshape(
            (-1, number_of_recommendations))
Exemplo n.º 22
0
 def eval(aname, algo, train, test):
     print("test")
     fittable = util.clone(algo)
     fittable = Recommender.adapt(fittable)
     fittable.fit(train)
     users = test.user.unique()
     # now we run the recommender
     recs = batch.recommend(fittable, users, 100)
     # add the algorithm name for analyzability
     recs['Algorithm'] = aname
     print("recs")
     print(recs.head())
     return recs
Exemplo n.º 23
0
def test_train_isolate():
    algo = Bias()
    algo = Recommender.adapt(algo)

    saved = train_isolated(algo, ml_test.ratings)
    try:
        trained = saved.get()
        assert isinstance(trained, TopN)
        recs = trained.recommend(10, 10)
        assert len(recs) == 10
        del recs, trained
    finally:
        saved.close()
Exemplo n.º 24
0
def train(options: TrainOptions):
    seed = init_rng(rng_seed(), 'train-model', options.data, options.algo)
    _log.info('using random seed %s', seed)

    ddir = data_dir / options.data
    rating_file = ddir / 'ratings.parquet'
    if options.train_data == 'all':
        mdir = ddir / 'models'
    elif options.train_data == 'eval':
        mdir = ddir / 'eval'
    else:
        raise ValueError(f'unknown training data {options.train_data}')

    mdir.mkdir(parents=True, exist_ok=True)
    mfn = mdir / f'{options.algo_fn}.model'
    if options.default:
        _log.warn('Using default settings')
        opt_fn = None
    else:
        opt_fn = ddir / 'tuning' / f'{options.algo_fn}.json'
        _log.info('Using algorithm optimization results %s', opt_fn)

    with LogFile(mdir / f'{options.algo_fn}.log'):
        _log.info('reading ratings from %s', rating_file)
        ratings = pd.read_parquet(rating_file)
        if options.drop_ratings and 'rating' in ratings.columns:
            _log.info('dropping rating column')
            ratings = ratings.drop(columns=['rating'])
        if options.train_data == 'eval':
            _log.info('reading test data')
            test = pd.read_parquet(ddir / 'eval' / 'test-ratings.parquet')
            train_mask = pd.Series(True, index=ratings.index)
            train_mask[test.index] = False
            ratings = ratings[train_mask].copy().reset_index(drop=True)

        implicit = 'rating' not in ratings.columns

        _log.info('loading algorithm %s for %s in %s mode', options.data,
                  options.algo, 'implicit' if implicit else 'explicit')
        algo = get_algorithm(options.data, options.algo, opt_fn, implicit)
        algo = Recommender.adapt(algo)

        _log.info('training %s on %s ratings', algo, len(ratings))
        timer = Stopwatch()
        model = algo.fit(ratings)
        timer.stop()
        _log.info('trained in %s', timer)
        _log.info('saving model to %s', mfn)
        with open(mfn, 'wb') as f:
            p = dt.CompactingPickler(f, protocol=4)
            p.dump(model)
Exemplo n.º 25
0
def test_train_isolate_file(tmp_path):
    fn = tmp_path / 'saved.bpk'
    algo = Bias()
    algo = Recommender.adapt(algo)

    saved = train_isolated(algo, ml_test.ratings, file=fn)
    try:
        assert saved.path == fn
        trained = saved.get()
        assert isinstance(trained, TopN)
        recs = trained.recommend(10, 10)
        assert len(recs) == 10
        del recs, trained
    finally:
        saved.close()
Exemplo n.º 26
0
def test_als_isolate(ml20m, rng):
    users = rng.choice(ml20m['user'].unique(), 5000, replace=False)
    algo = BiasedMF(20, iterations=10)
    algo = Recommender.adapt(algo)
    _log.info('training %s', algo)
    ares = batch.train_isolated(algo, ml20m)
    try:
        _log.info('recommending with %s', algo)
        recs = batch.recommend(ares, users, 10)
        assert recs['user'].nunique() == 5000
        _log.info('predicting with %s', algo)
        pairs = ml20m.sample(1000)
        preds = batch.predict(ares, pairs)
        assert len(preds) == len(pairs)
    finally:
        ares.close()
Exemplo n.º 27
0
    def predictRatingForUnseenMovies(self, userMovieRatings,
                                     predictConfigDict):

        numOfRecom, maxNumOfNeigh, minNumOfNeigh = \
            self._validatePredictConfig(predictConfigDict)

        if None in (numOfRecom, maxNumOfNeigh, minNumOfNeigh):
            return False, None

        userUser = UserUser(maxNumOfNeigh, min_nbrs=minNumOfNeigh)
        algo = Recommender.adapt(userUser)
        algo.fit(self.movieDataset.ratings)

        userRecom = algo.recommend(self.NON_EXISTING_USER,
                                   numOfRecom,
                                   ratings=pd.Series(userMovieRatings))

        return True, userRecom
Exemplo n.º 28
0
    def objective_fn(params: Dict[str, Any]):
        algo = als.BiasedMF(
            features=params["features"],
            iterations=params["iteration"],
            reg=0.1,
            damping=5,
        )

        model = util.clone(algo)
        model = Recommender.adapt(model)
        model.fit(train_df)

        recs = batch.recommend(model, test_users, recsize)

        rla = topn.RecListAnalysis()
        rla.add_metric(topn.ndcg)

        results = rla.compute(recs, test_df)

        target_metric = -results.ndcg.mean()

        return {"loss": target_metric, "status": STATUS_OK}
Exemplo n.º 29
0
out = args.get('FILE', None)
model = args.get('ALGO')
dsname = args.get('DATASET')

_log.info('importing from module %s', mod_name)
algorithms = importlib.import_module(mod_name)

_log.info('locating model %s', model)
algo = getattr(algorithms, model)
_log.info('locating data set %s', dsname)
data = getattr(datasets, dsname)

_log.info('loading ratings')
ratings = data.ratings
_log.info('training model')
algo = Recommender.adapt(algo)
timer = Stopwatch()
algo.fit(ratings)
timer.stop()
_log.info('trained model in %s', timer)
if resource:
    res = resource.getrusage(resource.RUSAGE_SELF)
    _log.info('%.2fs user, %.2fs system, %.1fMB max RSS', res.ru_utime,
              res.ru_stime, res.ru_maxrss / 1024)

if out is None:
    out = f'models/{dsname}-{model}.pkl.gz'

_log.info('writing to %s', out)
pathlib.Path(out).parent.mkdir(parents=True, exist_ok=True)
with gzip.open(out, 'wb') as f:
Exemplo n.º 30
0
dest.mkdir(exist_ok=True, parents=True)

for file in path.glob("test-*"):
    test = pd.read_csv(file, sep=',')
    suffix = file.name[5:]

    try:
        train = pd.read_csv(path / f'train-{suffix}', sep=',')
    except FileNotFoundError:
        _log.error(f'train-{suffix} does not exists')
        continue

    _log.info('Fitting the model')

    users = test.user.unique()

    fittable = util.clone(algo)
    fittable = Recommender.adapt(fittable)
    fittable.fit(train)

    _log.info(f'generating recommendations for unique users')
    recs = batch.recommend(fittable, users, n_recs)
    _log.info(f'writing recommendations to {dest}')
    suffix = model + suffix
    recs.to_csv(dest / f'recs-{suffix}', index=False)

    if isinstance(fittable, Predictor):
        _log.info(f'generating predictions for user-item')
        preds = batch.predict(fittable, test)
        preds.to_csv(dest / f'pred-{suffix}', index=False)