Пример #1
0
def recommend(algo, users, n, candidates=None, *, nprocs=None, dask_result=False, **kwargs):
    """
    Batch-recommend for multiple users.  The provided algorithm should be a
    :py:class:`algorithms.Recommender`.

    Args:
        algo: the algorithm
        users(array-like): the users to recommend for
        n(int): the number of recommendations to generate (None for unlimited)
        candidates:
            the users' candidate sets. This can be a function, in which case it will
            be passed each user ID; it can also be a dictionary, in which case user
            IDs will be looked up in it.  Pass ``None`` to use the recommender's
            built-in candidate selector (usually recommended).
        nprocs(int):
            The number of processes to use for parallel recommendations.  Passed as
            ``n_jobs`` to :cls:`joblib.Parallel`.  The default, ``None``, will make
            the process sequential _unless_ called inside the :func:`joblib.parallel_backend`
            context manager.
        dask_result(bool):
            Whether to return a Dask data frame instead of a Pandas one.

    Returns:
        A frame with at least the columns ``user``, ``rank``, and ``item``; possibly also
        ``score``, and any other columns returned by the recommender.
    """

    rec_algo = Recommender.adapt(algo)
    if candidates is None and rec_algo is not algo:
        warnings.warn('no candidates provided and algo is not a recommender, unlikely to work')
    del algo  # don't need reference any more

    if 'ratings' in kwargs:
        warnings.warn('Providing ratings to recommend is not supported', DeprecationWarning)

    candidates = __standard_cand_fun(candidates)

    loop = Parallel(n_jobs=nprocs)

    path = None
    try:
        with loop:
            backend = loop._backend.__class__.__name__
            njobs = loop._effective_n_jobs()
            _logger.info('parallel backend %s, effective njobs %s',
                         backend, njobs)
            using_dask = backend == 'DaskDistributedBackend'
            if using_dask:
                _logger.debug('pre-scattering algorithm %s', rec_algo)
                futures = loop._backend.client.scatter([rec_algo], broadcast=True, hash=False)
                rec_algo = _AlgoKey('future', futures[0])
            elif njobs > 1:
                fd, path = tempfile.mkstemp(prefix='lkpy-predict', suffix='.pkl')
                path = pathlib.Path(path)
                os.close(fd)
                _logger.debug('pre-serializing algorithm %s to %s', rec_algo, path)
                dump(rec_algo, path)
                rec_algo = _AlgoKey('file', path)

            _logger.info('recommending for %d users (nprocs=%s)', len(users), nprocs)
            timer = util.Stopwatch()
            results = loop(delayed(_recommend_user)(rec_algo, user, n, candidates(user))
                           for user in users)

            if using_dask or dask_result:
                results = ddf.concat(results, interleave_partitions=True)
                if not dask_result:  # only if we're running inside dask, but don't want results
                    results = results.compute()
            else:
                results = pd.concat(results, ignore_index=True)
            _logger.info('recommended for %d users in %s', len(users), timer)
    finally:
        util.delete_sometime(path)

    return results
Пример #2
0
def predict(algo, pairs, *, nprocs=None):
    """
    Generate predictions for user-item pairs.  The provided algorithm should be a
    :py:class:`algorithms.Predictor` or a function of two arguments: the user ID and
    a list of item IDs. It should return a dictionary or a :py:class:`pandas.Series`
    mapping item IDs to predictions.

    To use this function, provide a pre-fit algorithm::

        >>> from lenskit.algorithms.basic import Bias
        >>> from lenskit.metrics.predict import rmse
        >>> ratings = util.load_ml_ratings()
        >>> bias = Bias()
        >>> bias.fit(ratings[:-1000])
        <lenskit.algorithms.basic.Bias object at ...>
        >>> preds = predict(bias, ratings[-1000:])
        >>> preds.head()
               user  item  rating   timestamp  prediction
        99004   664  8361     3.0  1393891425    3.288286
        99005   664  8528     3.5  1393891047    3.559119
        99006   664  8529     4.0  1393891173    3.573008
        99007   664  8636     4.0  1393891175    3.846268
        99008   664  8641     4.5  1393890852    3.710635
        >>> rmse(preds['prediction'], preds['rating'])
        0.8326992222...

    Args:
        algo(lenskit.algorithms.Predictor):
            A rating predictor function or algorithm.
        pairs(pandas.DataFrame):
            A data frame of (``user``, ``item``) pairs to predict for. If this frame also
            contains a ``rating`` column, it will be included in the result.
        nprocs(int):
            The number of processes to use for parallel batch prediction.  Passed as
            ``n_jobs`` to :cls:`joblib.Parallel`.  The default, ``None``, will make
            the process sequential _unless_ called inside the :func:`joblib.parallel_backend`
            context manager.

    Returns:
        pandas.DataFrame:
            a frame with columns ``user``, ``item``, and ``prediction`` containing
            the prediction results. If ``pairs`` contains a `rating` column, this
            result will also contain a `rating` column.
    """

    loop = Parallel(n_jobs=nprocs)

    path = None
    try:
        if loop._effective_n_jobs() > 1:
            fd, path = tempfile.mkstemp(prefix='lkpy-predict', suffix='.pkl')
            path = pathlib.Path(path)
            os.close(fd)
            _logger.debug('pre-serializing algorithm %s to %s', algo, path)
            dump(algo, path)
            algo = _AlgoKey('file', path)

        results = loop(
            delayed(_predict_user)(algo, user, udf)
            for (user, udf) in pairs.groupby('user'))

        results = pd.concat(results, ignore_index=True)
    finally:
        util.delete_sometime(path)

    if 'rating' in pairs:
        return pairs.join(results.set_index(['user', 'item']),
                          on=('user', 'item'))
    return results
Пример #3
0
def predict(algo, pairs, *, n_jobs=None, **kwargs):
    """
    Generate predictions for user-item pairs.  The provided algorithm should be a
    :py:class:`algorithms.Predictor` or a function of two arguments: the user ID and
    a list of item IDs. It should return a dictionary or a :py:class:`pandas.Series`
    mapping item IDs to predictions.

    To use this function, provide a pre-fit algorithm::

        >>> from lenskit.algorithms.basic import Bias
        >>> from lenskit.metrics.predict import rmse
        >>> from lenskit import datasets
        >>> ratings = datasets.MovieLens('ml-latest-small').ratings
        >>> bias = Bias()
        >>> bias.fit(ratings[:-1000])
        <lenskit.algorithms.basic.Bias object at ...>
        >>> preds = predict(bias, ratings[-1000:])
        >>> preds.head()
               user  item  rating   timestamp  prediction
        99004   664  8361     3.0  1393891425    3.288286
        99005   664  8528     3.5  1393891047    3.559119
        99006   664  8529     4.0  1393891173    3.573008
        99007   664  8636     4.0  1393891175    3.846268
        99008   664  8641     4.5  1393890852    3.710635
        >>> rmse(preds['prediction'], preds['rating'])
        0.8326992222...

    Args:
        algo(lenskit.algorithms.Predictor):
            A rating predictor function or algorithm.
        pairs(pandas.DataFrame):
            A data frame of (``user``, ``item``) pairs to predict for. If this frame also
            contains a ``rating`` column, it will be included in the result.
        n_jobs(int):
            The number of processes to use for parallel batch prediction.  Passed as
            ``n_jobs`` to :class:`joblib.Parallel`.  The default, ``None``, will result
            in a call to :func:`util.proc_count`(``None``), so the process will be
            the process sequential _unless_ called inside the :func:`joblib.parallel_backend`
            context manager or the ``LK_NUM_PROCS`` environment variable is set.

    Returns:
        pandas.DataFrame:
            a frame with columns ``user``, ``item``, and ``prediction`` containing
            the prediction results. If ``pairs`` contains a `rating` column, this
            result will also contain a `rating` column.
    """
    if n_jobs is None and 'nprocs' in kwargs:
        n_jobs = kwargs['nprocs']
        warnings.warn('nprocs is deprecated, use n_jobs', DeprecationWarning)

    if n_jobs is None:
        n_jobs = util.proc_count(None)

    loop = Parallel(n_jobs=n_jobs)

    path = None
    try:
        store = get_store(in_process=loop._effective_n_jobs() == 1)
        _logger.info('using model store %s', store)

        with store:
            key = store.put_model(algo)
            del algo
            client = store.client()

            nusers = pairs['user'].nunique()
            _logger.info('generating %d predictions for %d users', len(pairs),
                         nusers)
            results = loop(
                delayed(_predict_user)(client, key, user, udf.copy())
                for (user, udf) in pairs.groupby('user'))

        results = pd.concat(results, ignore_index=True, copy=False)
    finally:
        util.delete_sometime(path)

    if 'rating' in pairs:
        return pairs.join(results.set_index(['user', 'item']),
                          on=('user', 'item'))
    return results
Пример #4
0
def recommend(algo, users, n, candidates=None, *, n_jobs=None, **kwargs):
    """
    Batch-recommend for multiple users.  The provided algorithm should be a
    :py:class:`algorithms.Recommender`.

    Args:
        algo: the algorithm
        users(array-like): the users to recommend for
        n(int): the number of recommendations to generate (None for unlimited)
        candidates:
            the users' candidate sets. This can be a function, in which case it will
            be passed each user ID; it can also be a dictionary, in which case user
            IDs will be looked up in it.  Pass ``None`` to use the recommender's
            built-in candidate selector (usually recommended).
        n_jobs(int):
            The number of processes to use for parallel recommendations.  Passed as
            ``n_jobs`` to :cls:`joblib.Parallel`.  The default, ``None``, will make
            the process sequential _unless_ called inside the :func:`joblib.parallel_backend`
            context manager.

            .. note:: ``nprocs`` is accepted as a deprecated alias.

    Returns:
        A frame with at least the columns ``user``, ``rank``, and ``item``; possibly also
        ``score``, and any other columns returned by the recommender.
    """

    if n_jobs is None and 'nprocs' in kwargs:
        n_jobs = kwargs['nprocs']
        warnings.warn('nprocs is deprecated, use n_jobs', DeprecationWarning)

    rec_algo = Recommender.adapt(algo)
    if candidates is None and rec_algo is not algo:
        warnings.warn(
            'no candidates provided and algo is not a recommender, unlikely to work'
        )
    del algo  # don't need reference any more

    if 'ratings' in kwargs:
        warnings.warn('Providing ratings to recommend is not supported',
                      DeprecationWarning)

    candidates = __standard_cand_fun(candidates)

    loop = Parallel(n_jobs=n_jobs)

    path = None
    try:
        _logger.debug('activating recommender loop')
        with loop:
            backend = loop._backend.__class__.__name__
            njobs = loop._effective_n_jobs()
            _logger.info('parallel backend %s, effective njobs %s', backend,
                         njobs)
            astr = str(rec_algo)
            if njobs > 1:
                fd, path = tempfile.mkstemp(prefix='lkpy-predict',
                                            suffix='.pkl',
                                            dir=util.scratch_dir(joblib=True))
                path = pathlib.Path(path)
                os.close(fd)
                _logger.debug('pre-serializing algorithm %s to %s', rec_algo,
                              path)
                with sharing_mode():
                    dump(rec_algo, path)
                rec_algo = _AlgoKey('file', path)

            _logger.info('recommending with %s for %d users (n_jobs=%s)', astr,
                         len(users), n_jobs)
            timer = util.Stopwatch()
            results = loop(
                delayed(_recommend_user)(rec_algo, user, n, candidates(user))
                for user in users)

            results = pd.concat(results, ignore_index=True, copy=False)
            _logger.info('recommended for %d users in %s', len(users), timer)
    finally:
        util.delete_sometime(path)

    return results
Пример #5
0
def recommend(algo, users, n, candidates=None, *, n_jobs=None, **kwargs):
    """
    Batch-recommend for multiple users.  The provided algorithm should be a
    :py:class:`algorithms.Recommender`.

    Args:
        algo: the algorithm
        users(array-like): the users to recommend for
        n(int): the number of recommendations to generate (None for unlimited)
        candidates:
            the users' candidate sets. This can be a function, in which case it will
            be passed each user ID; it can also be a dictionary, in which case user
            IDs will be looked up in it.  Pass ``None`` to use the recommender's
            built-in candidate selector (usually recommended).
        n_jobs(int):
            The number of processes to use for parallel recommendations.  Passed as
            ``n_jobs`` to :class:`joblib.Parallel`.  The default, ``None``, will result
            in a call to :func:`util.proc_count`(``None``), so the process will be
            the process sequential _unless_ called inside the :func:`joblib.parallel_backend`
            context manager or the ``LK_NUM_PROCS`` environment variable is set.

    Returns:
        A frame with at least the columns ``user``, ``rank``, and ``item``; possibly also
        ``score``, and any other columns returned by the recommender.
    """

    if n_jobs is None and 'nprocs' in kwargs:
        n_jobs = kwargs['nprocs']
        warnings.warn('nprocs is deprecated, use n_jobs', DeprecationWarning)

    if n_jobs is None:
        n_jobs = util.proc_count(None)

    rec_algo = Recommender.adapt(algo)
    if candidates is None and rec_algo is not algo:
        warnings.warn(
            'no candidates provided and algo is not a recommender, unlikely to work'
        )
    del algo  # don't need reference any more

    if 'ratings' in kwargs:
        warnings.warn('Providing ratings to recommend is not supported',
                      DeprecationWarning)

    candidates = __standard_cand_fun(candidates)

    loop = Parallel(n_jobs=n_jobs)

    path = None
    try:
        _logger.debug('activating recommender loop')
        with loop:
            store = get_store(in_process=loop._effective_n_jobs() == 1)
            _logger.info('using model store %s', store)
            astr = str(rec_algo)

            with store:
                key = store.put_model(rec_algo)
                del rec_algo
                client = store.client()

                _logger.info('recommending with %s for %d users (n_jobs=%s)',
                             astr, len(users), n_jobs)
                timer = util.Stopwatch()
                results = loop(
                    delayed(_recommend_user)(client, key, user, n,
                                             candidates(user))
                    for user in users)

            results = pd.concat(results, ignore_index=True, copy=False)
            _logger.info('recommended for %d users in %s', len(users), timer)
    finally:
        util.delete_sometime(path)

    return results