예제 #1
0
def test_sparse_matrix(rng):
    ratings = ml_test.ratings
    mat, uidx, iidx = sparse_ratings(ratings)

    assert mat.nrows == len(uidx)
    assert mat.nrows == ratings.user.nunique()
    assert mat.ncols == len(iidx)
    assert mat.ncols == ratings.item.nunique()

    # user indicators should correspond to user item counts
    ucounts = ratings.groupby('user').item.count()
    ucounts = ucounts.loc[uidx].cumsum()
    assert all(mat.rowptrs[1:] == ucounts.values)

    # verify rating values
    ratings = ratings.set_index(['user', 'item'])
    for u in rng.choice(uidx, size=50):
        ui = uidx.get_loc(u)
        vs = mat.row_vs(ui)
        vs = pd.Series(vs, iidx[mat.row_cs(ui)])
        rates = ratings.loc[u]['rating']
        vs, rates = vs.align(rates)
        assert not any(vs.isna())
        assert not any(rates.isna())
        assert all(vs == rates)
예제 #2
0
    def fit(self, ratings, **kwargs):
        """
        Train a model.

        The model-training process depends on ``save_nbrs`` and ``min_sim``, but *not* on other
        algorithm parameters.

        Args:
            ratings(pandas.DataFrame):
                (user,item,rating) data for computing item similarities.
        """
        util.check_env()
        # Training proceeds in 2 steps:
        # 1. Normalize item vectors to be mean-centered and unit-normalized
        # 2. Compute similarities with pairwise dot products
        self._timer = util.Stopwatch()

        _logger.debug('[%s] beginning fit, memory use %s', self._timer,
                      util.max_memory())
        _logger.debug('[%s] using CSR kernel %s', self._timer, csrk.name)

        init_rmat, users, items = sparse_ratings(ratings)
        n_items = len(items)
        _logger.info(
            '[%s] made sparse matrix for %d items (%d ratings from %d users)',
            self._timer, len(items), init_rmat.nnz, len(users))
        _logger.debug('[%s] made matrix, memory use %s', self._timer,
                      util.max_memory())

        rmat, item_means = self._mean_center(ratings, init_rmat, items)
        _logger.debug('[%s] centered, memory use %s', self._timer,
                      util.max_memory())

        rmat = self._normalize(rmat)
        _logger.debug('[%s] normalized, memory use %s', self._timer,
                      util.max_memory())

        _logger.info('[%s] computing similarity matrix', self._timer)
        smat = self._compute_similarities(rmat)
        _logger.debug('[%s] computed, memory use %s', self._timer,
                      util.max_memory())

        _logger.info('[%s] got neighborhoods for %d of %d items', self._timer,
                     np.sum(np.diff(smat.rowptrs) > 0), n_items)

        _logger.info('[%s] computed %d neighbor pairs', self._timer, smat.nnz)

        self.item_index_ = items
        self.item_means_ = item_means
        self.item_counts_ = np.diff(smat.rowptrs)
        self.sim_matrix_ = smat
        self.user_index_ = users
        self.rating_matrix_ = init_rmat
        # create an inverted similarity matrix for efficient scanning
        self._sim_inv_ = smat.transpose()
        _logger.info('[%s] transposed matrix for optimization', self._timer)
        _logger.debug('[%s] done, memory use %s', self._timer,
                      util.max_memory())

        return self
예제 #3
0
def test_sparse_matrix_implicit():
    ratings = ml_test.ratings
    ratings = ratings.loc[:, ['user', 'item']]
    mat, uidx, iidx = sparse_ratings(ratings)

    assert mat.nrows == len(uidx)
    assert mat.nrows == ratings.user.nunique()
    assert mat.ncols == len(iidx)
    assert mat.ncols == ratings.item.nunique()
    assert mat.values is None
예제 #4
0
def test_sparse_matrix_scipy_implicit():
    ratings = ml_test.ratings
    ratings = ratings.loc[:, ['user', 'item']]
    mat, uidx, iidx = sparse_ratings(ratings, scipy=True)

    assert sps.issparse(mat)
    assert sps.isspmatrix_csr(mat)
    assert len(uidx) == ratings.user.nunique()
    assert len(iidx) == ratings.item.nunique()

    assert all(mat.data == 1.0)
예제 #5
0
def test_sparse_matrix_scipy(format, sps_fmt_checker):
    ratings = ml_test.ratings
    mat, uidx, iidx = sparse_ratings(ratings, scipy=format)

    assert sps.issparse(mat)
    assert sps_fmt_checker(mat)
    assert len(uidx) == ratings.user.nunique()
    assert len(iidx) == ratings.item.nunique()

    # user indicators should correspond to user item counts
    ucounts = ratings.groupby('user').item.count()
    ucounts = ucounts.loc[uidx].cumsum()
    if sps.isspmatrix_coo(mat):
        mat = mat.tocsr()
    assert all(mat.indptr[1:] == ucounts.values)
예제 #6
0
    def fit(self, ratings, **kwargs):
        timer = util.Stopwatch()
        rng = util.rng(self.rng_spec)

        matrix, users, items = sparse_ratings(ratings[['user', 'item']])

        _log.info('[%s] setting up model', timer)
        train, model = self._build_model(len(users), len(items))

        _log.info('[%s] preparing training dataset', timer)
        train_data = BprInputs(matrix, self.batch_size, self.neg_count, self.neg_weight, rng)

        _log.info('[%s] training model', timer)
        train.fit(train_data, epochs=self.epochs)

        _log.info('[%s] model finished', timer)

        self.user_index_ = users
        self.item_index_ = items
        self.model = model

        return self
예제 #7
0
def test_sparse_matrix_indexes(rng):
    ratings = ml_test.ratings
    uidx = pd.Index(rng.permutation(ratings['user'].unique()))
    iidx = pd.Index(rng.permutation(ratings['item'].unique()))

    mat, _uidx, _iidx = sparse_ratings(ratings, users=uidx, items=iidx)

    assert _uidx is uidx
    assert _iidx is iidx
    assert len(_uidx) == ratings.user.nunique()
    assert len(_iidx) == ratings.item.nunique()

    # verify rating values
    ratings = ratings.set_index(['user', 'item'])
    for u in rng.choice(_uidx, size=50):
        ui = _uidx.get_loc(u)
        vs = mat.row_vs(ui)
        vs = pd.Series(vs, _iidx[mat.row_cs(ui)])
        rates = ratings.loc[u]['rating']
        vs, rates = vs.align(rates)
        assert not any(vs.isna())
        assert not any(rates.isna())
        assert all(vs == rates)