def test_sparse_matrix(rng): ratings = ml_test.ratings mat, uidx, iidx = sparse_ratings(ratings) assert mat.nrows == len(uidx) assert mat.nrows == ratings.user.nunique() assert mat.ncols == len(iidx) assert mat.ncols == ratings.item.nunique() # user indicators should correspond to user item counts ucounts = ratings.groupby('user').item.count() ucounts = ucounts.loc[uidx].cumsum() assert all(mat.rowptrs[1:] == ucounts.values) # verify rating values ratings = ratings.set_index(['user', 'item']) for u in rng.choice(uidx, size=50): ui = uidx.get_loc(u) vs = mat.row_vs(ui) vs = pd.Series(vs, iidx[mat.row_cs(ui)]) rates = ratings.loc[u]['rating'] vs, rates = vs.align(rates) assert not any(vs.isna()) assert not any(rates.isna()) assert all(vs == rates)
def fit(self, ratings, **kwargs): """ Train a model. The model-training process depends on ``save_nbrs`` and ``min_sim``, but *not* on other algorithm parameters. Args: ratings(pandas.DataFrame): (user,item,rating) data for computing item similarities. """ util.check_env() # Training proceeds in 2 steps: # 1. Normalize item vectors to be mean-centered and unit-normalized # 2. Compute similarities with pairwise dot products self._timer = util.Stopwatch() _logger.debug('[%s] beginning fit, memory use %s', self._timer, util.max_memory()) _logger.debug('[%s] using CSR kernel %s', self._timer, csrk.name) init_rmat, users, items = sparse_ratings(ratings) n_items = len(items) _logger.info( '[%s] made sparse matrix for %d items (%d ratings from %d users)', self._timer, len(items), init_rmat.nnz, len(users)) _logger.debug('[%s] made matrix, memory use %s', self._timer, util.max_memory()) rmat, item_means = self._mean_center(ratings, init_rmat, items) _logger.debug('[%s] centered, memory use %s', self._timer, util.max_memory()) rmat = self._normalize(rmat) _logger.debug('[%s] normalized, memory use %s', self._timer, util.max_memory()) _logger.info('[%s] computing similarity matrix', self._timer) smat = self._compute_similarities(rmat) _logger.debug('[%s] computed, memory use %s', self._timer, util.max_memory()) _logger.info('[%s] got neighborhoods for %d of %d items', self._timer, np.sum(np.diff(smat.rowptrs) > 0), n_items) _logger.info('[%s] computed %d neighbor pairs', self._timer, smat.nnz) self.item_index_ = items self.item_means_ = item_means self.item_counts_ = np.diff(smat.rowptrs) self.sim_matrix_ = smat self.user_index_ = users self.rating_matrix_ = init_rmat # create an inverted similarity matrix for efficient scanning self._sim_inv_ = smat.transpose() _logger.info('[%s] transposed matrix for optimization', self._timer) _logger.debug('[%s] done, memory use %s', self._timer, util.max_memory()) return self
def test_sparse_matrix_implicit(): ratings = ml_test.ratings ratings = ratings.loc[:, ['user', 'item']] mat, uidx, iidx = sparse_ratings(ratings) assert mat.nrows == len(uidx) assert mat.nrows == ratings.user.nunique() assert mat.ncols == len(iidx) assert mat.ncols == ratings.item.nunique() assert mat.values is None
def test_sparse_matrix_scipy_implicit(): ratings = ml_test.ratings ratings = ratings.loc[:, ['user', 'item']] mat, uidx, iidx = sparse_ratings(ratings, scipy=True) assert sps.issparse(mat) assert sps.isspmatrix_csr(mat) assert len(uidx) == ratings.user.nunique() assert len(iidx) == ratings.item.nunique() assert all(mat.data == 1.0)
def test_sparse_matrix_scipy(format, sps_fmt_checker): ratings = ml_test.ratings mat, uidx, iidx = sparse_ratings(ratings, scipy=format) assert sps.issparse(mat) assert sps_fmt_checker(mat) assert len(uidx) == ratings.user.nunique() assert len(iidx) == ratings.item.nunique() # user indicators should correspond to user item counts ucounts = ratings.groupby('user').item.count() ucounts = ucounts.loc[uidx].cumsum() if sps.isspmatrix_coo(mat): mat = mat.tocsr() assert all(mat.indptr[1:] == ucounts.values)
def fit(self, ratings, **kwargs): timer = util.Stopwatch() rng = util.rng(self.rng_spec) matrix, users, items = sparse_ratings(ratings[['user', 'item']]) _log.info('[%s] setting up model', timer) train, model = self._build_model(len(users), len(items)) _log.info('[%s] preparing training dataset', timer) train_data = BprInputs(matrix, self.batch_size, self.neg_count, self.neg_weight, rng) _log.info('[%s] training model', timer) train.fit(train_data, epochs=self.epochs) _log.info('[%s] model finished', timer) self.user_index_ = users self.item_index_ = items self.model = model return self
def test_sparse_matrix_indexes(rng): ratings = ml_test.ratings uidx = pd.Index(rng.permutation(ratings['user'].unique())) iidx = pd.Index(rng.permutation(ratings['item'].unique())) mat, _uidx, _iidx = sparse_ratings(ratings, users=uidx, items=iidx) assert _uidx is uidx assert _iidx is iidx assert len(_uidx) == ratings.user.nunique() assert len(_iidx) == ratings.item.nunique() # verify rating values ratings = ratings.set_index(['user', 'item']) for u in rng.choice(_uidx, size=50): ui = _uidx.get_loc(u) vs = mat.row_vs(ui) vs = pd.Series(vs, _iidx[mat.row_cs(ui)]) rates = ratings.loc[u]['rating'] vs, rates = vs.align(rates) assert not any(vs.isna()) assert not any(rates.isna()) assert all(vs == rates)