def _scipy_sim_block(block, bsp, bep, rmat, min_sim, max_nbrs, nitems): "Compute a single block of the similarity matrix" assert block.nrows == bep - bsp _logger.debug('processing block %d:%d (%d nnz)', bsp, bep, block.nnz) if rmat.nnz == 0: return _empty_csr(block.nrows, nitems, np.zeros(block.nrows, np.int32)) sims = rmat @ block.to_scipy().transpose() sims = matrix.CSR.from_scipy(sims) r_sp = sims.rowptrs[:-1] r_ep = sims.rowptrs[1:] r_cs = sims.colinds r_vs = sims.values block_csr = _make_sim_block(nitems, bsp, block.nrows, r_sp, r_ep, r_cs, r_vs, min_sim, max_nbrs) _logger.debug('umm %d %d', block_csr.nrows, block.nrows) assert block_csr.nrows == block.nrows assert block_csr.ncols == nitems _logger.debug('block %d:%d has %d similarities', bsp, bep, block_csr.nnz) _logger.debug('block: %s', matrix.CSR(N=block_csr)) return block_csr
def _select_similarities(self, nitems, rows, cols, vals): _logger.info('[%s] ordering similarities', self._timer) csr = matrix.CSR.from_coo(rows, cols, vals, shape=(nitems, nitems)) csr.sort_values() if self.save_nbrs is None or self.save_nbrs <= 0: return csr _logger.info('[%s] picking %d top similarities', self._timer, self.save_nbrs) counts = csr.row_nnzs() _logger.debug('have %d rows in size range [%d,%d]', len(counts), np.min(counts), np.max(counts)) ncounts = np.fmin(counts, self.save_nbrs) _logger.debug('will have %d rows in size range [%d,%d]', len(ncounts), np.min(ncounts), np.max(ncounts)) assert np.all(ncounts <= self.save_nbrs) assert np.all(ncounts >= 0) nnz = np.sum(ncounts) rp2 = np.zeros_like(csr.rowptrs) rp2[1:] = np.cumsum(ncounts) ci2 = np.zeros(nnz, np.int32) vs2 = np.zeros(nnz) for i in range(nitems): sp1 = csr.rowptrs[i] sp2 = rp2[i] ep1 = sp1 + ncounts[i] ep2 = sp2 + ncounts[i] assert ep1 - sp1 == ep2 - sp2 ci2[sp2:ep2] = csr.colinds[sp1:ep1] vs2[sp2:ep2] = csr.values[sp1:ep1] return matrix.CSR(csr.nrows, csr.ncols, nnz, rp2, ci2, vs2)
def _compute_similarities_unlearn_min_centering_matrix_vectorize( self, rmat_scipy, items, users): N = len(users) M = len(items) rmat_mask = rmat_scipy.copy() rmat_mask[rmat_scipy > 0] = 1 self.S_I_sparse = rmat_scipy.transpose() @ rmat_mask self.S_II_sparse = rmat_scipy.transpose() @ rmat_scipy self.N_II_sparse = rmat_mask.transpose() @ rmat_mask self.S_I_matrix = matrix.CSR(self.S_I_sparse.shape[0], self.S_I_sparse.shape[1], self.S_I_sparse.nnz, self.S_I_sparse.indptr.copy(), self.S_I_sparse.indices.copy(), self.S_I_sparse.data) self.S_II_matrix = matrix.CSR(self.S_II_sparse.shape[0], self.S_II_sparse.shape[1], self.S_II_sparse.nnz, self.S_II_sparse.indptr.copy(), self.S_II_sparse.indices.copy(), self.S_II_sparse.data) self.N_II_matrix = matrix.CSR(self.N_II_sparse.shape[0], self.N_II_sparse.shape[1], self.N_II_sparse.nnz, self.N_II_sparse.indptr.copy(), self.N_II_sparse.indices.copy(), self.N_II_sparse.data) self.N_I = rmat_mask.sum(axis=0) self.Sum_I = rmat_scipy.sum(axis=0) self.M_I = self.Sum_I / self.N_I self.M_I_sparse = sps.csr_matrix(self.M_I) self.N_I_sparse = sps.csr_matrix(self.N_I) self.Sum_I_sparse = sps.csr_matrix(self.Sum_I) self.N = N self.M = M self.smat_unlearn_sparse_csr = self._learn_sim_vectorize()
def _mean_center(self, ratings, rmat, items): if not self.center: return rmat, None item_means = ratings.groupby('item').rating.mean() item_means = item_means.reindex(items).values mcvals = rmat.values - item_means[rmat.colinds] nmat = matrix.CSR(rmat.nrows, rmat.ncols, rmat.nnz, rmat.rowptrs.copy(), rmat.colinds.copy(), mcvals) _logger.info('[%s] computed means for %d items', self._timer, len(item_means)) return nmat, item_means
def _compute_similarities(self, rmat): trmat = rmat.transpose() nitems = trmat.nrows m_nbrs = self.save_nbrs if m_nbrs is None or m_nbrs < 0: m_nbrs = 0 bounds = _make_blocks(nitems, 1000) _logger.info('[%s] splitting %d items (%d ratings) into %d blocks', self._timer, nitems, trmat.nnz, len(bounds)) blocks = [trmat.subset_rows(sp, ep) for (sp, ep) in bounds] if self._use_mkl and _mkl_ops is not None: _logger.info('[%s] computing similarities with MKL', self._timer) ptrs = List(bounds) nbs = List(b.N for b in blocks) if not nbs: # oops, this is the bad place # in non-JIT node, List doesn't actually make the list nbs = [b.N for b in blocks] ptrs = bounds s_blocks = _mkl_sim_blocks(trmat.N, nbs, ptrs, self.min_sim, m_nbrs) else: s_blocks = _scipy_sim_blocks(trmat.to_scipy(), blocks, bounds, self.min_sim, m_nbrs) s_blocks = [matrix.CSR(N=b) for b in s_blocks] nnz = sum(b.nnz for b in s_blocks) tot_rows = sum(b.nrows for b in s_blocks) _logger.info('[%s] computed %d similarities for %d items in %d blocks', self._timer, nnz, tot_rows, len(s_blocks)) row_nnzs = np.concatenate([b.row_nnzs() for b in s_blocks]) assert len(row_nnzs) == nitems, \ 'only have {} rows for {} items'.format(len(row_nnzs), nitems) smat = matrix.CSR.empty((nitems, nitems), row_nnzs, rpdtype=np.int64) start = 0 for bi, b in enumerate(s_blocks): bnr = b.nrows end = start + bnr v_sp = smat.rowptrs[start] v_ep = smat.rowptrs[end] _logger.debug('block %d (%d:%d) has %d entries, storing in %d:%d', bi, start, end, b.nnz, v_sp, v_ep) smat.colinds[v_sp:v_ep] = b.colinds smat.values[v_sp:v_ep] = b.values start = end _logger.info('[%s] sorting similarity matrix with %d entries', self._timer, smat.nnz) _sort_nbrs(smat.N) return smat
def _mkl_similarities(self, rmat): assert rmat.values is not None _logger.info('[%s] multiplying matrix with MKL', self._timer) m_nbrs = self.save_nbrs if m_nbrs is None or m_nbrs < 0: m_nbrs = 0 trmat = rmat.transpose() nitems = trmat.nrows # for i in range(nitems): # _logger.debug('verifying row %d', i) # cs = trmat.row_cs(i) # assert np.all(cs >= 0) # assert np.all(cs < trmat.ncols) # assert pd.Series(cs).nunique() == len(cs) _logger.debug('[%s] transposed, memory use %s', self._timer, util.max_memory()) s_blocks = _mkl_sim_blocks(trmat.N, self.min_sim, m_nbrs) _logger.debug('[%s] computed blocks, memory use %s', self._timer, util.max_memory()) s_blocks = [matrix.CSR(N=b) for (b, bs, be) in s_blocks] nnz = sum(b.nnz for b in s_blocks) tot_rows = sum(b.nrows for b in s_blocks) _logger.info('[%s] computed %d similarities for %d items in %d blocks', self._timer, nnz, tot_rows, len(s_blocks)) row_nnzs = np.concatenate([b.row_nnzs() for b in s_blocks]) assert len(row_nnzs) == nitems, \ 'only have {} rows for {} items'.format(len(row_nnzs), nitems) smat = matrix.CSR.empty((nitems, nitems), row_nnzs, rpdtype=np.int64) start = 0 for bi, b in enumerate(s_blocks): bnr = b.nrows end = start + bnr v_sp = smat.rowptrs[start] v_ep = smat.rowptrs[end] _logger.debug('block %d (%d:%d) has %d entries, storing in %d:%d', bi, start, end, b.nnz, v_sp, v_ep) smat.colinds[v_sp:v_ep] = b.colinds smat.values[v_sp:v_ep] = b.values start = end _logger.info('[%s] sorting similarity matrix with %d entries', self._timer, smat.nnz) _sort_nbrs(smat.N) return smat
def test_csr64_pickle(csr): csr = lm.CSR(csr.nrows, csr.ncols, csr.nnz, csr.rowptrs.astype(np.int64), csr.colinds, csr.values) data = pickle.dumps(csr) csr2 = pickle.loads(data) assert csr2.nrows == csr.nrows assert csr2.ncols == csr.ncols assert csr2.nnz == csr.nnz assert all(csr2.rowptrs == csr.rowptrs) assert csr2.rowptrs.dtype == np.int64 assert all(csr2.colinds == csr.colinds) if csr.values is not None: assert all(csr2.values == csr.values) else: assert csr2.values is None
def test_mkl_mabt(): for i in range(50): A = lktu.rand_csr(20, 10, nnz=50) B = lktu.rand_csr(5, 10, nnz=20) As = mkl_ops.SparseM.from_csr(A) Bs = mkl_ops.SparseM.from_csr(B) Ch = mkl_ops._lk_mkl_spmabt(As.ptr, Bs.ptr) C = mkl_ops._to_csr(Ch) C = lm.CSR(N=C) assert C.nrows == 20 assert C.ncols == 5 Csp = A.to_scipy() @ B.to_scipy().T Cspa = Csp.toarray() Ca = C.to_scipy().toarray() assert Ca == approx(Cspa)
def _mean_center(self, ratings, rmat, items, users): if not self.center: return rmat, None item_means = ratings.groupby('item').rating.mean() item_means = item_means.reindex(items).values user_means = ratings.groupby('user').rating.mean() user_means = user_means.reindex(users).values global_mean = ratings.rating.mean() #mcvals = rmat.values - item_means[rmat.colinds] - user_means[rmat.rowinds()] + global_mean #Old Mean Centering mcvals = rmat.values - item_means[rmat.colinds] nmat = matrix.CSR(rmat.nrows, rmat.ncols, rmat.nnz, rmat.rowptrs.copy(), rmat.colinds.copy(), mcvals) _logger.info('[%s] computed means for %d items', self._timer, len(item_means)) return nmat, item_means
def test_mkl_mabt(mkl_ops, data): Anr = data.draw(st.integers(5, 100)) nc = data.draw(st.integers(5, 100)) Bnr = data.draw(st.integers(5, 100)) A = data.draw(lktu.csrs(nrows=Anr, ncols=nc, values=True)) B = data.draw(lktu.csrs(nrows=Bnr, ncols=nc, values=True)) As = mkl_ops.SparseM.from_csr(A) Bs = mkl_ops.SparseM.from_csr(B) Ch = mkl_ops._lk_mkl_spmabt(As.ptr, Bs.ptr) C = mkl_ops._to_csr(Ch) C = lm.CSR(N=C) assert C.nrows == A.nrows assert C.ncols == B.nrows Csp = A.to_scipy() @ B.to_scipy().T Cspa = Csp.toarray() Ca = C.to_scipy().toarray() assert Ca == approx(Cspa)
def test_csr64_pickle(values): csr = rand_csr(100, 50, 1000, values=values) csr = lm.CSR(csr.nrows, csr.ncols, csr.nnz, csr.rowptrs.astype(np.int64), csr.colinds, csr.values) assert csr.nrows == 100 assert csr.ncols == 50 assert csr.nnz == 1000 data = pickle.dumps(csr) csr2 = pickle.loads(data) assert csr2.nrows == csr.nrows assert csr2.ncols == csr.ncols assert csr2.nnz == csr.nnz assert all(csr2.rowptrs == csr.rowptrs) assert csr2.rowptrs.dtype == np.int64 assert all(csr2.colinds == csr.colinds) if values: assert all(csr2.values == csr.values) else: assert csr2.values is None