Пример #1
0
def _scipy_sim_block(block, bsp, bep, rmat, min_sim, max_nbrs, nitems):
    "Compute a single block of the similarity matrix"
    assert block.nrows == bep - bsp

    _logger.debug('processing block %d:%d (%d nnz)', bsp, bep, block.nnz)

    if rmat.nnz == 0:
        return _empty_csr(block.nrows, nitems, np.zeros(block.nrows, np.int32))

    sims = rmat @ block.to_scipy().transpose()
    sims = matrix.CSR.from_scipy(sims)

    r_sp = sims.rowptrs[:-1]
    r_ep = sims.rowptrs[1:]
    r_cs = sims.colinds
    r_vs = sims.values

    block_csr = _make_sim_block(nitems, bsp, block.nrows, r_sp, r_ep, r_cs,
                                r_vs, min_sim, max_nbrs)
    _logger.debug('umm %d %d', block_csr.nrows, block.nrows)
    assert block_csr.nrows == block.nrows
    assert block_csr.ncols == nitems
    _logger.debug('block %d:%d has %d similarities', bsp, bep, block_csr.nnz)
    _logger.debug('block: %s', matrix.CSR(N=block_csr))

    return block_csr
Пример #2
0
    def _select_similarities(self, nitems, rows, cols, vals):
        _logger.info('[%s] ordering similarities', self._timer)
        csr = matrix.CSR.from_coo(rows, cols, vals, shape=(nitems, nitems))
        csr.sort_values()

        if self.save_nbrs is None or self.save_nbrs <= 0:
            return csr

        _logger.info('[%s] picking %d top similarities', self._timer, self.save_nbrs)
        counts = csr.row_nnzs()
        _logger.debug('have %d rows in size range [%d,%d]',
                      len(counts), np.min(counts), np.max(counts))
        ncounts = np.fmin(counts, self.save_nbrs)
        _logger.debug('will have %d rows in size range [%d,%d]',
                      len(ncounts), np.min(ncounts), np.max(ncounts))
        assert np.all(ncounts <= self.save_nbrs)
        assert np.all(ncounts >= 0)
        nnz = np.sum(ncounts)

        rp2 = np.zeros_like(csr.rowptrs)
        rp2[1:] = np.cumsum(ncounts)
        ci2 = np.zeros(nnz, np.int32)
        vs2 = np.zeros(nnz)
        for i in range(nitems):
            sp1 = csr.rowptrs[i]
            sp2 = rp2[i]

            ep1 = sp1 + ncounts[i]
            ep2 = sp2 + ncounts[i]
            assert ep1 - sp1 == ep2 - sp2

            ci2[sp2:ep2] = csr.colinds[sp1:ep1]
            vs2[sp2:ep2] = csr.values[sp1:ep1]

        return matrix.CSR(csr.nrows, csr.ncols, nnz, rp2, ci2, vs2)
Пример #3
0
    def _compute_similarities_unlearn_min_centering_matrix_vectorize(
            self, rmat_scipy, items, users):
        N = len(users)
        M = len(items)
        rmat_mask = rmat_scipy.copy()
        rmat_mask[rmat_scipy > 0] = 1

        self.S_I_sparse = rmat_scipy.transpose() @ rmat_mask
        self.S_II_sparse = rmat_scipy.transpose() @ rmat_scipy
        self.N_II_sparse = rmat_mask.transpose() @ rmat_mask

        self.S_I_matrix = matrix.CSR(self.S_I_sparse.shape[0],
                                     self.S_I_sparse.shape[1],
                                     self.S_I_sparse.nnz,
                                     self.S_I_sparse.indptr.copy(),
                                     self.S_I_sparse.indices.copy(),
                                     self.S_I_sparse.data)
        self.S_II_matrix = matrix.CSR(self.S_II_sparse.shape[0],
                                      self.S_II_sparse.shape[1],
                                      self.S_II_sparse.nnz,
                                      self.S_II_sparse.indptr.copy(),
                                      self.S_II_sparse.indices.copy(),
                                      self.S_II_sparse.data)
        self.N_II_matrix = matrix.CSR(self.N_II_sparse.shape[0],
                                      self.N_II_sparse.shape[1],
                                      self.N_II_sparse.nnz,
                                      self.N_II_sparse.indptr.copy(),
                                      self.N_II_sparse.indices.copy(),
                                      self.N_II_sparse.data)

        self.N_I = rmat_mask.sum(axis=0)
        self.Sum_I = rmat_scipy.sum(axis=0)
        self.M_I = self.Sum_I / self.N_I

        self.M_I_sparse = sps.csr_matrix(self.M_I)
        self.N_I_sparse = sps.csr_matrix(self.N_I)
        self.Sum_I_sparse = sps.csr_matrix(self.Sum_I)

        self.N = N
        self.M = M

        self.smat_unlearn_sparse_csr = self._learn_sim_vectorize()
Пример #4
0
    def _mean_center(self, ratings, rmat, items):
        if not self.center:
            return rmat, None

        item_means = ratings.groupby('item').rating.mean()
        item_means = item_means.reindex(items).values
        mcvals = rmat.values - item_means[rmat.colinds]
        nmat = matrix.CSR(rmat.nrows, rmat.ncols, rmat.nnz,
                          rmat.rowptrs.copy(), rmat.colinds.copy(), mcvals)
        _logger.info('[%s] computed means for %d items', self._timer, len(item_means))
        return nmat, item_means
Пример #5
0
    def _compute_similarities(self, rmat):
        trmat = rmat.transpose()
        nitems = trmat.nrows
        m_nbrs = self.save_nbrs
        if m_nbrs is None or m_nbrs < 0:
            m_nbrs = 0

        bounds = _make_blocks(nitems, 1000)
        _logger.info('[%s] splitting %d items (%d ratings) into %d blocks',
                     self._timer, nitems, trmat.nnz, len(bounds))
        blocks = [trmat.subset_rows(sp, ep) for (sp, ep) in bounds]

        if self._use_mkl and _mkl_ops is not None:
            _logger.info('[%s] computing similarities with MKL', self._timer)
            ptrs = List(bounds)
            nbs = List(b.N for b in blocks)
            if not nbs:
                # oops, this is the bad place
                # in non-JIT node, List doesn't actually make the list
                nbs = [b.N for b in blocks]
                ptrs = bounds
            s_blocks = _mkl_sim_blocks(trmat.N, nbs, ptrs, self.min_sim,
                                       m_nbrs)
        else:
            s_blocks = _scipy_sim_blocks(trmat.to_scipy(), blocks, bounds,
                                         self.min_sim, m_nbrs)

        s_blocks = [matrix.CSR(N=b) for b in s_blocks]
        nnz = sum(b.nnz for b in s_blocks)
        tot_rows = sum(b.nrows for b in s_blocks)
        _logger.info('[%s] computed %d similarities for %d items in %d blocks',
                     self._timer, nnz, tot_rows, len(s_blocks))
        row_nnzs = np.concatenate([b.row_nnzs() for b in s_blocks])
        assert len(row_nnzs) == nitems, \
            'only have {} rows for {} items'.format(len(row_nnzs), nitems)

        smat = matrix.CSR.empty((nitems, nitems), row_nnzs, rpdtype=np.int64)
        start = 0
        for bi, b in enumerate(s_blocks):
            bnr = b.nrows
            end = start + bnr
            v_sp = smat.rowptrs[start]
            v_ep = smat.rowptrs[end]
            _logger.debug('block %d (%d:%d) has %d entries, storing in %d:%d',
                          bi, start, end, b.nnz, v_sp, v_ep)
            smat.colinds[v_sp:v_ep] = b.colinds
            smat.values[v_sp:v_ep] = b.values
            start = end

        _logger.info('[%s] sorting similarity matrix with %d entries',
                     self._timer, smat.nnz)
        _sort_nbrs(smat.N)

        return smat
Пример #6
0
    def _mkl_similarities(self, rmat):
        assert rmat.values is not None

        _logger.info('[%s] multiplying matrix with MKL', self._timer)
        m_nbrs = self.save_nbrs
        if m_nbrs is None or m_nbrs < 0:
            m_nbrs = 0
        trmat = rmat.transpose()
        nitems = trmat.nrows

        # for i in range(nitems):
        #     _logger.debug('verifying row %d', i)
        #     cs = trmat.row_cs(i)
        #     assert np.all(cs >= 0)
        #     assert np.all(cs < trmat.ncols)
        #     assert pd.Series(cs).nunique() == len(cs)

        _logger.debug('[%s] transposed, memory use %s', self._timer,
                      util.max_memory())
        s_blocks = _mkl_sim_blocks(trmat.N, self.min_sim, m_nbrs)
        _logger.debug('[%s] computed blocks, memory use %s', self._timer,
                      util.max_memory())
        s_blocks = [matrix.CSR(N=b) for (b, bs, be) in s_blocks]
        nnz = sum(b.nnz for b in s_blocks)
        tot_rows = sum(b.nrows for b in s_blocks)
        _logger.info('[%s] computed %d similarities for %d items in %d blocks',
                     self._timer, nnz, tot_rows, len(s_blocks))
        row_nnzs = np.concatenate([b.row_nnzs() for b in s_blocks])
        assert len(row_nnzs) == nitems, \
            'only have {} rows for {} items'.format(len(row_nnzs), nitems)

        smat = matrix.CSR.empty((nitems, nitems), row_nnzs, rpdtype=np.int64)
        start = 0
        for bi, b in enumerate(s_blocks):
            bnr = b.nrows
            end = start + bnr
            v_sp = smat.rowptrs[start]
            v_ep = smat.rowptrs[end]
            _logger.debug('block %d (%d:%d) has %d entries, storing in %d:%d',
                          bi, start, end, b.nnz, v_sp, v_ep)
            smat.colinds[v_sp:v_ep] = b.colinds
            smat.values[v_sp:v_ep] = b.values
            start = end

        _logger.info('[%s] sorting similarity matrix with %d entries',
                     self._timer, smat.nnz)
        _sort_nbrs(smat.N)

        return smat
Пример #7
0
def test_csr64_pickle(csr):
    csr = lm.CSR(csr.nrows, csr.ncols, csr.nnz, csr.rowptrs.astype(np.int64),
                 csr.colinds, csr.values)

    data = pickle.dumps(csr)
    csr2 = pickle.loads(data)

    assert csr2.nrows == csr.nrows
    assert csr2.ncols == csr.ncols
    assert csr2.nnz == csr.nnz
    assert all(csr2.rowptrs == csr.rowptrs)
    assert csr2.rowptrs.dtype == np.int64
    assert all(csr2.colinds == csr.colinds)
    if csr.values is not None:
        assert all(csr2.values == csr.values)
    else:
        assert csr2.values is None
Пример #8
0
def test_mkl_mabt():
    for i in range(50):
        A = lktu.rand_csr(20, 10, nnz=50)
        B = lktu.rand_csr(5, 10, nnz=20)

        As = mkl_ops.SparseM.from_csr(A)
        Bs = mkl_ops.SparseM.from_csr(B)

        Ch = mkl_ops._lk_mkl_spmabt(As.ptr, Bs.ptr)
        C = mkl_ops._to_csr(Ch)
        C = lm.CSR(N=C)

        assert C.nrows == 20
        assert C.ncols == 5

        Csp = A.to_scipy() @ B.to_scipy().T
        Cspa = Csp.toarray()
        Ca = C.to_scipy().toarray()
        assert Ca == approx(Cspa)
Пример #9
0
    def _mean_center(self, ratings, rmat, items, users):
        if not self.center:
            return rmat, None

        item_means = ratings.groupby('item').rating.mean()
        item_means = item_means.reindex(items).values

        user_means = ratings.groupby('user').rating.mean()
        user_means = user_means.reindex(users).values

        global_mean = ratings.rating.mean()

        #mcvals = rmat.values - item_means[rmat.colinds] - user_means[rmat.rowinds()] + global_mean
        #Old Mean Centering
        mcvals = rmat.values - item_means[rmat.colinds]

        nmat = matrix.CSR(rmat.nrows, rmat.ncols, rmat.nnz,
                          rmat.rowptrs.copy(), rmat.colinds.copy(), mcvals)
        _logger.info('[%s] computed means for %d items', self._timer,
                     len(item_means))
        return nmat, item_means
Пример #10
0
def test_mkl_mabt(mkl_ops, data):
    Anr = data.draw(st.integers(5, 100))
    nc = data.draw(st.integers(5, 100))
    Bnr = data.draw(st.integers(5, 100))
    A = data.draw(lktu.csrs(nrows=Anr, ncols=nc, values=True))
    B = data.draw(lktu.csrs(nrows=Bnr, ncols=nc, values=True))

    As = mkl_ops.SparseM.from_csr(A)
    Bs = mkl_ops.SparseM.from_csr(B)

    Ch = mkl_ops._lk_mkl_spmabt(As.ptr, Bs.ptr)
    C = mkl_ops._to_csr(Ch)
    C = lm.CSR(N=C)

    assert C.nrows == A.nrows
    assert C.ncols == B.nrows

    Csp = A.to_scipy() @ B.to_scipy().T
    Cspa = Csp.toarray()
    Ca = C.to_scipy().toarray()
    assert Ca == approx(Cspa)
Пример #11
0
def test_csr64_pickle(values):
    csr = rand_csr(100, 50, 1000, values=values)
    csr = lm.CSR(csr.nrows, csr.ncols, csr.nnz, csr.rowptrs.astype(np.int64),
                 csr.colinds, csr.values)
    assert csr.nrows == 100
    assert csr.ncols == 50
    assert csr.nnz == 1000

    data = pickle.dumps(csr)
    csr2 = pickle.loads(data)

    assert csr2.nrows == csr.nrows
    assert csr2.ncols == csr.ncols
    assert csr2.nnz == csr.nnz
    assert all(csr2.rowptrs == csr.rowptrs)
    assert csr2.rowptrs.dtype == np.int64
    assert all(csr2.colinds == csr.colinds)
    if values:
        assert all(csr2.values == csr.values)
    else:
        assert csr2.values is None