Пример #1
0
def plsa_fit(
    X,
    k,
    n_row_blocks=8,
    n_col_blocks=8,
    init="random",
    n_iter=100,
    n_iter_per_test=10,
    tolerance=0.001,
    e_step_thresh=1e-32,
    random_state=None,
):
    rng = check_random_state(random_state)
    p_z_given_d_init, p_w_given_z_init = plsa_init(X, k, init=init, rng=rng)

    A = X.tocsr().astype(np.float32)

    n = A.shape[0]
    m = A.shape[1]

    block_row_size = np.uint16(np.ceil(A.shape[0] / n_row_blocks))
    block_col_size = np.uint16(np.ceil(A.shape[1] / n_col_blocks))

    p_z_given_d = np.zeros((block_row_size * n_row_blocks, k),
                           dtype=np.float32)
    p_z_given_d[:p_z_given_d_init.shape[0]] = p_z_given_d_init
    p_z_given_d = p_z_given_d.reshape(n_row_blocks, block_row_size, k)

    p_w_given_z = np.zeros((k, block_col_size * n_col_blocks),
                           dtype=np.float32)
    p_w_given_z[:, :p_w_given_z_init.shape[1]] = p_w_given_z_init
    # p_w_given_z = np.transpose(
    #     p_w_given_z.T.reshape(n_col_blocks, block_col_size, k), axes=[0, 2, 1]
    # ).astype(np.float32, order="C")
    p_w_given_z = np.stack(np.hsplit(p_w_given_z, 10))

    A_blocks = [[0] * n_col_blocks for i in range(n_row_blocks)]
    max_nnz_per_block = 0
    for i in range(n_row_blocks):

        row_start = block_row_size * i
        row_end = min(row_start + block_row_size, n)

        for j in range(n_col_blocks):

            col_start = block_col_size * j
            col_end = min(col_start + block_col_size, m)

            A_blocks[i][j] = A[row_start:row_end, col_start:col_end].tocoo()
            if A_blocks[i][j].nnz > max_nnz_per_block:
                max_nnz_per_block = A_blocks[i][j].nnz

    block_rows_ndarray = np.full(
        (n_row_blocks, n_col_blocks, max_nnz_per_block), -1, dtype=np.int32)
    block_cols_ndarray = np.full(
        (n_row_blocks, n_col_blocks, max_nnz_per_block), -1, dtype=np.int32)
    block_vals_ndarray = np.zeros(
        (n_row_blocks, n_col_blocks, max_nnz_per_block), dtype=np.float32)
    for i in range(n_row_blocks):
        for j in range(n_col_blocks):
            nnz = A_blocks[i][j].nnz
            block_rows_ndarray[i, j, :nnz] = A_blocks[i][j].row
            block_cols_ndarray[i, j, :nnz] = A_blocks[i][j].col
            block_vals_ndarray[i, j, :nnz] = A_blocks[i][j].data

    p_z_given_d, p_w_given_z = plsa_fit_inner_blockwise(
        block_rows_ndarray,
        block_cols_ndarray,
        block_vals_ndarray,
        p_w_given_z,
        p_z_given_d,
        block_row_size,
        block_col_size,
        n_iter=n_iter,
        n_iter_per_test=n_iter_per_test,
        tolerance=tolerance,
        e_step_thresh=e_step_thresh,
    )
    p_z_given_d = np.vstack(p_z_given_d)[:n, :]
    p_w_given_z = np.hstack(p_w_given_z)[:, :m]

    return p_z_given_d, p_w_given_z
Пример #2
0
def plsa_fit(
    data,
    k,
    n_row_blocks=8,
    n_col_blocks=8,
    init="random",
    n_iter=100,
    n_iter_per_test=10,
    tolerance=0.001,
    e_step_thresh=1e-32,
    random_state=None,
):
    rng = check_random_state(random_state)
    p_z_given_d_init, p_w_given_z_init = plsa_init(data, k, init=init, rng=rng)

    A = data.tocsr().astype(np.float32)

    n = A.shape[0]
    m = A.shape[1]

    block_row_size = np.uint16(np.ceil(A.shape[0] / n_row_blocks))
    block_col_size = np.uint16(np.ceil(A.shape[1] / n_col_blocks))

    p_z_given_d = np.zeros((block_row_size * n_row_blocks, k),
                           dtype=np.float32)
    p_z_given_d[:p_z_given_d_init.shape[0]] = p_z_given_d_init
    p_z_given_d = p_z_given_d.reshape(n_row_blocks, block_row_size, k)

    p_w_given_z = np.zeros((k, block_col_size * n_col_blocks),
                           dtype=np.float32)
    p_w_given_z[:, :p_w_given_z_init.shape[1]] = p_w_given_z_init
    p_w_given_z = np.transpose(p_w_given_z.T.reshape(n_col_blocks,
                                                     block_col_size, k),
                               axes=[0, 2, 1]).astype(np.float32, order="C")

    A_blocks = [[0] * n_col_blocks for i in range(n_row_blocks)]
    max_nnz_per_block = 0
    for i in range(n_row_blocks):

        row_start = block_row_size * i
        row_end = min(row_start + block_row_size, n)

        for j in range(n_col_blocks):

            col_start = block_col_size * j
            col_end = min(col_start + block_col_size, m)

            A_blocks[i][j] = A[row_start:row_end, col_start:col_end].tocoo()
            if A_blocks[i][j].nnz > max_nnz_per_block:
                max_nnz_per_block = A_blocks[i][j].nnz

    block_rows_ndarray = np.full(
        (n_row_blocks, n_col_blocks, max_nnz_per_block), -1, dtype=np.int32)
    block_cols_ndarray = np.full(
        (n_row_blocks, n_col_blocks, max_nnz_per_block), -1, dtype=np.int32)
    block_vals_ndarray = np.zeros(
        (n_row_blocks, n_col_blocks, max_nnz_per_block), dtype=np.float32)
    for i in range(n_row_blocks):
        for j in range(n_col_blocks):
            nnz = A_blocks[i][j].nnz
            block_rows_ndarray[i, j, :nnz] = A_blocks[i][j].row
            block_cols_ndarray[i, j, :nnz] = A_blocks[i][j].col
            block_vals_ndarray[i, j, :nnz] = A_blocks[i][j].data

    n_d_blocks = block_rows_ndarray.shape[0]
    n_w_blocks = block_rows_ndarray.shape[1]
    block_size = block_rows_ndarray.shape[2]

    p_z_given_wd_block = np.zeros((n_d_blocks, n_w_blocks, block_size, k),
                                  dtype=np.float32)

    blocked_next_p_w_given_z = np.zeros(
        (
            np.int64(n_d_blocks),
            np.int64(n_w_blocks),
            np.int64(k),
            np.int64(block_col_size),
        ),
        dtype=np.float32,
    )
    blocked_next_p_z_given_d = np.zeros(
        (
            np.int64(n_w_blocks),
            np.int64(n_d_blocks),
            np.int64(block_row_size),
            np.int64(k),
        ),
        dtype=np.float32,
    )
    norms_pwz = np.zeros((n_d_blocks, n_w_blocks, k), dtype=np.float64)

    previous_log_likelihood = log_likelihood_by_blocks(
        block_rows_ndarray,
        block_cols_ndarray,
        block_vals_ndarray,
        p_w_given_z,
        p_z_given_d,
    )

    d_block_rows_ndarray = cuda.to_device(block_rows_ndarray)
    d_block_cols_ndarray = cuda.to_device(block_cols_ndarray)
    d_block_vals_ndarray = cuda.to_device(block_vals_ndarray)
    d_blocked_next_p_w_given_z = cuda.to_device(blocked_next_p_w_given_z)
    d_blocked_next_p_z_given_d = cuda.to_device(blocked_next_p_z_given_d)
    d_p_z_given_wd_block = cuda.to_device(p_z_given_wd_block)
    d_p_w_given_z = cuda.to_device(p_w_given_z)
    d_p_z_given_d = cuda.to_device(p_z_given_d)
    d_norms_pwz = cuda.to_device(norms_pwz)

    n_d = p_z_given_d.shape[1]
    n_w = p_w_given_z.shape[2]

    for i in range(n_iter // n_iter_per_test):
        for j in range(n_iter_per_test):
            plsa_e_step[(n_d_blocks, n_w_blocks), 256](
                d_block_rows_ndarray,
                d_block_cols_ndarray,
                d_p_w_given_z,
                d_p_z_given_d,
                d_p_z_given_wd_block,
                e_step_thresh,
            )
            cuda.synchronize()
            plsa_partial_m_step[(n_d_blocks, n_w_blocks), k](
                d_block_rows_ndarray,
                d_block_cols_ndarray,
                d_block_vals_ndarray,
                d_p_w_given_z,
                d_p_z_given_d,
                d_blocked_next_p_w_given_z,
                d_blocked_next_p_z_given_d,
                d_p_z_given_wd_block,
                d_norms_pwz,
            )
            cuda.synchronize()
            normalize_m_step_p_z_given_d[n_d_blocks,
                                         256](d_blocked_next_p_z_given_d,
                                              d_p_z_given_d)
            normalize_m_step_p_w_given_z[n_w_blocks,
                                         256](d_blocked_next_p_w_given_z,
                                              d_p_w_given_z, d_norms_pwz)
            cuda.synchronize()

        p_z_given_d = d_p_z_given_d.copy_to_host()
        p_w_given_z = d_p_w_given_z.copy_to_host()
        current_log_likelihood = log_likelihood_by_blocks(
            block_rows_ndarray,
            block_cols_ndarray,
            block_vals_ndarray,
            p_w_given_z,
            p_z_given_d,
        )
        change = np.abs(current_log_likelihood - previous_log_likelihood)
        if change / np.abs(current_log_likelihood) < tolerance:
            break
        else:
            previous_log_likelihood = current_log_likelihood

    for i in range(n_iter % n_iter_per_test):
        plsa_e_step[(n_d_blocks, n_w_blocks), 256](
            d_block_rows_ndarray,
            d_block_cols_ndarray,
            d_p_w_given_z,
            d_p_z_given_d,
            d_p_z_given_wd_block,
            e_step_thresh,
        )
        cuda.synchronize()
        plsa_partial_m_step[(n_d_blocks, n_w_blocks), k](
            d_block_rows_ndarray,
            d_block_cols_ndarray,
            d_block_vals_ndarray,
            d_p_w_given_z,
            d_p_z_given_d,
            d_blocked_next_p_w_given_z,
            d_blocked_next_p_z_given_d,
            d_p_z_given_wd_block,
            d_norms_pwz,
        )
        cuda.synchronize()
        normalize_m_step_p_z_given_d[n_d_blocks,
                                     256](d_blocked_next_p_z_given_d,
                                          d_p_z_given_d)
        normalize_m_step_p_w_given_z[n_w_blocks,
                                     256](d_blocked_next_p_w_given_z,
                                          d_p_w_given_z, d_norms_pwz)
        cuda.synchronize()

    p_z_given_d = d_p_z_given_d.copy_to_host()
    p_w_given_z = d_p_w_given_z.copy_to_host()

    p_z_given_d = np.vstack(p_z_given_d)[:n, :]
    p_w_given_z = np.hstack(p_w_given_z)[:, :m]

    return p_z_given_d, p_w_given_z
Пример #3
0
def plsa_fit(
    X,
    k,
    sample_weight,
    init="random",
    block_size=65536,
    n_iter=100,
    n_iter_per_test=10,
    tolerance=0.001,
    e_step_thresh=1e-32,
    random_state=None,
):
    """Fit a pLSA model to a data matrix ``X`` with ``k`` topics, an initialized
    according to ``init``. This will run an EM method to optimize estimates of P(z|d)
    and P(w|z). The will perform at most ``n_iter`` EM step iterations,
    while checking for relative improvement of the log-likelihood of the data under
    the model every ``n_iter_per_test`` iterations, and stops early if that is under
    ``tolerance``.

    Parameters
    ----------
    X: sparse matrix of shape (n_docs, n_words)
        The data matrix pLSA is attempting to fit to.

    k: int
        The number of topics for pLSA to fit with.

    sample_weight: array of shape (n_docs,)
        Input document weights.

    init: string or tuple (optional, default="random")
        The intialization method to use. This should be one of:
            * ``"random"``
            * ``"nndsvd"``
            * ``"nmf"``
        or a tuple of two ndarrays of shape (n_docs, n_topics) and (n_topics, n_words).

    block_size: int (optional, default=65536)
        The number of nonzero entries of X to process in a block. The larger this
        value the faster the compute may go, but at higher memory cost.

    n_iter: int
        The maximum number iterations of EM to perform

    n_iter_per_test: int
        The number of iterations between tests for
        relative improvement in log-likelihood.

    tolerance: float
        The threshold of relative improvement in
        log-likelihood required to continue iterations.

    e_step_thresh: float (optional, default=1e-32)
        Option to promote sparsity. If the value of P(w|z)P(z|d) in the E step falls
        below threshold then write a zero for P(z|w,d).

    random_state: int, RandomState instance or None, (optional, default: None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`. Used in in initialization.

    Returns
    -------
    p_z_given_d, p_w_given_z: arrays of shapes (n_docs, n_topics) and (n_topics, n_words)
        The resulting model values of P(z|d) and P(w|z)

    """

    rng = check_random_state(random_state)
    p_z_given_d, p_w_given_z = plsa_init(X, k, init=init, rng=rng)
    p_z_given_d = p_z_given_d.astype(np.float32, order="C")
    p_w_given_z = p_w_given_z.astype(np.float32, order="C")

    use_sample_weights = np.any(sample_weight != 1.0)

    A = X.tocoo().astype(np.float32)

    p_z_given_d, p_w_given_z = plsa_fit_inner_blockwise(
        A.row,
        A.col,
        A.data,
        p_w_given_z,
        p_z_given_d,
        sample_weight,
        block_size=block_size,
        n_iter=n_iter,
        n_iter_per_test=n_iter_per_test,
        tolerance=tolerance,
        e_step_thresh=e_step_thresh,
        use_sample_weights=use_sample_weights,
    )

    return p_z_given_d, p_w_given_z
Пример #4
0
def plsa_fit(
    X,
    k,
    n_row_blocks=8,
    n_col_blocks=8,
    init="random",
    n_iter=100,
    n_iter_per_test=10,
    tolerance=0.001,
    e_step_thresh=1e-32,
    random_state=None,
):
    rng = check_random_state(random_state)
    p_z_given_d, p_w_given_z = plsa_init(X, k, init=init, rng=rng)
    p_z_given_d = p_z_given_d.astype(np.float32, order="C")
    p_w_given_z = p_w_given_z.astype(np.float32, order="C")

    A = X.tocsr().astype(np.float32)

    n = A.shape[0]
    m = A.shape[1]

    block_row_size = np.uint32(np.ceil(A.shape[0] / n_row_blocks))
    block_col_size = np.uint32(np.ceil(A.shape[1] / n_col_blocks))

    A_blocks = [[0] * n_col_blocks for i in range(n_row_blocks)]
    max_nnz_per_block = 0
    for i in range(n_row_blocks):

        row_start = block_row_size * i
        row_end = min(row_start + block_row_size, n)

        for j in range(n_col_blocks):

            col_start = block_col_size * j
            col_end = min(col_start + block_col_size, m)

            A_blocks[i][j] = A[row_start:row_end, col_start:col_end].tocoo()
            if A_blocks[i][j].nnz > max_nnz_per_block:
                max_nnz_per_block = A_blocks[i][j].nnz

    del A

    block_rows_ndarray = np.full(
        (n_row_blocks, n_col_blocks, max_nnz_per_block),
        -1,
        dtype=np.int32,
    )
    block_cols_ndarray = np.full(
        (n_row_blocks, n_col_blocks, max_nnz_per_block),
        -1,
        dtype=np.int32,
    )
    block_vals_ndarray = np.zeros(
        (n_row_blocks, n_col_blocks, max_nnz_per_block),
        dtype=np.float32,
    )
    for i in range(n_row_blocks):
        for j in range(n_col_blocks):
            nnz = A_blocks[i][j].nnz
            block_rows_ndarray[i, j, :nnz] = A_blocks[i][j].row
            block_cols_ndarray[i, j, :nnz] = A_blocks[i][j].col
            block_vals_ndarray[i, j, :nnz] = A_blocks[i][j].data

    del A_blocks

    block_rows_ndarray = da.from_array(
        block_rows_ndarray,
        chunks=(1, 1, max_nnz_per_block),
    )
    block_cols_ndarray = da.from_array(
        block_cols_ndarray,
        chunks=(1, 1, max_nnz_per_block),
    )
    block_vals_ndarray = da.from_array(
        block_vals_ndarray,
        chunks=(1, 1, max_nnz_per_block),
    )

    p_z_given_d, p_w_given_z = plsa_fit_inner_dask(
        block_rows_ndarray,
        block_cols_ndarray,
        block_vals_ndarray,
        p_w_given_z,
        p_z_given_d,
        block_row_size,
        block_col_size,
        n_iter=n_iter,
        n_iter_per_test=n_iter_per_test,
        tolerance=tolerance,
        e_step_thresh=e_step_thresh,
    )

    return p_z_given_d, p_w_given_z