Пример #1
0
def par_lauum_c_lower(A: torch.Tensor,
                      block_allocs: List[BlockAlloc],
                      my_rows: List[int],
                      barrier: threading.Barrier,
                      device_id: int,
                      cublas_handle,
                      independent_output: bool):
    N = A.shape[0]
    dts = sizeof_dtype(A.dtype)
    is_cuda = A.device.type == "cuda"

    trmm_fn = choose_fn(A.dtype, cublasDtrmm, cublasStrmm, "cuBlas TRMM")
    gemm_fn = choose_fn(A.dtype, cublasDgemm, cublasSgemm, "cuBlas GEMM")
    syrk_fn = choose_fn(A.dtype, cublasDsyrk, cublasSsyrk, "cuBlas SYRK")

    tc_device = torch.device('cuda:%d' % (device_id))
    s1 = torch.cuda.Stream(device=tc_device)
    s3 = torch.cuda.Stream(device=tc_device)
    s1_cuda = s1._as_parameter_

    max_block_size = max(ba.length for ba in block_allocs)
    my_rows = sorted(my_rows)

    with torch.cuda.device(tc_device), torch.cuda.stream(s1), cublas_stream(cublas_handle, s1_cuda):
        if not is_cuda:
            temp_bb = create_fortran((max_block_size, max_block_size), A.dtype, 'cpu', pin_memory=True).T
        # Pre allocate r-col, b-col, syrk-out, lauum-out
        mem_needed = 2 * N * max_block_size + 2 * (max_block_size ** 2)
        f_gpu = torch.empty(size=(mem_needed,), dtype=A.dtype, device=tc_device)
        whole_col_b = f_gpu[:N * max_block_size]
        whole_col_r = f_gpu[N * max_block_size: 2 * N * max_block_size]
        syrk_out = extract_fortran(f_gpu, size=(max_block_size, max_block_size), offset=2 * N * max_block_size)
        lauum_out = extract_fortran(f_gpu, size=(max_block_size, max_block_size), offset=2 * N * max_block_size + max_block_size ** 2)
        syrk_out.fill_(0.0)

        for b in range(len(block_allocs)):
            bb = block_allocs[b]
            # Load col b.
            # Instead of loading the whole column only load the last rows
            # as necessary by inspecting the minimum value in my_rows which is >= b.
            try:
                min_row = min([r for r in my_rows if r >= b])
                b_start = block_allocs[min_row].start
                cuda_memcpy2d_async(
                    dst=whole_col_b.data_ptr(), dpitch=max_block_size * dts,
                    src=A[b_start, bb.start].data_ptr(), spitch=A.shape[1] * dts,
                    width=bb.length * dts, height=N - b_start, stream=s1_cuda)
            except ValueError:  # all of `my_rows` are smaller than `b`.
                pass
            if not independent_output:
                # wait for copy to device to succeed. After barrier other threads may modify
                # the part of col_b which we need!
                s1.synchronize()
                barrier.wait()

            for r in my_rows:
                if r < b:
                    continue
                if r == b:
                    is_last_row = b_start + bb.length == N
                    # SYRK on g_b[bb.length:, :] with output replacing g_b[:bb.length, :]
                    # C = beta*C + alpha * op(A) @ op(A).T
                    if not is_last_row:
                        syrk_fn(cublas_handle, uplo='U', trans='N',
                                n=bb.length, k=N - b_start - bb.length,
                                alpha=1.0, A=whole_col_b[bb.length * max_block_size:].data_ptr(),
                                lda=max_block_size,
                                beta=0.0, C=syrk_out.data_ptr(), ldc=max_block_size)

                    with torch.cuda.stream(s3):
                        if independent_output:
                            s1.synchronize()  # we need col_b to be loaded
                        # Lower LAUUM for C-contig is equal to upper LAUUM for F-contig
                        c_lauum_in = whole_col_b[:bb.length * max_block_size].view(bb.length, max_block_size)[:, :bb.length]
                        c_lauum_out = lauum_out[:bb.length, :bb.length]

                        if independent_output:
                            c_lauum_out.copy_(c_lauum_in)
                        else:
                            c_lauum_out.copy_(c_lauum_in.T)
                        cuda_lauum(n=bb.length, A=c_lauum_in, lda=max_block_size, B=c_lauum_out, ldb=max_block_size, lower=False)

                    s1.wait_stream(s3)  # all subsequent work on s1 will need cur_lauum_out
                    if not is_last_row:
                        c_lauum_out.add_(syrk_out[:bb.length, :bb.length])

                    # copy back whole_col_b into Abb
                    # Now lauum_out is F-contig, while Abb is C-contig
                    Abb = A[bb.start:bb.end, bb.start:bb.end]
                    if independent_output:
                        Abb.copy_(c_lauum_out)
                    else:
                        Abb.copy_(c_lauum_out.T)
                else:  # r > b
                    br = block_allocs[r]

                    # Load column r. Since r > b this column will be shorter than column b
                    cuda_memcpy2d_async(
                        dst=whole_col_r.data_ptr(), dpitch=max_block_size * dts,
                        src=A[br.start, br.start].data_ptr(), spitch=A.shape[1] * dts,
                        width=br.length * dts, height=N - br.start, stream=s1_cuda)
                    # Restrict column b to only the last 'r' rows
                    ccb = whole_col_b[(br.start - b_start) * max_block_size:]

                    # TRMM on g_r[0:br.length, :] which is triangular (r*r)
                    #         and cur_g_b[0:br.length, :]
                    #         output is a r*b matrix and stored in first rows of ccb
                    # C = alpha * op(A) @ B -- A triangular
                    trmm_fn(
                        handle=cublas_handle,
                        side='R', uplo='U', trans='T', diag='N',
                        m=bb.length, n=br.length,
                        alpha=1.0, A=whole_col_r.data_ptr(), lda=max_block_size,
                        B=ccb.data_ptr(), ldb=max_block_size,
                        C=ccb.data_ptr(), ldc=max_block_size)

                    # GEMM on g_r[br.length:, :].T and cur_g_b[bb.length:, :]
                    #         output  is the same r*b matrix as before, outputs need to be summed.
                    # C = alpha * op(A) @ op(B) + beta * C
                    if br.end < N:
                        gemm_fn(handle=cublas_handle, transa='N', transb='T',
                                m=bb.length, n=br.length, k=N - br.start - br.length,
                                alpha=1.0,
                                A=ccb[br.length * max_block_size:].data_ptr(),
                                lda=max_block_size,
                                B=whole_col_r[br.length * max_block_size:].data_ptr(),
                                ldb=max_block_size,
                                beta=1.0, C=ccb.data_ptr(), ldc=max_block_size)

                    # Copy back to A[r, b]
                    if is_cuda:
                        ccb_square = ccb[:max_block_size * br.length].view(br.length, max_block_size)
                        if independent_output:
                            A[bb.start:bb.end, br.start:br.end].copy_(ccb_square[:br.length, :bb.length].T)
                        else:
                            A[br.start:br.end, bb.start:bb.end].copy_(ccb_square[:br.length, :bb.length])
                    elif independent_output:
                        # Copy must be transposed, copy to temp_bb first.
                        cublasGetMatrixAsync(
                            rows=bb.length, cols=br.length, elem_size=dts,
                            A=ccb.data_ptr(), lda=max_block_size,
                            B=temp_bb.data_ptr(), ldb=max_block_size, stream=s1_cuda)
                        s1.synchronize()
                        A[bb.start:bb.end, br.start:br.end].copy_(temp_bb[:br.length, :bb.length].T)
                    else:
                        cublasGetMatrixAsync(
                            rows=bb.length, cols=br.length, elem_size=dts,
                            A=ccb.data_ptr(), lda=max_block_size,
                            B=A[br.start, bb.start].data_ptr(), ldb=A.shape[0],
                            stream=s1_cuda)
            s1.synchronize()
Пример #2
0
def sparse_fdmmv(proc_idx, queue, device_id):
    a: ArgsFdmmv = queue.get()
    X1: SparseTensor = a.X1
    X2: SparseTensor = a.X2
    v, w, out = a.v, a.w, a.out
    kernel, max_mem = a.kernel, a.max_mem
    dtype = X1.dtype
    N, D = X1.shape
    M = X2.size(0)
    if v is None:
        T = w.size(1)
    else:
        T = v.size(1)

    # Memory needs:
    # X1_chunk : ntot + 2 * D * ntot * density
    # X2       : dtot + 2 * D * M * density (because is transposed)
    # sparse_out : ntot + 2 * ntot * M * density (assume here density = 1)
    # ker_gpu  : M * ntot
    # w_gpu    : ntot * T
    # v_gpu    : M * T
    # out_gpu  : M * T
    avail_mem = max_mem / sizeof_dtype(dtype)
    den = 2 * D * X1.density + 2 + 3 * M + T
    sub = D + 2 * D * M * X2.density + M * T
    if v is not None:
        sub += M * T
    n = (avail_mem - sub) / den
    n = min(int(n), N)
    if n < 1:
        raise MemoryError("Not enough memory to run sparse dfmmv")

    ddev = torch.device('cuda:%d' % int(device_id))
    with tcd.device(ddev):
        # First collect necessary memory
        mem_needed = n * M + n * T
        if not out.is_cuda:
            mem_needed += M * T
        if v is not None:
            mem_needed += M * T
        # Create flat tensor
        flat_gpu_tn = torch.empty(size=(mem_needed, ),
                                  dtype=dtype,
                                  device=ddev)
        # Extract the sub-tensors
        flat_offset = 0
        ker_gpu = extract_fortran(flat_gpu_tn, size=(n, M), offset=flat_offset)
        flat_offset += np.prod(ker_gpu.shape)
        w_gpu = extract_same_stride(flat_gpu_tn,
                                    size=(n, T),
                                    other=out,
                                    offset=flat_offset)
        flat_offset += np.prod(w_gpu.shape)
        if not out.is_cuda:
            out_gpu = extract_same_stride(flat_gpu_tn,
                                          size=(M, T),
                                          other=out,
                                          offset=flat_offset)
            flat_offset += np.prod(out_gpu.shape)
        else:
            out_gpu = out
        out_gpu.fill_(0.0)
        if v is not None:
            v_gpu = extract_same_stride(flat_gpu_tn,
                                        size=(M, T),
                                        other=v,
                                        offset=flat_offset)
            flat_offset += np.prod(v_gpu.shape)
            copy_to_device_noorder(M, T, v, 0, 0, v_gpu, 0, 0)
        # Sparse GPU data is allocated separately.
        X2_d = SparseTensor.from_scipy(
            X2.transpose_csc().to_scipy().tocsr(copy=False)) \
            .index_to_int() \
            .to(device=ddev)

        for i in range(0, N, n):
            ic = min(n, N - i)
            X1_chunk = X1.narrow_rows(i, ic)
            X1_chunk_d = X1_chunk.index_to_int().to(device=ddev)

            ker_chunk = ker_gpu[:ic]
            ker_chunk.fill_(0.0)

            # TODO: This is wasteful (X2 will be prepared many times over)
            ddd = kernel._prepare_sparse(X1_chunk, X2)
            ker_chunk = kernel._apply_sparse(X1_chunk_d, X2_d, ker_chunk)
            ker_chunk = kernel._finalize(ker_chunk, ddd)

            if w is not None:
                c_g_w = copy_to_device_noorder(ic, T, w, i, 0, w_gpu, 0, 0)
            else:
                c_g_w = w_gpu.narrow(0, 0, ic)
                c_g_w.fill_(0.0)

            if v is not None:
                c_g_w.addmm_(ker_chunk, v_gpu)
            out_gpu.addmm_(ker_chunk.T, c_g_w)
            del ddd, X1_chunk, X1_chunk_d

        if not out.is_cuda:
            copy_to_device_noorder(M, T, out_gpu, 0, 0, out, 0, 0)
    return out
Пример #3
0
def sparse_fmmv(proc_idx, queue, device_id):
    a: ArgsFmmv = queue.get()

    X1: SparseTensor = a.X1
    X2: SparseTensor = a.X2
    v, out = a.v, a.out
    kernel, max_mem = a.kernel, a.max_mem
    dtype = X1.dtype
    cuda_inputs = X1.is_cuda
    ntot, dtot = X1.shape
    mtot, T = v.size()

    avail_mem = max_mem / sizeof_dtype(dtype)
    # Memory needs:
    # X1_chunk : N + 2*D*N*density
    # X2_chunk : D + 2*D*M*density (because is transposed)
    # sparse_out : N + 2*N*M*(density) (assume density = 1)
    # ker_gpu  : M*N
    # mmv_gpu  : N*T
    # v_gpu    : M*T
    # Other: GPU buffer
    n, m = select_dim_over_nm_v2(max_n=ntot,
                                 max_m=mtot,
                                 coef_nm=3,
                                 coef_n=2 + 2 * dtot * X1.density + T,
                                 coef_m=2 * dtot * X2.density + T,
                                 rest=dtot,
                                 max_mem=avail_mem)

    ddev = torch.device('cuda:%d' % int(device_id))
    with tcd.device(ddev):
        # First collect necessary memory
        mem_needed = mtot * T + n * T + n * m
        # Create flat tensor
        flat_gpu_tn = torch.empty(size=(mem_needed, ),
                                  dtype=dtype,
                                  device=ddev)
        # Extract the sub-tensors
        flat_offset = 0
        v_gpu = extract_same_stride(flat_gpu_tn,
                                    size=(mtot, T),
                                    other=v,
                                    offset=flat_offset)
        flat_offset += np.prod(v_gpu.shape)
        copy_to_device_noorder(mtot, T, v, 0, 0, v_gpu, 0, 0)
        mmv_gpu = extract_same_stride(flat_gpu_tn,
                                      size=(n, T),
                                      other=out,
                                      offset=flat_offset)
        flat_offset += np.prod(mmv_gpu.shape)
        # ker_gpu should be fortran-ordered due to cusparse csr2dense function
        ker_gpu = extract_fortran(flat_gpu_tn, size=(n, m), offset=flat_offset)
        flat_offset += np.prod(ker_gpu.shape)

        for i in range(0, ntot, n):
            ic = min(n, ntot - i)

            cur_mmv_gpu = mmv_gpu[:ic]  # n x T
            cur_mmv_gpu.fill_(0.0)

            X1_chunk = X1.narrow_rows(i, ic)
            X1_chunk_d = X1_chunk.index_to_int().to(device=ddev)
            for j in range(0, mtot, m):
                jc = min(m, mtot - j)

                X2_chunk = X2.narrow_rows(j, jc)
                # Prepare sparse on CPU
                ddd = kernel._prepare_sparse(X1_chunk, X2_chunk)

                # Transpose X2-chunk and convert it to CSR. This uses lots of RAM
                X2_chunk_d = SparseTensor.from_scipy(
                    X2_chunk.transpose_csc().to_scipy().tocsr(copy=False)) \
                    .index_to_int() \
                    .to(device=ddev)

                cur_ker_gpu = ker_gpu[:ic, :jc]
                cur_ker_gpu.fill_(0.0)
                # Run the matrix multiplication (kernel apply)
                cur_ker_gpu = kernel._apply_sparse(X1_chunk_d, X2_chunk_d,
                                                   cur_ker_gpu)
                cur_ker_gpu = kernel._finalize(cur_ker_gpu, ddd)

                # Multiply by the vector v
                cur_mmv_gpu.addmm_(cur_ker_gpu, v_gpu.narrow(0, j, jc))
                del ddd, X2_chunk, X2_chunk_d

            # send result to CPU
            if not cuda_inputs:
                copy_to_host_noorder(ic, T, cur_mmv_gpu, 0, 0, out, i, 0)
            del X1_chunk, X1_chunk_d
    return out
Пример #4
0
def _ic_cholesky(A, upper, device, cusolver_handle):
    """Cholesky factorization of matrix `A` on the GPU

    Uses the cuSOLVER library for implementation of the POTRF function.

    Parameters:
    -----------
    A : [n, n] CPU or GPU array (column-contiguous)
        The (positive definite) matrix which should be factorized
    upper : bool
        Whether we need to factorize the upper of lower portion of `A`. The other side
        of the matrix will not be touched.
    device : int
        The GPU device on which to run the factorization
    cusolver_handle
        Pointer to the cuSOLVER context, which needs to be initialized before calling
        the function.

    Returns:
    --------
    A : [n, n] CPU or GPU array (column-contiguous)
        The factorization of A which overwrites the upper (or lower) triangular part
        of the matrix A. This is not a copy of the original matrix.
    """
    # Check library initialization
    if cusolver_handle is None:
        raise RuntimeError("CuSolver must be initialized "
                           "before running in-core Cholesky.")
    if not is_f_contig(A):
        raise RuntimeError("Cholesky input must be F-contiguous")

    uplo = 'U' if upper else 'L'
    n = A.shape[0]

    tc_device = torch.device("cuda:%d" % (device))
    tc_stream = torch.cuda.current_stream(tc_device)
    # Choose functions by dtype
    potrf_buf_size = choose_fn(A.dtype, cusolverDnDpotrf_bufferSize,
                               cusolverDnSpotrf_bufferSize,
                               "POTRF Buffer size")
    potrf_fn = choose_fn(A.dtype, cusolverDnDpotrf, cusolverDnSpotrf, "POTRF")

    with torch.cuda.device(tc_device), \
            torch.cuda.stream(tc_stream), \
            cusolver_stream(cusolver_handle, tc_stream._as_parameter_):
        # Determine necessary buffer size
        potrf_bsize = potrf_buf_size(handle=cusolver_handle,
                                     uplo=uplo,
                                     n=n,
                                     A=0,
                                     lda=n)

        # Allocate flat GPU buffer, and extract buffers
        if A.is_cuda:
            potrf_wspace = torch.empty(size=(potrf_bsize, ),
                                       dtype=A.dtype,
                                       device=tc_device)
            Agpu = A
        else:
            gpu_buf = torch.empty(size=(n * n + potrf_bsize, ),
                                  dtype=A.dtype,
                                  device=tc_device)
            potrf_wspace = gpu_buf[:potrf_bsize]
            Agpu = extract_fortran(gpu_buf, (n, n), offset=potrf_bsize)
            # Copy A to device memory
            copy_to_device(n, n, A, 0, 0, Agpu, 0, 0, s=tc_stream)

        dev_info = torch.tensor(4, dtype=torch.int32, device=tc_device)

        # Run cholesky
        potrf_fn(handle=cusolver_handle,
                 uplo=uplo,
                 n=n,
                 A=Agpu.data_ptr(),
                 lda=n,
                 workspace=potrf_wspace.data_ptr(),
                 Lwork=potrf_bsize,
                 devInfo=dev_info)

        # Copy back to CPU
        if not A.is_cuda:
            copy_to_host(n, n, Agpu, 0, 0, A, 0, 0, s=tc_stream)
            del Agpu
        del potrf_wspace, dev_info
        tc_stream.synchronize()
    return A