Пример #1
0
def fmm_cuda_sparse(X1: SparseTensor,
                    X2: SparseTensor,
                    kernel: 'falkon.kernels.Kernel',
                    out: Optional[torch.Tensor] = None,
                    opt: Optional[BaseOptions] = None) -> torch.Tensor:
    opt = _setup_opt(opt)
    _check_contiguity((out, 'out'))
    N = X1.size(0)
    M = X2.size(0)
    if out is None:
        out = create_fortran((N, M), X1.dtype, 'cpu', pin_memory=True)
    gpu_info = _get_gpu_info(opt, slack=0.9)
    block_sizes = calc_gpu_block_sizes(gpu_info, N)

    # If float32 we need to upcast to float64 to avoid numerical precision errors
    # in the kernel
    gpu_dtype = X1.dtype
    if sizeof_dtype(X1.dtype) < 8 and opt.no_single_kernel:
        gpu_dtype = torch.float64

    # Create the arguments passed to each subprocess
    args = []
    for i, g in enumerate(gpu_info):
        bwidth = block_sizes[i + 1] - block_sizes[i]
        if bwidth <= 0:
            continue
        args.append((ArgsFmm(X1=X1.narrow_rows(block_sizes[i], bwidth),
                             X2=X2,
                             out=out.narrow(0, block_sizes[i], bwidth),
                             kernel=kernel,
                             gpu_dtype=gpu_dtype,
                             max_mem=g.usable_ram), g.Id))
    _start_wait_processes(_sparse_fmm, args)
    torch.cuda.empty_cache()
    return out
Пример #2
0
def fmmv_cuda_sparse(X1: SparseTensor,
                     X2: SparseTensor,
                     v: torch.Tensor,
                     kernel,
                     out: Optional[torch.Tensor] = None,
                     opt: Optional[BaseOptions] = None) -> torch.Tensor:
    opt = _setup_opt(opt)
    _check_contiguity((v, 'v'), (out, 'out'))

    N = X1.size(0)
    # Create output matrix
    if out is None:
        out = create_fortran((N, v.size(1)), v.dtype, 'cpu', pin_memory=True)
    out.fill_(0.0)

    gpu_info = _get_gpu_info(opt, slack=0.9)
    block_sizes = calc_gpu_block_sizes(gpu_info, N)

    # Create queues
    args = []  # Arguments passed to each subprocess
    for i, g in enumerate(gpu_info):
        bwidth = block_sizes[i + 1] - block_sizes[i]
        if bwidth <= 0: continue
        args.append((ArgsFmmv(X1=X1.narrow_rows(block_sizes[i], bwidth),
                              X2=X2,
                              v=v,
                              out=out.narrow(0, block_sizes[i], bwidth),
                              kernel=kernel,
                              max_mem=g.usable_ram), g.Id))

    _start_wait_processes(sparse_fmmv, args)
    return out
Пример #3
0
    def init(self, X: Union[torch.Tensor, SparseTensor]):
        """Initialize the preconditioner matrix.

        This method must be called before the preconditioner can be used.

        Parameters
        ----------
        X : MxD tensor
            The matrix of Nystroem centers
        """
        dtype = X.dtype
        eps = self.params.pc_epsilon(X.dtype)

        M = X.size(0)

        with TicToc("Kernel", debug=self.params.debug):
            if isinstance(X, torch.Tensor):
                C = create_same_stride((M, M), X, dtype=dtype, device='cpu',
                                       pin_memory=self._use_cuda)
            else:  # If sparse tensor we need fortran for kernel calculation
                C = create_fortran((M, M), dtype=dtype, device='cpu', pin_memory=self._use_cuda)
            self.kernel(X, X, out=C, opt=self.params)
        self.fC = C.numpy()
        if not is_f_contig(C):
            self.fC = self.fC.T

        with TicToc("Cholesky 1", debug=self.params.debug):
            # Compute T: lower(fC) = T.T
            inplace_add_diag(self.fC, eps * M)
            self.fC = potrf_wrapper(self.fC, clean=False, upper=False,
                                    use_cuda=self._use_cuda, opt=self.params)
            # Save the diagonal which will be overwritten when computing A
            self.dT = C.diag()

        with TicToc("Copy triangular", debug=self.params.debug):
            # Copy lower(fC) to upper(fC):  upper(fC) = T.
            copy_triang(self.fC, upper=False)

        if self._use_cuda:
            with TicToc("LAUUM", debug=self.params.debug):
                # Product upper(fC) @ upper(fC).T : lower(fC) = T @ T.T
                self.fC = lauum_wrapper(self.fC, upper=True, use_cuda=self._use_cuda, opt=self.params)
        else:
            with TicToc("LAUUM", debug=self.params.debug):
                # Product lower(fC).T @ lower(fC) : lower(fC) = T @ T.T
                self.fC = lauum_wrapper(self.fC, upper=False, use_cuda=self._use_cuda, opt=self.params)

        with TicToc("Cholesky 2", debug=self.params.debug):
            # lower(fC) = 1/M * [email protected]
            self.fC = mul_triang(self.fC, upper=False, preserve_diag=False, multiplier=1 / M)
            # lower(fC) = 1/M * [email protected] + lambda * I
            inplace_add_diag(self.fC, self._lambda)
            # Cholesky on lower(fC) : lower(fC) = A.T
            self.fC = potrf_wrapper(self.fC, clean=False, upper=False,
                                    use_cuda=self._use_cuda, opt=self.params)
            self.dA = C.diag()
Пример #4
0
    def test_cuda_matmul(self, mat1, mat2, expected):
        dev = torch.device("cuda:0")
        out = create_fortran(expected.shape, expected.dtype, dev)
        mat1_csr = SparseTensor.from_scipy(
            scipy.sparse.csr_matrix(mat1)).to(device=dev)
        mat2_csr = SparseTensor.from_scipy(
            scipy.sparse.csr_matrix(mat2)).to(device=dev)
        sparse_matmul(mat1_csr, mat2_csr, out)

        torch.testing.assert_allclose(out.cpu(), expected)
Пример #5
0
def cuda_trsm(A: torch.Tensor,
              v: torch.Tensor,
              alpha: float,
              lower: int,
              transpose: int,
              stream: Optional[torch.cuda.Stream] = None) -> torch.Tensor:
    if not is_f_contig(A, strict=False):
        raise ValueError("A must be f-contiguous for CUDA TRSM to work.")
    if not check_same_device(A, v):
        raise ValueError("A and v must be on the same CUDA device.")
    if not A.is_cuda:
        raise ValueError("A and v must be CUDA tensors!")

    device = A.device
    s = stream
    if stream is None:
        s = torch.cuda.current_stream(device=device)
    cublas_hdl = cublas_handle(device.index)
    trsm_fn = choose_fn(A.dtype, cublasDtrsm, cublasStrsm, "TRSM")

    # noinspection PyProtectedMember
    with torch.cuda.device(device), torch.cuda.stream(s), cublas_stream(
            cublas_hdl, s._as_parameter_):
        # Deal with copying v, which may not be F-contiguous.
        vF = create_fortran(v.size(), v.dtype, device)
        if is_f_contig(v, strict=False):
            # We can just make a copy of v
            vF.copy_(v)
            s.synchronize(
            )  # sync is necessary here for correctness. Not sure why! TODO: Is it still needed?
        else:
            vF = cuda_transpose(input=v, output=vF.T).T

        uplo = 'L' if lower else 'U'
        trans = 'T' if transpose else 'N'
        trsm_fn(cublas_hdl,
                side='L',
                uplo=uplo,
                trans=trans,
                diag='N',
                m=vF.shape[0],
                n=vF.shape[1],
                alpha=alpha,
                A=A.data_ptr(),
                lda=A.stride(1),
                B=vF.data_ptr(),
                ldb=vF.stride(1))
        if is_f_contig(v, strict=False):
            vout = vF
        else:
            vout = create_C(v.size(), v.dtype, device)
            vout = cuda_transpose(input=vF, output=vout.T).T
    return vout
Пример #6
0
def cuda_trsm(A: torch.Tensor, v: torch.Tensor, alpha: float, lower: int,
              transpose: int) -> torch.Tensor:
    if not is_f_contig(A, strict=False):
        raise ValueError("A must be f-contiguous for CUDA TRSM to work.")
    if not check_same_device(A, v):
        raise ValueError("A and v must be on the same CUDA device.")
    if not A.is_cuda:
        raise ValueError("A and v must be CUDA tensors!")

    s = torch.cuda.Stream(device=A.device)
    cublas_hdl = cublas_handle(A.device.index)
    trsm_fn = choose_fn(A.dtype, cublasDtrsm, cublasStrsm, "TRSM")

    with torch.cuda.device(A.device), torch.cuda.stream(s), cublas_stream(
            cublas_hdl, s._as_parameter_):
        # Deal with copying v, which may not be F-contiguous.
        vF = create_fortran(v.size(), v.dtype, v.device)
        if is_f_contig(v, strict=False):
            # We can just make a copy of v
            vF.copy_(v)
        else:
            vF = cuda_transpose(input=v, output=vF.T).T

        uplo = 'L' if lower else 'U'
        trans = 'T' if transpose else 'N'
        trsm_fn(cublas_hdl,
                side='L',
                uplo=uplo,
                trans=trans,
                diag='N',
                m=vF.shape[0],
                n=vF.shape[1],
                alpha=alpha,
                A=A.data_ptr(),
                lda=A.stride(1),
                B=vF.data_ptr(),
                ldb=vF.stride(1))
        if not is_f_contig(v, strict=False):
            vout = create_C(v.size(), v.dtype, v.device)
            vout = cuda_transpose(input=vF, output=vout.T).T
        else:
            vout = vF
        s.synchronize()
    return vout
Пример #7
0
def par_lauum_f_lower(A: torch.Tensor, block_allocs: List[BlockAlloc],
                      my_rows: List[int], barrier: threading.Barrier,
                      device_id: int, cublas_handle, independent_output: bool):
    N = A.shape[0]

    lauum_fn = choose_fn(A.dtype, scll.dlauum, scll.slauum, "Lapack LAUUM")
    trmm_fn = choose_fn(A.dtype, cublasDtrmm, cublasStrmm, "cuBlas TRMM")
    gemm_fn = choose_fn(A.dtype, cublasDgemm, cublasSgemm, "cuBlas GEMM")
    syrk_fn = choose_fn(A.dtype, cublasDsyrk, cublasSsyrk, "cuBlas SYRK")

    tc_device = torch.device('cuda:%d' % (device_id))
    s1 = torch.cuda.Stream(device=tc_device)
    s2 = torch.cuda.Stream(device=tc_device)
    cublasSetStream(cublas_handle, s1._as_parameter_)

    max_block_size = max(ba.length for ba in block_allocs)
    my_rows = sorted(my_rows)

    with torch.cuda.device(tc_device), torch.cuda.stream(s1):
        # Preallocate 2 columns
        whole_col_b = create_fortran((A.shape[0], max_block_size), A.dtype,
                                     tc_device)
        whole_col_r = create_fortran((A.shape[0], max_block_size), A.dtype,
                                     tc_device)
        temp_bb = create_fortran((max_block_size, max_block_size),
                                 A.dtype,
                                 'cpu',
                                 pin_memory=True)

        for b in range(len(block_allocs)):
            bb = block_allocs[b]
            # Load col b.
            # Instead of loading the whole column only load the last rows
            # as necessary by inspecting the minimum value in my_rows which is >= b.
            try:
                min_row = min([r for r in my_rows if r >= b])
                b_start = block_allocs[min_row].start
                col_b = copy_to_device(N - b_start, bb.length, A, b_start,
                                       bb.start, whole_col_b, 0, 0, s1)
            except ValueError:
                pass  # No column here
            if not independent_output:
                barrier.wait()

            for r in my_rows:
                if r < b:
                    continue
                if r == b:
                    # SYRK on g_b[bb.length:, :] with output replacing g_b[:bb.length, :]
                    # C = beta*C + alpha * op(A) @ op(A).T
                    if b_start + bb.length < N:
                        syrk_fn(cublas_handle,
                                uplo='L',
                                trans='T',
                                n=bb.length,
                                k=col_b.shape[0] - bb.length,
                                alpha=1.0,
                                A=col_b[bb.length:, :].data_ptr(),
                                lda=col_b.stride(1),
                                beta=0.0,
                                C=col_b.data_ptr(),
                                ldc=col_b.stride(1))
                    # CPU LAUUM on A[bb.start:bb.end, bb.start:bb.end]. This is a bit messy, should do cleanup.
                    Abb = A[bb.start:bb.end, bb.start:bb.end]  # L\U
                    if independent_output:
                        Abb_np = Abb.numpy().copy(order="F")
                        # Make symmetric: L\L
                        copy_triang(Abb_np, upper=False)
                        uu, info = lauum_fn(Abb_np, lower=1,
                                            overwrite_c=True)  # LAU\L
                        Abb.copy_(torch.from_numpy(uu.T))  # L\LAU
                    else:
                        uu, info = lauum_fn(Abb.numpy(),
                                            lower=1,
                                            overwrite_c=False)  # LAU\L
                        if b_start + bb.length < N:
                            zero_triang(uu, upper=True)
                        Abb.copy_(torch.from_numpy(uu))
                    if b_start + bb.length < N:
                        # It is IMPORTANT to do the copy on s1 and then sync it.
                        tbb = copy_to_host(bb.length, bb.length, col_b, 0, 0,
                                           temp_bb, 0, 0, s1)
                        s1.synchronize()
                        if independent_output:
                            Abb.add_(torch.triu(tbb.T))
                        else:
                            Abb.add_(tbb)
                else:  # r > b
                    br = block_allocs[r]

                    # Load column r. Since r > b this column will be shorter than column b
                    col_r = copy_to_device(N - br.start, br.length, A,
                                           br.start, br.start, whole_col_r, 0,
                                           0, s1)
                    # Restrict column b to only the last 'r' rows
                    ccb = col_b[br.start - b_start:, :]

                    # TRMM on g_r[0:br.length, :] which is triangular (r*r)
                    #         and cur_g_b[0:br.length, :]
                    #         output is a r*b matrix and should be stored in a separate g_out block
                    # Could store output in the first rows of g_b
                    # C = alpha * op(A) @ B -- A triangular
                    trmm_fn(handle=cublas_handle,
                            side='L',
                            uplo='L',
                            trans='T',
                            diag='N',
                            m=br.length,
                            n=bb.length,
                            alpha=1.0,
                            A=col_r.data_ptr(),
                            lda=col_r.stride(1),
                            B=ccb.data_ptr(),
                            ldb=ccb.stride(1),
                            C=ccb.data_ptr(),
                            ldc=ccb.stride(1))

                    # GEMM on g_r[br.length:, :].T and cur_g_b[bb.length:, :]
                    #         output  is the same r*b matrix as before, outputs need to be summed.
                    # C = alpha * op(A) @ op(B) + beta * C
                    if br.end < N:
                        gemm_fn(handle=cublas_handle,
                                transa='T',
                                transb='N',
                                m=br.length,
                                n=bb.length,
                                k=col_r.shape[0] - br.length,
                                alpha=1.0,
                                A=col_r[br.length:, :].data_ptr(),
                                lda=col_r.stride(1),
                                B=ccb[br.length:, :].data_ptr(),
                                ldb=ccb.stride(1),
                                beta=1.0,
                                C=ccb.data_ptr(),
                                ldc=ccb.stride(1))
                    # Copy back to A[r, b]
                    if independent_output:
                        _temp_cpu = copy_to_host(br.length, bb.length, ccb, 0,
                                                 0, temp_bb, 0, 0, s1)
                        s1.synchronize()
                        A[bb.start:bb.end, br.start:br.end].copy_(_temp_cpu.T)
                    else:
                        s1.synchronize()
                        copy_to_host(br.length, bb.length, ccb, 0, 0, A,
                                     br.start, bb.start, s2)
            s2.synchronize()
Пример #8
0
def par_lauum_c_lower(A: torch.Tensor, block_allocs: List[BlockAlloc],
                      my_rows: List[int], barrier: threading.Barrier,
                      device_id: int, cublas_handle, independent_output: bool):
    N = A.shape[0]
    dts = sizeof_dtype(A.dtype)

    lauum_fn = choose_fn(A.dtype, scll.dlauum, scll.slauum, "Lapack LAUUM")
    trmm_fn = choose_fn(A.dtype, cublasDtrmm, cublasStrmm, "cuBlas TRMM")
    gemm_fn = choose_fn(A.dtype, cublasDgemm, cublasSgemm, "cuBlas GEMM")
    syrk_fn = choose_fn(A.dtype, cublasDsyrk, cublasSsyrk, "cuBlas SYRK")

    tc_device = torch.device('cuda:%d' % (device_id))
    s1 = torch.cuda.Stream(device=tc_device)
    s2 = torch.cuda.Stream(device=tc_device)
    s1_cuda, s2_cuda = s1._as_parameter_, s2._as_parameter_
    cublasSetStream(cublas_handle, s1_cuda)

    max_block_size = max(ba.length for ba in block_allocs)
    my_rows = sorted(my_rows)

    with torch.cuda.device(tc_device), torch.cuda.stream(s1):
        # Preallocate 2 block-columns. The single block is a CPU buffer
        whole_col_b = create_fortran((A.shape[0] * max_block_size, ), A.dtype,
                                     tc_device)
        whole_col_r = create_fortran((A.shape[0] * max_block_size, ), A.dtype,
                                     tc_device)
        temp_bb = create_fortran((max_block_size, max_block_size),
                                 A.dtype,
                                 'cpu',
                                 pin_memory=True).T
        for b in range(len(block_allocs)):
            bb = block_allocs[b]
            # Load col b.
            # Instead of loading the whole column only load the last rows
            # as necessary by inspecting the minimum value in my_rows which is >= b.
            try:
                min_row = min([r for r in my_rows if r >= b])
                b_start = block_allocs[min_row].start
                cuda_memcpy2d_async(dst=whole_col_b.data_ptr(),
                                    dpitch=max_block_size * dts,
                                    src=A[b_start, bb.start].data_ptr(),
                                    spitch=A.shape[1] * dts,
                                    width=bb.length * dts,
                                    height=N - b_start,
                                    stream=s1_cuda)
            except ValueError:
                # all of `my_rows` are smaller than `b`.
                pass
            if not independent_output:
                barrier.wait()

            for r in my_rows:
                if r < b:
                    continue
                if r == b:
                    is_last_row = b_start + bb.length == N
                    # SYRK on g_b[bb.length:, :] with output replacing g_b[:bb.length, :]
                    # C = beta*C + alpha * op(A) @ op(A).T
                    if not is_last_row:
                        syrk_fn(cublas_handle,
                                uplo='U',
                                trans='N',
                                n=bb.length,
                                k=N - b_start - bb.length,
                                alpha=1.0,
                                A=whole_col_b[bb.length *
                                              max_block_size:].data_ptr(),
                                lda=max_block_size,
                                beta=0.0,
                                C=whole_col_b.data_ptr(),
                                ldc=max_block_size)
                    # Run LAUUM on CPU on Abb.T (transpose because LAPACK works in F-order)
                    # Result will be on upper(uu). So if we copy back to lower(A), we must copy
                    # back uu.T -- otherwise we should copy back uu directly.
                    Abb = A[bb.start:bb.end, bb.start:bb.end]
                    if independent_output:
                        Abb_np = Abb.T.numpy().copy(order="F")  # U\L
                        copy_triang(Abb_np, upper=True)  # L\L
                        uu, info = lauum_fn(Abb_np, lower=1,
                                            overwrite_c=True)  # LAU\L
                        Abb.copy_(torch.from_numpy(uu.T))  # L \ LAU
                    else:
                        uu, info = lauum_fn(Abb.T.numpy(),
                                            lower=0,
                                            overwrite_c=False)
                        # Zeroing must happen if the SYRK output is to be added: otherwise the
                        # non-processed part of Abb (i.e. upper(Abb) if not independent_output)
                        # will be multiplied by 2.
                        if not is_last_row:
                            zero_triang(uu, upper=False)
                        Abb.copy_(torch.from_numpy(uu.T))

                    if not is_last_row:
                        cuda_memcpy2d_async(dst=temp_bb.data_ptr(),
                                            dpitch=max_block_size * dts,
                                            src=whole_col_b.data_ptr(),
                                            spitch=max_block_size * dts,
                                            width=bb.length * dts,
                                            height=bb.length,
                                            stream=s1_cuda)
                        s1.synchronize(
                        )  # TODO: Check if failure when this commented out.
                        if independent_output:
                            Abb.add_(
                                torch.triu(temp_bb[:bb.length, :bb.length].T))
                        else:
                            Abb.add_(temp_bb[:bb.length, :bb.length])
                else:  # r > b
                    br = block_allocs[r]

                    # Load column r. Since r > b this column will be shorter than column b
                    cuda_memcpy2d_async(dst=whole_col_r.data_ptr(),
                                        dpitch=max_block_size * dts,
                                        src=A[br.start, br.start].data_ptr(),
                                        spitch=A.shape[1] * dts,
                                        width=br.length * dts,
                                        height=N - br.start,
                                        stream=s1_cuda)
                    #s1.synchronize()
                    # Restrict column b to only the last 'r' rows
                    ccb = whole_col_b[(br.start - b_start) * max_block_size:]

                    # TRMM on g_r[0:br.length, :] which is triangular (r*r)
                    #         and cur_g_b[0:br.length, :]
                    #         output is a r*b matrix and should be stored in a separate g_out block
                    # Could store output in the first rows of g_b
                    # C = alpha * op(A) @ B -- A triangular
                    trmm_fn(handle=cublas_handle,
                            side='R',
                            uplo='U',
                            trans='T',
                            diag='N',
                            m=bb.length,
                            n=br.length,
                            alpha=1.0,
                            A=whole_col_r.data_ptr(),
                            lda=max_block_size,
                            B=ccb.data_ptr(),
                            ldb=max_block_size,
                            C=ccb.data_ptr(),
                            ldc=max_block_size)

                    # GEMM on g_r[br.length:, :].T and cur_g_b[bb.length:, :]
                    #         output  is the same r*b matrix as before, outputs need to be summed.
                    # C = alpha * op(A) @ op(B) + beta * C
                    if br.end < N:
                        gemm_fn(handle=cublas_handle,
                                transa='N',
                                transb='T',
                                m=bb.length,
                                n=br.length,
                                k=N - br.start - br.length,
                                alpha=1.0,
                                A=ccb[br.length * max_block_size:].data_ptr(),
                                lda=max_block_size,
                                B=whole_col_r[br.length *
                                              max_block_size:].data_ptr(),
                                ldb=max_block_size,
                                beta=1.0,
                                C=ccb.data_ptr(),
                                ldc=max_block_size)

                    # Copy back to A[r, b]
                    if independent_output:
                        # Copy must be transposed, copy to temp_bb first.
                        cublasGetMatrixAsync(rows=bb.length,
                                             cols=br.length,
                                             elem_size=dts,
                                             A=ccb.data_ptr(),
                                             lda=max_block_size,
                                             B=temp_bb.data_ptr(),
                                             ldb=max_block_size,
                                             stream=s1_cuda)
                        s1.synchronize()
                        A[bb.start:bb.end, br.start:br.end].copy_(
                            temp_bb[:br.length, :bb.length].T)
                    else:
                        s1.synchronize()
                        cublasGetMatrixAsync(rows=bb.length,
                                             cols=br.length,
                                             elem_size=dts,
                                             A=ccb.data_ptr(),
                                             lda=max_block_size,
                                             B=A[br.start,
                                                 bb.start].data_ptr(),
                                             ldb=A.shape[0],
                                             stream=s2_cuda)
            s2.synchronize()
Пример #9
0
def sparse_fmmv(proc_idx, queue, device_id):
    a: ArgsFmmv = queue.get()

    X1: SparseTensor = a.X1
    X2: SparseTensor = a.X2
    v, out = a.v, a.out
    kernel, max_mem = a.kernel, a.max_mem
    dtype = X1.dtype
    ntot, dtot = X1.shape
    mtot, T = v.size()

    avail_mem = max_mem / sizeof_dtype(dtype)
    # Memory needs:
    # X1_chunk : N + 2*D*N*density
    # X2_chunk : D + 2*D*M*density (because is transposed)
    # sparse_out : N + 2*N*M*(density) (assume density = 1)
    # ker_gpu  : M*N
    # mmv_gpu  : N*T
    # v_gpu    : M*T
    # Other: GPU buffer
    n, m = select_dim_over_m(
        maxM=mtot,
        maxN=ntot,
        tot=avail_mem,
        coef_nm=3,
        coef_n=2 + 2 * dtot * X1.density + T,
        coef_m=2 * dtot * X2.density + T,
        rest=dtot,
    )

    ddev = torch.device('cuda:%d' % int(device_id))
    with tcd.device(ddev):
        v_gpu = v.to(device=ddev)  # M x T
        mmv_gpu = create_same_stride((n, T), out, dtype, ddev)
        # ker_gpu should be fortran-ordered due to cusparse csr2dense function
        ker_gpu = create_fortran((n, m), dtype=dtype, device=ddev)

        for i in range(0, ntot, n):
            ic = min(n, ntot - i)

            cur_mmv_gpu = mmv_gpu[:ic]  # n x T
            cur_mmv_gpu.fill_(0.0)

            X1_chunk = X1.narrow_rows(i, ic)
            X1_chunk_d = X1_chunk.index_to_int().to(device=ddev)
            for j in range(0, mtot, m):
                jc = min(m, mtot - j)

                X2_chunk = X2.narrow_rows(j, jc)
                # Prepare sparse on CPU
                ddd = kernel._prepare_sparse(X1_chunk, X2_chunk)

                # Transpose X2-chunk and convert it to CSR. This uses lots of RAM
                X2_chunk_d = SparseTensor.from_scipy(
                    X2_chunk.transpose_csc().to_scipy().tocsr(copy=False)) \
                    .index_to_int() \
                    .to(device=ddev)

                cur_ker_gpu = ker_gpu[:ic, :jc]
                cur_ker_gpu.fill_(0.0)
                # Run the matrix multiplication (kernel apply)
                cur_ker_gpu = kernel._apply_sparse(X1_chunk_d, X2_chunk_d,
                                                   cur_ker_gpu)
                cur_ker_gpu = kernel._finalize(cur_ker_gpu, ddd)

                # Multiply by the vector v
                cur_mmv_gpu.addmm_(cur_ker_gpu, v_gpu.narrow(0, j, jc))
                del ddd, X2_chunk, X2_chunk_d

            # send result to CPU
            copy_to_host_noorder(ic, T, cur_mmv_gpu, 0, 0, out, i, 0)
            del X1_chunk, X1_chunk_d
    return out
Пример #10
0
def sparse_fdmmv(proc_idx, queue, device_id):
    a: ArgsFdmmv = queue.get()
    X1: SparseTensor = a.X1
    X2: SparseTensor = a.X2
    v, w, out = a.v, a.w, a.out
    kernel, max_mem = a.kernel, a.max_mem
    dtype = X1.dtype
    N, D = X1.shape
    M = X2.size(0)
    if v is None:
        T = w.size(1)
    else:
        T = v.size(1)

    # Memory needs:
    # X1_chunk : ntot + 2 * D * ntot * density
    # X2       : dtot + 2 * D * M * density (because is transposed)
    # sparse_out : ntot + 2 * ntot * M * density (assume here density = 1)
    # ker_gpu  : M * ntot
    # w_gpu    : ntot * T
    # v_gpu    : M * T
    # out_gpu  : M * T
    avail_mem = max_mem / sizeof_dtype(dtype)
    den = 2 * D * X1.density + 2 + 3 * M + T
    sub = D + 2 * D * M * X2.density + M * T
    if v is not None:
        sub += M * T
    n = (avail_mem - sub) / den
    n = min(int(n), N)
    if n < 1:
        raise MemoryError("Not enough memory to run sparse dfmmv")

    ddev = torch.device('cuda:%d' % int(device_id))
    with tcd.device(ddev):
        # Initialize GPU data
        w_gpu = create_same_stride((n, T), out, dtype, ddev)
        if out.is_cuda:
            out_gpu = out
        else:
            out_gpu = create_same_stride((M, T), out, dtype, ddev)
        out_gpu.fill_(0.0)
        ker_gpu = create_fortran((n, M), dtype, ddev)
        if v is not None:
            v_gpu = v.to(device=ddev)  # M x T

        X2_d = SparseTensor.from_scipy(
            X2.transpose_csc().to_scipy().tocsr(copy=False)) \
            .index_to_int() \
            .to(device=ddev)

        for i in range(0, N, n):
            ic = min(n, N - i)
            X1_chunk = X1.narrow_rows(i, ic)
            X1_chunk_d = X1_chunk.index_to_int().to(device=ddev)

            ker_chunk = ker_gpu[:ic]
            ker_chunk.fill_(0.0)

            # TODO: This is wasteful (X2 will be prepared many times over)
            ddd = kernel._prepare_sparse(X1_chunk, X2)
            ker_chunk = kernel._apply_sparse(X1_chunk_d, X2_d, ker_chunk)
            ker_chunk = kernel._finalize(ker_chunk, ddd)

            if w is not None:
                c_g_w = copy_to_device_noorder(ic, T, w, i, 0, w_gpu, 0, 0)
            else:
                c_g_w = w_gpu.narrow(0, 0, ic)
                c_g_w.fill_(0.0)

            if v is not None:
                c_g_w.addmm_(ker_chunk, v_gpu)
            out_gpu.addmm_(ker_chunk.T, c_g_w)
            del ddd, X1_chunk, X1_chunk_d

        if not out.is_cuda:
            copy_to_device_noorder(M, T, out_gpu, 0, 0, out, 0, 0)
    return out
Пример #11
0
def par_lauum_f_lower(A: torch.Tensor,
                      block_allocs: List[BlockAlloc],
                      my_rows: List[int],
                      barrier: threading.Barrier,
                      device_id: int,
                      cublas_handle,
                      independent_output: bool):
    N = A.shape[0]
    is_cuda = A.device.type == "cuda"

    trmm_fn = choose_fn(A.dtype, cublasDtrmm, cublasStrmm, "cuBlas TRMM")
    gemm_fn = choose_fn(A.dtype, cublasDgemm, cublasSgemm, "cuBlas GEMM")
    syrk_fn = choose_fn(A.dtype, cublasDsyrk, cublasSsyrk, "cuBlas SYRK")

    tc_device = torch.device('cuda:%d' % (device_id))
    s1 = torch.cuda.Stream(device=tc_device)
    s3 = torch.cuda.Stream(device=tc_device)

    max_block_size = max(ba.length for ba in block_allocs)
    my_rows = sorted(my_rows)

    with torch.cuda.device(tc_device), torch.cuda.stream(s1), cublas_stream(cublas_handle, s1._as_parameter_):
        # Pre allocate b-col, syrk-out, lauum-out
        mem_needed = N * max_block_size + 2 * (max_block_size ** 2)
        if not is_cuda:
            # Also pre alloc r-col
            mem_needed += N * max_block_size
        f_gpu = torch.empty(size=(mem_needed,), dtype=A.dtype, device=tc_device)
        f_offset = 0
        whole_col_b, f_offset = _extract_flat(f_gpu, (N, max_block_size), other=A, offset=f_offset)
        syrk_out, f_offset = _extract_flat(f_gpu, (max_block_size, max_block_size), other=A, offset=f_offset)
        lauum_out, f_offset = _extract_flat(f_gpu, (max_block_size, max_block_size), other=A, offset=f_offset)
        if not is_cuda:
            temp_bb = create_fortran((max_block_size, max_block_size), A.dtype, 'cpu', pin_memory=True)
            whole_col_r, f_offset = _extract_flat(f_gpu, (N, max_block_size), other=A, offset=f_offset)
        syrk_out.fill_(0.0)

        for b in range(len(block_allocs)):
            bb = block_allocs[b]
            # Load col b.
            # Instead of loading the whole column only load the last rows
            # as necessary by inspecting the minimum value in my_rows which is >= b.
            try:
                min_row = min([r for r in my_rows if r >= b])
                b_start = block_allocs[min_row].start
                if is_cuda:
                    col_b = whole_col_b[b_start:, :bb.length]
                    col_b.copy_(A[b_start:N, bb.start:bb.end])
                else:
                    col_b: torch.Tensor = copy_to_device(
                        N - b_start, bb.length, A, b_start, bb.start, whole_col_b, 0, 0, s1)
            except ValueError:
                pass  # No column here
            if not independent_output:
                # wait for copy to device to succeed. After barrier other threads may modify
                # the part of col_b which we need!
                s1.synchronize()
                barrier.wait()

            for r in my_rows:
                if r == b:
                    # SYRK on col_b[bb.length:, :] with output into syrk_out[:bb.length, :bb.length]
                    # C = beta*C + alpha * op(A) @ op(A).T
                    if b_start + bb.length < N:
                        cur_syrk_out = syrk_out[:bb.length, :bb.length]
                        syrk_fn(cublas_handle, uplo='L', trans='T',
                                n=bb.length, k=col_b.shape[0] - bb.length,
                                alpha=1.0, A=col_b[bb.length:, :].data_ptr(), lda=col_b.stride(1),
                                beta=0.0, C=cur_syrk_out.data_ptr(), ldc=syrk_out.stride(1))

                    with torch.cuda.stream(s3):
                        if independent_output:
                            s1.synchronize()  # we need col_b to be loaded
                        cur_lauum_out = lauum_out[:bb.length, :bb.length]
                        # Note that col_b[:bb.length, :bb.length] == Abb
                        if independent_output:
                            # In the independent output case we need to preserve tril(Abb) instead!
                            cur_lauum_out.copy_(col_b[:bb.length, :bb.length].T)
                        else:
                            # In normal case we need triu(Abb) to be preserved in the upper triangle of lauum_out
                            cur_lauum_out.copy_(col_b[:bb.length, :bb.length])

                        # LAUUM on col_b[:bb.length, :bb.length], into lauum_out[:bb.length, :bb.length]
                        cuda_lauum(n=bb.length, A=col_b[:bb.length, :bb.length], lda=col_b.stride(1),
                                   B=cur_lauum_out, ldb=max_block_size, lower=True)
                    s1.wait_stream(s3)  # all subsequent work will need cur_lauum_out

                    # Add outputs of SYRK and LAUUM (only if SYRK was performed)
                    if b_start + bb.length < N:
                        cur_lauum_out.add_(cur_syrk_out)

                    # Copy lauum_out into the original matrix, while preserving the other side
                    # of the triangular matrix. This depends on the `independent_output` flag.
                    Abb = A[bb.start:bb.end, bb.start:bb.end]
                    if independent_output:
                        # cuda and non-cuda cases, since we have different orderings.
                        Abb.copy_(cur_lauum_out.T)
                    elif is_cuda:
                        Abb.copy_(cur_lauum_out)
                    else:
                        copy_to_host(bb.length, bb.length, cur_lauum_out, 0, 0, Abb, 0, 0, s=s1)
                elif r > b:
                    br = block_allocs[r]

                    # Load column r. Since r > b this column will be shorter than column b
                    if is_cuda:  # If col_r is already in GPU no copy needed.
                        col_r = A[br.start:N, br.start:br.end]
                    else:
                        col_r = copy_to_device(N - br.start, br.length, A, br.start, br.start,
                                               whole_col_r, 0, 0, s1)
                    # Restrict column b to only the last 'r' rows
                    ccb = col_b[br.start - b_start:, :]

                    # TRMM on g_r[0:br.length, :] which is triangular (r*r)
                    #         and cur_g_b[0:br.length, :]
                    #         output is a r*b matrix stored in the first rows of ccb
                    # C = alpha * op(A) @ B -- A triangular
                    trmm_fn(
                        handle=cublas_handle,
                        side='L', uplo='L', trans='T', diag='N',
                        m=br.length, n=bb.length,
                        alpha=1.0, A=col_r.data_ptr(), lda=col_r.stride(1),
                        B=ccb.data_ptr(), ldb=ccb.stride(1),
                        C=ccb.data_ptr(), ldc=ccb.stride(1))

                    # GEMM on g_r[br.length:, :].T and cur_g_b[bb.length:, :]
                    #         output  is the same r*b matrix as before, outputs need to be summed.
                    # C = alpha * op(A) @ op(B) + beta * C
                    if br.end < N:
                        gemm_fn(handle=cublas_handle,
                                transa='T', transb='N',
                                m=br.length, n=bb.length, k=col_r.shape[0] - br.length,
                                alpha=1.0, A=col_r[br.length:, :].data_ptr(), lda=col_r.stride(1),
                                B=ccb[br.length:, :].data_ptr(), ldb=ccb.stride(1),
                                beta=1.0, C=ccb.data_ptr(), ldc=ccb.stride(1))
                    # Copy back to A[r, b]
                    if independent_output:
                        if is_cuda:
                            A[bb.start:bb.end, br.start:br.end].copy_(ccb[:br.length, :bb.length].T)
                        else:
                            _temp_cpu = copy_to_host(br.length, bb.length, ccb, 0, 0, temp_bb, 0, 0, s1)
                            s1.synchronize()  # must wait for data to be onto CPU.
                            A[bb.start:bb.end, br.start:br.end].copy_(_temp_cpu.T)
                    elif is_cuda:
                        A[br.start:br.end, bb.start:bb.end].copy_(ccb[:br.length, :bb.length])
                    else:
                        copy_to_host(br.length, bb.length, ccb, 0, 0, A, br.start, bb.start, s1)
            s1.synchronize()
Пример #12
0
def par_lauum_c_lower(A: torch.Tensor,
                      block_allocs: List[BlockAlloc],
                      my_rows: List[int],
                      barrier: threading.Barrier,
                      device_id: int,
                      cublas_handle,
                      independent_output: bool):
    N = A.shape[0]
    dts = sizeof_dtype(A.dtype)
    is_cuda = A.device.type == "cuda"

    trmm_fn = choose_fn(A.dtype, cublasDtrmm, cublasStrmm, "cuBlas TRMM")
    gemm_fn = choose_fn(A.dtype, cublasDgemm, cublasSgemm, "cuBlas GEMM")
    syrk_fn = choose_fn(A.dtype, cublasDsyrk, cublasSsyrk, "cuBlas SYRK")

    tc_device = torch.device('cuda:%d' % (device_id))
    s1 = torch.cuda.Stream(device=tc_device)
    s3 = torch.cuda.Stream(device=tc_device)
    s1_cuda = s1._as_parameter_

    max_block_size = max(ba.length for ba in block_allocs)
    my_rows = sorted(my_rows)

    with torch.cuda.device(tc_device), torch.cuda.stream(s1), cublas_stream(cublas_handle, s1_cuda):
        if not is_cuda:
            temp_bb = create_fortran((max_block_size, max_block_size), A.dtype, 'cpu', pin_memory=True).T
        # Pre allocate r-col, b-col, syrk-out, lauum-out
        mem_needed = 2 * N * max_block_size + 2 * (max_block_size ** 2)
        f_gpu = torch.empty(size=(mem_needed,), dtype=A.dtype, device=tc_device)
        whole_col_b = f_gpu[:N * max_block_size]
        whole_col_r = f_gpu[N * max_block_size: 2 * N * max_block_size]
        syrk_out = extract_fortran(f_gpu, size=(max_block_size, max_block_size), offset=2 * N * max_block_size)
        lauum_out = extract_fortran(f_gpu, size=(max_block_size, max_block_size), offset=2 * N * max_block_size + max_block_size ** 2)
        syrk_out.fill_(0.0)

        for b in range(len(block_allocs)):
            bb = block_allocs[b]
            # Load col b.
            # Instead of loading the whole column only load the last rows
            # as necessary by inspecting the minimum value in my_rows which is >= b.
            try:
                min_row = min([r for r in my_rows if r >= b])
                b_start = block_allocs[min_row].start
                cuda_memcpy2d_async(
                    dst=whole_col_b.data_ptr(), dpitch=max_block_size * dts,
                    src=A[b_start, bb.start].data_ptr(), spitch=A.shape[1] * dts,
                    width=bb.length * dts, height=N - b_start, stream=s1_cuda)
            except ValueError:  # all of `my_rows` are smaller than `b`.
                pass
            if not independent_output:
                # wait for copy to device to succeed. After barrier other threads may modify
                # the part of col_b which we need!
                s1.synchronize()
                barrier.wait()

            for r in my_rows:
                if r < b:
                    continue
                if r == b:
                    is_last_row = b_start + bb.length == N
                    # SYRK on g_b[bb.length:, :] with output replacing g_b[:bb.length, :]
                    # C = beta*C + alpha * op(A) @ op(A).T
                    if not is_last_row:
                        syrk_fn(cublas_handle, uplo='U', trans='N',
                                n=bb.length, k=N - b_start - bb.length,
                                alpha=1.0, A=whole_col_b[bb.length * max_block_size:].data_ptr(),
                                lda=max_block_size,
                                beta=0.0, C=syrk_out.data_ptr(), ldc=max_block_size)

                    with torch.cuda.stream(s3):
                        if independent_output:
                            s1.synchronize()  # we need col_b to be loaded
                        # Lower LAUUM for C-contig is equal to upper LAUUM for F-contig
                        c_lauum_in = whole_col_b[:bb.length * max_block_size].view(bb.length, max_block_size)[:, :bb.length]
                        c_lauum_out = lauum_out[:bb.length, :bb.length]

                        if independent_output:
                            c_lauum_out.copy_(c_lauum_in)
                        else:
                            c_lauum_out.copy_(c_lauum_in.T)
                        cuda_lauum(n=bb.length, A=c_lauum_in, lda=max_block_size, B=c_lauum_out, ldb=max_block_size, lower=False)

                    s1.wait_stream(s3)  # all subsequent work on s1 will need cur_lauum_out
                    if not is_last_row:
                        c_lauum_out.add_(syrk_out[:bb.length, :bb.length])

                    # copy back whole_col_b into Abb
                    # Now lauum_out is F-contig, while Abb is C-contig
                    Abb = A[bb.start:bb.end, bb.start:bb.end]
                    if independent_output:
                        Abb.copy_(c_lauum_out)
                    else:
                        Abb.copy_(c_lauum_out.T)
                else:  # r > b
                    br = block_allocs[r]

                    # Load column r. Since r > b this column will be shorter than column b
                    cuda_memcpy2d_async(
                        dst=whole_col_r.data_ptr(), dpitch=max_block_size * dts,
                        src=A[br.start, br.start].data_ptr(), spitch=A.shape[1] * dts,
                        width=br.length * dts, height=N - br.start, stream=s1_cuda)
                    # Restrict column b to only the last 'r' rows
                    ccb = whole_col_b[(br.start - b_start) * max_block_size:]

                    # TRMM on g_r[0:br.length, :] which is triangular (r*r)
                    #         and cur_g_b[0:br.length, :]
                    #         output is a r*b matrix and stored in first rows of ccb
                    # C = alpha * op(A) @ B -- A triangular
                    trmm_fn(
                        handle=cublas_handle,
                        side='R', uplo='U', trans='T', diag='N',
                        m=bb.length, n=br.length,
                        alpha=1.0, A=whole_col_r.data_ptr(), lda=max_block_size,
                        B=ccb.data_ptr(), ldb=max_block_size,
                        C=ccb.data_ptr(), ldc=max_block_size)

                    # GEMM on g_r[br.length:, :].T and cur_g_b[bb.length:, :]
                    #         output  is the same r*b matrix as before, outputs need to be summed.
                    # C = alpha * op(A) @ op(B) + beta * C
                    if br.end < N:
                        gemm_fn(handle=cublas_handle, transa='N', transb='T',
                                m=bb.length, n=br.length, k=N - br.start - br.length,
                                alpha=1.0,
                                A=ccb[br.length * max_block_size:].data_ptr(),
                                lda=max_block_size,
                                B=whole_col_r[br.length * max_block_size:].data_ptr(),
                                ldb=max_block_size,
                                beta=1.0, C=ccb.data_ptr(), ldc=max_block_size)

                    # Copy back to A[r, b]
                    if is_cuda:
                        ccb_square = ccb[:max_block_size * br.length].view(br.length, max_block_size)
                        if independent_output:
                            A[bb.start:bb.end, br.start:br.end].copy_(ccb_square[:br.length, :bb.length].T)
                        else:
                            A[br.start:br.end, bb.start:bb.end].copy_(ccb_square[:br.length, :bb.length])
                    elif independent_output:
                        # Copy must be transposed, copy to temp_bb first.
                        cublasGetMatrixAsync(
                            rows=bb.length, cols=br.length, elem_size=dts,
                            A=ccb.data_ptr(), lda=max_block_size,
                            B=temp_bb.data_ptr(), ldb=max_block_size, stream=s1_cuda)
                        s1.synchronize()
                        A[bb.start:bb.end, br.start:br.end].copy_(temp_bb[:br.length, :bb.length].T)
                    else:
                        cublasGetMatrixAsync(
                            rows=bb.length, cols=br.length, elem_size=dts,
                            A=ccb.data_ptr(), lda=max_block_size,
                            B=A[br.start, bb.start].data_ptr(), ldb=A.shape[0],
                            stream=s1_cuda)
            s1.synchronize()
Пример #13
0
    def init(self, X: Union[torch.Tensor, SparseTensor], Y: torch.Tensor,
             alpha: torch.Tensor, penalty: float, N: int) -> None:
        """Initialize the preconditioner matrix.

        This method must be called before the preconditioner becomes usable.

        Parameters
        ----------
        X : MxD tensor
            Matrix of Nystroem centers
        Y : Mx1 tensor
            Vector of targets corresponding to the Nystroem centers `X`
        alpha : Mx1 tensor
            Parameter vector (of the same dimension as `Y`) which gives the current
            solution to the optimization problem.
        penalty : float
            Regularization amount
        N : int
            Number of points in the full data-set.

        Notes
        -----
        If `debug=True` is present in the options, this method will print a lot of extra
        information pertaining timings of the various preconditioner operations. This can be
        useful to help understand how the preconditioner works.
        """
        if Y.shape[1] != 1:
            raise ValueError(
                "Logistic preconditioner can only deal with 1D outputs.")

        dtype = X.dtype
        M = X.size(0)

        eps = self.params.pc_epsilon(dtype)

        if self.fC is None:
            # This is done only at the first iteration of the logistic-falkon algorithm
            # It sets the `T` variable from the paper (chol(kMM)) to the upper part of `self.fC`
            with TicToc("Kernel", debug=self.params.debug):
                if isinstance(X, torch.Tensor):
                    C = create_same_stride((M, M),
                                           X,
                                           dtype=dtype,
                                           device='cpu',
                                           pin_memory=self._use_cuda)
                else:  # If sparse tensor we need fortran for kernel calculation
                    C = create_fortran((M, M),
                                       dtype=dtype,
                                       device='cpu',
                                       pin_memory=self._use_cuda)
                self.kernel(X, X, out=C, opt=self.params)
            self.fC = C.numpy()
            if not is_f_contig(C):
                self.fC = self.fC.T

            with TicToc("Add diag", debug=self.params.debug):
                # Compute T: lower(fC) = T.T
                inplace_add_diag(self.fC, eps * M)
            with TicToc("Cholesky 1", debug=self.params.debug):
                self.fC = potrf_wrapper(self.fC,
                                        clean=True,
                                        upper=False,
                                        use_cuda=self._use_cuda,
                                        opt=self.params)
                # Save the diagonal which will be overwritten when computing A
                self.dT = C.diag()
            with TicToc("Copy triangular", debug=self.params.debug):
                # Copy lower(fC) to upper(fC):  upper(fC) = T.
                copy_triang(self.fC, upper=False)
        else:
            if not self._use_cuda:
                # Copy non-necessary for cuda since LAUUM will do the copying
                with TicToc("Copy triangular", debug=self.params.debug):
                    # Copy upper(fC) to lower(fC): lower(fC) = T.T
                    copy_triang(self.fC,
                                upper=True)  # does not copy the diagonal
            # Setting diagonal necessary for trmm
            inplace_set_diag(self.fC, self.dT)

        # Compute W
        with TicToc("TRMM", debug=self.params.debug):
            # T is on upper(fC). Compute T.T @ alpha
            alpha = self._trmm(alpha.clone())
        with TicToc("W (ddf)", debug=self.params.debug):
            W = self.loss.ddf(Y, alpha)
        with TicToc("W-Multiply", debug=self.params.debug):
            W.sqrt_()
            self.fC = vec_mul_triang(self.fC,
                                     W.numpy().reshape(-1),
                                     side=0,
                                     upper=False)

        if self._use_cuda:
            with TicToc("LAUUM", debug=self.params.debug):
                # Product upper(fC) @ upper(fC).T : lower(fC) = T @ T.T
                self.fC = lauum_wrapper(self.fC,
                                        upper=True,
                                        use_cuda=self._use_cuda,
                                        opt=self.params)
        else:
            with TicToc("LAUUM", debug=self.params.debug):
                # Product lower(fC).T @ lower(fC) : lower(fC) = T @ T.T
                self.fC = lauum_wrapper(self.fC,
                                        upper=False,
                                        use_cuda=self._use_cuda,
                                        opt=self.params)

        # NOTE: Here the multiplier is 1/N instead of the more common 1/M!
        mul_triang(self.fC, upper=False, preserve_diag=False, multiplier=1 / N)

        with TicToc("Add diag", debug=self.params.debug):
            # lower(fC) = 1/N * [email protected] + lambda * I
            inplace_add_diag(self.fC, penalty)

        with TicToc("Cholesky 2", debug=self.params.debug):
            # Cholesky on lower(fC) : lower(fC) = A.T
            self.fC = potrf_wrapper(self.fC,
                                    clean=False,
                                    upper=False,
                                    use_cuda=self._use_cuda,
                                    opt=self.params)
            self.dA = torch.from_numpy(self.fC).diag()
Пример #14
0
    def init(self,
             X: Union[torch.Tensor, SparseTensor],
             weight_vec: Optional[torch.Tensor] = None):
        """Initialize the preconditioner matrix.

        This method must be called before the preconditioner can be used.

        Parameters
        ----------
        X : torch.Tensor
            The (M x D) matrix of Nystroem centers
        weight_vec
            An optional vector of size (M x 1) which is used for reweighted least-squares.
            This vector should contain the weights corresponding to the Nystrom centers.
        """
        if X.is_cuda and not self._use_cuda:
            raise RuntimeError(
                "use_cuda is set to False, but data is CUDA tensor. "
                "Check your options.")
        if weight_vec is not None and not check_same_device(X, weight_vec):
            raise ValueError(f"Weights and data are not on the same device "
                             f"({weight_vec.device}, {X.device})")
        if weight_vec is not None and weight_vec.shape[0] != X.shape[0]:
            raise ValueError(
                f"Weights and Nystrom centers should have the same first dimension. "
                f"Found instead {weight_vec.shape[0]}, {X.shape[0]}.")
        dtype = X.dtype
        dev = X.device
        eps = self.params.pc_epsilon(X.dtype)
        M = X.size(0)

        with TicToc("Kernel", debug=self.params.debug):
            if isinstance(X, torch.Tensor):
                C = create_same_stride((M, M),
                                       X,
                                       dtype=dtype,
                                       device=dev,
                                       pin_memory=self._use_cuda)
            else:  # If sparse tensor we need fortran for kernel calculation
                C = create_fortran((M, M),
                                   dtype=dtype,
                                   device=dev,
                                   pin_memory=self._use_cuda)
            self.kernel(X, X, out=C, opt=self.params)
        if not is_f_contig(C):
            C = C.T

        with TicToc("Cholesky 1", debug=self.params.debug):
            # Compute T: lower(fC) = T.T
            inplace_add_diag_th(C, eps * M)
            C = potrf_wrapper(C,
                              clean=False,
                              upper=False,
                              use_cuda=self._use_cuda,
                              opt=self.params)
            # Save the diagonal which will be overwritten when computing A
            self.dT = C.diag()

        with TicToc("Copy triangular", debug=self.params.debug):
            # Copy lower(fC) to upper(fC):  upper(fC) = T.
            copy_triang(C, upper=False)

        # Weighted least-squares needs to weight the A matrix. We can weigh once before LAUUM,
        # but since CUDA-LAUUM touches both sides of C, weighting before LAUUM will also modify
        # the matrix T. Therefore for CUDA inputs we weigh twice after LAUUM!
        if weight_vec is not None and not self._use_cuda:
            with TicToc("Weighting(CPU)", debug=self.params.debug):
                weight_vec.sqrt_()
                vec_mul_triang(C, weight_vec, side=1, upper=False)

        if self._use_cuda:
            with TicToc("LAUUM(CUDA)", debug=self.params.debug):
                # Product upper(fC) @ upper(fC).T, store in lower(fC) = T @ T.T
                C = lauum_wrapper(C,
                                  upper=True,
                                  use_cuda=self._use_cuda,
                                  opt=self.params)
        else:
            with TicToc("LAUUM(CPU)", debug=self.params.debug):
                # Product lower(fC).T @ lower(fC), store in lower(fC) = T @ T.T
                C = lauum_wrapper(C,
                                  upper=False,
                                  use_cuda=self._use_cuda,
                                  opt=self.params)

        if weight_vec is not None and self._use_cuda:
            with TicToc("Weighting(CUDA)", debug=self.params.debug):
                weight_vec.sqrt_()
                vec_mul_triang(C, weight_vec, side=0, upper=False)
                vec_mul_triang(C, weight_vec, side=1, upper=False)

        with TicToc("Cholesky 2", debug=self.params.debug):
            # lower(fC) = 1/M * [email protected]
            mul_triang(C, upper=False, preserve_diag=False, multiplier=1 / M)
            # lower(fC) = 1/M * [email protected] + lambda * I
            inplace_add_diag_th(C, self._lambda)
            # Cholesky on lower(fC) : lower(fC) = A.T
            C = potrf_wrapper(C,
                              clean=False,
                              upper=False,
                              use_cuda=self._use_cuda,
                              opt=self.params)
            self.dA = C.diag()

        self.fC = C
Пример #15
0
def par_lauum_f_lower(A: torch.Tensor, block_allocs: List[BlockAlloc],
                      my_rows: List[int], barrier: threading.Barrier,
                      device_id: int, cublas_handle, independent_output: bool):
    N = A.shape[0]
    is_cuda = A.device.type == "cuda"

    trmm_fn = choose_fn(A.dtype, cublasDtrmm, cublasStrmm, "cuBlas TRMM")
    gemm_fn = choose_fn(A.dtype, cublasDgemm, cublasSgemm, "cuBlas GEMM")
    syrk_fn = choose_fn(A.dtype, cublasDsyrk, cublasSsyrk, "cuBlas SYRK")

    tc_device = torch.device('cuda:%d' % (device_id))
    s1 = torch.cuda.Stream(device=tc_device)
    s3 = torch.cuda.Stream(device=tc_device)

    max_block_size = max(ba.length for ba in block_allocs)
    my_rows = sorted(my_rows)

    with torch.cuda.device(tc_device), torch.cuda.stream(s1), cublas_stream(
            cublas_handle, s1._as_parameter_):
        # Preallocate 2 columns
        if not is_cuda:
            whole_col_b = create_fortran((A.shape[0], max_block_size), A.dtype,
                                         tc_device)
            whole_col_r = create_fortran((A.shape[0], max_block_size), A.dtype,
                                         tc_device)
        syrk_out = create_fortran((max_block_size, max_block_size), A.dtype,
                                  tc_device)
        lauum_out = create_fortran((max_block_size, max_block_size), A.dtype,
                                   tc_device)
        temp_bb = create_fortran((max_block_size, max_block_size),
                                 A.dtype,
                                 'cpu',
                                 pin_memory=True)

        for b in range(len(block_allocs)):
            bb = block_allocs[b]
            # Load col b.
            # Instead of loading the whole column only load the last rows
            # as necessary by inspecting the minimum value in my_rows which is >= b.
            try:
                min_row = min([r for r in my_rows if r >= b])
                b_start = block_allocs[min_row].start
                if is_cuda:
                    col_b: torch.Tensor = A[b_start:N, bb.start:bb.end]
                else:
                    col_b: torch.Tensor = copy_to_device(
                        N - b_start, bb.length, A, b_start, bb.start,
                        whole_col_b, 0, 0, s1)
            except ValueError:
                pass  # No column here
            if not independent_output:
                barrier.wait()

            for r in my_rows:
                if r == b:
                    # SYRK on col_b[bb.length:, :] with output into syrk_out[:bb.length, :bb.length]
                    # C = beta*C + alpha * op(A) @ op(A).T
                    if b_start + bb.length < N:
                        cur_syrk_out = syrk_out[:bb.length, :bb.length]
                        syrk_fn(cublas_handle,
                                uplo='L',
                                trans='T',
                                n=bb.length,
                                k=col_b.shape[0] - bb.length,
                                alpha=1.0,
                                A=col_b[bb.length:, :].data_ptr(),
                                lda=col_b.stride(1),
                                beta=0.0,
                                C=cur_syrk_out.data_ptr(),
                                ldc=syrk_out.stride(1))

                    with torch.cuda.stream(s3):
                        cur_lauum_out = lauum_out[:bb.length, :bb.length]
                        # Note that col_b[:bb.length, :bb.length] == Abb
                        if independent_output:
                            # In the independent output case we need to preserve tril(Abb) instead!
                            cur_lauum_out.copy_(
                                col_b[:bb.length, :bb.length].T)
                        else:
                            # In normal case we need triu(Abb) to be preserved in the upper triangle of lauum_out
                            cur_lauum_out.copy_(col_b[:bb.length, :bb.length])

                        # LAUUM on col_b[:bb.length, :bb.length], into lauum_out[:bb.length, :bb.length]
                        cuda_lauum_lower(n=bb.length,
                                         A=col_b[:bb.length, :bb.length],
                                         lda=col_b.stride(1),
                                         B=cur_lauum_out,
                                         ldb=max_block_size)
                    s3.synchronize()

                    # Add outputs of SYRK and LAUUM (only if SYRK was performed)
                    if b_start + bb.length < N:
                        s1.synchronize()
                        cur_lauum_out.add_(cur_syrk_out)

                    # Copy lauum_out into the original matrix, while preserving the other side
                    # of the triangular matrix. This depends on the `independent_output` flag.
                    Abb = A[bb.start:bb.end, bb.start:bb.end]
                    if independent_output:
                        Abb.copy_(cur_lauum_out.T)
                    else:
                        copy_to_host(bb.length,
                                     bb.length,
                                     cur_lauum_out,
                                     0,
                                     0,
                                     Abb,
                                     0,
                                     0,
                                     s=s1)
                elif r > b:
                    br = block_allocs[r]

                    # Load column r. Since r > b this column will be shorter than column b
                    if is_cuda:
                        col_r = A[br.start:N, br.start:br.end]
                    else:
                        col_r = copy_to_device(N - br.start, br.length, A,
                                               br.start, br.start, whole_col_r,
                                               0, 0, s1)
                    # Restrict column b to only the last 'r' rows
                    ccb = col_b[br.start - b_start:, :]

                    # TRMM on g_r[0:br.length, :] which is triangular (r*r)
                    #         and cur_g_b[0:br.length, :]
                    #         output is a r*b matrix and should be stored in a separate g_out block
                    # Could store output in the first rows of g_b
                    # C = alpha * op(A) @ B -- A triangular
                    trmm_fn(handle=cublas_handle,
                            side='L',
                            uplo='L',
                            trans='T',
                            diag='N',
                            m=br.length,
                            n=bb.length,
                            alpha=1.0,
                            A=col_r.data_ptr(),
                            lda=col_r.stride(1),
                            B=ccb.data_ptr(),
                            ldb=ccb.stride(1),
                            C=ccb.data_ptr(),
                            ldc=ccb.stride(1))

                    # GEMM on g_r[br.length:, :].T and cur_g_b[bb.length:, :]
                    #         output  is the same r*b matrix as before, outputs need to be summed.
                    # C = alpha * op(A) @ op(B) + beta * C
                    if br.end < N:
                        gemm_fn(handle=cublas_handle,
                                transa='T',
                                transb='N',
                                m=br.length,
                                n=bb.length,
                                k=col_r.shape[0] - br.length,
                                alpha=1.0,
                                A=col_r[br.length:, :].data_ptr(),
                                lda=col_r.stride(1),
                                B=ccb[br.length:, :].data_ptr(),
                                ldb=ccb.stride(1),
                                beta=1.0,
                                C=ccb.data_ptr(),
                                ldc=ccb.stride(1))
                    # Copy back to A[r, b]
                    if independent_output:
                        if is_cuda:
                            A[bb.start:bb.end, br.start:br.end].copy_(
                                ccb[:br.length, :bb.length].T)
                        else:
                            _temp_cpu = copy_to_host(br.length, bb.length, ccb,
                                                     0, 0, temp_bb, 0, 0, s1)
                            s1.synchronize()
                            A[bb.start:bb.end,
                              br.start:br.end].copy_(_temp_cpu.T)
                    elif not is_cuda:
                        copy_to_host(br.length, bb.length, ccb, 0, 0, A,
                                     br.start, bb.start, s1)
            s1.synchronize()
Пример #16
0
def par_lauum_c_lower(A: torch.Tensor, block_allocs: List[BlockAlloc],
                      my_rows: List[int], barrier: threading.Barrier,
                      device_id: int, cublas_handle, independent_output: bool):
    N = A.shape[0]
    dts = sizeof_dtype(A.dtype)
    is_cuda = A.device.type == "cuda"

    trmm_fn = choose_fn(A.dtype, cublasDtrmm, cublasStrmm, "cuBlas TRMM")
    gemm_fn = choose_fn(A.dtype, cublasDgemm, cublasSgemm, "cuBlas GEMM")
    syrk_fn = choose_fn(A.dtype, cublasDsyrk, cublasSsyrk, "cuBlas SYRK")

    tc_device = torch.device('cuda:%d' % (device_id))
    s1 = torch.cuda.Stream(device=tc_device)
    s3 = torch.cuda.Stream(device=tc_device)
    s1_cuda = s1._as_parameter_

    max_block_size = max(ba.length for ba in block_allocs)
    my_rows = sorted(my_rows)

    with torch.cuda.device(tc_device), torch.cuda.stream(s1), cublas_stream(
            cublas_handle, s1_cuda):
        # Preallocate 2 block-columns. The single block is a CPU buffer
        whole_col_b = create_fortran((A.shape[0] * max_block_size, ), A.dtype,
                                     tc_device)
        whole_col_r = create_fortran((A.shape[0] * max_block_size, ), A.dtype,
                                     tc_device)
        syrk_out = create_fortran((max_block_size, max_block_size), A.dtype,
                                  tc_device)
        lauum_in = create_fortran((max_block_size, max_block_size), A.dtype,
                                  tc_device)
        temp_bb = create_fortran((max_block_size, max_block_size),
                                 A.dtype,
                                 'cpu',
                                 pin_memory=True).T

        for b in range(len(block_allocs)):
            bb = block_allocs[b]
            # Load col b.
            # Instead of loading the whole column only load the last rows
            # as necessary by inspecting the minimum value in my_rows which is >= b.
            try:
                min_row = min([r for r in my_rows if r >= b])
                b_start = block_allocs[min_row].start
                cuda_memcpy2d_async(dst=whole_col_b.data_ptr(),
                                    dpitch=max_block_size * dts,
                                    src=A[b_start, bb.start].data_ptr(),
                                    spitch=A.shape[1] * dts,
                                    width=bb.length * dts,
                                    height=N - b_start,
                                    stream=s1_cuda)
            except ValueError:  # all of `my_rows` are smaller than `b`.
                pass
            if not independent_output:
                barrier.wait()

            for r in my_rows:
                if r < b:
                    continue
                if r == b:
                    is_last_row = b_start + bb.length == N
                    # Sync the load of whole_col_b
                    s1.synchronize()
                    # SYRK on g_b[bb.length:, :] with output replacing g_b[:bb.length, :]
                    # C = beta*C + alpha * op(A) @ op(A).T
                    if not is_last_row:
                        syrk_fn(cublas_handle,
                                uplo='U',
                                trans='N',
                                n=bb.length,
                                k=N - b_start - bb.length,
                                alpha=1.0,
                                A=whole_col_b[bb.length *
                                              max_block_size:].data_ptr(),
                                lda=max_block_size,
                                beta=0.0,
                                C=syrk_out.data_ptr(),
                                ldc=max_block_size)

                    with torch.cuda.stream(s3):
                        lauum_out = whole_col_b[:bb.length *
                                                max_block_size].view(
                                                    bb.length, max_block_size
                                                )[:, :bb.length]
                        # With the copy we go from C-contig to F-contig into lauum_in. This also transposes lauum_out so we get a correct order.
                        cur_lauum_in = lauum_in[:bb.length, :bb.length]
                        cur_lauum_in.copy_(lauum_out)
                        # Since lauum_out is supposed to also be F-contig, we must do another copy from lauum_in to lauum_out.
                        if independent_output:
                            lauum_out.copy_(cur_lauum_in)
                        else:
                            lauum_out.copy_(cur_lauum_in.T)
                        cuda_lauum_lower(n=bb.length,
                                         A=cur_lauum_in,
                                         lda=max_block_size,
                                         B=lauum_out,
                                         ldb=max_block_size)

                    s3.synchronize()
                    if not is_last_row:
                        s1.synchronize()
                        lauum_out.add_(syrk_out[:bb.length, :bb.length])

                    # copy back whole_col_b into Abb
                    # Now lauum_out is F-contig, while Abb is C-contig
                    Abb = A[bb.start:bb.end, bb.start:bb.end]
                    if independent_output:
                        Abb.copy_(lauum_out)
                    else:
                        if not is_cuda:
                            # It is not possible to directly copy lauum_out.T to Abb due to mismatch in strides.
                            # Use CPU buffer for copy, and then copy transposed into Abb
                            temp_bb[:bb.length, :bb.length].copy_(lauum_out)
                            Abb.copy_(temp_bb[:bb.length, :bb.length].T)
                        else:
                            Abb.copy_(lauum_out.T)
                else:  # r > b
                    br = block_allocs[r]

                    # Load column r. Since r > b this column will be shorter than column b
                    cuda_memcpy2d_async(dst=whole_col_r.data_ptr(),
                                        dpitch=max_block_size * dts,
                                        src=A[br.start, br.start].data_ptr(),
                                        spitch=A.shape[1] * dts,
                                        width=br.length * dts,
                                        height=N - br.start,
                                        stream=s1_cuda)
                    # Restrict column b to only the last 'r' rows
                    ccb = whole_col_b[(br.start - b_start) * max_block_size:]

                    # TRMM on g_r[0:br.length, :] which is triangular (r*r)
                    #         and cur_g_b[0:br.length, :]
                    #         output is a r*b matrix and should be stored in a separate g_out block
                    # Could store output in the first rows of g_b
                    # C = alpha * op(A) @ B -- A triangular
                    trmm_fn(handle=cublas_handle,
                            side='R',
                            uplo='U',
                            trans='T',
                            diag='N',
                            m=bb.length,
                            n=br.length,
                            alpha=1.0,
                            A=whole_col_r.data_ptr(),
                            lda=max_block_size,
                            B=ccb.data_ptr(),
                            ldb=max_block_size,
                            C=ccb.data_ptr(),
                            ldc=max_block_size)

                    # GEMM on g_r[br.length:, :].T and cur_g_b[bb.length:, :]
                    #         output  is the same r*b matrix as before, outputs need to be summed.
                    # C = alpha * op(A) @ op(B) + beta * C
                    if br.end < N:
                        gemm_fn(handle=cublas_handle,
                                transa='N',
                                transb='T',
                                m=bb.length,
                                n=br.length,
                                k=N - br.start - br.length,
                                alpha=1.0,
                                A=ccb[br.length * max_block_size:].data_ptr(),
                                lda=max_block_size,
                                B=whole_col_r[br.length *
                                              max_block_size:].data_ptr(),
                                ldb=max_block_size,
                                beta=1.0,
                                C=ccb.data_ptr(),
                                ldc=max_block_size)

                    # Copy back to A[r, b]
                    if independent_output:
                        # Copy must be transposed, copy to temp_bb first.
                        cublasGetMatrixAsync(rows=bb.length,
                                             cols=br.length,
                                             elem_size=dts,
                                             A=ccb.data_ptr(),
                                             lda=max_block_size,
                                             B=temp_bb.data_ptr(),
                                             ldb=max_block_size,
                                             stream=s1_cuda)
                        s1.synchronize()
                        A[bb.start:bb.end, br.start:br.end].copy_(
                            temp_bb[:br.length, :bb.length].T)
                    else:
                        cublasGetMatrixAsync(rows=bb.length,
                                             cols=br.length,
                                             elem_size=dts,
                                             A=ccb.data_ptr(),
                                             lda=max_block_size,
                                             B=A[br.start,
                                                 bb.start].data_ptr(),
                                             ldb=A.shape[0],
                                             stream=s1_cuda)
            s1.synchronize()