def fmm_cuda_sparse(X1: SparseTensor, X2: SparseTensor, kernel: 'falkon.kernels.Kernel', out: Optional[torch.Tensor] = None, opt: Optional[BaseOptions] = None) -> torch.Tensor: opt = _setup_opt(opt) _check_contiguity((out, 'out')) N = X1.size(0) M = X2.size(0) if out is None: out = create_fortran((N, M), X1.dtype, 'cpu', pin_memory=True) gpu_info = _get_gpu_info(opt, slack=0.9) block_sizes = calc_gpu_block_sizes(gpu_info, N) # If float32 we need to upcast to float64 to avoid numerical precision errors # in the kernel gpu_dtype = X1.dtype if sizeof_dtype(X1.dtype) < 8 and opt.no_single_kernel: gpu_dtype = torch.float64 # Create the arguments passed to each subprocess args = [] for i, g in enumerate(gpu_info): bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue args.append((ArgsFmm(X1=X1.narrow_rows(block_sizes[i], bwidth), X2=X2, out=out.narrow(0, block_sizes[i], bwidth), kernel=kernel, gpu_dtype=gpu_dtype, max_mem=g.usable_ram), g.Id)) _start_wait_processes(_sparse_fmm, args) torch.cuda.empty_cache() return out
def fmmv_cuda_sparse(X1: SparseTensor, X2: SparseTensor, v: torch.Tensor, kernel, out: Optional[torch.Tensor] = None, opt: Optional[BaseOptions] = None) -> torch.Tensor: opt = _setup_opt(opt) _check_contiguity((v, 'v'), (out, 'out')) N = X1.size(0) # Create output matrix if out is None: out = create_fortran((N, v.size(1)), v.dtype, 'cpu', pin_memory=True) out.fill_(0.0) gpu_info = _get_gpu_info(opt, slack=0.9) block_sizes = calc_gpu_block_sizes(gpu_info, N) # Create queues args = [] # Arguments passed to each subprocess for i, g in enumerate(gpu_info): bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue args.append((ArgsFmmv(X1=X1.narrow_rows(block_sizes[i], bwidth), X2=X2, v=v, out=out.narrow(0, block_sizes[i], bwidth), kernel=kernel, max_mem=g.usable_ram), g.Id)) _start_wait_processes(sparse_fmmv, args) return out
def init(self, X: Union[torch.Tensor, SparseTensor]): """Initialize the preconditioner matrix. This method must be called before the preconditioner can be used. Parameters ---------- X : MxD tensor The matrix of Nystroem centers """ dtype = X.dtype eps = self.params.pc_epsilon(X.dtype) M = X.size(0) with TicToc("Kernel", debug=self.params.debug): if isinstance(X, torch.Tensor): C = create_same_stride((M, M), X, dtype=dtype, device='cpu', pin_memory=self._use_cuda) else: # If sparse tensor we need fortran for kernel calculation C = create_fortran((M, M), dtype=dtype, device='cpu', pin_memory=self._use_cuda) self.kernel(X, X, out=C, opt=self.params) self.fC = C.numpy() if not is_f_contig(C): self.fC = self.fC.T with TicToc("Cholesky 1", debug=self.params.debug): # Compute T: lower(fC) = T.T inplace_add_diag(self.fC, eps * M) self.fC = potrf_wrapper(self.fC, clean=False, upper=False, use_cuda=self._use_cuda, opt=self.params) # Save the diagonal which will be overwritten when computing A self.dT = C.diag() with TicToc("Copy triangular", debug=self.params.debug): # Copy lower(fC) to upper(fC): upper(fC) = T. copy_triang(self.fC, upper=False) if self._use_cuda: with TicToc("LAUUM", debug=self.params.debug): # Product upper(fC) @ upper(fC).T : lower(fC) = T @ T.T self.fC = lauum_wrapper(self.fC, upper=True, use_cuda=self._use_cuda, opt=self.params) else: with TicToc("LAUUM", debug=self.params.debug): # Product lower(fC).T @ lower(fC) : lower(fC) = T @ T.T self.fC = lauum_wrapper(self.fC, upper=False, use_cuda=self._use_cuda, opt=self.params) with TicToc("Cholesky 2", debug=self.params.debug): # lower(fC) = 1/M * [email protected] self.fC = mul_triang(self.fC, upper=False, preserve_diag=False, multiplier=1 / M) # lower(fC) = 1/M * [email protected] + lambda * I inplace_add_diag(self.fC, self._lambda) # Cholesky on lower(fC) : lower(fC) = A.T self.fC = potrf_wrapper(self.fC, clean=False, upper=False, use_cuda=self._use_cuda, opt=self.params) self.dA = C.diag()
def test_cuda_matmul(self, mat1, mat2, expected): dev = torch.device("cuda:0") out = create_fortran(expected.shape, expected.dtype, dev) mat1_csr = SparseTensor.from_scipy( scipy.sparse.csr_matrix(mat1)).to(device=dev) mat2_csr = SparseTensor.from_scipy( scipy.sparse.csr_matrix(mat2)).to(device=dev) sparse_matmul(mat1_csr, mat2_csr, out) torch.testing.assert_allclose(out.cpu(), expected)
def cuda_trsm(A: torch.Tensor, v: torch.Tensor, alpha: float, lower: int, transpose: int, stream: Optional[torch.cuda.Stream] = None) -> torch.Tensor: if not is_f_contig(A, strict=False): raise ValueError("A must be f-contiguous for CUDA TRSM to work.") if not check_same_device(A, v): raise ValueError("A and v must be on the same CUDA device.") if not A.is_cuda: raise ValueError("A and v must be CUDA tensors!") device = A.device s = stream if stream is None: s = torch.cuda.current_stream(device=device) cublas_hdl = cublas_handle(device.index) trsm_fn = choose_fn(A.dtype, cublasDtrsm, cublasStrsm, "TRSM") # noinspection PyProtectedMember with torch.cuda.device(device), torch.cuda.stream(s), cublas_stream( cublas_hdl, s._as_parameter_): # Deal with copying v, which may not be F-contiguous. vF = create_fortran(v.size(), v.dtype, device) if is_f_contig(v, strict=False): # We can just make a copy of v vF.copy_(v) s.synchronize( ) # sync is necessary here for correctness. Not sure why! TODO: Is it still needed? else: vF = cuda_transpose(input=v, output=vF.T).T uplo = 'L' if lower else 'U' trans = 'T' if transpose else 'N' trsm_fn(cublas_hdl, side='L', uplo=uplo, trans=trans, diag='N', m=vF.shape[0], n=vF.shape[1], alpha=alpha, A=A.data_ptr(), lda=A.stride(1), B=vF.data_ptr(), ldb=vF.stride(1)) if is_f_contig(v, strict=False): vout = vF else: vout = create_C(v.size(), v.dtype, device) vout = cuda_transpose(input=vF, output=vout.T).T return vout
def cuda_trsm(A: torch.Tensor, v: torch.Tensor, alpha: float, lower: int, transpose: int) -> torch.Tensor: if not is_f_contig(A, strict=False): raise ValueError("A must be f-contiguous for CUDA TRSM to work.") if not check_same_device(A, v): raise ValueError("A and v must be on the same CUDA device.") if not A.is_cuda: raise ValueError("A and v must be CUDA tensors!") s = torch.cuda.Stream(device=A.device) cublas_hdl = cublas_handle(A.device.index) trsm_fn = choose_fn(A.dtype, cublasDtrsm, cublasStrsm, "TRSM") with torch.cuda.device(A.device), torch.cuda.stream(s), cublas_stream( cublas_hdl, s._as_parameter_): # Deal with copying v, which may not be F-contiguous. vF = create_fortran(v.size(), v.dtype, v.device) if is_f_contig(v, strict=False): # We can just make a copy of v vF.copy_(v) else: vF = cuda_transpose(input=v, output=vF.T).T uplo = 'L' if lower else 'U' trans = 'T' if transpose else 'N' trsm_fn(cublas_hdl, side='L', uplo=uplo, trans=trans, diag='N', m=vF.shape[0], n=vF.shape[1], alpha=alpha, A=A.data_ptr(), lda=A.stride(1), B=vF.data_ptr(), ldb=vF.stride(1)) if not is_f_contig(v, strict=False): vout = create_C(v.size(), v.dtype, v.device) vout = cuda_transpose(input=vF, output=vout.T).T else: vout = vF s.synchronize() return vout
def par_lauum_f_lower(A: torch.Tensor, block_allocs: List[BlockAlloc], my_rows: List[int], barrier: threading.Barrier, device_id: int, cublas_handle, independent_output: bool): N = A.shape[0] lauum_fn = choose_fn(A.dtype, scll.dlauum, scll.slauum, "Lapack LAUUM") trmm_fn = choose_fn(A.dtype, cublasDtrmm, cublasStrmm, "cuBlas TRMM") gemm_fn = choose_fn(A.dtype, cublasDgemm, cublasSgemm, "cuBlas GEMM") syrk_fn = choose_fn(A.dtype, cublasDsyrk, cublasSsyrk, "cuBlas SYRK") tc_device = torch.device('cuda:%d' % (device_id)) s1 = torch.cuda.Stream(device=tc_device) s2 = torch.cuda.Stream(device=tc_device) cublasSetStream(cublas_handle, s1._as_parameter_) max_block_size = max(ba.length for ba in block_allocs) my_rows = sorted(my_rows) with torch.cuda.device(tc_device), torch.cuda.stream(s1): # Preallocate 2 columns whole_col_b = create_fortran((A.shape[0], max_block_size), A.dtype, tc_device) whole_col_r = create_fortran((A.shape[0], max_block_size), A.dtype, tc_device) temp_bb = create_fortran((max_block_size, max_block_size), A.dtype, 'cpu', pin_memory=True) for b in range(len(block_allocs)): bb = block_allocs[b] # Load col b. # Instead of loading the whole column only load the last rows # as necessary by inspecting the minimum value in my_rows which is >= b. try: min_row = min([r for r in my_rows if r >= b]) b_start = block_allocs[min_row].start col_b = copy_to_device(N - b_start, bb.length, A, b_start, bb.start, whole_col_b, 0, 0, s1) except ValueError: pass # No column here if not independent_output: barrier.wait() for r in my_rows: if r < b: continue if r == b: # SYRK on g_b[bb.length:, :] with output replacing g_b[:bb.length, :] # C = beta*C + alpha * op(A) @ op(A).T if b_start + bb.length < N: syrk_fn(cublas_handle, uplo='L', trans='T', n=bb.length, k=col_b.shape[0] - bb.length, alpha=1.0, A=col_b[bb.length:, :].data_ptr(), lda=col_b.stride(1), beta=0.0, C=col_b.data_ptr(), ldc=col_b.stride(1)) # CPU LAUUM on A[bb.start:bb.end, bb.start:bb.end]. This is a bit messy, should do cleanup. Abb = A[bb.start:bb.end, bb.start:bb.end] # L\U if independent_output: Abb_np = Abb.numpy().copy(order="F") # Make symmetric: L\L copy_triang(Abb_np, upper=False) uu, info = lauum_fn(Abb_np, lower=1, overwrite_c=True) # LAU\L Abb.copy_(torch.from_numpy(uu.T)) # L\LAU else: uu, info = lauum_fn(Abb.numpy(), lower=1, overwrite_c=False) # LAU\L if b_start + bb.length < N: zero_triang(uu, upper=True) Abb.copy_(torch.from_numpy(uu)) if b_start + bb.length < N: # It is IMPORTANT to do the copy on s1 and then sync it. tbb = copy_to_host(bb.length, bb.length, col_b, 0, 0, temp_bb, 0, 0, s1) s1.synchronize() if independent_output: Abb.add_(torch.triu(tbb.T)) else: Abb.add_(tbb) else: # r > b br = block_allocs[r] # Load column r. Since r > b this column will be shorter than column b col_r = copy_to_device(N - br.start, br.length, A, br.start, br.start, whole_col_r, 0, 0, s1) # Restrict column b to only the last 'r' rows ccb = col_b[br.start - b_start:, :] # TRMM on g_r[0:br.length, :] which is triangular (r*r) # and cur_g_b[0:br.length, :] # output is a r*b matrix and should be stored in a separate g_out block # Could store output in the first rows of g_b # C = alpha * op(A) @ B -- A triangular trmm_fn(handle=cublas_handle, side='L', uplo='L', trans='T', diag='N', m=br.length, n=bb.length, alpha=1.0, A=col_r.data_ptr(), lda=col_r.stride(1), B=ccb.data_ptr(), ldb=ccb.stride(1), C=ccb.data_ptr(), ldc=ccb.stride(1)) # GEMM on g_r[br.length:, :].T and cur_g_b[bb.length:, :] # output is the same r*b matrix as before, outputs need to be summed. # C = alpha * op(A) @ op(B) + beta * C if br.end < N: gemm_fn(handle=cublas_handle, transa='T', transb='N', m=br.length, n=bb.length, k=col_r.shape[0] - br.length, alpha=1.0, A=col_r[br.length:, :].data_ptr(), lda=col_r.stride(1), B=ccb[br.length:, :].data_ptr(), ldb=ccb.stride(1), beta=1.0, C=ccb.data_ptr(), ldc=ccb.stride(1)) # Copy back to A[r, b] if independent_output: _temp_cpu = copy_to_host(br.length, bb.length, ccb, 0, 0, temp_bb, 0, 0, s1) s1.synchronize() A[bb.start:bb.end, br.start:br.end].copy_(_temp_cpu.T) else: s1.synchronize() copy_to_host(br.length, bb.length, ccb, 0, 0, A, br.start, bb.start, s2) s2.synchronize()
def par_lauum_c_lower(A: torch.Tensor, block_allocs: List[BlockAlloc], my_rows: List[int], barrier: threading.Barrier, device_id: int, cublas_handle, independent_output: bool): N = A.shape[0] dts = sizeof_dtype(A.dtype) lauum_fn = choose_fn(A.dtype, scll.dlauum, scll.slauum, "Lapack LAUUM") trmm_fn = choose_fn(A.dtype, cublasDtrmm, cublasStrmm, "cuBlas TRMM") gemm_fn = choose_fn(A.dtype, cublasDgemm, cublasSgemm, "cuBlas GEMM") syrk_fn = choose_fn(A.dtype, cublasDsyrk, cublasSsyrk, "cuBlas SYRK") tc_device = torch.device('cuda:%d' % (device_id)) s1 = torch.cuda.Stream(device=tc_device) s2 = torch.cuda.Stream(device=tc_device) s1_cuda, s2_cuda = s1._as_parameter_, s2._as_parameter_ cublasSetStream(cublas_handle, s1_cuda) max_block_size = max(ba.length for ba in block_allocs) my_rows = sorted(my_rows) with torch.cuda.device(tc_device), torch.cuda.stream(s1): # Preallocate 2 block-columns. The single block is a CPU buffer whole_col_b = create_fortran((A.shape[0] * max_block_size, ), A.dtype, tc_device) whole_col_r = create_fortran((A.shape[0] * max_block_size, ), A.dtype, tc_device) temp_bb = create_fortran((max_block_size, max_block_size), A.dtype, 'cpu', pin_memory=True).T for b in range(len(block_allocs)): bb = block_allocs[b] # Load col b. # Instead of loading the whole column only load the last rows # as necessary by inspecting the minimum value in my_rows which is >= b. try: min_row = min([r for r in my_rows if r >= b]) b_start = block_allocs[min_row].start cuda_memcpy2d_async(dst=whole_col_b.data_ptr(), dpitch=max_block_size * dts, src=A[b_start, bb.start].data_ptr(), spitch=A.shape[1] * dts, width=bb.length * dts, height=N - b_start, stream=s1_cuda) except ValueError: # all of `my_rows` are smaller than `b`. pass if not independent_output: barrier.wait() for r in my_rows: if r < b: continue if r == b: is_last_row = b_start + bb.length == N # SYRK on g_b[bb.length:, :] with output replacing g_b[:bb.length, :] # C = beta*C + alpha * op(A) @ op(A).T if not is_last_row: syrk_fn(cublas_handle, uplo='U', trans='N', n=bb.length, k=N - b_start - bb.length, alpha=1.0, A=whole_col_b[bb.length * max_block_size:].data_ptr(), lda=max_block_size, beta=0.0, C=whole_col_b.data_ptr(), ldc=max_block_size) # Run LAUUM on CPU on Abb.T (transpose because LAPACK works in F-order) # Result will be on upper(uu). So if we copy back to lower(A), we must copy # back uu.T -- otherwise we should copy back uu directly. Abb = A[bb.start:bb.end, bb.start:bb.end] if independent_output: Abb_np = Abb.T.numpy().copy(order="F") # U\L copy_triang(Abb_np, upper=True) # L\L uu, info = lauum_fn(Abb_np, lower=1, overwrite_c=True) # LAU\L Abb.copy_(torch.from_numpy(uu.T)) # L \ LAU else: uu, info = lauum_fn(Abb.T.numpy(), lower=0, overwrite_c=False) # Zeroing must happen if the SYRK output is to be added: otherwise the # non-processed part of Abb (i.e. upper(Abb) if not independent_output) # will be multiplied by 2. if not is_last_row: zero_triang(uu, upper=False) Abb.copy_(torch.from_numpy(uu.T)) if not is_last_row: cuda_memcpy2d_async(dst=temp_bb.data_ptr(), dpitch=max_block_size * dts, src=whole_col_b.data_ptr(), spitch=max_block_size * dts, width=bb.length * dts, height=bb.length, stream=s1_cuda) s1.synchronize( ) # TODO: Check if failure when this commented out. if independent_output: Abb.add_( torch.triu(temp_bb[:bb.length, :bb.length].T)) else: Abb.add_(temp_bb[:bb.length, :bb.length]) else: # r > b br = block_allocs[r] # Load column r. Since r > b this column will be shorter than column b cuda_memcpy2d_async(dst=whole_col_r.data_ptr(), dpitch=max_block_size * dts, src=A[br.start, br.start].data_ptr(), spitch=A.shape[1] * dts, width=br.length * dts, height=N - br.start, stream=s1_cuda) #s1.synchronize() # Restrict column b to only the last 'r' rows ccb = whole_col_b[(br.start - b_start) * max_block_size:] # TRMM on g_r[0:br.length, :] which is triangular (r*r) # and cur_g_b[0:br.length, :] # output is a r*b matrix and should be stored in a separate g_out block # Could store output in the first rows of g_b # C = alpha * op(A) @ B -- A triangular trmm_fn(handle=cublas_handle, side='R', uplo='U', trans='T', diag='N', m=bb.length, n=br.length, alpha=1.0, A=whole_col_r.data_ptr(), lda=max_block_size, B=ccb.data_ptr(), ldb=max_block_size, C=ccb.data_ptr(), ldc=max_block_size) # GEMM on g_r[br.length:, :].T and cur_g_b[bb.length:, :] # output is the same r*b matrix as before, outputs need to be summed. # C = alpha * op(A) @ op(B) + beta * C if br.end < N: gemm_fn(handle=cublas_handle, transa='N', transb='T', m=bb.length, n=br.length, k=N - br.start - br.length, alpha=1.0, A=ccb[br.length * max_block_size:].data_ptr(), lda=max_block_size, B=whole_col_r[br.length * max_block_size:].data_ptr(), ldb=max_block_size, beta=1.0, C=ccb.data_ptr(), ldc=max_block_size) # Copy back to A[r, b] if independent_output: # Copy must be transposed, copy to temp_bb first. cublasGetMatrixAsync(rows=bb.length, cols=br.length, elem_size=dts, A=ccb.data_ptr(), lda=max_block_size, B=temp_bb.data_ptr(), ldb=max_block_size, stream=s1_cuda) s1.synchronize() A[bb.start:bb.end, br.start:br.end].copy_( temp_bb[:br.length, :bb.length].T) else: s1.synchronize() cublasGetMatrixAsync(rows=bb.length, cols=br.length, elem_size=dts, A=ccb.data_ptr(), lda=max_block_size, B=A[br.start, bb.start].data_ptr(), ldb=A.shape[0], stream=s2_cuda) s2.synchronize()
def sparse_fmmv(proc_idx, queue, device_id): a: ArgsFmmv = queue.get() X1: SparseTensor = a.X1 X2: SparseTensor = a.X2 v, out = a.v, a.out kernel, max_mem = a.kernel, a.max_mem dtype = X1.dtype ntot, dtot = X1.shape mtot, T = v.size() avail_mem = max_mem / sizeof_dtype(dtype) # Memory needs: # X1_chunk : N + 2*D*N*density # X2_chunk : D + 2*D*M*density (because is transposed) # sparse_out : N + 2*N*M*(density) (assume density = 1) # ker_gpu : M*N # mmv_gpu : N*T # v_gpu : M*T # Other: GPU buffer n, m = select_dim_over_m( maxM=mtot, maxN=ntot, tot=avail_mem, coef_nm=3, coef_n=2 + 2 * dtot * X1.density + T, coef_m=2 * dtot * X2.density + T, rest=dtot, ) ddev = torch.device('cuda:%d' % int(device_id)) with tcd.device(ddev): v_gpu = v.to(device=ddev) # M x T mmv_gpu = create_same_stride((n, T), out, dtype, ddev) # ker_gpu should be fortran-ordered due to cusparse csr2dense function ker_gpu = create_fortran((n, m), dtype=dtype, device=ddev) for i in range(0, ntot, n): ic = min(n, ntot - i) cur_mmv_gpu = mmv_gpu[:ic] # n x T cur_mmv_gpu.fill_(0.0) X1_chunk = X1.narrow_rows(i, ic) X1_chunk_d = X1_chunk.index_to_int().to(device=ddev) for j in range(0, mtot, m): jc = min(m, mtot - j) X2_chunk = X2.narrow_rows(j, jc) # Prepare sparse on CPU ddd = kernel._prepare_sparse(X1_chunk, X2_chunk) # Transpose X2-chunk and convert it to CSR. This uses lots of RAM X2_chunk_d = SparseTensor.from_scipy( X2_chunk.transpose_csc().to_scipy().tocsr(copy=False)) \ .index_to_int() \ .to(device=ddev) cur_ker_gpu = ker_gpu[:ic, :jc] cur_ker_gpu.fill_(0.0) # Run the matrix multiplication (kernel apply) cur_ker_gpu = kernel._apply_sparse(X1_chunk_d, X2_chunk_d, cur_ker_gpu) cur_ker_gpu = kernel._finalize(cur_ker_gpu, ddd) # Multiply by the vector v cur_mmv_gpu.addmm_(cur_ker_gpu, v_gpu.narrow(0, j, jc)) del ddd, X2_chunk, X2_chunk_d # send result to CPU copy_to_host_noorder(ic, T, cur_mmv_gpu, 0, 0, out, i, 0) del X1_chunk, X1_chunk_d return out
def sparse_fdmmv(proc_idx, queue, device_id): a: ArgsFdmmv = queue.get() X1: SparseTensor = a.X1 X2: SparseTensor = a.X2 v, w, out = a.v, a.w, a.out kernel, max_mem = a.kernel, a.max_mem dtype = X1.dtype N, D = X1.shape M = X2.size(0) if v is None: T = w.size(1) else: T = v.size(1) # Memory needs: # X1_chunk : ntot + 2 * D * ntot * density # X2 : dtot + 2 * D * M * density (because is transposed) # sparse_out : ntot + 2 * ntot * M * density (assume here density = 1) # ker_gpu : M * ntot # w_gpu : ntot * T # v_gpu : M * T # out_gpu : M * T avail_mem = max_mem / sizeof_dtype(dtype) den = 2 * D * X1.density + 2 + 3 * M + T sub = D + 2 * D * M * X2.density + M * T if v is not None: sub += M * T n = (avail_mem - sub) / den n = min(int(n), N) if n < 1: raise MemoryError("Not enough memory to run sparse dfmmv") ddev = torch.device('cuda:%d' % int(device_id)) with tcd.device(ddev): # Initialize GPU data w_gpu = create_same_stride((n, T), out, dtype, ddev) if out.is_cuda: out_gpu = out else: out_gpu = create_same_stride((M, T), out, dtype, ddev) out_gpu.fill_(0.0) ker_gpu = create_fortran((n, M), dtype, ddev) if v is not None: v_gpu = v.to(device=ddev) # M x T X2_d = SparseTensor.from_scipy( X2.transpose_csc().to_scipy().tocsr(copy=False)) \ .index_to_int() \ .to(device=ddev) for i in range(0, N, n): ic = min(n, N - i) X1_chunk = X1.narrow_rows(i, ic) X1_chunk_d = X1_chunk.index_to_int().to(device=ddev) ker_chunk = ker_gpu[:ic] ker_chunk.fill_(0.0) # TODO: This is wasteful (X2 will be prepared many times over) ddd = kernel._prepare_sparse(X1_chunk, X2) ker_chunk = kernel._apply_sparse(X1_chunk_d, X2_d, ker_chunk) ker_chunk = kernel._finalize(ker_chunk, ddd) if w is not None: c_g_w = copy_to_device_noorder(ic, T, w, i, 0, w_gpu, 0, 0) else: c_g_w = w_gpu.narrow(0, 0, ic) c_g_w.fill_(0.0) if v is not None: c_g_w.addmm_(ker_chunk, v_gpu) out_gpu.addmm_(ker_chunk.T, c_g_w) del ddd, X1_chunk, X1_chunk_d if not out.is_cuda: copy_to_device_noorder(M, T, out_gpu, 0, 0, out, 0, 0) return out
def par_lauum_f_lower(A: torch.Tensor, block_allocs: List[BlockAlloc], my_rows: List[int], barrier: threading.Barrier, device_id: int, cublas_handle, independent_output: bool): N = A.shape[0] is_cuda = A.device.type == "cuda" trmm_fn = choose_fn(A.dtype, cublasDtrmm, cublasStrmm, "cuBlas TRMM") gemm_fn = choose_fn(A.dtype, cublasDgemm, cublasSgemm, "cuBlas GEMM") syrk_fn = choose_fn(A.dtype, cublasDsyrk, cublasSsyrk, "cuBlas SYRK") tc_device = torch.device('cuda:%d' % (device_id)) s1 = torch.cuda.Stream(device=tc_device) s3 = torch.cuda.Stream(device=tc_device) max_block_size = max(ba.length for ba in block_allocs) my_rows = sorted(my_rows) with torch.cuda.device(tc_device), torch.cuda.stream(s1), cublas_stream(cublas_handle, s1._as_parameter_): # Pre allocate b-col, syrk-out, lauum-out mem_needed = N * max_block_size + 2 * (max_block_size ** 2) if not is_cuda: # Also pre alloc r-col mem_needed += N * max_block_size f_gpu = torch.empty(size=(mem_needed,), dtype=A.dtype, device=tc_device) f_offset = 0 whole_col_b, f_offset = _extract_flat(f_gpu, (N, max_block_size), other=A, offset=f_offset) syrk_out, f_offset = _extract_flat(f_gpu, (max_block_size, max_block_size), other=A, offset=f_offset) lauum_out, f_offset = _extract_flat(f_gpu, (max_block_size, max_block_size), other=A, offset=f_offset) if not is_cuda: temp_bb = create_fortran((max_block_size, max_block_size), A.dtype, 'cpu', pin_memory=True) whole_col_r, f_offset = _extract_flat(f_gpu, (N, max_block_size), other=A, offset=f_offset) syrk_out.fill_(0.0) for b in range(len(block_allocs)): bb = block_allocs[b] # Load col b. # Instead of loading the whole column only load the last rows # as necessary by inspecting the minimum value in my_rows which is >= b. try: min_row = min([r for r in my_rows if r >= b]) b_start = block_allocs[min_row].start if is_cuda: col_b = whole_col_b[b_start:, :bb.length] col_b.copy_(A[b_start:N, bb.start:bb.end]) else: col_b: torch.Tensor = copy_to_device( N - b_start, bb.length, A, b_start, bb.start, whole_col_b, 0, 0, s1) except ValueError: pass # No column here if not independent_output: # wait for copy to device to succeed. After barrier other threads may modify # the part of col_b which we need! s1.synchronize() barrier.wait() for r in my_rows: if r == b: # SYRK on col_b[bb.length:, :] with output into syrk_out[:bb.length, :bb.length] # C = beta*C + alpha * op(A) @ op(A).T if b_start + bb.length < N: cur_syrk_out = syrk_out[:bb.length, :bb.length] syrk_fn(cublas_handle, uplo='L', trans='T', n=bb.length, k=col_b.shape[0] - bb.length, alpha=1.0, A=col_b[bb.length:, :].data_ptr(), lda=col_b.stride(1), beta=0.0, C=cur_syrk_out.data_ptr(), ldc=syrk_out.stride(1)) with torch.cuda.stream(s3): if independent_output: s1.synchronize() # we need col_b to be loaded cur_lauum_out = lauum_out[:bb.length, :bb.length] # Note that col_b[:bb.length, :bb.length] == Abb if independent_output: # In the independent output case we need to preserve tril(Abb) instead! cur_lauum_out.copy_(col_b[:bb.length, :bb.length].T) else: # In normal case we need triu(Abb) to be preserved in the upper triangle of lauum_out cur_lauum_out.copy_(col_b[:bb.length, :bb.length]) # LAUUM on col_b[:bb.length, :bb.length], into lauum_out[:bb.length, :bb.length] cuda_lauum(n=bb.length, A=col_b[:bb.length, :bb.length], lda=col_b.stride(1), B=cur_lauum_out, ldb=max_block_size, lower=True) s1.wait_stream(s3) # all subsequent work will need cur_lauum_out # Add outputs of SYRK and LAUUM (only if SYRK was performed) if b_start + bb.length < N: cur_lauum_out.add_(cur_syrk_out) # Copy lauum_out into the original matrix, while preserving the other side # of the triangular matrix. This depends on the `independent_output` flag. Abb = A[bb.start:bb.end, bb.start:bb.end] if independent_output: # cuda and non-cuda cases, since we have different orderings. Abb.copy_(cur_lauum_out.T) elif is_cuda: Abb.copy_(cur_lauum_out) else: copy_to_host(bb.length, bb.length, cur_lauum_out, 0, 0, Abb, 0, 0, s=s1) elif r > b: br = block_allocs[r] # Load column r. Since r > b this column will be shorter than column b if is_cuda: # If col_r is already in GPU no copy needed. col_r = A[br.start:N, br.start:br.end] else: col_r = copy_to_device(N - br.start, br.length, A, br.start, br.start, whole_col_r, 0, 0, s1) # Restrict column b to only the last 'r' rows ccb = col_b[br.start - b_start:, :] # TRMM on g_r[0:br.length, :] which is triangular (r*r) # and cur_g_b[0:br.length, :] # output is a r*b matrix stored in the first rows of ccb # C = alpha * op(A) @ B -- A triangular trmm_fn( handle=cublas_handle, side='L', uplo='L', trans='T', diag='N', m=br.length, n=bb.length, alpha=1.0, A=col_r.data_ptr(), lda=col_r.stride(1), B=ccb.data_ptr(), ldb=ccb.stride(1), C=ccb.data_ptr(), ldc=ccb.stride(1)) # GEMM on g_r[br.length:, :].T and cur_g_b[bb.length:, :] # output is the same r*b matrix as before, outputs need to be summed. # C = alpha * op(A) @ op(B) + beta * C if br.end < N: gemm_fn(handle=cublas_handle, transa='T', transb='N', m=br.length, n=bb.length, k=col_r.shape[0] - br.length, alpha=1.0, A=col_r[br.length:, :].data_ptr(), lda=col_r.stride(1), B=ccb[br.length:, :].data_ptr(), ldb=ccb.stride(1), beta=1.0, C=ccb.data_ptr(), ldc=ccb.stride(1)) # Copy back to A[r, b] if independent_output: if is_cuda: A[bb.start:bb.end, br.start:br.end].copy_(ccb[:br.length, :bb.length].T) else: _temp_cpu = copy_to_host(br.length, bb.length, ccb, 0, 0, temp_bb, 0, 0, s1) s1.synchronize() # must wait for data to be onto CPU. A[bb.start:bb.end, br.start:br.end].copy_(_temp_cpu.T) elif is_cuda: A[br.start:br.end, bb.start:bb.end].copy_(ccb[:br.length, :bb.length]) else: copy_to_host(br.length, bb.length, ccb, 0, 0, A, br.start, bb.start, s1) s1.synchronize()
def par_lauum_c_lower(A: torch.Tensor, block_allocs: List[BlockAlloc], my_rows: List[int], barrier: threading.Barrier, device_id: int, cublas_handle, independent_output: bool): N = A.shape[0] dts = sizeof_dtype(A.dtype) is_cuda = A.device.type == "cuda" trmm_fn = choose_fn(A.dtype, cublasDtrmm, cublasStrmm, "cuBlas TRMM") gemm_fn = choose_fn(A.dtype, cublasDgemm, cublasSgemm, "cuBlas GEMM") syrk_fn = choose_fn(A.dtype, cublasDsyrk, cublasSsyrk, "cuBlas SYRK") tc_device = torch.device('cuda:%d' % (device_id)) s1 = torch.cuda.Stream(device=tc_device) s3 = torch.cuda.Stream(device=tc_device) s1_cuda = s1._as_parameter_ max_block_size = max(ba.length for ba in block_allocs) my_rows = sorted(my_rows) with torch.cuda.device(tc_device), torch.cuda.stream(s1), cublas_stream(cublas_handle, s1_cuda): if not is_cuda: temp_bb = create_fortran((max_block_size, max_block_size), A.dtype, 'cpu', pin_memory=True).T # Pre allocate r-col, b-col, syrk-out, lauum-out mem_needed = 2 * N * max_block_size + 2 * (max_block_size ** 2) f_gpu = torch.empty(size=(mem_needed,), dtype=A.dtype, device=tc_device) whole_col_b = f_gpu[:N * max_block_size] whole_col_r = f_gpu[N * max_block_size: 2 * N * max_block_size] syrk_out = extract_fortran(f_gpu, size=(max_block_size, max_block_size), offset=2 * N * max_block_size) lauum_out = extract_fortran(f_gpu, size=(max_block_size, max_block_size), offset=2 * N * max_block_size + max_block_size ** 2) syrk_out.fill_(0.0) for b in range(len(block_allocs)): bb = block_allocs[b] # Load col b. # Instead of loading the whole column only load the last rows # as necessary by inspecting the minimum value in my_rows which is >= b. try: min_row = min([r for r in my_rows if r >= b]) b_start = block_allocs[min_row].start cuda_memcpy2d_async( dst=whole_col_b.data_ptr(), dpitch=max_block_size * dts, src=A[b_start, bb.start].data_ptr(), spitch=A.shape[1] * dts, width=bb.length * dts, height=N - b_start, stream=s1_cuda) except ValueError: # all of `my_rows` are smaller than `b`. pass if not independent_output: # wait for copy to device to succeed. After barrier other threads may modify # the part of col_b which we need! s1.synchronize() barrier.wait() for r in my_rows: if r < b: continue if r == b: is_last_row = b_start + bb.length == N # SYRK on g_b[bb.length:, :] with output replacing g_b[:bb.length, :] # C = beta*C + alpha * op(A) @ op(A).T if not is_last_row: syrk_fn(cublas_handle, uplo='U', trans='N', n=bb.length, k=N - b_start - bb.length, alpha=1.0, A=whole_col_b[bb.length * max_block_size:].data_ptr(), lda=max_block_size, beta=0.0, C=syrk_out.data_ptr(), ldc=max_block_size) with torch.cuda.stream(s3): if independent_output: s1.synchronize() # we need col_b to be loaded # Lower LAUUM for C-contig is equal to upper LAUUM for F-contig c_lauum_in = whole_col_b[:bb.length * max_block_size].view(bb.length, max_block_size)[:, :bb.length] c_lauum_out = lauum_out[:bb.length, :bb.length] if independent_output: c_lauum_out.copy_(c_lauum_in) else: c_lauum_out.copy_(c_lauum_in.T) cuda_lauum(n=bb.length, A=c_lauum_in, lda=max_block_size, B=c_lauum_out, ldb=max_block_size, lower=False) s1.wait_stream(s3) # all subsequent work on s1 will need cur_lauum_out if not is_last_row: c_lauum_out.add_(syrk_out[:bb.length, :bb.length]) # copy back whole_col_b into Abb # Now lauum_out is F-contig, while Abb is C-contig Abb = A[bb.start:bb.end, bb.start:bb.end] if independent_output: Abb.copy_(c_lauum_out) else: Abb.copy_(c_lauum_out.T) else: # r > b br = block_allocs[r] # Load column r. Since r > b this column will be shorter than column b cuda_memcpy2d_async( dst=whole_col_r.data_ptr(), dpitch=max_block_size * dts, src=A[br.start, br.start].data_ptr(), spitch=A.shape[1] * dts, width=br.length * dts, height=N - br.start, stream=s1_cuda) # Restrict column b to only the last 'r' rows ccb = whole_col_b[(br.start - b_start) * max_block_size:] # TRMM on g_r[0:br.length, :] which is triangular (r*r) # and cur_g_b[0:br.length, :] # output is a r*b matrix and stored in first rows of ccb # C = alpha * op(A) @ B -- A triangular trmm_fn( handle=cublas_handle, side='R', uplo='U', trans='T', diag='N', m=bb.length, n=br.length, alpha=1.0, A=whole_col_r.data_ptr(), lda=max_block_size, B=ccb.data_ptr(), ldb=max_block_size, C=ccb.data_ptr(), ldc=max_block_size) # GEMM on g_r[br.length:, :].T and cur_g_b[bb.length:, :] # output is the same r*b matrix as before, outputs need to be summed. # C = alpha * op(A) @ op(B) + beta * C if br.end < N: gemm_fn(handle=cublas_handle, transa='N', transb='T', m=bb.length, n=br.length, k=N - br.start - br.length, alpha=1.0, A=ccb[br.length * max_block_size:].data_ptr(), lda=max_block_size, B=whole_col_r[br.length * max_block_size:].data_ptr(), ldb=max_block_size, beta=1.0, C=ccb.data_ptr(), ldc=max_block_size) # Copy back to A[r, b] if is_cuda: ccb_square = ccb[:max_block_size * br.length].view(br.length, max_block_size) if independent_output: A[bb.start:bb.end, br.start:br.end].copy_(ccb_square[:br.length, :bb.length].T) else: A[br.start:br.end, bb.start:bb.end].copy_(ccb_square[:br.length, :bb.length]) elif independent_output: # Copy must be transposed, copy to temp_bb first. cublasGetMatrixAsync( rows=bb.length, cols=br.length, elem_size=dts, A=ccb.data_ptr(), lda=max_block_size, B=temp_bb.data_ptr(), ldb=max_block_size, stream=s1_cuda) s1.synchronize() A[bb.start:bb.end, br.start:br.end].copy_(temp_bb[:br.length, :bb.length].T) else: cublasGetMatrixAsync( rows=bb.length, cols=br.length, elem_size=dts, A=ccb.data_ptr(), lda=max_block_size, B=A[br.start, bb.start].data_ptr(), ldb=A.shape[0], stream=s1_cuda) s1.synchronize()
def init(self, X: Union[torch.Tensor, SparseTensor], Y: torch.Tensor, alpha: torch.Tensor, penalty: float, N: int) -> None: """Initialize the preconditioner matrix. This method must be called before the preconditioner becomes usable. Parameters ---------- X : MxD tensor Matrix of Nystroem centers Y : Mx1 tensor Vector of targets corresponding to the Nystroem centers `X` alpha : Mx1 tensor Parameter vector (of the same dimension as `Y`) which gives the current solution to the optimization problem. penalty : float Regularization amount N : int Number of points in the full data-set. Notes ----- If `debug=True` is present in the options, this method will print a lot of extra information pertaining timings of the various preconditioner operations. This can be useful to help understand how the preconditioner works. """ if Y.shape[1] != 1: raise ValueError( "Logistic preconditioner can only deal with 1D outputs.") dtype = X.dtype M = X.size(0) eps = self.params.pc_epsilon(dtype) if self.fC is None: # This is done only at the first iteration of the logistic-falkon algorithm # It sets the `T` variable from the paper (chol(kMM)) to the upper part of `self.fC` with TicToc("Kernel", debug=self.params.debug): if isinstance(X, torch.Tensor): C = create_same_stride((M, M), X, dtype=dtype, device='cpu', pin_memory=self._use_cuda) else: # If sparse tensor we need fortran for kernel calculation C = create_fortran((M, M), dtype=dtype, device='cpu', pin_memory=self._use_cuda) self.kernel(X, X, out=C, opt=self.params) self.fC = C.numpy() if not is_f_contig(C): self.fC = self.fC.T with TicToc("Add diag", debug=self.params.debug): # Compute T: lower(fC) = T.T inplace_add_diag(self.fC, eps * M) with TicToc("Cholesky 1", debug=self.params.debug): self.fC = potrf_wrapper(self.fC, clean=True, upper=False, use_cuda=self._use_cuda, opt=self.params) # Save the diagonal which will be overwritten when computing A self.dT = C.diag() with TicToc("Copy triangular", debug=self.params.debug): # Copy lower(fC) to upper(fC): upper(fC) = T. copy_triang(self.fC, upper=False) else: if not self._use_cuda: # Copy non-necessary for cuda since LAUUM will do the copying with TicToc("Copy triangular", debug=self.params.debug): # Copy upper(fC) to lower(fC): lower(fC) = T.T copy_triang(self.fC, upper=True) # does not copy the diagonal # Setting diagonal necessary for trmm inplace_set_diag(self.fC, self.dT) # Compute W with TicToc("TRMM", debug=self.params.debug): # T is on upper(fC). Compute T.T @ alpha alpha = self._trmm(alpha.clone()) with TicToc("W (ddf)", debug=self.params.debug): W = self.loss.ddf(Y, alpha) with TicToc("W-Multiply", debug=self.params.debug): W.sqrt_() self.fC = vec_mul_triang(self.fC, W.numpy().reshape(-1), side=0, upper=False) if self._use_cuda: with TicToc("LAUUM", debug=self.params.debug): # Product upper(fC) @ upper(fC).T : lower(fC) = T @ T.T self.fC = lauum_wrapper(self.fC, upper=True, use_cuda=self._use_cuda, opt=self.params) else: with TicToc("LAUUM", debug=self.params.debug): # Product lower(fC).T @ lower(fC) : lower(fC) = T @ T.T self.fC = lauum_wrapper(self.fC, upper=False, use_cuda=self._use_cuda, opt=self.params) # NOTE: Here the multiplier is 1/N instead of the more common 1/M! mul_triang(self.fC, upper=False, preserve_diag=False, multiplier=1 / N) with TicToc("Add diag", debug=self.params.debug): # lower(fC) = 1/N * [email protected] + lambda * I inplace_add_diag(self.fC, penalty) with TicToc("Cholesky 2", debug=self.params.debug): # Cholesky on lower(fC) : lower(fC) = A.T self.fC = potrf_wrapper(self.fC, clean=False, upper=False, use_cuda=self._use_cuda, opt=self.params) self.dA = torch.from_numpy(self.fC).diag()
def init(self, X: Union[torch.Tensor, SparseTensor], weight_vec: Optional[torch.Tensor] = None): """Initialize the preconditioner matrix. This method must be called before the preconditioner can be used. Parameters ---------- X : torch.Tensor The (M x D) matrix of Nystroem centers weight_vec An optional vector of size (M x 1) which is used for reweighted least-squares. This vector should contain the weights corresponding to the Nystrom centers. """ if X.is_cuda and not self._use_cuda: raise RuntimeError( "use_cuda is set to False, but data is CUDA tensor. " "Check your options.") if weight_vec is not None and not check_same_device(X, weight_vec): raise ValueError(f"Weights and data are not on the same device " f"({weight_vec.device}, {X.device})") if weight_vec is not None and weight_vec.shape[0] != X.shape[0]: raise ValueError( f"Weights and Nystrom centers should have the same first dimension. " f"Found instead {weight_vec.shape[0]}, {X.shape[0]}.") dtype = X.dtype dev = X.device eps = self.params.pc_epsilon(X.dtype) M = X.size(0) with TicToc("Kernel", debug=self.params.debug): if isinstance(X, torch.Tensor): C = create_same_stride((M, M), X, dtype=dtype, device=dev, pin_memory=self._use_cuda) else: # If sparse tensor we need fortran for kernel calculation C = create_fortran((M, M), dtype=dtype, device=dev, pin_memory=self._use_cuda) self.kernel(X, X, out=C, opt=self.params) if not is_f_contig(C): C = C.T with TicToc("Cholesky 1", debug=self.params.debug): # Compute T: lower(fC) = T.T inplace_add_diag_th(C, eps * M) C = potrf_wrapper(C, clean=False, upper=False, use_cuda=self._use_cuda, opt=self.params) # Save the diagonal which will be overwritten when computing A self.dT = C.diag() with TicToc("Copy triangular", debug=self.params.debug): # Copy lower(fC) to upper(fC): upper(fC) = T. copy_triang(C, upper=False) # Weighted least-squares needs to weight the A matrix. We can weigh once before LAUUM, # but since CUDA-LAUUM touches both sides of C, weighting before LAUUM will also modify # the matrix T. Therefore for CUDA inputs we weigh twice after LAUUM! if weight_vec is not None and not self._use_cuda: with TicToc("Weighting(CPU)", debug=self.params.debug): weight_vec.sqrt_() vec_mul_triang(C, weight_vec, side=1, upper=False) if self._use_cuda: with TicToc("LAUUM(CUDA)", debug=self.params.debug): # Product upper(fC) @ upper(fC).T, store in lower(fC) = T @ T.T C = lauum_wrapper(C, upper=True, use_cuda=self._use_cuda, opt=self.params) else: with TicToc("LAUUM(CPU)", debug=self.params.debug): # Product lower(fC).T @ lower(fC), store in lower(fC) = T @ T.T C = lauum_wrapper(C, upper=False, use_cuda=self._use_cuda, opt=self.params) if weight_vec is not None and self._use_cuda: with TicToc("Weighting(CUDA)", debug=self.params.debug): weight_vec.sqrt_() vec_mul_triang(C, weight_vec, side=0, upper=False) vec_mul_triang(C, weight_vec, side=1, upper=False) with TicToc("Cholesky 2", debug=self.params.debug): # lower(fC) = 1/M * [email protected] mul_triang(C, upper=False, preserve_diag=False, multiplier=1 / M) # lower(fC) = 1/M * [email protected] + lambda * I inplace_add_diag_th(C, self._lambda) # Cholesky on lower(fC) : lower(fC) = A.T C = potrf_wrapper(C, clean=False, upper=False, use_cuda=self._use_cuda, opt=self.params) self.dA = C.diag() self.fC = C
def par_lauum_f_lower(A: torch.Tensor, block_allocs: List[BlockAlloc], my_rows: List[int], barrier: threading.Barrier, device_id: int, cublas_handle, independent_output: bool): N = A.shape[0] is_cuda = A.device.type == "cuda" trmm_fn = choose_fn(A.dtype, cublasDtrmm, cublasStrmm, "cuBlas TRMM") gemm_fn = choose_fn(A.dtype, cublasDgemm, cublasSgemm, "cuBlas GEMM") syrk_fn = choose_fn(A.dtype, cublasDsyrk, cublasSsyrk, "cuBlas SYRK") tc_device = torch.device('cuda:%d' % (device_id)) s1 = torch.cuda.Stream(device=tc_device) s3 = torch.cuda.Stream(device=tc_device) max_block_size = max(ba.length for ba in block_allocs) my_rows = sorted(my_rows) with torch.cuda.device(tc_device), torch.cuda.stream(s1), cublas_stream( cublas_handle, s1._as_parameter_): # Preallocate 2 columns if not is_cuda: whole_col_b = create_fortran((A.shape[0], max_block_size), A.dtype, tc_device) whole_col_r = create_fortran((A.shape[0], max_block_size), A.dtype, tc_device) syrk_out = create_fortran((max_block_size, max_block_size), A.dtype, tc_device) lauum_out = create_fortran((max_block_size, max_block_size), A.dtype, tc_device) temp_bb = create_fortran((max_block_size, max_block_size), A.dtype, 'cpu', pin_memory=True) for b in range(len(block_allocs)): bb = block_allocs[b] # Load col b. # Instead of loading the whole column only load the last rows # as necessary by inspecting the minimum value in my_rows which is >= b. try: min_row = min([r for r in my_rows if r >= b]) b_start = block_allocs[min_row].start if is_cuda: col_b: torch.Tensor = A[b_start:N, bb.start:bb.end] else: col_b: torch.Tensor = copy_to_device( N - b_start, bb.length, A, b_start, bb.start, whole_col_b, 0, 0, s1) except ValueError: pass # No column here if not independent_output: barrier.wait() for r in my_rows: if r == b: # SYRK on col_b[bb.length:, :] with output into syrk_out[:bb.length, :bb.length] # C = beta*C + alpha * op(A) @ op(A).T if b_start + bb.length < N: cur_syrk_out = syrk_out[:bb.length, :bb.length] syrk_fn(cublas_handle, uplo='L', trans='T', n=bb.length, k=col_b.shape[0] - bb.length, alpha=1.0, A=col_b[bb.length:, :].data_ptr(), lda=col_b.stride(1), beta=0.0, C=cur_syrk_out.data_ptr(), ldc=syrk_out.stride(1)) with torch.cuda.stream(s3): cur_lauum_out = lauum_out[:bb.length, :bb.length] # Note that col_b[:bb.length, :bb.length] == Abb if independent_output: # In the independent output case we need to preserve tril(Abb) instead! cur_lauum_out.copy_( col_b[:bb.length, :bb.length].T) else: # In normal case we need triu(Abb) to be preserved in the upper triangle of lauum_out cur_lauum_out.copy_(col_b[:bb.length, :bb.length]) # LAUUM on col_b[:bb.length, :bb.length], into lauum_out[:bb.length, :bb.length] cuda_lauum_lower(n=bb.length, A=col_b[:bb.length, :bb.length], lda=col_b.stride(1), B=cur_lauum_out, ldb=max_block_size) s3.synchronize() # Add outputs of SYRK and LAUUM (only if SYRK was performed) if b_start + bb.length < N: s1.synchronize() cur_lauum_out.add_(cur_syrk_out) # Copy lauum_out into the original matrix, while preserving the other side # of the triangular matrix. This depends on the `independent_output` flag. Abb = A[bb.start:bb.end, bb.start:bb.end] if independent_output: Abb.copy_(cur_lauum_out.T) else: copy_to_host(bb.length, bb.length, cur_lauum_out, 0, 0, Abb, 0, 0, s=s1) elif r > b: br = block_allocs[r] # Load column r. Since r > b this column will be shorter than column b if is_cuda: col_r = A[br.start:N, br.start:br.end] else: col_r = copy_to_device(N - br.start, br.length, A, br.start, br.start, whole_col_r, 0, 0, s1) # Restrict column b to only the last 'r' rows ccb = col_b[br.start - b_start:, :] # TRMM on g_r[0:br.length, :] which is triangular (r*r) # and cur_g_b[0:br.length, :] # output is a r*b matrix and should be stored in a separate g_out block # Could store output in the first rows of g_b # C = alpha * op(A) @ B -- A triangular trmm_fn(handle=cublas_handle, side='L', uplo='L', trans='T', diag='N', m=br.length, n=bb.length, alpha=1.0, A=col_r.data_ptr(), lda=col_r.stride(1), B=ccb.data_ptr(), ldb=ccb.stride(1), C=ccb.data_ptr(), ldc=ccb.stride(1)) # GEMM on g_r[br.length:, :].T and cur_g_b[bb.length:, :] # output is the same r*b matrix as before, outputs need to be summed. # C = alpha * op(A) @ op(B) + beta * C if br.end < N: gemm_fn(handle=cublas_handle, transa='T', transb='N', m=br.length, n=bb.length, k=col_r.shape[0] - br.length, alpha=1.0, A=col_r[br.length:, :].data_ptr(), lda=col_r.stride(1), B=ccb[br.length:, :].data_ptr(), ldb=ccb.stride(1), beta=1.0, C=ccb.data_ptr(), ldc=ccb.stride(1)) # Copy back to A[r, b] if independent_output: if is_cuda: A[bb.start:bb.end, br.start:br.end].copy_( ccb[:br.length, :bb.length].T) else: _temp_cpu = copy_to_host(br.length, bb.length, ccb, 0, 0, temp_bb, 0, 0, s1) s1.synchronize() A[bb.start:bb.end, br.start:br.end].copy_(_temp_cpu.T) elif not is_cuda: copy_to_host(br.length, bb.length, ccb, 0, 0, A, br.start, bb.start, s1) s1.synchronize()
def par_lauum_c_lower(A: torch.Tensor, block_allocs: List[BlockAlloc], my_rows: List[int], barrier: threading.Barrier, device_id: int, cublas_handle, independent_output: bool): N = A.shape[0] dts = sizeof_dtype(A.dtype) is_cuda = A.device.type == "cuda" trmm_fn = choose_fn(A.dtype, cublasDtrmm, cublasStrmm, "cuBlas TRMM") gemm_fn = choose_fn(A.dtype, cublasDgemm, cublasSgemm, "cuBlas GEMM") syrk_fn = choose_fn(A.dtype, cublasDsyrk, cublasSsyrk, "cuBlas SYRK") tc_device = torch.device('cuda:%d' % (device_id)) s1 = torch.cuda.Stream(device=tc_device) s3 = torch.cuda.Stream(device=tc_device) s1_cuda = s1._as_parameter_ max_block_size = max(ba.length for ba in block_allocs) my_rows = sorted(my_rows) with torch.cuda.device(tc_device), torch.cuda.stream(s1), cublas_stream( cublas_handle, s1_cuda): # Preallocate 2 block-columns. The single block is a CPU buffer whole_col_b = create_fortran((A.shape[0] * max_block_size, ), A.dtype, tc_device) whole_col_r = create_fortran((A.shape[0] * max_block_size, ), A.dtype, tc_device) syrk_out = create_fortran((max_block_size, max_block_size), A.dtype, tc_device) lauum_in = create_fortran((max_block_size, max_block_size), A.dtype, tc_device) temp_bb = create_fortran((max_block_size, max_block_size), A.dtype, 'cpu', pin_memory=True).T for b in range(len(block_allocs)): bb = block_allocs[b] # Load col b. # Instead of loading the whole column only load the last rows # as necessary by inspecting the minimum value in my_rows which is >= b. try: min_row = min([r for r in my_rows if r >= b]) b_start = block_allocs[min_row].start cuda_memcpy2d_async(dst=whole_col_b.data_ptr(), dpitch=max_block_size * dts, src=A[b_start, bb.start].data_ptr(), spitch=A.shape[1] * dts, width=bb.length * dts, height=N - b_start, stream=s1_cuda) except ValueError: # all of `my_rows` are smaller than `b`. pass if not independent_output: barrier.wait() for r in my_rows: if r < b: continue if r == b: is_last_row = b_start + bb.length == N # Sync the load of whole_col_b s1.synchronize() # SYRK on g_b[bb.length:, :] with output replacing g_b[:bb.length, :] # C = beta*C + alpha * op(A) @ op(A).T if not is_last_row: syrk_fn(cublas_handle, uplo='U', trans='N', n=bb.length, k=N - b_start - bb.length, alpha=1.0, A=whole_col_b[bb.length * max_block_size:].data_ptr(), lda=max_block_size, beta=0.0, C=syrk_out.data_ptr(), ldc=max_block_size) with torch.cuda.stream(s3): lauum_out = whole_col_b[:bb.length * max_block_size].view( bb.length, max_block_size )[:, :bb.length] # With the copy we go from C-contig to F-contig into lauum_in. This also transposes lauum_out so we get a correct order. cur_lauum_in = lauum_in[:bb.length, :bb.length] cur_lauum_in.copy_(lauum_out) # Since lauum_out is supposed to also be F-contig, we must do another copy from lauum_in to lauum_out. if independent_output: lauum_out.copy_(cur_lauum_in) else: lauum_out.copy_(cur_lauum_in.T) cuda_lauum_lower(n=bb.length, A=cur_lauum_in, lda=max_block_size, B=lauum_out, ldb=max_block_size) s3.synchronize() if not is_last_row: s1.synchronize() lauum_out.add_(syrk_out[:bb.length, :bb.length]) # copy back whole_col_b into Abb # Now lauum_out is F-contig, while Abb is C-contig Abb = A[bb.start:bb.end, bb.start:bb.end] if independent_output: Abb.copy_(lauum_out) else: if not is_cuda: # It is not possible to directly copy lauum_out.T to Abb due to mismatch in strides. # Use CPU buffer for copy, and then copy transposed into Abb temp_bb[:bb.length, :bb.length].copy_(lauum_out) Abb.copy_(temp_bb[:bb.length, :bb.length].T) else: Abb.copy_(lauum_out.T) else: # r > b br = block_allocs[r] # Load column r. Since r > b this column will be shorter than column b cuda_memcpy2d_async(dst=whole_col_r.data_ptr(), dpitch=max_block_size * dts, src=A[br.start, br.start].data_ptr(), spitch=A.shape[1] * dts, width=br.length * dts, height=N - br.start, stream=s1_cuda) # Restrict column b to only the last 'r' rows ccb = whole_col_b[(br.start - b_start) * max_block_size:] # TRMM on g_r[0:br.length, :] which is triangular (r*r) # and cur_g_b[0:br.length, :] # output is a r*b matrix and should be stored in a separate g_out block # Could store output in the first rows of g_b # C = alpha * op(A) @ B -- A triangular trmm_fn(handle=cublas_handle, side='R', uplo='U', trans='T', diag='N', m=bb.length, n=br.length, alpha=1.0, A=whole_col_r.data_ptr(), lda=max_block_size, B=ccb.data_ptr(), ldb=max_block_size, C=ccb.data_ptr(), ldc=max_block_size) # GEMM on g_r[br.length:, :].T and cur_g_b[bb.length:, :] # output is the same r*b matrix as before, outputs need to be summed. # C = alpha * op(A) @ op(B) + beta * C if br.end < N: gemm_fn(handle=cublas_handle, transa='N', transb='T', m=bb.length, n=br.length, k=N - br.start - br.length, alpha=1.0, A=ccb[br.length * max_block_size:].data_ptr(), lda=max_block_size, B=whole_col_r[br.length * max_block_size:].data_ptr(), ldb=max_block_size, beta=1.0, C=ccb.data_ptr(), ldc=max_block_size) # Copy back to A[r, b] if independent_output: # Copy must be transposed, copy to temp_bb first. cublasGetMatrixAsync(rows=bb.length, cols=br.length, elem_size=dts, A=ccb.data_ptr(), lda=max_block_size, B=temp_bb.data_ptr(), ldb=max_block_size, stream=s1_cuda) s1.synchronize() A[bb.start:bb.end, br.start:br.end].copy_( temp_bb[:br.length, :bb.length].T) else: cublasGetMatrixAsync(rows=bb.length, cols=br.length, elem_size=dts, A=ccb.data_ptr(), lda=max_block_size, B=A[br.start, bb.start].data_ptr(), ldb=A.shape[0], stream=s1_cuda) s1.synchronize()