def cuda_trsm(A: torch.Tensor, v: torch.Tensor, alpha: float, lower: int, transpose: int, stream: Optional[torch.cuda.Stream] = None) -> torch.Tensor: if not is_f_contig(A, strict=False): raise ValueError("A must be f-contiguous for CUDA TRSM to work.") if not check_same_device(A, v): raise ValueError("A and v must be on the same CUDA device.") if not A.is_cuda: raise ValueError("A and v must be CUDA tensors!") device = A.device s = stream if stream is None: s = torch.cuda.current_stream(device=device) cublas_hdl = cublas_handle(device.index) trsm_fn = choose_fn(A.dtype, cublasDtrsm, cublasStrsm, "TRSM") # noinspection PyProtectedMember with torch.cuda.device(device), torch.cuda.stream(s), cublas_stream( cublas_hdl, s._as_parameter_): # Deal with copying v, which may not be F-contiguous. vF = create_fortran(v.size(), v.dtype, device) if is_f_contig(v, strict=False): # We can just make a copy of v vF.copy_(v) s.synchronize( ) # sync is necessary here for correctness. Not sure why! TODO: Is it still needed? else: vF = cuda_transpose(input=v, output=vF.T).T uplo = 'L' if lower else 'U' trans = 'T' if transpose else 'N' trsm_fn(cublas_hdl, side='L', uplo=uplo, trans=trans, diag='N', m=vF.shape[0], n=vF.shape[1], alpha=alpha, A=A.data_ptr(), lda=A.stride(1), B=vF.data_ptr(), ldb=vF.stride(1)) if is_f_contig(v, strict=False): vout = vF else: vout = create_C(v.size(), v.dtype, device) vout = cuda_transpose(input=vF, output=vout.T).T return vout
def _sparse_matmul_cuda(A: SparseTensor, B: SparseTensor, out: torch.Tensor): """ Typically D is very large and since `B` must be in CSR format, memory usage will be quite high. Parameters ---------- A : SparseTensor N x D :class:`SparseTensor`. Must be in CSR format. B : SparseTensor D x M :class:`SparseTensor`. Must be in CSR format. out : torch.Tensor Dense N x M output tensor. Must be F-contiguous (column-contiguous) Notes ------ This function runs in two steps: sparse*sparse->sparse multiplication and conversion of the output sparse matrix to a dense matrix. """ from falkon.sparse.sparse_helpers import spspmm, csr2dense if not A.is_csr: raise ValueError("A must be CSR matrix") if not B.is_csr: raise ValueError("B must be CSR matrix") if not is_f_contig(out, strict=False): raise ValueError("out must be F-contiguous") # 1. MatMul out_indexptr, out_index, out_data = spspmm(A.indexptr, A.index, A.data, B.indexptr, B.index, B.data, A.shape[1]) # 2. Convert to dense out = csr2dense(out_indexptr, out_index, out_data, out) return out
def cuda_trsm(A: torch.Tensor, v: torch.Tensor, alpha: float, lower: int, transpose: int) -> torch.Tensor: if not is_f_contig(A, strict=False): raise ValueError("A must be f-contiguous for CUDA TRSM to work.") if not check_same_device(A, v): raise ValueError("A and v must be on the same CUDA device.") if not A.is_cuda: raise ValueError("A and v must be CUDA tensors!") s = torch.cuda.Stream(device=A.device) cublas_hdl = cublas_handle(A.device.index) trsm_fn = choose_fn(A.dtype, cublasDtrsm, cublasStrsm, "TRSM") with torch.cuda.device(A.device), torch.cuda.stream(s), cublas_stream( cublas_hdl, s._as_parameter_): # Deal with copying v, which may not be F-contiguous. vF = create_fortran(v.size(), v.dtype, v.device) if is_f_contig(v, strict=False): # We can just make a copy of v vF.copy_(v) else: vF = cuda_transpose(input=v, output=vF.T).T uplo = 'L' if lower else 'U' trans = 'T' if transpose else 'N' trsm_fn(cublas_hdl, side='L', uplo=uplo, trans=trans, diag='N', m=vF.shape[0], n=vF.shape[1], alpha=alpha, A=A.data_ptr(), lda=A.stride(1), B=vF.data_ptr(), ldb=vF.stride(1)) if not is_f_contig(v, strict=False): vout = create_C(v.size(), v.dtype, v.device) vout = cuda_transpose(input=vF, output=vout.T).T else: vout = vF s.synchronize() return vout
def init(self, X: Union[torch.Tensor, SparseTensor]): """Initialize the preconditioner matrix. This method must be called before the preconditioner can be used. Parameters ---------- X : MxD tensor The matrix of Nystroem centers """ dtype = X.dtype eps = self.params.pc_epsilon(X.dtype) M = X.size(0) with TicToc("Kernel", debug=self.params.debug): if isinstance(X, torch.Tensor): C = create_same_stride((M, M), X, dtype=dtype, device='cpu', pin_memory=self._use_cuda) else: # If sparse tensor we need fortran for kernel calculation C = create_fortran((M, M), dtype=dtype, device='cpu', pin_memory=self._use_cuda) self.kernel(X, X, out=C, opt=self.params) self.fC = C.numpy() if not is_f_contig(C): self.fC = self.fC.T with TicToc("Cholesky 1", debug=self.params.debug): # Compute T: lower(fC) = T.T inplace_add_diag(self.fC, eps * M) self.fC = potrf_wrapper(self.fC, clean=False, upper=False, use_cuda=self._use_cuda, opt=self.params) # Save the diagonal which will be overwritten when computing A self.dT = C.diag() with TicToc("Copy triangular", debug=self.params.debug): # Copy lower(fC) to upper(fC): upper(fC) = T. copy_triang(self.fC, upper=False) if self._use_cuda: with TicToc("LAUUM", debug=self.params.debug): # Product upper(fC) @ upper(fC).T : lower(fC) = T @ T.T self.fC = lauum_wrapper(self.fC, upper=True, use_cuda=self._use_cuda, opt=self.params) else: with TicToc("LAUUM", debug=self.params.debug): # Product lower(fC).T @ lower(fC) : lower(fC) = T @ T.T self.fC = lauum_wrapper(self.fC, upper=False, use_cuda=self._use_cuda, opt=self.params) with TicToc("Cholesky 2", debug=self.params.debug): # lower(fC) = 1/M * [email protected] self.fC = mul_triang(self.fC, upper=False, preserve_diag=False, multiplier=1 / M) # lower(fC) = 1/M * [email protected] + lambda * I inplace_add_diag(self.fC, self._lambda) # Cholesky on lower(fC) : lower(fC) = A.T self.fC = potrf_wrapper(self.fC, clean=False, upper=False, use_cuda=self._use_cuda, opt=self.params) self.dA = C.diag()
def select( self, X: _tensor_type, Y: Union[torch.Tensor, None], M: int) -> Union[_tensor_type, Tuple[_tensor_type, torch.Tensor]]: """Select M rows from 2D array `X`, preserving the memory order of `X`. """ N = X.size(0) if M > N: warnings.warn("Number of centers M greater than the " "number of data-points. Setting M to %d" % (N)) M = N idx = self.random_gen.choice(N, size=M, replace=False) if isinstance(X, SparseTensor): X = X.to_scipy() centers = X[idx, :].copy() Xc = SparseTensor.from_scipy(centers) else: Xnp = X.numpy() # work on np array if is_f_contig(X): order = 'F' else: order = 'C' Xc_np = np.empty((M, Xnp.shape[1]), dtype=Xnp.dtype, order=order) Xc = torch.from_numpy( np.take(Xnp, idx, axis=0, out=Xc_np, mode='wrap')) if Y is not None: Ynp = Y.numpy() # work on np array if is_f_contig(X): order = 'F' else: order = 'C' Yc_np = np.empty((M, Ynp.shape[1]), dtype=Ynp.dtype, order=order) Yc = torch.from_numpy( np.take(Ynp, idx, axis=0, out=Yc_np, mode='wrap')) return Xc, Yc return Xc
def to_c_contig(tensor: Optional[torch.Tensor], name: str = "", warn: bool = False) -> Optional[torch.Tensor]: warning_text = ( "Input '%s' is F-contiguous; to ensure KeOps compatibility, C-contiguous inputs " "are necessary. The data will be copied to change its order. To avoid this " "unnecessary copy, either disable KeOps (passing `keops_active='no'`) or make " "the input tensors C-contiguous.") if tensor is not None and is_f_contig(tensor): if warn: warnings.warn(warning_text % name) orig_device = tensor.device return torch.from_numpy(np.array(tensor.cpu().numpy(), order="C")).to(device=orig_device) return tensor
def _generic_fmm(proc_idx, queue, device_id): # Unpack the function arguments a: ArgsFmm = queue.get() X1: torch.Tensor = a.X1 X2: torch.Tensor = a.X2 cuda_inputs = X1.is_cuda out = a.out kernel, gpu_dtype = a.kernel, a.gpu_dtype max_mem = a.max_mem num_streams = a.num_streams # flags and local variables change_dtype = gpu_dtype != X1.dtype X1_equal_X2 = _gpu_tns_same_memory(X1, X2) use_gpu_bufs = change_dtype or not cuda_inputs stride = "F" if is_f_contig(out, strict=True) else "C" j_iter = 0 dts = sizeof_dtype(gpu_dtype) tc_device = torch.device('cuda:%d' % (int(device_id))) avail_mem = max_mem / dts # Choose block sizes n, m such that we won't run out of GPU memory ntot, d = X1.shape mtot = X2.shape[0] extra_mem = kernel.extra_mem() if cuda_inputs and not change_dtype: # No allocation will be performed by us. Only in-kernel stuff. n, m = select_dim_over_nm(max_n=ntot, max_m=mtot, d=d, coef_nd=extra_mem.get('nd', 0), coef_md=extra_mem.get('md', 0), coef_nm=extra_mem.get('nm', 0), coef_n=extra_mem.get('n', 0), coef_m=extra_mem.get('m', 0), rest=extra_mem.get('d', 0), max_mem=avail_mem) else: n, m = select_dim_over_nm( max_n=ntot, max_m=mtot, d=d, coef_nd=num_streams * (extra_mem.get('nd', 0) + 1), coef_md=num_streams * (extra_mem.get('md', 0) + 1), coef_nm=num_streams * (extra_mem.get('nm', 0) + 1), coef_n=extra_mem.get('n', 0), coef_m=extra_mem.get('m', 0), rest=extra_mem.get('d', 0), max_mem=avail_mem) # Create streams streams = [tcd.Stream(device=tc_device) for _ in range(num_streams)] # Create buffers if use_gpu_bufs: gX1 = create_same_stride((n, d), X1, gpu_dtype, tc_device) gX2_list = [ create_same_stride((m, d), X2, gpu_dtype, tc_device) for _ in range(num_streams) ] gout_list = [ create_same_stride((n, m), out, gpu_dtype, tc_device) for _ in range(num_streams) ] if not cuda_inputs: cpu_buf_list = [ create_same_stride((n, m), out, gpu_dtype, 'cpu', pin_memory=True) for _ in range(num_streams) ] # Define helpers for the copy-back operations (from cpu_buf to output) copy_ops = [None] * num_streams def wrap_copy_op(stream_idx): if copy_ops[stream_idx] is not None: copy_ops[stream_idx]() copy_ops[stream_idx] = None def do_copy_op(output, buf, i_, ic_, j_, jc_): # This function will also do the type conversion output[i_:i_ + ic_, j_:j_ + jc_].copy_(buf[:ic_, :jc_]) # Kernel computation begin with tcd.device(tc_device): for i in range(0, ntot, n): ic = min(n, ntot - i) with tcd.stream(streams[j_iter % len(streams)]): X1_chunk = X1.narrow(0, i, ic) if use_gpu_bufs: cur_gX1 = gX1.narrow(0, 0, ic) cur_gX1.copy_(X1_chunk, non_blocking=True) else: cur_gX1 = X1_chunk for j in range(0, mtot, m): jc = min(m, mtot - j) # Choose the buffers for this inner iteration stream_id = j_iter % len(streams) stream = streams[stream_id] if use_gpu_bufs: gX2 = gX2_list[stream_id] gout = gout_list[stream_id] if not cuda_inputs: cpu_buf = cpu_buf_list[stream_id] # Sync for buffers we must use now (e.g. 2 previous iters) with tcd.stream(stream): # Inner-loop stream.synchronize() wrap_copy_op(stream_id) if X1_equal_X2 and j < i: # Shortcut for symmetric kernels jc = min(m, mtot - j) out[i:i + ic, j:j + jc].copy_(out[j:j + jc, i:i + ic].T, non_blocking=True) j_iter += 1 continue # Copy (CPU->GPU) X2_chunk = X2.narrow(0, j, jc) if use_gpu_bufs: cur_gX2 = gX2.narrow(0, 0, jc) cur_gX2.copy_(X2_chunk, non_blocking=True) else: cur_gX2 = X2_chunk if use_gpu_bufs: cur_gout = gout[:ic, :jc] else: cur_gout = out[i:i + ic, j:j + jc] cur_gout.fill_(0.0) # Compute ddd = kernel._prepare(cur_gX1, cur_gX2) kernel._apply(cur_gX1, cur_gX2.T, cur_gout) cur_gout = kernel._finalize(cur_gout, ddd) # Copy Back (GPU->CPU) if not cuda_inputs: # copy_ does not care about the contiguity of copies, as long as it's consistent # however, in case of C-contiguous inputs it will create an intermediate array # which is undesired. We use cuda_memcpy2d_async which works well with C-contiguous # arrays. if stride == "F": copy_to_host(ic, jc, cur_gout, 0, 0, cpu_buf, 0, 0, s=stream) else: cuda_memcpy2d_async(dst=cpu_buf.data_ptr(), dpitch=cpu_buf.stride(0) * dts, src=cur_gout.data_ptr(), spitch=cur_gout.stride(0) * dts, width=jc * dts, height=ic, stream=stream._as_parameter_) copy_ops[stream_id] = partial(do_copy_op, out, cpu_buf, i, ic, j, jc) elif change_dtype: out.narrow(0, i, ic).narrow(1, j, jc).copy_(cur_gout, non_blocking=True) j_iter += 1 for i in range(num_streams): streams[i].synchronize() wrap_copy_op(i) return out
def _parallel_lauum_runner(A, write_opposite: bool, gpu_info): # Choose target: if is_f_contig(A): target = par_lauum_f_lower elif is_contig(A): target = par_lauum_c_lower else: raise NotImplementedError( "Parallel LAUUM is only implemented for contiguous matrices") N = A.shape[0] dt = A.dtype dts = sizeof_dtype(dt) if A.is_cuda: sync_current_stream(A.device) gpu_info = [g for g in gpu_info if g.Id == A.device.index] avail_ram = gpu_info[0].actual_free_mem / dts if target.__name__ == "par_lauum_f_lower": # Each GPU should hold in memory two additional blocks (2*B^2 <= M) # and 1 full column. max_block_size = int( math.floor((-N + math.sqrt(N**2 + 8 * avail_ram)) / 4)) else: # Same RAM requirements as the out-of-core version max_block_size = int( math.floor((-2 * N + math.sqrt(4 * N**2 + 8 * avail_ram)) / 4)) if max_block_size < 1: raise RuntimeError("Cannot run parallel LAUUM with minimum " "available memory of %.2fMB" % (avail_ram * dts / 2**20)) # All computations on the same device (where data is stored). No multi-GPU support! block_sizes = calc_block_sizes3(max_block_size, 1, N) else: avail_ram = min([g.actual_free_mem for g in gpu_info]) / dts # Each GPU should be able to hold in memory 2 block columns # Plus two blocks (=> quadratic equation 2B^2 + 2BN - M <= 0. # An additional block is needed whenever write_opposite is True, due to # copying blocks between matrices with different strides! if write_opposite: max_block_size = int( math.floor( (-2 * N + math.sqrt(4 * N**2 + 12 * avail_ram)) / 6)) else: max_block_size = int( math.floor((-2 * N + math.sqrt(4 * N**2 + 8 * avail_ram)) / 4)) if max_block_size < 1: raise RuntimeError("Cannot run parallel LAUUM with minimum " "available memory of %.2fMB" % (avail_ram * dts / 2**20)) block_sizes = calc_block_sizes3(max_block_size, len(gpu_info), N) # Create BlockAlloc objects describing the subdivision of input block_allocations: List[BlockAlloc] = [] cur_n = 0 for bs in block_sizes: block_allocations.append( BlockAlloc(start=cur_n, end=cur_n + bs, length=bs)) cur_n += bs num_gpus = len(gpu_info) if num_gpus < 1: raise ValueError( "Parallel LAUUM can only run when a GPU is available.") barrier = threading.Barrier(num_gpus, timeout=1000) threads = [] for _gpu_idx, g in enumerate(gpu_info): # Assign rows to GPUs round-robin. Use _gpu_idx instead of g.Id since the latter # may not contain all integers from 0. gid_allocs = [ i for i in range(len(block_allocations)) if i % num_gpus == _gpu_idx ] cublas_handle = initialization.cublas_handle(g.Id) if cublas_handle is None: raise RuntimeError("CUBLAS must be initialized " "on device %d before running parallel LAUUM." % (g.Id)) t = PropagatingThread(target=target, name="GPU-%d" % (g.Id), args=(A, block_allocations, gid_allocs, barrier, g.Id, cublas_handle, write_opposite)) threads.append(t) for t in threads: t.start() for t in threads: t.join() return A
def init(self, X: Union[torch.Tensor, SparseTensor], Y: torch.Tensor, alpha: torch.Tensor, penalty: float, N: int) -> None: """Initialize the preconditioner matrix. This method must be called before the preconditioner becomes usable. Parameters ---------- X : MxD tensor Matrix of Nystroem centers Y : Mx1 tensor Vector of targets corresponding to the Nystroem centers `X` alpha : Mx1 tensor Parameter vector (of the same dimension as `Y`) which gives the current solution to the optimization problem. penalty : float Regularization amount N : int Number of points in the full data-set. Notes ----- If `debug=True` is present in the options, this method will print a lot of extra information pertaining timings of the various preconditioner operations. This can be useful to help understand how the preconditioner works. """ if Y.shape[1] != 1: raise ValueError( "Logistic preconditioner can only deal with 1D outputs.") dtype = X.dtype M = X.size(0) eps = self.params.pc_epsilon(dtype) if self.fC is None: # This is done only at the first iteration of the logistic-falkon algorithm # It sets the `T` variable from the paper (chol(kMM)) to the upper part of `self.fC` with TicToc("Kernel", debug=self.params.debug): if isinstance(X, torch.Tensor): C = create_same_stride((M, M), X, dtype=dtype, device='cpu', pin_memory=self._use_cuda) else: # If sparse tensor we need fortran for kernel calculation C = create_fortran((M, M), dtype=dtype, device='cpu', pin_memory=self._use_cuda) self.kernel(X, X, out=C, opt=self.params) self.fC = C.numpy() if not is_f_contig(C): self.fC = self.fC.T with TicToc("Add diag", debug=self.params.debug): # Compute T: lower(fC) = T.T inplace_add_diag(self.fC, eps * M) with TicToc("Cholesky 1", debug=self.params.debug): self.fC = potrf_wrapper(self.fC, clean=True, upper=False, use_cuda=self._use_cuda, opt=self.params) # Save the diagonal which will be overwritten when computing A self.dT = C.diag() with TicToc("Copy triangular", debug=self.params.debug): # Copy lower(fC) to upper(fC): upper(fC) = T. copy_triang(self.fC, upper=False) else: if not self._use_cuda: # Copy non-necessary for cuda since LAUUM will do the copying with TicToc("Copy triangular", debug=self.params.debug): # Copy upper(fC) to lower(fC): lower(fC) = T.T copy_triang(self.fC, upper=True) # does not copy the diagonal # Setting diagonal necessary for trmm inplace_set_diag(self.fC, self.dT) # Compute W with TicToc("TRMM", debug=self.params.debug): # T is on upper(fC). Compute T.T @ alpha alpha = self._trmm(alpha.clone()) with TicToc("W (ddf)", debug=self.params.debug): W = self.loss.ddf(Y, alpha) with TicToc("W-Multiply", debug=self.params.debug): W.sqrt_() self.fC = vec_mul_triang(self.fC, W.numpy().reshape(-1), side=0, upper=False) if self._use_cuda: with TicToc("LAUUM", debug=self.params.debug): # Product upper(fC) @ upper(fC).T : lower(fC) = T @ T.T self.fC = lauum_wrapper(self.fC, upper=True, use_cuda=self._use_cuda, opt=self.params) else: with TicToc("LAUUM", debug=self.params.debug): # Product lower(fC).T @ lower(fC) : lower(fC) = T @ T.T self.fC = lauum_wrapper(self.fC, upper=False, use_cuda=self._use_cuda, opt=self.params) # NOTE: Here the multiplier is 1/N instead of the more common 1/M! mul_triang(self.fC, upper=False, preserve_diag=False, multiplier=1 / N) with TicToc("Add diag", debug=self.params.debug): # lower(fC) = 1/N * [email protected] + lambda * I inplace_add_diag(self.fC, penalty) with TicToc("Cholesky 2", debug=self.params.debug): # Cholesky on lower(fC) : lower(fC) = A.T self.fC = potrf_wrapper(self.fC, clean=False, upper=False, use_cuda=self._use_cuda, opt=self.params) self.dA = torch.from_numpy(self.fC).diag()
def init(self, X: Union[torch.Tensor, SparseTensor], weight_vec: Optional[torch.Tensor] = None): """Initialize the preconditioner matrix. This method must be called before the preconditioner can be used. Parameters ---------- X : torch.Tensor The (M x D) matrix of Nystroem centers weight_vec An optional vector of size (M x 1) which is used for reweighted least-squares. This vector should contain the weights corresponding to the Nystrom centers. """ if X.is_cuda and not self._use_cuda: raise RuntimeError( "use_cuda is set to False, but data is CUDA tensor. " "Check your options.") if weight_vec is not None and not check_same_device(X, weight_vec): raise ValueError(f"Weights and data are not on the same device " f"({weight_vec.device}, {X.device})") if weight_vec is not None and weight_vec.shape[0] != X.shape[0]: raise ValueError( f"Weights and Nystrom centers should have the same first dimension. " f"Found instead {weight_vec.shape[0]}, {X.shape[0]}.") dtype = X.dtype dev = X.device eps = self.params.pc_epsilon(X.dtype) M = X.size(0) with TicToc("Kernel", debug=self.params.debug): if isinstance(X, torch.Tensor): C = create_same_stride((M, M), X, dtype=dtype, device=dev, pin_memory=self._use_cuda) else: # If sparse tensor we need fortran for kernel calculation C = create_fortran((M, M), dtype=dtype, device=dev, pin_memory=self._use_cuda) self.kernel(X, X, out=C, opt=self.params) if not is_f_contig(C): C = C.T with TicToc("Cholesky 1", debug=self.params.debug): # Compute T: lower(fC) = T.T inplace_add_diag_th(C, eps * M) C = potrf_wrapper(C, clean=False, upper=False, use_cuda=self._use_cuda, opt=self.params) # Save the diagonal which will be overwritten when computing A self.dT = C.diag() with TicToc("Copy triangular", debug=self.params.debug): # Copy lower(fC) to upper(fC): upper(fC) = T. copy_triang(C, upper=False) # Weighted least-squares needs to weight the A matrix. We can weigh once before LAUUM, # but since CUDA-LAUUM touches both sides of C, weighting before LAUUM will also modify # the matrix T. Therefore for CUDA inputs we weigh twice after LAUUM! if weight_vec is not None and not self._use_cuda: with TicToc("Weighting(CPU)", debug=self.params.debug): weight_vec.sqrt_() vec_mul_triang(C, weight_vec, side=1, upper=False) if self._use_cuda: with TicToc("LAUUM(CUDA)", debug=self.params.debug): # Product upper(fC) @ upper(fC).T, store in lower(fC) = T @ T.T C = lauum_wrapper(C, upper=True, use_cuda=self._use_cuda, opt=self.params) else: with TicToc("LAUUM(CPU)", debug=self.params.debug): # Product lower(fC).T @ lower(fC), store in lower(fC) = T @ T.T C = lauum_wrapper(C, upper=False, use_cuda=self._use_cuda, opt=self.params) if weight_vec is not None and self._use_cuda: with TicToc("Weighting(CUDA)", debug=self.params.debug): weight_vec.sqrt_() vec_mul_triang(C, weight_vec, side=0, upper=False) vec_mul_triang(C, weight_vec, side=1, upper=False) with TicToc("Cholesky 2", debug=self.params.debug): # lower(fC) = 1/M * [email protected] mul_triang(C, upper=False, preserve_diag=False, multiplier=1 / M) # lower(fC) = 1/M * [email protected] + lambda * I inplace_add_diag_th(C, self._lambda) # Cholesky on lower(fC) : lower(fC) = A.T C = potrf_wrapper(C, clean=False, upper=False, use_cuda=self._use_cuda, opt=self.params) self.dA = C.diag() self.fC = C
def _ic_cholesky(A, upper, device, cusolver_handle): """Cholesky factorization of matrix `A` on the GPU Uses the cuSOLVER library for implementation of the POTRF function. Parameters: ----------- A : [n, n] CPU or GPU array (column-contiguous) The (positive definite) matrix which should be factorized upper : bool Whether we need to factorize the upper of lower portion of `A`. The other side of the matrix will not be touched. device : int The GPU device on which to run the factorization cusolver_handle Pointer to the cuSOLVER context, which needs to be initialized before calling the function. Returns: -------- A : [n, n] CPU or GPU array (column-contiguous) The factorization of A which overwrites the upper (or lower) triangular part of the matrix A. This is not a copy of the original matrix. """ # Check library initialization if cusolver_handle is None: raise RuntimeError("CuSolver must be initialized " "before running in-core Cholesky.") if not is_f_contig(A): raise RuntimeError("Cholesky input must be F-contiguous") uplo = 'U' if upper else 'L' n = A.shape[0] tc_device = torch.device("cuda:%d" % (device)) tc_stream = torch.cuda.current_stream(tc_device) # Choose functions by dtype potrf_buf_size = choose_fn(A.dtype, cusolverDnDpotrf_bufferSize, cusolverDnSpotrf_bufferSize, "POTRF Buffer size") potrf_fn = choose_fn(A.dtype, cusolverDnDpotrf, cusolverDnSpotrf, "POTRF") with torch.cuda.device(tc_device), \ torch.cuda.stream(tc_stream), \ cusolver_stream(cusolver_handle, tc_stream._as_parameter_): # Determine necessary buffer size potrf_bsize = potrf_buf_size(handle=cusolver_handle, uplo=uplo, n=n, A=0, lda=n) # Allocate flat GPU buffer, and extract buffers if A.is_cuda: potrf_wspace = torch.empty(size=(potrf_bsize, ), dtype=A.dtype, device=tc_device) Agpu = A else: gpu_buf = torch.empty(size=(n * n + potrf_bsize, ), dtype=A.dtype, device=tc_device) potrf_wspace = gpu_buf[:potrf_bsize] Agpu = extract_fortran(gpu_buf, (n, n), offset=potrf_bsize) # Copy A to device memory copy_to_device(n, n, A, 0, 0, Agpu, 0, 0, s=tc_stream) dev_info = torch.tensor(4, dtype=torch.int32, device=tc_device) # Run cholesky potrf_fn(handle=cusolver_handle, uplo=uplo, n=n, A=Agpu.data_ptr(), lda=n, workspace=potrf_wspace.data_ptr(), Lwork=potrf_bsize, devInfo=dev_info) # Copy back to CPU if not A.is_cuda: copy_to_host(n, n, Agpu, 0, 0, A, 0, 0, s=tc_stream) del Agpu del potrf_wspace, dev_info tc_stream.synchronize() return A
def gpu_cholesky(A: torch.Tensor, upper: bool, clean: bool, overwrite: bool, opt: FalkonOptions) -> torch.Tensor: """ Parameters ----------- A : torch.Tensor 2D positive-definite matrix of size (n x n) that will be factorized as A = U.T @ U (if `upper` is True) or A = L @ L.T if `upper` is False. upper : bool Whether the triangle which should be factorized is the upper or lower of `A`. clean : bool Whether the "other" triangle of the output matrix (the one that does not contain the factorization) will be filled with zeros or not. overwrite : bool Whether to overwrite matrix A or to output the result in a new buffer. opt : FalkonOptions Options forwarded for block calculation, and other knobs in the out-of-core parallel POTRF implementation. Useful options are the ones defined in :class:`~falkon.options.CholeskyOptions` . Notes ------ The factorization will always be the 'lower' version of the factorization which could however end up on the upper-triangular part of the matrix in case A is not Fortran contiguous to begin with. """ # Handle 'overwrite' option immediately so that its usage is reflected in memory # availability (in case A is on GPU). if not overwrite: # We could change the stride to be more favorable to the POTRF requirements # but it gets complicated. We leave such decisions to the user! A = copy_same_stride(A, pin_memory=True) # Decide which version of the algo we run: can be in-core or parallel. # (Note that the original OOC version is not going to run). # Determine GPU free RAM gpu_info = [v for k, v in devices.get_device_info(opt).items() if k >= 0] for g in gpu_info: g.actual_free_mem = min((g.free_memory - 300 * 2**20) * 0.95, opt.max_gpu_mem * 0.95) if A.is_cuda: try: device = [d for d in gpu_info if d.Id == A.device.index][0] except IndexError: # This should never happen! raise RuntimeError("Device of matrix A (%s) is not recognized" % (A.device)) else: device = max(gpu_info, key=lambda g: g.actual_free_mem) ic = can_do_ic(A, device) and not opt.chol_force_ooc if opt.chol_force_in_core and not ic: raise RuntimeError( "Cannot run in-core POTRF but `chol_force_in_core` was specified.") f_order = is_f_contig(A) transposed = False if not f_order: A = A.T upper = not upper transposed = True # Now A is always in f_order. So we can only allow upper=False (ooc) if upper: # Can do only in-core! if not ic: raise ValueError( "GPU POTRF is only implemented on the " "lower triangle for Fortran-ordered matrices (or on the upper " "triangle for C-ordered matrices)") if not ic and A.is_cuda: _msg = "Cannot run out-of-core POTRF on CUDA matrix 'A'." if opt.chol_force_ooc: _msg += " Set the `chol_force_ooc` option to `False` in to allow in-core POTRF." raise ValueError(_msg) # Handle different implementations for POTRF: in-core and out-of-core if ic: if opt.debug: print("Using in-core POTRF") _ic_cholesky(A, upper, device=device.Id, cusolver_handle=initialization.cusolver_handle(device.Id)) else: if opt.debug: print("Using parallel POTRF") _parallel_potrf_runner(A, opt, gpu_info) # Perform cleaning of the 'other side' of the matrix if clean: la_helpers.zero_triang(A, upper=not upper) # Undo previous matrix transformations if transposed: A = A.T return A