def _parallel_potrf_runner(A: torch.Tensor, opt: CholeskyOptions, gpu_info) -> torch.Tensor: num_gpus = len(gpu_info) N = A.shape[0] dt = A.dtype # Calculate the maximum block size such that we don't run out of GPU # RAM on **any** available GPU. We need a total of 2 whole columns and 1 tile: # block-size^2 * ((N / block-size) * 2 + 1) floats # (plus the cuSOLVER buffer which is small). # block_size < (sqrt((2*N)^2 + 4R) - 2*N) / 2 dts = sizeof_dtype(dt) avail_ram = min([g.actual_free_mem for g in gpu_info]) / dts max_block_size = (math.sqrt(4 * N**2 + 4 * avail_ram) - 2 * N) / 2 max_block_size = int(math.floor(max_block_size)) if max_block_size < 1: raise RuntimeError("Cannot run parallel POTRF with minimum " "available memory of %.2fMB" % (avail_ram * dts / 2**20)) block_sizes = calc_block_sizes(max_block_size, num_gpus, N, opt.chol_par_blk_multiplier) block_allocations = [] cur_n = 0 for i, bs in enumerate(block_sizes): block_allocations.append((cur_n, cur_n + bs, bs, i % num_gpus, i)) cur_n += bs device_info = [] for g in range(num_gpus): device_info.append((0.0, initialization.cusolver_handle(g), g)) parallel_potrf(device_info, block_allocations, A) return A
def gpu_cholesky(A: torch.Tensor, upper: bool, clean: bool, overwrite: bool, opt: FalkonOptions) -> torch.Tensor: """ Parameters ----------- A : torch.Tensor 2D positive-definite matrix of size (n x n) that will be factorized as A = U.T @ U (if `upper` is True) or A = L @ L.T if `upper` is False. upper : bool Whether the triangle which should be factorized is the upper or lower of `A`. clean : bool Whether the "other" triangle of the output matrix (the one that does not contain the factorization) will be filled with zeros or not. overwrite : bool Whether to overwrite matrix A or to output the result in a new buffer. opt : FalkonOptions Options forwarded for block calculation, and other knobs in the out-of-core parallel POTRF implementation. Useful options are the ones defined in :class:`~falkon.options.CholeskyOptions` . Notes ------ The factorization will always be the 'lower' version of the factorization which could however end up on the upper-triangular part of the matrix in case A is not Fortran contiguous to begin with. """ # Handle 'overwrite' option immediately so that its usage is reflected in memory # availability (in case A is on GPU). if not overwrite: # We could change the stride to be more favorable to the POTRF requirements # but it gets complicated. We leave such decisions to the user! A = copy_same_stride(A, pin_memory=True) # Decide which version of the algo we run: can be in-core or parallel. # (Note that the original OOC version is not going to run). # Determine GPU free RAM gpu_info = [v for k, v in devices.get_device_info(opt).items() if k >= 0] for g in gpu_info: g.actual_free_mem = min((g.free_memory - 300 * 2**20) * 0.95, opt.max_gpu_mem * 0.95) if A.is_cuda: try: device = [d for d in gpu_info if d.Id == A.device.index][0] except IndexError: # This should never happen! raise RuntimeError("Device of matrix A (%s) is not recognized" % (A.device)) else: device = max(gpu_info, key=lambda g: g.actual_free_mem) ic = can_do_ic(A, device) and not opt.chol_force_ooc if opt.chol_force_in_core and not ic: raise RuntimeError( "Cannot run in-core POTRF but `chol_force_in_core` was specified.") f_order = is_f_contig(A) transposed = False if not f_order: A = A.T upper = not upper transposed = True # Now A is always in f_order. So we can only allow upper=False (ooc) if upper: # Can do only in-core! if not ic: raise ValueError( "GPU POTRF is only implemented on the " "lower triangle for Fortran-ordered matrices (or on the upper " "triangle for C-ordered matrices)") if not ic and A.is_cuda: _msg = "Cannot run out-of-core POTRF on CUDA matrix 'A'." if opt.chol_force_ooc: _msg += " Set the `chol_force_ooc` option to `False` in to allow in-core POTRF." raise ValueError(_msg) # Handle different implementations for POTRF: in-core and out-of-core if ic: if opt.debug: print("Using in-core POTRF") _ic_cholesky(A, upper, device=device.Id, cusolver_handle=initialization.cusolver_handle(device.Id)) else: if opt.debug: print("Using parallel POTRF") _parallel_potrf_runner(A, opt, gpu_info) # Perform cleaning of the 'other side' of the matrix if clean: la_helpers.zero_triang(A, upper=not upper) # Undo previous matrix transformations if transposed: A = A.T return A