예제 #1
0
def _parallel_potrf_runner(A: torch.Tensor, opt: CholeskyOptions,
                           gpu_info) -> torch.Tensor:
    num_gpus = len(gpu_info)
    N = A.shape[0]
    dt = A.dtype
    # Calculate the maximum block size such that we don't run out of GPU
    # RAM on **any** available GPU. We need a total of 2 whole columns and 1 tile:
    # block-size^2 * ((N / block-size) * 2 + 1) floats
    # (plus the cuSOLVER buffer which is small).
    # block_size < (sqrt((2*N)^2 + 4R) - 2*N) / 2
    dts = sizeof_dtype(dt)
    avail_ram = min([g.actual_free_mem for g in gpu_info]) / dts
    max_block_size = (math.sqrt(4 * N**2 + 4 * avail_ram) - 2 * N) / 2
    max_block_size = int(math.floor(max_block_size))
    if max_block_size < 1:
        raise RuntimeError("Cannot run parallel POTRF with minimum "
                           "available memory of %.2fMB" %
                           (avail_ram * dts / 2**20))

    block_sizes = calc_block_sizes(max_block_size, num_gpus, N,
                                   opt.chol_par_blk_multiplier)
    block_allocations = []
    cur_n = 0
    for i, bs in enumerate(block_sizes):
        block_allocations.append((cur_n, cur_n + bs, bs, i % num_gpus, i))
        cur_n += bs

    device_info = []
    for g in range(num_gpus):
        device_info.append((0.0, initialization.cusolver_handle(g), g))

    parallel_potrf(device_info, block_allocations, A)
    return A
예제 #2
0
def gpu_cholesky(A: torch.Tensor, upper: bool, clean: bool, overwrite: bool,
                 opt: FalkonOptions) -> torch.Tensor:
    """
    Parameters
    -----------
    A : torch.Tensor
        2D positive-definite matrix of size (n x n) that will be factorized as
        A = U.T @ U (if `upper` is True) or A = L @ L.T if `upper`
        is False.
    upper : bool
        Whether the triangle which should be factorized is the upper or lower of `A`.
    clean : bool
        Whether the "other" triangle of the output matrix (the one that
        does not contain the factorization) will be filled with zeros or
        not.
    overwrite : bool
        Whether to overwrite matrix A or to output the result in a new
        buffer.
    opt : FalkonOptions
        Options forwarded for block calculation, and other knobs in the out-of-core
        parallel POTRF implementation. Useful options are the ones defined in
        :class:`~falkon.options.CholeskyOptions` .

    Notes
    ------
    The factorization will always be the 'lower' version of the factorization
    which could however end up on the upper-triangular part of the matrix
    in case A is not Fortran contiguous to begin with.
    """
    # Handle 'overwrite' option immediately so that its usage is reflected in memory
    # availability (in case A is on GPU).
    if not overwrite:
        # We could change the stride to be more favorable to the POTRF requirements
        # but it gets complicated. We leave such decisions to the user!
        A = copy_same_stride(A, pin_memory=True)

    # Decide which version of the algo we run: can be in-core or parallel.
    # (Note that the original OOC version is not going to run).

    # Determine GPU free RAM
    gpu_info = [v for k, v in devices.get_device_info(opt).items() if k >= 0]
    for g in gpu_info:
        g.actual_free_mem = min((g.free_memory - 300 * 2**20) * 0.95,
                                opt.max_gpu_mem * 0.95)

    if A.is_cuda:
        try:
            device = [d for d in gpu_info if d.Id == A.device.index][0]
        except IndexError:
            # This should never happen!
            raise RuntimeError("Device of matrix A (%s) is not recognized" %
                               (A.device))
    else:
        device = max(gpu_info, key=lambda g: g.actual_free_mem)
    ic = can_do_ic(A, device) and not opt.chol_force_ooc
    if opt.chol_force_in_core and not ic:
        raise RuntimeError(
            "Cannot run in-core POTRF but `chol_force_in_core` was specified.")

    f_order = is_f_contig(A)
    transposed = False
    if not f_order:
        A = A.T
        upper = not upper
        transposed = True
    # Now A is always in f_order. So we can only allow upper=False (ooc)
    if upper:
        # Can do only in-core!
        if not ic:
            raise ValueError(
                "GPU POTRF is only implemented on the "
                "lower triangle for Fortran-ordered matrices (or on the upper "
                "triangle for C-ordered matrices)")
    if not ic and A.is_cuda:
        _msg = "Cannot run out-of-core POTRF on CUDA matrix 'A'."
        if opt.chol_force_ooc:
            _msg += " Set the `chol_force_ooc` option to `False` in to allow in-core POTRF."
        raise ValueError(_msg)

    # Handle different implementations for POTRF: in-core and out-of-core
    if ic:
        if opt.debug:
            print("Using in-core POTRF")
        _ic_cholesky(A,
                     upper,
                     device=device.Id,
                     cusolver_handle=initialization.cusolver_handle(device.Id))
    else:
        if opt.debug:
            print("Using parallel POTRF")
        _parallel_potrf_runner(A, opt, gpu_info)

    # Perform cleaning of the 'other side' of the matrix
    if clean:
        la_helpers.zero_triang(A, upper=not upper)
    # Undo previous matrix transformations
    if transposed:
        A = A.T

    return A