Exemplo n.º 1
0
def _parallel_lauum_runner(A, write_opposite: bool, opt: LauumOptions,
                           gpu_info):
    # Choose target:
    if is_f_contig(A):
        target = par_lauum_f_lower
    elif is_contig(A):
        target = par_lauum_c_lower
    else:
        raise NotImplementedError(
            "Parallel LAUUM is only implemented for contiguous matrices")

    num_gpus = len(gpu_info)
    if num_gpus < 1:
        raise ValueError(
            "Parallel LAUUM should only be run when some GPU is available.")
    N = A.shape[0]
    dt = A.dtype
    dts = sizeof_dtype(dt)
    avail_ram = min([g.actual_free_mem for g in gpu_info]) / dts
    # Each GPU should be able to hold in memory 2 block columns
    max_block_size = int(math.floor(avail_ram / (2 * N)))
    if max_block_size < 1:
        raise RuntimeError("Cannot run parallel LAUUM with minimum "
                           "available memory of %.2fMB" %
                           (avail_ram * dts / 2**20))

    block_sizes = calc_block_sizes2(max_block_size, num_gpus, N,
                                    opt.lauum_par_blk_multiplier)
    block_allocations: List[BlockAlloc] = []
    cur_n = 0
    for bs in block_sizes:
        block_allocations.append(
            BlockAlloc(start=cur_n, end=cur_n + bs, length=bs))
        cur_n += bs

    barrier = threading.Barrier(num_gpus, timeout=1000)
    threads = []
    for g in gpu_info:
        gid_allocs = [
            i for i in range(len(block_allocations)) if i % num_gpus == g.Id
        ]
        cublas_handle = initialization.cublas_handle(g.Id)
        if cublas_handle is None:
            raise RuntimeError("CUBLAS must be initialized "
                               "on device %d before running parallel LAUUM." %
                               (g.Id))
        t = PropagatingThread(target=target,
                              name="GPU-%d" % (g.Id),
                              args=(A, block_allocations, gid_allocs, barrier,
                                    g.Id, cublas_handle, write_opposite))
        threads.append(t)

    for t in threads:
        t.start()
    for t in threads:
        t.join()
    return A
Exemplo n.º 2
0
def cuda_trsm(A: torch.Tensor,
              v: torch.Tensor,
              alpha: float,
              lower: int,
              transpose: int,
              stream: Optional[torch.cuda.Stream] = None) -> torch.Tensor:
    if not is_f_contig(A, strict=False):
        raise ValueError("A must be f-contiguous for CUDA TRSM to work.")
    if not check_same_device(A, v):
        raise ValueError("A and v must be on the same CUDA device.")
    if not A.is_cuda:
        raise ValueError("A and v must be CUDA tensors!")

    device = A.device
    s = stream
    if stream is None:
        s = torch.cuda.current_stream(device=device)
    cublas_hdl = cublas_handle(device.index)
    trsm_fn = choose_fn(A.dtype, cublasDtrsm, cublasStrsm, "TRSM")

    # noinspection PyProtectedMember
    with torch.cuda.device(device), torch.cuda.stream(s), cublas_stream(
            cublas_hdl, s._as_parameter_):
        # Deal with copying v, which may not be F-contiguous.
        vF = create_fortran(v.size(), v.dtype, device)
        if is_f_contig(v, strict=False):
            # We can just make a copy of v
            vF.copy_(v)
            s.synchronize(
            )  # sync is necessary here for correctness. Not sure why! TODO: Is it still needed?
        else:
            vF = cuda_transpose(input=v, output=vF.T).T

        uplo = 'L' if lower else 'U'
        trans = 'T' if transpose else 'N'
        trsm_fn(cublas_hdl,
                side='L',
                uplo=uplo,
                trans=trans,
                diag='N',
                m=vF.shape[0],
                n=vF.shape[1],
                alpha=alpha,
                A=A.data_ptr(),
                lda=A.stride(1),
                B=vF.data_ptr(),
                ldb=vF.stride(1))
        if is_f_contig(v, strict=False):
            vout = vF
        else:
            vout = create_C(v.size(), v.dtype, device)
            vout = cuda_transpose(input=vF, output=vout.T).T
    return vout
Exemplo n.º 3
0
def cuda_trsm(A: torch.Tensor, v: torch.Tensor, alpha: float, lower: int,
              transpose: int) -> torch.Tensor:
    if not is_f_contig(A, strict=False):
        raise ValueError("A must be f-contiguous for CUDA TRSM to work.")
    if not check_same_device(A, v):
        raise ValueError("A and v must be on the same CUDA device.")
    if not A.is_cuda:
        raise ValueError("A and v must be CUDA tensors!")

    s = torch.cuda.Stream(device=A.device)
    cublas_hdl = cublas_handle(A.device.index)
    trsm_fn = choose_fn(A.dtype, cublasDtrsm, cublasStrsm, "TRSM")

    with torch.cuda.device(A.device), torch.cuda.stream(s), cublas_stream(
            cublas_hdl, s._as_parameter_):
        # Deal with copying v, which may not be F-contiguous.
        vF = create_fortran(v.size(), v.dtype, v.device)
        if is_f_contig(v, strict=False):
            # We can just make a copy of v
            vF.copy_(v)
        else:
            vF = cuda_transpose(input=v, output=vF.T).T

        uplo = 'L' if lower else 'U'
        trans = 'T' if transpose else 'N'
        trsm_fn(cublas_hdl,
                side='L',
                uplo=uplo,
                trans=trans,
                diag='N',
                m=vF.shape[0],
                n=vF.shape[1],
                alpha=alpha,
                A=A.data_ptr(),
                lda=A.stride(1),
                B=vF.data_ptr(),
                ldb=vF.stride(1))
        if not is_f_contig(v, strict=False):
            vout = create_C(v.size(), v.dtype, v.device)
            vout = cuda_transpose(input=vF, output=vout.T).T
        else:
            vout = vF
        s.synchronize()
    return vout
Exemplo n.º 4
0
def _parallel_lauum_runner(A, write_opposite: bool, gpu_info):
    # Choose target:
    if is_f_contig(A):
        target = par_lauum_f_lower
    elif is_contig(A):
        target = par_lauum_c_lower
    else:
        raise NotImplementedError(
            "Parallel LAUUM is only implemented for contiguous matrices")

    N = A.shape[0]
    dt = A.dtype
    dts = sizeof_dtype(dt)
    if A.is_cuda:
        sync_current_stream(A.device)
        gpu_info = [g for g in gpu_info if g.Id == A.device.index]
        avail_ram = gpu_info[0].actual_free_mem / dts
        if target.__name__ == "par_lauum_f_lower":
            # Each GPU should hold in memory two additional blocks (2*B^2 <= M)
            # and 1 full column.
            max_block_size = int(
                math.floor((-N + math.sqrt(N**2 + 8 * avail_ram)) / 4))
        else:
            # Same RAM requirements as the out-of-core version
            max_block_size = int(
                math.floor((-2 * N + math.sqrt(4 * N**2 + 8 * avail_ram)) / 4))
        if max_block_size < 1:
            raise RuntimeError("Cannot run parallel LAUUM with minimum "
                               "available memory of %.2fMB" %
                               (avail_ram * dts / 2**20))
        # All computations on the same device (where data is stored). No multi-GPU support!
        block_sizes = calc_block_sizes3(max_block_size, 1, N)
    else:
        avail_ram = min([g.actual_free_mem for g in gpu_info]) / dts
        # Each GPU should be able to hold in memory 2 block columns
        # Plus two blocks (=> quadratic equation 2B^2 + 2BN - M <= 0.
        # An additional block is needed whenever write_opposite is True, due to
        # copying blocks between matrices with different strides!
        if write_opposite:
            max_block_size = int(
                math.floor(
                    (-2 * N + math.sqrt(4 * N**2 + 12 * avail_ram)) / 6))
        else:
            max_block_size = int(
                math.floor((-2 * N + math.sqrt(4 * N**2 + 8 * avail_ram)) / 4))
        if max_block_size < 1:
            raise RuntimeError("Cannot run parallel LAUUM with minimum "
                               "available memory of %.2fMB" %
                               (avail_ram * dts / 2**20))

        block_sizes = calc_block_sizes3(max_block_size, len(gpu_info), N)

    # Create BlockAlloc objects describing the subdivision of input
    block_allocations: List[BlockAlloc] = []
    cur_n = 0
    for bs in block_sizes:
        block_allocations.append(
            BlockAlloc(start=cur_n, end=cur_n + bs, length=bs))
        cur_n += bs

    num_gpus = len(gpu_info)
    if num_gpus < 1:
        raise ValueError(
            "Parallel LAUUM can only run when a GPU is available.")
    barrier = threading.Barrier(num_gpus, timeout=1000)
    threads = []
    for _gpu_idx, g in enumerate(gpu_info):
        # Assign rows to GPUs round-robin. Use _gpu_idx instead of g.Id since the latter
        # may not contain all integers from 0.
        gid_allocs = [
            i for i in range(len(block_allocations))
            if i % num_gpus == _gpu_idx
        ]
        cublas_handle = initialization.cublas_handle(g.Id)
        if cublas_handle is None:
            raise RuntimeError("CUBLAS must be initialized "
                               "on device %d before running parallel LAUUM." %
                               (g.Id))
        t = PropagatingThread(target=target,
                              name="GPU-%d" % (g.Id),
                              args=(A, block_allocations, gid_allocs, barrier,
                                    g.Id, cublas_handle, write_opposite))
        threads.append(t)

    for t in threads:
        t.start()
    for t in threads:
        t.join()
    return A