示例#1
0
def fmmv_cuda_sparse(X1: SparseTensor,
                     X2: SparseTensor,
                     v: torch.Tensor,
                     kernel,
                     out: Optional[torch.Tensor] = None,
                     opt: Optional[BaseOptions] = None) -> torch.Tensor:
    opt = _setup_opt(opt)
    _check_contiguity((v, 'v'), (out, 'out'))

    N = X1.size(0)
    # Create output matrix
    if out is None:
        out = create_fortran((N, v.size(1)), v.dtype, 'cpu', pin_memory=True)
    out.fill_(0.0)

    gpu_info = _get_gpu_info(opt, slack=0.9)
    block_sizes = calc_gpu_block_sizes(gpu_info, N)

    # Create queues
    args = []  # Arguments passed to each subprocess
    for i, g in enumerate(gpu_info):
        bwidth = block_sizes[i + 1] - block_sizes[i]
        if bwidth <= 0: continue
        args.append((ArgsFmmv(X1=X1.narrow_rows(block_sizes[i], bwidth),
                              X2=X2,
                              v=v,
                              out=out.narrow(0, block_sizes[i], bwidth),
                              kernel=kernel,
                              max_mem=g.usable_ram), g.Id))

    _start_wait_processes(sparse_fmmv, args)
    return out
示例#2
0
def fdmmv_cuda_sparse(X1: SparseTensor,
                      X2: SparseTensor,
                      v: Optional[torch.Tensor],
                      w: Optional[torch.Tensor],
                      kernel,
                      out: Optional[torch.Tensor] = None,
                      opt: Optional[BaseOptions] = None) -> torch.Tensor:
    opt = _setup_opt(opt)
    _check_contiguity((v, 'v'), (w, 'w'), (out, 'out'))
    if v is None and w is None:
        raise ValueError("one of 'v' or 'w' must not be None.")

    T = v.size(1) if v is not None else w.size(1)
    M = X2.size(0)
    N = X1.size(0)

    gpu_info = _get_gpu_info(opt, slack=0.95)
    block_sizes = calc_gpu_block_sizes(gpu_info, N)

    if out is None:
        out = create_C((M, T), X1.dtype, 'cpu', pin_memory=True)

    wrlk = []  # outputs for each subprocess.
    args = []
    for i, g in enumerate(gpu_info):
        bwidth = block_sizes[i + 1] - block_sizes[i]
        if bwidth <= 0:
            continue
        cur_out_gpu = create_C((M, T), X1.dtype,
                               f'cuda:{gpu_info[i].Id}')  # M x T
        wrlk.append(cur_out_gpu)

        cur_w = None

        if w is not None:
            cur_w = w.narrow(0, block_sizes[i], bwidth)
        args.append((ArgsFdmmv(X1=X1.narrow_rows(block_sizes[i], bwidth),
                               X2=X2,
                               v=v,
                               w=cur_w,
                               out=cur_out_gpu,
                               kernel=kernel,
                               max_mem=g.usable_ram), g.Id))

    _start_wait_processes(sparse_fdmmv, args)

    if len(wrlk) > 1:
        # noinspection PyTypeChecker
        fastest_device: int = np.argmax([d.speed for d in gpu_info])
        out.copy_(
            torch.cuda.comm.reduce_add(
                wrlk, destination=gpu_info[fastest_device].Id))
    else:
        out.copy_(wrlk[0])
    return out
示例#3
0
def fmmv_cuda(X1: torch.Tensor,
              X2: torch.Tensor,
              v: torch.Tensor,
              kernel,
              out: Optional[torch.Tensor] = None,
              opt: Optional[BaseOptions] = None) -> torch.Tensor:
    """
    X1 : N x D
    X2 : M x D
    v  : M x T

    performs  fnc(X1*X2', X1, X2) * v   : N x T
    in blocks on multiple GPUs
    """
    opt = _setup_opt(opt)
    _check_contiguity((X1, 'X1'), (X2, 'X2'), (v, 'v'), (out, 'out'))

    N = X1.size(0)
    # Create output matrix
    if out is None:
        out = create_same_stride((N, v.size(1)),
                                 X1,
                                 v.dtype,
                                 'cpu',
                                 pin_memory=True)
    out.fill_(0.0)

    gpu_info = _get_gpu_info(opt, slack=0.9)
    block_sizes = calc_gpu_block_sizes(gpu_info, N)

    # Create queues
    args = []  # Arguments passed to each subprocess
    for i, g in enumerate(gpu_info):
        bwidth = block_sizes[i + 1] - block_sizes[i]
        if bwidth <= 0: continue
        args.append((ArgsFmmv(X1=X1.narrow(0, block_sizes[i], bwidth),
                              X2=X2,
                              v=v,
                              out=out.narrow(0, block_sizes[i], bwidth),
                              kernel=kernel,
                              max_mem=g.usable_ram), g.Id))

    # If using torch multiprocessing with spawn/forkserver here we must make sure
    # that any consumer of the queues is on a different process than the queue producer.
    # This is due to passing in a CUDA tensor to the queue
    # https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors
    # Thus we cannot run the first task on the current process.
    if kernel.kernel_type == "l2distance" and kernel.name == "gaussian":
        target = distk_fmmv
    else:
        target = generic_fmmv
    _start_wait_processes(target, args)
    return out
示例#4
0
def fmmv_cuda(X1: torch.Tensor,
              X2: torch.Tensor,
              v: torch.Tensor,
              kernel,
              out: Optional[torch.Tensor] = None,
              opt: Optional[BaseOptions] = None) -> torch.Tensor:
    """
    X1 : N x D
    X2 : M x D
    v  : M x T

    performs  fnc(X1*X2', X1, X2) * v   : N x T
    in blocks on multiple GPUs
    """
    opt = _setup_opt(opt)
    _check_contiguity((X1, 'X1'), (X2, 'X2'), (v, 'v'), (out, 'out'))
    device = X1.device

    N = X1.size(0)
    # Create output matrix
    if out is None:
        out = create_same_stride((N, v.size(1)), X1, v.dtype, device=device,
                                 pin_memory=device.type != 'cuda')
    out.fill_(0.0)

    if kernel.kernel_type == "l2distance" and kernel.name == "gaussian":
        target = distk_fmmv
    else:
        target = generic_fmmv

    gpu_info = _get_gpu_info(opt, slack=0.9)

    if device.type == 'cuda':
        single_gpu_info = [g for g in gpu_info if g.Id == device.index][0]
        args = ArgsFmmv(X1=X1, X2=X2, v=v, out=out, kernel=kernel,
                        max_mem=single_gpu_info.usable_ram)
        _call_direct(target, (args, device.index))
    else:
        block_sizes = calc_gpu_block_sizes(gpu_info, N)
        # Create queues
        args = []  # Arguments passed to each subprocess
        for i, g in enumerate(gpu_info):
            bwidth = block_sizes[i + 1] - block_sizes[i]
            if bwidth <= 0:
                continue
            args.append((ArgsFmmv(
                X1=X1.narrow(0, block_sizes[i], bwidth),
                X2=X2, v=v,
                out=out.narrow(0, block_sizes[i], bwidth),
                kernel=kernel, max_mem=g.usable_ram), g.Id))

        _start_wait_processes(target, args)
    return out
示例#5
0
def fdmmv_cuda(X1: torch.Tensor,
               X2: torch.Tensor,
               v: Optional[torch.Tensor],
               w: Optional[torch.Tensor],
               kernel,
               out: Optional[torch.Tensor] = None,
               opt: Optional[BaseOptions] = None) -> torch.Tensor:
    """
    X1 : N x D
    X2 : M x D
    v  : M x T
    w  : N x T

    performs fnc(X1*X2', X1, X2)' * ( fnc(X1*X2', X1, X2) * v  +  w )  : M x T
    in blocks on multiple GPUs

    Assume all inputs have the same data type
    """
    opt = _setup_opt(opt)
    _check_contiguity((X1, 'X1'), (X2, 'X2'), (v, 'v'), (w, 'w'), (out, 'out'))
    if v is None and w is None:
        raise ValueError("one of 'v' or 'w' must not be None.")

    T = v.size(1) if v is not None else w.size(1)
    M = X2.size(0)
    N = X1.size(0)

    gpu_info = _get_gpu_info(opt, slack=0.9)
    block_sizes = calc_gpu_block_sizes(gpu_info, N)

    if out is None:
        out = create_same_stride((M, T), X1, X1.dtype, 'cpu', pin_memory=True)

    wrlk = []  # outputs for each subprocess.
    args = []
    for i, g in enumerate(gpu_info):
        bwidth = block_sizes[i + 1] - block_sizes[i]
        if bwidth <= 0:
            continue

        cur_out_gpu = create_same_stride((M, T), X1, X1.dtype,
                                         f'cuda:{gpu_info[i].Id}')  # M x T
        wrlk.append(cur_out_gpu)

        cur_w = None
        if w is not None:
            cur_w = w.narrow(0, block_sizes[i], bwidth)
        args.append((ArgsFdmmv(X1=X1.narrow(0, block_sizes[i], bwidth),
                               X2=X2,
                               v=v,
                               w=cur_w,
                               out=cur_out_gpu,
                               kernel=kernel,
                               max_mem=g.usable_ram), g.Id))

    # If using torch multiprocessing with spawn/forkserver here we must make sure
    # that any consumer of the queues is on a different process than the queue producer.
    # This is due to passing in a CUDA tensor to the queue
    # https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors
    # Thus we cannot run the first task on the current process.
    if kernel.kernel_type == "l2distance" and kernel.name == "gaussian":
        target = distk_fdmmv
    else:
        target = generic_fdmmv
    _start_wait_processes(target, args)

    if len(wrlk) > 1:
        # noinspection PyTypeChecker
        fastest_device: int = np.argmax([d.speed for d in gpu_info])
        out.copy_(
            tcd.comm.reduce_add(wrlk, destination=gpu_info[fastest_device].Id))
    else:
        out.copy_(wrlk[0])
    return out
示例#6
0
def fdmmv_cuda(X1: torch.Tensor,
               X2: torch.Tensor,
               v: Optional[torch.Tensor],
               w: Optional[torch.Tensor],
               kernel,
               out: Optional[torch.Tensor] = None,
               opt: Optional[BaseOptions] = None) -> torch.Tensor:
    """
    X1 : N x D
    X2 : M x D
    v  : M x T
    w  : N x T

    performs fnc(X1*X2', X1, X2)' * ( fnc(X1*X2', X1, X2) * v  +  w )  : M x T
    in blocks on multiple GPUs

    Assume all inputs have the same data type
    """
    opt = _setup_opt(opt)
    _check_contiguity((X1, 'X1'), (X2, 'X2'), (v, 'v'), (w, 'w'), (out, 'out'))
    device = X1.device
    if v is None and w is None:
        raise ValueError("one of 'v' or 'w' must not be None.")

    T = v.size(1) if v is not None else w.size(1)
    M = X2.size(0)
    N = X1.size(0)

    if out is None:
        out = create_same_stride((M, T),
                                 X1,
                                 X1.dtype,
                                 device=device,
                                 pin_memory=device.type != 'cuda')

    gpu_info = _get_gpu_info(opt, slack=0.9)

    if kernel.kernel_type == "l2distance" and kernel.name == "gaussian":
        target = distk_fdmmv
    else:
        target = generic_fdmmv

    if device.type == 'cuda':
        single_gpu_info = [g for g in gpu_info if g.Id == device.index][0]
        args = ArgsFdmmv(X1=X1,
                         X2=X2,
                         v=v,
                         w=w,
                         out=out,
                         kernel=kernel,
                         max_mem=single_gpu_info.usable_ram)
        _call_direct(target, (args, device.index))
    else:
        block_sizes = calc_gpu_block_sizes(gpu_info, N)
        wrlk = []  # outputs for each subprocess.
        args = []
        for i, g in enumerate(gpu_info):
            bwidth = block_sizes[i + 1] - block_sizes[i]
            if bwidth <= 0:
                continue

            cur_out_gpu = create_same_stride((M, T), X1, X1.dtype,
                                             f'cuda:{gpu_info[i].Id}')  # M x T
            wrlk.append(cur_out_gpu)

            cur_w = None
            if w is not None:
                cur_w = w.narrow(0, block_sizes[i], bwidth)
            args.append((ArgsFdmmv(X1=X1.narrow(0, block_sizes[i], bwidth),
                                   X2=X2,
                                   v=v,
                                   w=cur_w,
                                   out=cur_out_gpu,
                                   kernel=kernel,
                                   max_mem=g.usable_ram), g.Id))
        _start_wait_processes(target, args)
        if len(wrlk) > 1:
            # noinspection PyTypeChecker
            fastest_device: int = np.argmax([d.speed for d in gpu_info])
            out.copy_(
                tcd.comm.reduce_add(wrlk,
                                    destination=gpu_info[fastest_device].Id))
        else:
            out.copy_(wrlk[0])
    return out