def fmmv_cuda_sparse(X1: SparseTensor, X2: SparseTensor, v: torch.Tensor, kernel, out: Optional[torch.Tensor] = None, opt: Optional[BaseOptions] = None) -> torch.Tensor: opt = _setup_opt(opt) _check_contiguity((v, 'v'), (out, 'out')) N = X1.size(0) # Create output matrix if out is None: out = create_fortran((N, v.size(1)), v.dtype, 'cpu', pin_memory=True) out.fill_(0.0) gpu_info = _get_gpu_info(opt, slack=0.9) block_sizes = calc_gpu_block_sizes(gpu_info, N) # Create queues args = [] # Arguments passed to each subprocess for i, g in enumerate(gpu_info): bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue args.append((ArgsFmmv(X1=X1.narrow_rows(block_sizes[i], bwidth), X2=X2, v=v, out=out.narrow(0, block_sizes[i], bwidth), kernel=kernel, max_mem=g.usable_ram), g.Id)) _start_wait_processes(sparse_fmmv, args) return out
def fdmmv_cuda_sparse(X1: SparseTensor, X2: SparseTensor, v: Optional[torch.Tensor], w: Optional[torch.Tensor], kernel, out: Optional[torch.Tensor] = None, opt: Optional[BaseOptions] = None) -> torch.Tensor: opt = _setup_opt(opt) _check_contiguity((v, 'v'), (w, 'w'), (out, 'out')) if v is None and w is None: raise ValueError("one of 'v' or 'w' must not be None.") T = v.size(1) if v is not None else w.size(1) M = X2.size(0) N = X1.size(0) gpu_info = _get_gpu_info(opt, slack=0.95) block_sizes = calc_gpu_block_sizes(gpu_info, N) if out is None: out = create_C((M, T), X1.dtype, 'cpu', pin_memory=True) wrlk = [] # outputs for each subprocess. args = [] for i, g in enumerate(gpu_info): bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue cur_out_gpu = create_C((M, T), X1.dtype, f'cuda:{gpu_info[i].Id}') # M x T wrlk.append(cur_out_gpu) cur_w = None if w is not None: cur_w = w.narrow(0, block_sizes[i], bwidth) args.append((ArgsFdmmv(X1=X1.narrow_rows(block_sizes[i], bwidth), X2=X2, v=v, w=cur_w, out=cur_out_gpu, kernel=kernel, max_mem=g.usable_ram), g.Id)) _start_wait_processes(sparse_fdmmv, args) if len(wrlk) > 1: # noinspection PyTypeChecker fastest_device: int = np.argmax([d.speed for d in gpu_info]) out.copy_( torch.cuda.comm.reduce_add( wrlk, destination=gpu_info[fastest_device].Id)) else: out.copy_(wrlk[0]) return out
def fmmv_cuda(X1: torch.Tensor, X2: torch.Tensor, v: torch.Tensor, kernel, out: Optional[torch.Tensor] = None, opt: Optional[BaseOptions] = None) -> torch.Tensor: """ X1 : N x D X2 : M x D v : M x T performs fnc(X1*X2', X1, X2) * v : N x T in blocks on multiple GPUs """ opt = _setup_opt(opt) _check_contiguity((X1, 'X1'), (X2, 'X2'), (v, 'v'), (out, 'out')) N = X1.size(0) # Create output matrix if out is None: out = create_same_stride((N, v.size(1)), X1, v.dtype, 'cpu', pin_memory=True) out.fill_(0.0) gpu_info = _get_gpu_info(opt, slack=0.9) block_sizes = calc_gpu_block_sizes(gpu_info, N) # Create queues args = [] # Arguments passed to each subprocess for i, g in enumerate(gpu_info): bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue args.append((ArgsFmmv(X1=X1.narrow(0, block_sizes[i], bwidth), X2=X2, v=v, out=out.narrow(0, block_sizes[i], bwidth), kernel=kernel, max_mem=g.usable_ram), g.Id)) # If using torch multiprocessing with spawn/forkserver here we must make sure # that any consumer of the queues is on a different process than the queue producer. # This is due to passing in a CUDA tensor to the queue # https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors # Thus we cannot run the first task on the current process. if kernel.kernel_type == "l2distance" and kernel.name == "gaussian": target = distk_fmmv else: target = generic_fmmv _start_wait_processes(target, args) return out
def fmmv_cuda(X1: torch.Tensor, X2: torch.Tensor, v: torch.Tensor, kernel, out: Optional[torch.Tensor] = None, opt: Optional[BaseOptions] = None) -> torch.Tensor: """ X1 : N x D X2 : M x D v : M x T performs fnc(X1*X2', X1, X2) * v : N x T in blocks on multiple GPUs """ opt = _setup_opt(opt) _check_contiguity((X1, 'X1'), (X2, 'X2'), (v, 'v'), (out, 'out')) device = X1.device N = X1.size(0) # Create output matrix if out is None: out = create_same_stride((N, v.size(1)), X1, v.dtype, device=device, pin_memory=device.type != 'cuda') out.fill_(0.0) if kernel.kernel_type == "l2distance" and kernel.name == "gaussian": target = distk_fmmv else: target = generic_fmmv gpu_info = _get_gpu_info(opt, slack=0.9) if device.type == 'cuda': single_gpu_info = [g for g in gpu_info if g.Id == device.index][0] args = ArgsFmmv(X1=X1, X2=X2, v=v, out=out, kernel=kernel, max_mem=single_gpu_info.usable_ram) _call_direct(target, (args, device.index)) else: block_sizes = calc_gpu_block_sizes(gpu_info, N) # Create queues args = [] # Arguments passed to each subprocess for i, g in enumerate(gpu_info): bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue args.append((ArgsFmmv( X1=X1.narrow(0, block_sizes[i], bwidth), X2=X2, v=v, out=out.narrow(0, block_sizes[i], bwidth), kernel=kernel, max_mem=g.usable_ram), g.Id)) _start_wait_processes(target, args) return out
def fdmmv_cuda(X1: torch.Tensor, X2: torch.Tensor, v: Optional[torch.Tensor], w: Optional[torch.Tensor], kernel, out: Optional[torch.Tensor] = None, opt: Optional[BaseOptions] = None) -> torch.Tensor: """ X1 : N x D X2 : M x D v : M x T w : N x T performs fnc(X1*X2', X1, X2)' * ( fnc(X1*X2', X1, X2) * v + w ) : M x T in blocks on multiple GPUs Assume all inputs have the same data type """ opt = _setup_opt(opt) _check_contiguity((X1, 'X1'), (X2, 'X2'), (v, 'v'), (w, 'w'), (out, 'out')) if v is None and w is None: raise ValueError("one of 'v' or 'w' must not be None.") T = v.size(1) if v is not None else w.size(1) M = X2.size(0) N = X1.size(0) gpu_info = _get_gpu_info(opt, slack=0.9) block_sizes = calc_gpu_block_sizes(gpu_info, N) if out is None: out = create_same_stride((M, T), X1, X1.dtype, 'cpu', pin_memory=True) wrlk = [] # outputs for each subprocess. args = [] for i, g in enumerate(gpu_info): bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue cur_out_gpu = create_same_stride((M, T), X1, X1.dtype, f'cuda:{gpu_info[i].Id}') # M x T wrlk.append(cur_out_gpu) cur_w = None if w is not None: cur_w = w.narrow(0, block_sizes[i], bwidth) args.append((ArgsFdmmv(X1=X1.narrow(0, block_sizes[i], bwidth), X2=X2, v=v, w=cur_w, out=cur_out_gpu, kernel=kernel, max_mem=g.usable_ram), g.Id)) # If using torch multiprocessing with spawn/forkserver here we must make sure # that any consumer of the queues is on a different process than the queue producer. # This is due to passing in a CUDA tensor to the queue # https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors # Thus we cannot run the first task on the current process. if kernel.kernel_type == "l2distance" and kernel.name == "gaussian": target = distk_fdmmv else: target = generic_fdmmv _start_wait_processes(target, args) if len(wrlk) > 1: # noinspection PyTypeChecker fastest_device: int = np.argmax([d.speed for d in gpu_info]) out.copy_( tcd.comm.reduce_add(wrlk, destination=gpu_info[fastest_device].Id)) else: out.copy_(wrlk[0]) return out
def fdmmv_cuda(X1: torch.Tensor, X2: torch.Tensor, v: Optional[torch.Tensor], w: Optional[torch.Tensor], kernel, out: Optional[torch.Tensor] = None, opt: Optional[BaseOptions] = None) -> torch.Tensor: """ X1 : N x D X2 : M x D v : M x T w : N x T performs fnc(X1*X2', X1, X2)' * ( fnc(X1*X2', X1, X2) * v + w ) : M x T in blocks on multiple GPUs Assume all inputs have the same data type """ opt = _setup_opt(opt) _check_contiguity((X1, 'X1'), (X2, 'X2'), (v, 'v'), (w, 'w'), (out, 'out')) device = X1.device if v is None and w is None: raise ValueError("one of 'v' or 'w' must not be None.") T = v.size(1) if v is not None else w.size(1) M = X2.size(0) N = X1.size(0) if out is None: out = create_same_stride((M, T), X1, X1.dtype, device=device, pin_memory=device.type != 'cuda') gpu_info = _get_gpu_info(opt, slack=0.9) if kernel.kernel_type == "l2distance" and kernel.name == "gaussian": target = distk_fdmmv else: target = generic_fdmmv if device.type == 'cuda': single_gpu_info = [g for g in gpu_info if g.Id == device.index][0] args = ArgsFdmmv(X1=X1, X2=X2, v=v, w=w, out=out, kernel=kernel, max_mem=single_gpu_info.usable_ram) _call_direct(target, (args, device.index)) else: block_sizes = calc_gpu_block_sizes(gpu_info, N) wrlk = [] # outputs for each subprocess. args = [] for i, g in enumerate(gpu_info): bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue cur_out_gpu = create_same_stride((M, T), X1, X1.dtype, f'cuda:{gpu_info[i].Id}') # M x T wrlk.append(cur_out_gpu) cur_w = None if w is not None: cur_w = w.narrow(0, block_sizes[i], bwidth) args.append((ArgsFdmmv(X1=X1.narrow(0, block_sizes[i], bwidth), X2=X2, v=v, w=cur_w, out=cur_out_gpu, kernel=kernel, max_mem=g.usable_ram), g.Id)) _start_wait_processes(target, args) if len(wrlk) > 1: # noinspection PyTypeChecker fastest_device: int = np.argmax([d.speed for d in gpu_info]) out.copy_( tcd.comm.reduce_add(wrlk, destination=gpu_info[fastest_device].Id)) else: out.copy_(wrlk[0]) return out