def select_indices(self, X: _tensor_type, Y: Optional[torch.Tensor]) -> _opt_tns_idx_tup: """Select M observations from 2D tensor `X`, preserving device and memory order. The selection strategy is uniformly at random. To control the randomness, pass an appropriate numpy random generator to this class's constructor. This method behaves the same as :meth:`select` but additionally returns a `LongTensor` containing the indices of the chosen points. Parameters ---------- X N x D tensor containing the whole input dataset. If N is lower than the number of centers this class is programmed to pick, a warning will be raised and only N centers will be returned. Y Optional N x T tensor containing the input targets. If `Y` is provided, the same observations selected for `X` will also be selected from `Y`. Certain models (such as :class:`falkon.models.LogisticFalkon`) require centers to be extracted from both predictors and targets, while others (such as :class:`falkon.models.Falkon`) only require the centers from the predictors. Returns ------- (X_M, indices) The randomly selected centers and the corresponding indices. The centers will be stored in a new, memory-contiguous tensor and all characteristics of the input tensor will be preserved. (X_M, Y_M, indices) If parameter`Y` is not `None` then the entries of `Y` corresponding to the selected centers of `X` will also be returned. """ N = X.shape[0] num_centers = self.num_centers if num_centers > N: warnings.warn("Number of centers M greater than the " f"number of data-points. Setting `num_centers` to {N}") num_centers = N idx = self.random_gen.choice(N, size=num_centers, replace=False) if isinstance(X, SparseTensor): X_sp = X.to_scipy() centers = X_sp[idx, :].copy() Xc = SparseTensor.from_scipy(centers) th_idx = torch.from_numpy(idx.astype(np.long)).to(X.device) else: Xc = create_same_stride((num_centers, X.shape[1]), other=X, dtype=X.dtype, device=X.device, pin_memory=False) th_idx = torch.from_numpy(idx.astype(np.long)).to(X.device) torch.index_select(X, dim=0, index=th_idx, out=Xc) if Y is not None: Yc = create_same_stride((num_centers, Y.shape[1]), other=Y, dtype=Y.dtype, device=Y.device, pin_memory=False) th_idx = torch.from_numpy(idx.astype(np.long)).to(Y.device) torch.index_select(Y, dim=0, index=th_idx, out=Yc) return Xc, Yc, th_idx return Xc, th_idx
def fdmmv_cpu_sparse(X1: SparseTensor, X2: SparseTensor, v: Optional[torch.Tensor], w: Optional[torch.Tensor], kernel, out: Optional[torch.Tensor] = None, opt: Optional[BaseOptions] = None): opt = _setup_opt(opt, is_cpu=True) # Parameter validation if v is None and w is None: raise ValueError("One of v and w must be specified to run fMMV.") T = v.size(1) if v is not None else w.size(1) ntot, dtot = X1.size() M = X2.size(0) dtype = X1.dtype # Create output matrix if out is None: out = torch.empty(M, T, dtype=dtype) out.fill_(0) avail_mem = _get_cpu_ram(opt, 0.95) / sizeof_dtype(dtype) # Narrow X1 : n # ker_chunk : n*M # w_blk : n*T n = avail_mem / (M * T + 1) n = int(math.floor(n)) if n < 1: raise MemoryError(("Available memory %.2fGB is insufficient " "for blockwise fdMMv.") % (avail_mem * sizeof_dtype(dtype) / 2**30)) # Allocate fixed arrays ker_chunk = create_same_stride((n, M), out, dtype, device='cpu') w_blk = create_same_stride((n, T), out, dtype, device='cpu') # Run blocked fdmmv for i in range(0, ntot, n): ic = min(n, ntot - i) X1_chunk = X1.narrow_rows(i, ic) cur_ker_chunk = ker_chunk[:ic] cur_ker_chunk.fill_(0.0) ddd = kernel._prepare_sparse(X1_chunk, X2) kernel._apply_sparse(X1_chunk, X2.transpose_csc(), cur_ker_chunk) kernel._finalize(cur_ker_chunk, ddd) # Multiply by the vector v cur_w_blk = w_blk[:ic] # n x T cur_w_blk.fill_(0.0) if w is not None: cur_w_blk.copy_(w[i:i + ic, :]) if v is not None: # w_blk + c_out * v => (n x T) + (n x M)*(M x T) cur_w_blk.addmm_(cur_ker_chunk, v) out.addmm_(cur_ker_chunk.T, cur_w_blk) del ker_chunk, w_blk return out
def select(self, X: _tensor_type, Y: Union[torch.Tensor, None], M: int) -> Union[_tensor_type, Tuple[_tensor_type, torch.Tensor]]: """Select M observations from 2D tensor `X`, preserving device and memory order. The selection strategy is uniformly at random. To control the randomness, pass an appropriate numpy random generator to this class's constructor. Parameters ---------- X N x D tensor containing the whole input dataset. We have that N <= M. Y Optional N x T tensor containing the input targets. If `Y` is provided, the same observations selected for `X` will also be selected from `Y`. Certain models (such as :class:`falkon.models.LogisticFalkon`) require centers to be extracted from both predictors and targets, while others (such as :class:`falkon.models.Falkon`) only require the centers from the predictors. M The number of observations to choose. M <= N, otherwise M is forcibly set to N with a warning. Returns ------- X_M The randomly selected centers. They will be in a new, memory-contiguous tensor. All characteristics of the input tensor will be preserved. (X_M, Y_M) If `Y` was different than `None` then the entries of `Y` corresponding to the selected centers of `X` will also be returned. """ N = X.shape[0] if M > N: warnings.warn("Number of centers M greater than the " "number of data-points. Setting M to %d" % (N)) M = N idx = self.random_gen.choice(N, size=M, replace=False) if isinstance(X, SparseTensor): X = X.to_scipy() centers = X[idx, :].copy() Xc = SparseTensor.from_scipy(centers) else: Xc = create_same_stride((M, X.shape[1]), other=X, dtype=X.dtype, device=X.device, pin_memory=False) th_idx = torch.from_numpy(idx.astype(np.long)).to(X.device) torch.index_select(X, dim=0, index=th_idx, out=Xc) if Y is not None: Yc = create_same_stride((M, Y.shape[1]), other=Y, dtype=Y.dtype, device=Y.device, pin_memory=False) th_idx = torch.from_numpy(idx.astype(np.long)).to(Y.device) torch.index_select(Y, dim=0, index=th_idx, out=Yc) return Xc, Yc return Xc
def generic_fmmv(proc_idx, queue, device_id): a: ArgsFmmv = queue.get() X1, X2, v, out = a.X1, a.X2, a.v, a.out kernel, max_mem = a.kernel, a.max_mem dtype = X1.dtype ntot, dtot = X1.size() M, T = v.size() # GPU Memory Usage: # ker_gpu : n*M # v_gpu : M*T # X1s_gpu : n*d # X2s_gpu : M*d # mmv_gpu : n*T # ---------- # total : n*d + n*(M+T) + d*M + M*T avail_mem = max_mem / sizeof_dtype(dtype) n, d = select_dim_over_d(maxD=dtot, maxN=ntot, coef_nd=1, coef_n=M + T, coef_d=M, rest=M * T, tot=avail_mem) ddev = torch.device('cuda:%d' % int(device_id)) with tcd.device(ddev): ker_gpu = torch.empty(n, M, dtype=dtype, device=ddev) v_gpu = v.to(device=ddev) # M x T X1s_gpu = create_same_stride((n, d), X1, dtype, ddev) X2s_gpu = create_same_stride((M, d), X2, dtype, ddev) mmv_gpu = create_same_stride((n, T), out, dtype, ddev) for i in range(0, ntot, n): ic = min(n, ntot - i) ddd = kernel._prepare(X1.narrow(0, i, ic), X2) c_g_ker = ker_gpu.narrow(0, 0, ic) c_g_ker.fill_(0.0) for k in range(0, dtot, d): kc = min(d, dtot - k) c_g_X1s = copy_to_device_noorder(ic, kc, X1, i, k, X1s_gpu, 0, 0) c_g_X2s = copy_to_device_noorder(M, kc, X2, 0, k, X2s_gpu, 0, 0) kernel._apply(c_g_X1s, c_g_X2s.T, c_g_ker) kernel._finalize(c_g_ker, ddd) # Multiply by the vector v c_g_mmv = mmv_gpu[:ic, :] torch.mm(c_g_ker, v_gpu, out=c_g_mmv) # n x T # Copy back to host copy_to_host_noorder(ic, T, c_g_mmv, 0, 0, out, i, 0) return out
def test_zero(self, mat, upper, preserve_diag, order, device): inpt1 = fix_mat(mat, dtype=mat.dtype, order=order, copy=True, numpy=True) inpt2 = inpt1.copy(order="K") k = 1 if preserve_diag else 0 if upper: tri_fn = partial(np.triu, k=k) else: tri_fn = partial(np.tril, k=-k) inpt1 = torch.from_numpy(inpt1) inpt1_dev = create_same_stride(inpt1.shape, inpt1, inpt1.dtype, device) inpt1_dev.copy_(inpt1) mul_triang(inpt1_dev, upper=upper, preserve_diag=preserve_diag, multiplier=0) inpt1 = inpt1_dev.cpu().numpy() assert np.sum(tri_fn(inpt1)) == 0 if preserve_diag: inpt2_dev = inpt1_dev inpt2_dev.copy_(torch.from_numpy(inpt2)) zero_triang(inpt2_dev, upper=upper) inpt2 = inpt2_dev.cpu().numpy() np.testing.assert_allclose(inpt1, inpt2)
def test_with_x0(self, mat, vec_rhs, conjgrad, order, device): if order == "F": mat = torch.from_numpy(np.asfortranarray(mat.numpy())) vec_rhs = torch.from_numpy(np.asfortranarray(vec_rhs.numpy())) mat = move_tensor(mat, device) vec_rhs = move_tensor(vec_rhs, device) init_sol = create_same_stride(vec_rhs.size(), vec_rhs, vec_rhs.dtype, device) init_sol.fill_(0.0) x = conjgrad.solve(X0=init_sol, B=vec_rhs, mmv=lambda x_: mat @ x_, max_iter=10, callback=None) assert x.data_ptr() == init_sol.data_ptr( ), "Initial solution vector was copied" assert str(x.device) == device, "Device has changed unexpectedly" assert x.shape == (self.t, vec_rhs.shape[1]), "Output shape is incorrect" assert x.stride() == vec_rhs.stride( ), "Stride has changed unexpectedly" expected = np.linalg.solve(mat.cpu().numpy(), vec_rhs.cpu().numpy()) np.testing.assert_allclose(expected, x.cpu().numpy(), rtol=1e-6)
def incore_fmmv(mat: torch.Tensor, vec: torch.Tensor, out: Optional[torch.Tensor] = None, transpose: bool = False, opt: Optional[FalkonOptions] = None) -> torch.Tensor: if not check_same_dtype(mat, vec, out): raise TypeError("Data types of input matrices must be equal.") if not check_same_device(mat, vec, out): raise RuntimeError("All input arguments to incore_fmmv must be on the same device") if out is None: if transpose: out_shape = (mat.shape[1], vec.shape[1]) else: out_shape = (mat.shape[0], vec.shape[1]) out = create_same_stride(out_shape, mat, mat.dtype, device=mat.device, pin_memory=False) out.fill_(0.0) if mat.is_cuda: s1 = torch.cuda.Stream() with torch.cuda.stream(s1): if transpose: out.addmm_(mat.T, vec, beta=0.0) else: out.addmm_(mat, vec, beta=0.0) s1.synchronize() else: if transpose: out.addmm_(mat.T, vec, beta=0.0) else: out.addmm_(mat, vec, beta=0.0) return out
def test_mul(self, mat, upper, preserve_diag, order, device): inpt1 = fix_mat(mat, dtype=mat.dtype, order=order, copy=True, numpy=True) k = 1 if preserve_diag else 0 if upper: tri_fn = partial(np.triu, k=k) other_tri_fn = partial(np.tril, k=k - 1) else: tri_fn = partial(np.tril, k=-k) other_tri_fn = partial(np.triu, k=-k + 1) inpt1 = torch.from_numpy(inpt1) inpt1_dev = create_same_stride(inpt1.shape, inpt1, inpt1.dtype, device) inpt1_dev.copy_(inpt1) mul_triang(inpt1_dev, upper=upper, preserve_diag=preserve_diag, multiplier=10**6) inpt1 = inpt1_dev.cpu().numpy() assert np.mean(tri_fn(inpt1)) > 10**5 assert np.mean(other_tri_fn(inpt1)) < 1
def init(self, X: Union[torch.Tensor, SparseTensor]): """Initialize the preconditioner matrix. This method must be called before the preconditioner can be used. Parameters ---------- X : MxD tensor The matrix of Nystroem centers """ dtype = X.dtype eps = self.params.pc_epsilon(X.dtype) M = X.size(0) with TicToc("Kernel", debug=self.params.debug): if isinstance(X, torch.Tensor): C = create_same_stride((M, M), X, dtype=dtype, device='cpu', pin_memory=self._use_cuda) else: # If sparse tensor we need fortran for kernel calculation C = create_fortran((M, M), dtype=dtype, device='cpu', pin_memory=self._use_cuda) self.kernel(X, X, out=C, opt=self.params) self.fC = C.numpy() if not is_f_contig(C): self.fC = self.fC.T with TicToc("Cholesky 1", debug=self.params.debug): # Compute T: lower(fC) = T.T inplace_add_diag(self.fC, eps * M) self.fC = potrf_wrapper(self.fC, clean=False, upper=False, use_cuda=self._use_cuda, opt=self.params) # Save the diagonal which will be overwritten when computing A self.dT = C.diag() with TicToc("Copy triangular", debug=self.params.debug): # Copy lower(fC) to upper(fC): upper(fC) = T. copy_triang(self.fC, upper=False) if self._use_cuda: with TicToc("LAUUM", debug=self.params.debug): # Product upper(fC) @ upper(fC).T : lower(fC) = T @ T.T self.fC = lauum_wrapper(self.fC, upper=True, use_cuda=self._use_cuda, opt=self.params) else: with TicToc("LAUUM", debug=self.params.debug): # Product lower(fC).T @ lower(fC) : lower(fC) = T @ T.T self.fC = lauum_wrapper(self.fC, upper=False, use_cuda=self._use_cuda, opt=self.params) with TicToc("Cholesky 2", debug=self.params.debug): # lower(fC) = 1/M * [email protected] self.fC = mul_triang(self.fC, upper=False, preserve_diag=False, multiplier=1 / M) # lower(fC) = 1/M * [email protected] + lambda * I inplace_add_diag(self.fC, self._lambda) # Cholesky on lower(fC) : lower(fC) = A.T self.fC = potrf_wrapper(self.fC, clean=False, upper=False, use_cuda=self._use_cuda, opt=self.params) self.dA = C.diag()
def solve(self, X0, B, mmv, max_iter, callback=None): t_start = time.time() if X0 is None: R = copy_same_stride(B) X = create_same_stride(B.size(), B, B.dtype, B.device) X.fill_(0.0) else: R = B - mmv(X0) X = X0 m_eps = self.params.cg_epsilon(X.dtype) P = R # noinspection PyArgumentList Rsold = torch.sum(R.pow(2), dim=0) e_train = time.time() - t_start for i in range(max_iter): with TicToc("Chol Iter", debug=False): # TODO: FIXME t_start = time.time() AP = mmv(P) # noinspection PyArgumentList alpha = Rsold / (torch.sum(P * AP, dim=0) + m_eps) X.addmm_(P, torch.diag(alpha)) if (i + 1) % self.params.cg_full_gradient_every == 0: if (X.is_cuda): # addmm_ may not be finished yet causing mmv to get stale inputs. torch.cuda.synchronize() R = B - mmv(X) else: R = R - torch.mm(AP, torch.diag(alpha)) # R.addmm_(mat1=AP, mat2=torch.diag(alpha), alpha=-1.0) # noinspection PyArgumentList Rsnew = torch.sum(R.pow(2), dim=0) if Rsnew.abs().max().sqrt() < self.params.cg_tolerance: print("Stopping conjugate gradient descent at " "iteration %d. Solution has converged." % (i + 1)) break P = R + torch.mm(P, torch.diag(Rsnew / (Rsold + m_eps))) if P.is_cuda: # P must be synced so that it's correct for mmv in next iter. torch.cuda.synchronize() Rsold = Rsnew e_iter = time.time() - t_start e_train += e_iter with TicToc("Chol callback", debug=False): if callback is not None: callback(i + 1, X, e_train) return X
def fmm_cuda(X1: torch.Tensor, X2: torch.Tensor, kernel: 'falkon.kernels.Kernel', out: Optional[torch.Tensor] = None, opt: Optional[BaseOptions] = None) -> torch.Tensor: """ performs fnc(X1*X2', X1, X2) in blocks on multiple GPUs """ opt = _setup_opt(opt) _check_contiguity((X1, 'X1'), (X2, 'X2'), (out, 'out')) N = X1.shape[0] M = X2.shape[0] device = X1.device if out is None: out = create_same_stride((N, M), X1, X1.dtype, device=device, pin_memory=False) gpu_info = _get_gpu_info(opt, slack=0.9) block_sizes = calc_gpu_block_sizes(gpu_info, N) # If float32 we need to upcast to float64 to avoid numerical precision errors # in the kernel gpu_dtype = X1.dtype if sizeof_dtype(X1.dtype) < 8 and opt.no_single_kernel: gpu_dtype = torch.float64 if device.type == 'cuda': sync_current_stream(device) single_gpu_info = [g for g in gpu_info if g.Id == device.index][0] args = ArgsFmm(X1=X1, X2=X2, out=out, kernel=kernel, gpu_dtype=gpu_dtype, max_mem=single_gpu_info.usable_ram, num_streams=opt.num_fmm_streams) _call_direct(_generic_fmm, (args, device.index)) else: # Create the arguments passed to each subprocess args = [] for i, g in enumerate(gpu_info): bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue args.append((ArgsFmm(X1=X1.narrow(0, block_sizes[i], bwidth), X2=X2, out=out.narrow(0, block_sizes[i], bwidth), kernel=kernel, gpu_dtype=gpu_dtype, max_mem=g.usable_ram, num_streams=opt.num_fmm_streams), g.Id)) _start_wait_processes(_generic_fmm, args) return out
def fmmv_cuda(X1: torch.Tensor, X2: torch.Tensor, v: torch.Tensor, kernel, out: Optional[torch.Tensor] = None, opt: Optional[BaseOptions] = None) -> torch.Tensor: """ X1 : N x D X2 : M x D v : M x T performs fnc(X1*X2', X1, X2) * v : N x T in blocks on multiple GPUs """ opt = _setup_opt(opt) _check_contiguity((X1, 'X1'), (X2, 'X2'), (v, 'v'), (out, 'out')) device = X1.device N = X1.size(0) # Create output matrix if out is None: out = create_same_stride((N, v.size(1)), X1, v.dtype, device=device, pin_memory=device.type != 'cuda') out.fill_(0.0) if kernel.kernel_type == "l2distance" and kernel.name == "gaussian": target = distk_fmmv else: target = generic_fmmv gpu_info = _get_gpu_info(opt, slack=0.9) if device.type == 'cuda': single_gpu_info = [g for g in gpu_info if g.Id == device.index][0] args = ArgsFmmv(X1=X1, X2=X2, v=v, out=out, kernel=kernel, max_mem=single_gpu_info.usable_ram) _call_direct(target, (args, device.index)) else: block_sizes = calc_gpu_block_sizes(gpu_info, N) # Create queues args = [] # Arguments passed to each subprocess for i, g in enumerate(gpu_info): bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue args.append((ArgsFmmv( X1=X1.narrow(0, block_sizes[i], bwidth), X2=X2, v=v, out=out.narrow(0, block_sizes[i], bwidth), kernel=kernel, max_mem=g.usable_ram), g.Id)) _start_wait_processes(target, args) return out
def fmmv_cuda(X1: torch.Tensor, X2: torch.Tensor, v: torch.Tensor, kernel, out: Optional[torch.Tensor] = None, opt: Optional[BaseOptions] = None) -> torch.Tensor: """ X1 : N x D X2 : M x D v : M x T performs fnc(X1*X2', X1, X2) * v : N x T in blocks on multiple GPUs """ opt = _setup_opt(opt) _check_contiguity((X1, 'X1'), (X2, 'X2'), (v, 'v'), (out, 'out')) N = X1.size(0) # Create output matrix if out is None: out = create_same_stride((N, v.size(1)), X1, v.dtype, 'cpu', pin_memory=True) out.fill_(0.0) gpu_info = _get_gpu_info(opt, slack=0.9) block_sizes = calc_gpu_block_sizes(gpu_info, N) # Create queues args = [] # Arguments passed to each subprocess for i, g in enumerate(gpu_info): bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue args.append((ArgsFmmv(X1=X1.narrow(0, block_sizes[i], bwidth), X2=X2, v=v, out=out.narrow(0, block_sizes[i], bwidth), kernel=kernel, max_mem=g.usable_ram), g.Id)) # If using torch multiprocessing with spawn/forkserver here we must make sure # that any consumer of the queues is on a different process than the queue producer. # This is due to passing in a CUDA tensor to the queue # https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors # Thus we cannot run the first task on the current process. if kernel.kernel_type == "l2distance" and kernel.name == "gaussian": target = distk_fmmv else: target = generic_fmmv _start_wait_processes(target, args) return out
def solve(self, X0, B, mmv, max_iter, callback=None): t_start = time.time() if X0 is None: R = copy_same_stride(B) X = create_same_stride(B.size(), B, B.dtype, B.device) X.fill_(0.0) else: R = B - mmv(X0) X = X0 m_eps = self.params.cg_epsilon(X.dtype) P = R Rsold = torch.sum(R.pow(2), dim=0) e_train = time.time() - t_start for i in range(max_iter): with TicToc("Chol Iter", debug=False): t_start = time.time() AP = mmv(P) alpha = Rsold / (torch.sum(P * AP, dim=0) + m_eps) X.addmm_(P, torch.diag(alpha)) if (i + 1) % self.params.cg_full_gradient_every == 0: R = B - mmv(X) else: R = R - torch.mm(AP, torch.diag(alpha)) # R.addmm_(mat1=AP, mat2=torch.diag(alpha), alpha=-1.0) Rsnew = torch.sum(R.pow(2), dim=0) if Rsnew.abs().max().sqrt() < self.params.cg_tolerance: print("Stopping conjugate gradient descent at " "iteration %d. Solution has converged." % (i + 1)) break P = R + torch.mm(P, torch.diag(Rsnew / (Rsold + m_eps))) Rsold = Rsnew e_iter = time.time() - t_start e_train += e_iter with TicToc("Chol callback", debug=False): if callback is not None: callback(i + 1, X, e_train) return X
def fmmv_cpu_sparse(X1: SparseTensor, X2: SparseTensor, v: torch.Tensor, kernel: 'falkon.kernels.Kernel', out: Optional[torch.Tensor], opt: BaseOptions): opt = _setup_opt(opt, is_cpu=True) dtype = X1.dtype ntot, dtot = X1.size() mtot, T = v.size() # Create output matrix if out is None: out = torch.empty(ntot, T, dtype=dtype) out.fill_(0.0) avail_mem = _get_cpu_ram(opt, 0.95) / sizeof_dtype(dtype) # Narrowing X1, X2: n + m # Prepare - not computable, depends on kernel # ker_chunk : n*m # finalize : 0 (if can be implemented in place, kernel-dependent) n, m = select_dim_over_m(maxM=mtot, maxN=ntot, coef_nm=1, coef_n=1, coef_m=1, tot=avail_mem) ker_chunk = create_same_stride((n, m), out, dtype, device='cpu') for i in range(0, ntot, n): ic = min(n, ntot - i) cur_out = out[i:i + ic, :] X1_chunk = X1.narrow_rows(i, ic) for j in range(0, mtot, m): jc = min(m, mtot - j) X2_chunk = X2.narrow_rows(j, jc) cur_ker_chunk = ker_chunk[:ic, :jc] cur_ker_chunk.fill_(0.0) ddd = kernel._prepare_sparse(X1_chunk, X2_chunk) kernel._apply_sparse(X1_chunk, X2_chunk.transpose_csc(), cur_ker_chunk) kernel._finalize(cur_ker_chunk, ddd) # Multiply by the vector v cur_out.addmm_(cur_ker_chunk, v.narrow(0, j, jc)) return out
def fmm_cuda(X1: torch.Tensor, X2: torch.Tensor, kernel: 'falkon.kernels.Kernel', out: Optional[torch.Tensor] = None, opt: Optional[BaseOptions] = None) -> torch.Tensor: """ performs fnc(X1*X2', X1, X2) in blocks on multiple GPUs """ opt = _setup_opt(opt) _check_contiguity((X1, 'X1'), (X2, 'X2'), (out, 'out')) N = X1.size(0) M = X2.size(0) if out is None: out = create_same_stride((N, M), X1, X1.dtype, 'cpu', pin_memory=True) gpu_info = _get_gpu_info(opt, slack=0.9) block_sizes = calc_gpu_block_sizes(gpu_info, N) # If float32 we need to upcast to float64 to avoid numerical precision errors # in the kernel gpu_dtype = X1.dtype if sizeof_dtype(X1.dtype) < 8 and opt.no_single_kernel: gpu_dtype = torch.float64 # Create the arguments passed to each subprocess args = [] for i, g in enumerate(gpu_info): bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue args.append((ArgsFmm(X1=X1.narrow(0, block_sizes[i], bwidth), X2=X2, out=out.narrow(0, block_sizes[i], bwidth), kernel=kernel, gpu_dtype=gpu_dtype, max_mem=g.usable_ram), g.Id)) _start_wait_processes(_generic_fmm, args) torch.cuda.empty_cache() return out
def _generic_fmm(proc_idx, queue, device_id): # Unpack the function arguments a: ArgsFmm = queue.get() X1: torch.Tensor = a.X1 X2: torch.Tensor = a.X2 cuda_inputs = X1.is_cuda out = a.out kernel, gpu_dtype = a.kernel, a.gpu_dtype max_mem = a.max_mem num_streams = a.num_streams # flags and local variables change_dtype = gpu_dtype != X1.dtype X1_equal_X2 = _gpu_tns_same_memory(X1, X2) use_gpu_bufs = change_dtype or not cuda_inputs stride = "F" if is_f_contig(out, strict=True) else "C" j_iter = 0 dts = sizeof_dtype(gpu_dtype) tc_device = torch.device('cuda:%d' % (int(device_id))) avail_mem = max_mem / dts # Choose block sizes n, m such that we won't run out of GPU memory ntot, d = X1.shape mtot = X2.shape[0] extra_mem = kernel.extra_mem() if cuda_inputs and not change_dtype: # No allocation will be performed by us. Only in-kernel stuff. n, m = select_dim_over_nm(max_n=ntot, max_m=mtot, d=d, coef_nd=extra_mem.get('nd', 0), coef_md=extra_mem.get('md', 0), coef_nm=extra_mem.get('nm', 0), coef_n=extra_mem.get('n', 0), coef_m=extra_mem.get('m', 0), rest=extra_mem.get('d', 0), max_mem=avail_mem) else: n, m = select_dim_over_nm( max_n=ntot, max_m=mtot, d=d, coef_nd=num_streams * (extra_mem.get('nd', 0) + 1), coef_md=num_streams * (extra_mem.get('md', 0) + 1), coef_nm=num_streams * (extra_mem.get('nm', 0) + 1), coef_n=extra_mem.get('n', 0), coef_m=extra_mem.get('m', 0), rest=extra_mem.get('d', 0), max_mem=avail_mem) # Create streams streams = [tcd.Stream(device=tc_device) for _ in range(num_streams)] # Create buffers if use_gpu_bufs: gX1 = create_same_stride((n, d), X1, gpu_dtype, tc_device) gX2_list = [ create_same_stride((m, d), X2, gpu_dtype, tc_device) for _ in range(num_streams) ] gout_list = [ create_same_stride((n, m), out, gpu_dtype, tc_device) for _ in range(num_streams) ] if not cuda_inputs: cpu_buf_list = [ create_same_stride((n, m), out, gpu_dtype, 'cpu', pin_memory=True) for _ in range(num_streams) ] # Define helpers for the copy-back operations (from cpu_buf to output) copy_ops = [None] * num_streams def wrap_copy_op(stream_idx): if copy_ops[stream_idx] is not None: copy_ops[stream_idx]() copy_ops[stream_idx] = None def do_copy_op(output, buf, i_, ic_, j_, jc_): # This function will also do the type conversion output[i_:i_ + ic_, j_:j_ + jc_].copy_(buf[:ic_, :jc_]) # Kernel computation begin with tcd.device(tc_device): for i in range(0, ntot, n): ic = min(n, ntot - i) with tcd.stream(streams[j_iter % len(streams)]): X1_chunk = X1.narrow(0, i, ic) if use_gpu_bufs: cur_gX1 = gX1.narrow(0, 0, ic) cur_gX1.copy_(X1_chunk, non_blocking=True) else: cur_gX1 = X1_chunk for j in range(0, mtot, m): jc = min(m, mtot - j) # Choose the buffers for this inner iteration stream_id = j_iter % len(streams) stream = streams[stream_id] if use_gpu_bufs: gX2 = gX2_list[stream_id] gout = gout_list[stream_id] if not cuda_inputs: cpu_buf = cpu_buf_list[stream_id] # Sync for buffers we must use now (e.g. 2 previous iters) with tcd.stream(stream): # Inner-loop stream.synchronize() wrap_copy_op(stream_id) if X1_equal_X2 and j < i: # Shortcut for symmetric kernels jc = min(m, mtot - j) out[i:i + ic, j:j + jc].copy_(out[j:j + jc, i:i + ic].T, non_blocking=True) j_iter += 1 continue # Copy (CPU->GPU) X2_chunk = X2.narrow(0, j, jc) if use_gpu_bufs: cur_gX2 = gX2.narrow(0, 0, jc) cur_gX2.copy_(X2_chunk, non_blocking=True) else: cur_gX2 = X2_chunk if use_gpu_bufs: cur_gout = gout[:ic, :jc] else: cur_gout = out[i:i + ic, j:j + jc] cur_gout.fill_(0.0) # Compute ddd = kernel._prepare(cur_gX1, cur_gX2) kernel._apply(cur_gX1, cur_gX2.T, cur_gout) cur_gout = kernel._finalize(cur_gout, ddd) # Copy Back (GPU->CPU) if not cuda_inputs: # copy_ does not care about the contiguity of copies, as long as it's consistent # however, in case of C-contiguous inputs it will create an intermediate array # which is undesired. We use cuda_memcpy2d_async which works well with C-contiguous # arrays. if stride == "F": copy_to_host(ic, jc, cur_gout, 0, 0, cpu_buf, 0, 0, s=stream) else: cuda_memcpy2d_async(dst=cpu_buf.data_ptr(), dpitch=cpu_buf.stride(0) * dts, src=cur_gout.data_ptr(), spitch=cur_gout.stride(0) * dts, width=jc * dts, height=ic, stream=stream._as_parameter_) copy_ops[stream_id] = partial(do_copy_op, out, cpu_buf, i, ic, j, jc) elif change_dtype: out.narrow(0, i, ic).narrow(1, j, jc).copy_(cur_gout, non_blocking=True) j_iter += 1 for i in range(num_streams): streams[i].synchronize() wrap_copy_op(i) return out
def fdmmv_cuda(X1: torch.Tensor, X2: torch.Tensor, v: Optional[torch.Tensor], w: Optional[torch.Tensor], kernel, out: Optional[torch.Tensor] = None, opt: Optional[BaseOptions] = None) -> torch.Tensor: """ X1 : N x D X2 : M x D v : M x T w : N x T performs fnc(X1*X2', X1, X2)' * ( fnc(X1*X2', X1, X2) * v + w ) : M x T in blocks on multiple GPUs Assume all inputs have the same data type """ opt = _setup_opt(opt) _check_contiguity((X1, 'X1'), (X2, 'X2'), (v, 'v'), (w, 'w'), (out, 'out')) if v is None and w is None: raise ValueError("one of 'v' or 'w' must not be None.") T = v.size(1) if v is not None else w.size(1) M = X2.size(0) N = X1.size(0) gpu_info = _get_gpu_info(opt, slack=0.9) block_sizes = calc_gpu_block_sizes(gpu_info, N) if out is None: out = create_same_stride((M, T), X1, X1.dtype, 'cpu', pin_memory=True) wrlk = [] # outputs for each subprocess. args = [] for i, g in enumerate(gpu_info): bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue cur_out_gpu = create_same_stride((M, T), X1, X1.dtype, f'cuda:{gpu_info[i].Id}') # M x T wrlk.append(cur_out_gpu) cur_w = None if w is not None: cur_w = w.narrow(0, block_sizes[i], bwidth) args.append((ArgsFdmmv(X1=X1.narrow(0, block_sizes[i], bwidth), X2=X2, v=v, w=cur_w, out=cur_out_gpu, kernel=kernel, max_mem=g.usable_ram), g.Id)) # If using torch multiprocessing with spawn/forkserver here we must make sure # that any consumer of the queues is on a different process than the queue producer. # This is due to passing in a CUDA tensor to the queue # https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors # Thus we cannot run the first task on the current process. if kernel.kernel_type == "l2distance" and kernel.name == "gaussian": target = distk_fdmmv else: target = generic_fdmmv _start_wait_processes(target, args) if len(wrlk) > 1: # noinspection PyTypeChecker fastest_device: int = np.argmax([d.speed for d in gpu_info]) out.copy_( tcd.comm.reduce_add(wrlk, destination=gpu_info[fastest_device].Id)) else: out.copy_(wrlk[0]) return out
def init(self, X: Union[torch.Tensor, SparseTensor], Y: torch.Tensor, alpha: torch.Tensor, penalty: float, N: int) -> None: """Initialize the preconditioner matrix. This method must be called before the preconditioner becomes usable. Parameters ---------- X : MxD tensor Matrix of Nystroem centers Y : Mx1 tensor Vector of targets corresponding to the Nystroem centers `X` alpha : Mx1 tensor Parameter vector (of the same dimension as `Y`) which gives the current solution to the optimization problem. penalty : float Regularization amount N : int Number of points in the full data-set. Notes ----- If `debug=True` is present in the options, this method will print a lot of extra information pertaining timings of the various preconditioner operations. This can be useful to help understand how the preconditioner works. """ if Y.shape[1] != 1: raise ValueError( "Logistic preconditioner can only deal with 1D outputs.") dtype = X.dtype M = X.size(0) eps = self.params.pc_epsilon(dtype) if self.fC is None: # This is done only at the first iteration of the logistic-falkon algorithm # It sets the `T` variable from the paper (chol(kMM)) to the upper part of `self.fC` with TicToc("Kernel", debug=self.params.debug): if isinstance(X, torch.Tensor): C = create_same_stride((M, M), X, dtype=dtype, device='cpu', pin_memory=self._use_cuda) else: # If sparse tensor we need fortran for kernel calculation C = create_fortran((M, M), dtype=dtype, device='cpu', pin_memory=self._use_cuda) self.kernel(X, X, out=C, opt=self.params) self.fC = C.numpy() if not is_f_contig(C): self.fC = self.fC.T with TicToc("Add diag", debug=self.params.debug): # Compute T: lower(fC) = T.T inplace_add_diag(self.fC, eps * M) with TicToc("Cholesky 1", debug=self.params.debug): self.fC = potrf_wrapper(self.fC, clean=True, upper=False, use_cuda=self._use_cuda, opt=self.params) # Save the diagonal which will be overwritten when computing A self.dT = C.diag() with TicToc("Copy triangular", debug=self.params.debug): # Copy lower(fC) to upper(fC): upper(fC) = T. copy_triang(self.fC, upper=False) else: if not self._use_cuda: # Copy non-necessary for cuda since LAUUM will do the copying with TicToc("Copy triangular", debug=self.params.debug): # Copy upper(fC) to lower(fC): lower(fC) = T.T copy_triang(self.fC, upper=True) # does not copy the diagonal # Setting diagonal necessary for trmm inplace_set_diag(self.fC, self.dT) # Compute W with TicToc("TRMM", debug=self.params.debug): # T is on upper(fC). Compute T.T @ alpha alpha = self._trmm(alpha.clone()) with TicToc("W (ddf)", debug=self.params.debug): W = self.loss.ddf(Y, alpha) with TicToc("W-Multiply", debug=self.params.debug): W.sqrt_() self.fC = vec_mul_triang(self.fC, W.numpy().reshape(-1), side=0, upper=False) if self._use_cuda: with TicToc("LAUUM", debug=self.params.debug): # Product upper(fC) @ upper(fC).T : lower(fC) = T @ T.T self.fC = lauum_wrapper(self.fC, upper=True, use_cuda=self._use_cuda, opt=self.params) else: with TicToc("LAUUM", debug=self.params.debug): # Product lower(fC).T @ lower(fC) : lower(fC) = T @ T.T self.fC = lauum_wrapper(self.fC, upper=False, use_cuda=self._use_cuda, opt=self.params) # NOTE: Here the multiplier is 1/N instead of the more common 1/M! mul_triang(self.fC, upper=False, preserve_diag=False, multiplier=1 / N) with TicToc("Add diag", debug=self.params.debug): # lower(fC) = 1/N * [email protected] + lambda * I inplace_add_diag(self.fC, penalty) with TicToc("Cholesky 2", debug=self.params.debug): # Cholesky on lower(fC) : lower(fC) = A.T self.fC = potrf_wrapper(self.fC, clean=False, upper=False, use_cuda=self._use_cuda, opt=self.params) self.dA = torch.from_numpy(self.fC).diag()
def sparse_fmmv(proc_idx, queue, device_id): a: ArgsFmmv = queue.get() X1: SparseTensor = a.X1 X2: SparseTensor = a.X2 v, out = a.v, a.out kernel, max_mem = a.kernel, a.max_mem dtype = X1.dtype ntot, dtot = X1.shape mtot, T = v.size() avail_mem = max_mem / sizeof_dtype(dtype) # Memory needs: # X1_chunk : N + 2*D*N*density # X2_chunk : D + 2*D*M*density (because is transposed) # sparse_out : N + 2*N*M*(density) (assume density = 1) # ker_gpu : M*N # mmv_gpu : N*T # v_gpu : M*T # Other: GPU buffer n, m = select_dim_over_m( maxM=mtot, maxN=ntot, tot=avail_mem, coef_nm=3, coef_n=2 + 2 * dtot * X1.density + T, coef_m=2 * dtot * X2.density + T, rest=dtot, ) ddev = torch.device('cuda:%d' % int(device_id)) with tcd.device(ddev): v_gpu = v.to(device=ddev) # M x T mmv_gpu = create_same_stride((n, T), out, dtype, ddev) # ker_gpu should be fortran-ordered due to cusparse csr2dense function ker_gpu = create_fortran((n, m), dtype=dtype, device=ddev) for i in range(0, ntot, n): ic = min(n, ntot - i) cur_mmv_gpu = mmv_gpu[:ic] # n x T cur_mmv_gpu.fill_(0.0) X1_chunk = X1.narrow_rows(i, ic) X1_chunk_d = X1_chunk.index_to_int().to(device=ddev) for j in range(0, mtot, m): jc = min(m, mtot - j) X2_chunk = X2.narrow_rows(j, jc) # Prepare sparse on CPU ddd = kernel._prepare_sparse(X1_chunk, X2_chunk) # Transpose X2-chunk and convert it to CSR. This uses lots of RAM X2_chunk_d = SparseTensor.from_scipy( X2_chunk.transpose_csc().to_scipy().tocsr(copy=False)) \ .index_to_int() \ .to(device=ddev) cur_ker_gpu = ker_gpu[:ic, :jc] cur_ker_gpu.fill_(0.0) # Run the matrix multiplication (kernel apply) cur_ker_gpu = kernel._apply_sparse(X1_chunk_d, X2_chunk_d, cur_ker_gpu) cur_ker_gpu = kernel._finalize(cur_ker_gpu, ddd) # Multiply by the vector v cur_mmv_gpu.addmm_(cur_ker_gpu, v_gpu.narrow(0, j, jc)) del ddd, X2_chunk, X2_chunk_d # send result to CPU copy_to_host_noorder(ic, T, cur_mmv_gpu, 0, 0, out, i, 0) del X1_chunk, X1_chunk_d return out
def distk_fdmmv(proc_idx, queue, device_id): a: ArgsFdmmv = queue.get() X1, X2, v, w, out = a.X1, a.X2, a.v, a.w, a.out kernel: L2DistanceKernel = a.kernel max_mem = a.max_mem N, D = X1.size() M = X2.size(0) T = v.size(1) if v is not None else w.size(1) dtype = X1.dtype # Memory usage: # v : M x T # K : n x M # X1ss : n x d # X2s : M x d # Kv : n x T # out : M x T # sq1 : n x 1 # sq2 : M x 1 # ------------ # total : n*d + M*d + n*(M + T + 1) + 2*M*T + M avail_mem = max_mem / sizeof_dtype(dtype) # FIXME: There seems to be a bug where if we let avail_mem like it is # for 32-bit data-types some copy fails. In such case we need # to free up some more memory and then everything runs fine. rest_coef = 2 * M * T if v is not None else M * T n, d = select_dim_over_d(maxD=D, maxN=N, coef_nd=1, coef_n=M + T + 1, coef_d=M, rest=rest_coef + M, tot=avail_mem) ddev = torch.device('cuda:%d' % int(device_id)) s1 = tcd.Stream() s2 = tcd.Stream() with tcd.device(ddev), tcd.stream(s1): if v is not None: v_gpu = create_same_stride((M, T), v, dtype, ddev) copy_to_device_noorder(M, T, v, 0, 0, v_gpu, 0, 0) K_gpu = create_same_stride((n, M), X1, dtype, ddev) X1ss_gpu = create_same_stride((n, d), X1, dtype, ddev) X2s_gpu = create_same_stride((M, d), X2, dtype, ddev) Kv_gpu = create_same_stride((n, T), X1, dtype, ddev) if out.is_cuda: out_gpu = out else: out_gpu = create_same_stride((M, T), out, dtype, ddev) out_gpu.fill_(0.0) sq1_gpu = create_same_stride((n, ), X1, dtype, ddev) sq2_gpu = create_same_stride((M, ), X1, dtype, ddev) #if (d == D): # with torch.cuda.stream(s2): # cur_X2s_gpu = copy_to_device_noorder(M, d, X2, 0, 0, X2s_gpu, 0, 0, s=s2) # torch.norm(cur_X2s_gpu, p=2, dim=1, keepdim=True, out=sq2_gpu).pow_(2) for i in range(0, N, n): nb = min(N - i, n) cur_K_gpu = K_gpu.narrow(0, 0, nb) # nb x M cur_K_gpu.fill_(0.0) for j in range(0, D, d): db = min(D - j, d) # Parallelize two matrix transfers (probably pointless) #if d < D: with torch.cuda.stream(s2): cur_X2s_gpu = copy_to_device_noorder(M, db, X2, 0, j, X2s_gpu, 0, 0, s=s2) torch.norm(cur_X2s_gpu, p=2, dim=1, keepdim=True, out=sq2_gpu).pow_(2) cur_X1ss_gpu = copy_to_device_noorder(nb, db, X1, i, j, X1ss_gpu, 0, 0, s=s1) torch.norm(cur_X1ss_gpu, p=2, dim=1, keepdim=True, out=sq1_gpu).pow_(2) s2.synchronize() s1.synchronize() cur_K_gpu.addmm_(mat1=cur_X1ss_gpu, mat2=cur_X2s_gpu.T, alpha=-2.0) cur_K_gpu.add_(sq1_gpu) cur_K_gpu.add_(sq2_gpu.T) cur_K_gpu.clamp_min_(0) cur_K_gpu = kernel._transform(cur_K_gpu) if w is not None: # Copy split w to GPU into cur_Kv_gpu, cur_Kv_gpu = copy_to_device_noorder(nb, T, w, i, 0, Kv_gpu, 0, 0, s=s1) # n x T if v is not None: cur_Kv_gpu.addmm_(cur_K_gpu, v_gpu) else: # v cannot be None if w is None cur_Kv_gpu = Kv_gpu.narrow(0, 0, nb) # n x T torch.mm(cur_K_gpu, v_gpu, out=cur_Kv_gpu) # n x T # Multiply transposed kernel with the Kv result. out_gpu.addmm_(cur_K_gpu.T, cur_Kv_gpu) # M x T s1.synchronize() s1.synchronize() if not out.is_cuda: copy_to_host_noorder(M, T, out_gpu, 0, 0, out, 0, 0) return out
def distk_fmmv(proc_idx, queue, device_id): a: ArgsFmmv = queue.get() X1, X2, v, out = a.X1, a.X2, a.v, a.out kernel: L2DistanceKernel = a.kernel max_mem = a.max_mem N, D = X1.shape M = X2.shape[0] T = v.shape[1] dtype = X1.dtype # GPU memory usage: # X1s : n x D # X2s : m x D # vs : m x T # nm : n x m # out : n x T # ----------- # total: n*m + n * (D + T) + m * (D + T) = R avail_mem = max_mem / sizeof_dtype(dtype) #if sizeof_dtype(dtype) == 4: # avail_mem /= 2 n, m = select_dim_over_m(maxM=M, maxN=N, coef_nm=1.0, coef_n=D + T, coef_m=D + T, tot=avail_mem) ddev = torch.device('cuda:%d' % int(device_id)) with tcd.device(ddev): nm_gpu = create_same_stride((n, m), X1, dtype, ddev) out_gpu = create_same_stride((n, T), out, dtype, ddev) X1s_gpu = create_same_stride((n, D), X1, dtype, ddev) X2s_gpu = create_same_stride((m, D), X2, dtype, ddev) vs_gpu = create_same_stride((m, T), v, dtype, ddev) for i in range(0, N, n): nb = min(n, N - i) cur_X1s_gpu = copy_to_device_noorder(nb, D, X1, i, 0, X1s_gpu, 0, 0) sq1 = torch.norm(cur_X1s_gpu, p=2, dim=1, keepdim=True).pow_(2) cur_out_gpu = out_gpu.narrow(0, 0, nb) # n x T cur_out_gpu.fill_(0.0) for j in range(0, M, m): mb = min(m, M - j) cur_X2s_gpu = copy_to_device_noorder(mb, D, X2, j, 0, X2s_gpu, 0, 0) cur_vs_gpu = copy_to_device_noorder(mb, T, v, j, 0, vs_gpu, 0, 0) # m x T cur_nm_gpu = nm_gpu[:nb, :mb] # n x m sq2 = torch.norm(cur_X2s_gpu, p=2, dim=1, keepdim=True).pow_(2) torch.mm(cur_X1s_gpu, cur_X2s_gpu.T, out=cur_nm_gpu) cur_nm_gpu.mul_(-2.0) cur_nm_gpu.add_(sq1) cur_nm_gpu.add_(sq2.T) cur_nm_gpu.clamp_min_(0) kernel._transform(cur_nm_gpu) # Multiply by the vector v # FIXME: This is the cause of mapping errors in case of float32 calculations. cur_out_gpu.addmm_(cur_nm_gpu, cur_vs_gpu) # n x T # send result to CPU copy_to_host_noorder(nb, T, out_gpu, 0, 0, out, i, 0) return out
def generic_fdmmv(proc_idx, queue, device_id): a: ArgsFdmmv = queue.get() X1, X2, v, w, out = a.X1, a.X2, a.v, a.w, a.out kernel, max_mem = a.kernel, a.max_mem dtype = X1.dtype N, D = X1.size() M = X2.size(0) if v is None: T = w.size(1) else: T = v.size(1) # Memory usage: # v : M x T # K : n x M # X1d : n x d # X2d : M x d # Kv : n x T # out2 : M x T # sq1 : n x 1 # sq2 : M x 1 # ------------ # total : n*d + M*d + n*(M + T) + 2*M*T + M avail_mem = max_mem / sizeof_dtype(dtype) # FIXME: There seems to be a bug where if we let avail_mem like it is # for 32-bit data-types some copy fails. In such case we need # to free up some more memory and then everything runs fine. if sizeof_dtype(dtype) == 4: avail_mem /= 2 rest_coef = 2 * M * T if v is not None else M * T n, d = select_dim_over_d(maxD=D, maxN=N, coef_nd=1, coef_n=M + T + 1, coef_d=M, rest=rest_coef + M, tot=avail_mem) ddev = torch.device('cuda:%d' % int(device_id)) with tcd.device(ddev): # Initialize GPU data ker_gpu = create_same_stride((n, M), out, dtype=dtype, device=ddev) X1s_gpu = create_same_stride((n, d), X1, dtype, ddev) X2s_gpu = create_same_stride((M, d), X2, dtype, ddev) w_gpu = create_same_stride((n, T), ker_gpu, dtype, ddev) if out.is_cuda: out_gpu = out else: out_gpu = create_same_stride((M, T), out, dtype, ddev) out_gpu.fill_(0.0) if v is not None: v_gpu = v.to(device=ddev) # M x T for i in range(0, N, n): ic = min(n, N - i) ddd = kernel._prepare(X1.narrow(0, i, ic), X2) c_g_ker = ker_gpu.narrow(0, 0, ic) c_g_ker.fill_(0.0) for k in range(0, D, d): kc = min(d, D - k) c_g_X1s = copy_to_device_noorder(ic, kc, X1, i, k, X1s_gpu, 0, 0) c_g_X2s = copy_to_device_noorder(M, kc, X2, 0, k, X2s_gpu, 0, 0) kernel._apply(c_g_X1s, c_g_X2s.T, c_g_ker) kernel._finalize(c_g_ker, ddd) if w is not None: c_g_w = copy_to_device_noorder(ic, T, w, i, 0, w_gpu, 0, 0) else: c_g_w = w_gpu.narrow(0, 0, ic) c_g_w.fill_(0.0) if v is not None: c_g_w.addmm_(c_g_ker, v_gpu) out_gpu.addmm_(c_g_ker.T, c_g_w) if not out.is_cuda: copy_to_device_noorder(M, T, out_gpu, 0, 0, out, 0, 0) return out
def fdmmv_cuda(X1: torch.Tensor, X2: torch.Tensor, v: Optional[torch.Tensor], w: Optional[torch.Tensor], kernel, out: Optional[torch.Tensor] = None, opt: Optional[BaseOptions] = None) -> torch.Tensor: """ X1 : N x D X2 : M x D v : M x T w : N x T performs fnc(X1*X2', X1, X2)' * ( fnc(X1*X2', X1, X2) * v + w ) : M x T in blocks on multiple GPUs Assume all inputs have the same data type """ opt = _setup_opt(opt) _check_contiguity((X1, 'X1'), (X2, 'X2'), (v, 'v'), (w, 'w'), (out, 'out')) device = X1.device if v is None and w is None: raise ValueError("one of 'v' or 'w' must not be None.") T = v.size(1) if v is not None else w.size(1) M = X2.size(0) N = X1.size(0) if out is None: out = create_same_stride((M, T), X1, X1.dtype, device=device, pin_memory=device.type != 'cuda') gpu_info = _get_gpu_info(opt, slack=0.9) if kernel.kernel_type == "l2distance" and kernel.name == "gaussian": target = distk_fdmmv else: target = generic_fdmmv if device.type == 'cuda': single_gpu_info = [g for g in gpu_info if g.Id == device.index][0] args = ArgsFdmmv(X1=X1, X2=X2, v=v, w=w, out=out, kernel=kernel, max_mem=single_gpu_info.usable_ram) _call_direct(target, (args, device.index)) else: block_sizes = calc_gpu_block_sizes(gpu_info, N) wrlk = [] # outputs for each subprocess. args = [] for i, g in enumerate(gpu_info): bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue cur_out_gpu = create_same_stride((M, T), X1, X1.dtype, f'cuda:{gpu_info[i].Id}') # M x T wrlk.append(cur_out_gpu) cur_w = None if w is not None: cur_w = w.narrow(0, block_sizes[i], bwidth) args.append((ArgsFdmmv(X1=X1.narrow(0, block_sizes[i], bwidth), X2=X2, v=v, w=cur_w, out=cur_out_gpu, kernel=kernel, max_mem=g.usable_ram), g.Id)) _start_wait_processes(target, args) if len(wrlk) > 1: # noinspection PyTypeChecker fastest_device: int = np.argmax([d.speed for d in gpu_info]) out.copy_( tcd.comm.reduce_add(wrlk, destination=gpu_info[fastest_device].Id)) else: out.copy_(wrlk[0]) return out
def _generic_fmm(proc_idx, queue, device_id): a: ArgsFmm = queue.get() X1: torch.Tensor = a.X1 X2: torch.Tensor = a.X2 out = a.out kernel, gpu_dtype = a.kernel, a.gpu_dtype max_mem = a.max_mem ntot, dtot = X1.shape mtot = X2.shape[0] # This function is slightly faster if we limit the sizes # of the processed blocks slightly. Especially when doing # a cold run since pinned-memory allocation is extremely slow. # We don't want to do it if we're memory constrained though. if max_mem > 4 * 2**30: max_mem /= 4 avail_mem = max_mem / sizeof_dtype(gpu_dtype) # Memory usage: # - gOut : n x m # - g_ssX1 : n x d # - g_sX2 : m x d # total : n*d + m*d + n*m n, d, m = select_dim_fMM(avail_mem, ntot, dtot, mtot) tc_device = torch.device('cuda:%d' % (int(device_id))) with torch.cuda.device(tc_device): # Initialize GPU buffers g_out = create_same_stride((n, m), out, gpu_dtype, tc_device) g_X1d = create_same_stride((n, d), X1, gpu_dtype, tc_device) g_X2d = create_same_stride((m, d), X2, gpu_dtype, tc_device) cpu_buf = None if X1.dtype != gpu_dtype: cpu_buf = create_same_stride((n, m), out, gpu_dtype, 'cpu', pin_memory=True) for j in range(0, mtot, m): jc = min(m, mtot - j) X2_chunk = cast_tensor(X2.narrow(0, j, jc), dtype=gpu_dtype, warn=False).pin_memory() for i in range(0, ntot, n): ic = min(n, ntot - i) if _gpu_tns_same_memory(X1, X2) and j < i: out[i:i + ic, j:j + jc].copy_(out[j:j + jc, i:i + ic].T) else: X1_chunk = cast_tensor(X1.narrow(0, i, ic), dtype=gpu_dtype, warn=False).pin_memory() ddd = kernel._prepare(X1_chunk, X2_chunk) cur_g_out = g_out.narrow(0, 0, ic).narrow(1, 0, jc) cur_g_out.fill_(0.0) for k in range(0, dtot, d): kc = min(d, dtot - k) # Move to GPU cur_g_X1d = g_X1d.narrow(0, 0, ic).narrow(1, 0, kc) cur_g_X1d.copy_(X1_chunk.narrow(1, k, kc)) cur_g_X2d = g_X2d.narrow(0, 0, jc).narrow(1, 0, kc) cur_g_X2d.copy_(X2_chunk.narrow(1, k, kc)) # Apply a.kernel._apply(cur_g_X1d, cur_g_X2d.T, cur_g_out) a.kernel._finalize(cur_g_out, ddd) copy_to_host_noorder(ic, jc, cur_g_out, 0, 0, out, i, j, cpu_buf) del ddd del g_out, g_X1d, g_X2d return out
def _sparse_fmm(proc_idx, queue, device_id): a: ArgsFmm = queue.get() X1: SparseTensor = a.X1 X2: SparseTensor = a.X2 out = a.out kernel, gpu_dtype = a.kernel, a.gpu_dtype max_mem = a.max_mem ntot, dtot = X1.shape mtot = X2.size(0) avail_mem = max_mem / sizeof_dtype(gpu_dtype) # Memory usage: # X1_chunk : ntot + 2 * D * ntot * density # X2_chunk : dtot + 2 * D * mtot * density (because is transposed) # sparse_out : ntot + 2 * ntot * mtot * density (assume density=1 here) # ker_gpu : mtot * ntot n, m = select_dim_over_nm_v2(max_n=ntot, max_m=mtot, coef_nm=3, coef_n=2 + 2 * dtot * X1.density, coef_m=2 * dtot * X2.density, rest=dtot, max_mem=avail_mem) tc_device = torch.device('cuda:%d' % (int(device_id))) with torch.cuda.device(tc_device): # Initialize GPU buffers g_out = create_same_stride((n, m), out, gpu_dtype, tc_device) cpu_buf = None if X1.dtype != gpu_dtype: cpu_buf = create_same_stride((n, m), out, gpu_dtype, 'cpu', pin_memory=True) for j in range(0, mtot, m): jc = min(m, mtot - j) X2_chunk = X2.narrow_rows(j, jc).to(dtype=gpu_dtype) X2_chunk_d = SparseTensor.from_scipy( X2_chunk.transpose_csc().to_scipy().tocsr(copy=False)) \ .index_to_int() \ .to(device=tc_device) for i in range(0, ntot, n): ic = min(n, ntot - i) X1_chunk = X1.narrow_rows(i, ic).to(dtype=gpu_dtype) X1_chunk_d = X1_chunk.index_to_int().to(device=tc_device) cur_g_out = g_out.narrow(0, 0, ic).narrow(1, 0, jc) cur_g_out.fill_(0.0) ddd = kernel._prepare_sparse(X1_chunk, X2_chunk) cur_g_out = kernel._apply_sparse(X1_chunk_d, X2_chunk_d, cur_g_out) cur_g_out = kernel._finalize(cur_g_out, ddd) copy_to_host_noorder(ic, jc, cur_g_out, 0, 0, out, i, j, cpu_buf) del ddd, X1_chunk_d, X1_chunk del X2_chunk, X2_chunk_d del g_out return out
def sparse_fdmmv(proc_idx, queue, device_id): a: ArgsFdmmv = queue.get() X1: SparseTensor = a.X1 X2: SparseTensor = a.X2 v, w, out = a.v, a.w, a.out kernel, max_mem = a.kernel, a.max_mem dtype = X1.dtype N, D = X1.shape M = X2.size(0) if v is None: T = w.size(1) else: T = v.size(1) # Memory needs: # X1_chunk : ntot + 2 * D * ntot * density # X2 : dtot + 2 * D * M * density (because is transposed) # sparse_out : ntot + 2 * ntot * M * density (assume here density = 1) # ker_gpu : M * ntot # w_gpu : ntot * T # v_gpu : M * T # out_gpu : M * T avail_mem = max_mem / sizeof_dtype(dtype) den = 2 * D * X1.density + 2 + 3 * M + T sub = D + 2 * D * M * X2.density + M * T if v is not None: sub += M * T n = (avail_mem - sub) / den n = min(int(n), N) if n < 1: raise MemoryError("Not enough memory to run sparse dfmmv") ddev = torch.device('cuda:%d' % int(device_id)) with tcd.device(ddev): # Initialize GPU data w_gpu = create_same_stride((n, T), out, dtype, ddev) if out.is_cuda: out_gpu = out else: out_gpu = create_same_stride((M, T), out, dtype, ddev) out_gpu.fill_(0.0) ker_gpu = create_fortran((n, M), dtype, ddev) if v is not None: v_gpu = v.to(device=ddev) # M x T X2_d = SparseTensor.from_scipy( X2.transpose_csc().to_scipy().tocsr(copy=False)) \ .index_to_int() \ .to(device=ddev) for i in range(0, N, n): ic = min(n, N - i) X1_chunk = X1.narrow_rows(i, ic) X1_chunk_d = X1_chunk.index_to_int().to(device=ddev) ker_chunk = ker_gpu[:ic] ker_chunk.fill_(0.0) # TODO: This is wasteful (X2 will be prepared many times over) ddd = kernel._prepare_sparse(X1_chunk, X2) ker_chunk = kernel._apply_sparse(X1_chunk_d, X2_d, ker_chunk) ker_chunk = kernel._finalize(ker_chunk, ddd) if w is not None: c_g_w = copy_to_device_noorder(ic, T, w, i, 0, w_gpu, 0, 0) else: c_g_w = w_gpu.narrow(0, 0, ic) c_g_w.fill_(0.0) if v is not None: c_g_w.addmm_(ker_chunk, v_gpu) out_gpu.addmm_(ker_chunk.T, c_g_w) del ddd, X1_chunk, X1_chunk_d if not out.is_cuda: copy_to_device_noorder(M, T, out_gpu, 0, 0, out, 0, 0) return out
def _generic_fmm(proc_idx, queue, device_id): a: ArgsFmm = queue.get() X1: torch.Tensor = a.X1 X2: torch.Tensor = a.X2 cuda_inputs = X1.is_cuda out = a.out kernel, gpu_dtype = a.kernel, a.gpu_dtype max_mem = a.max_mem change_dtype = gpu_dtype != X1.dtype ntot, dtot = X1.shape mtot = X2.shape[0] # This function is slightly faster if we limit the sizes # of the processed blocks slightly. Especially when doing # a cold run since pinned-memory allocation is extremely slow. # We don't want to do it if we're memory constrained though. if max_mem > 4 * 2**30: max_mem /= 4 avail_mem = max_mem / sizeof_dtype(gpu_dtype) # Memory usage: # - gOut : n x m # - g_ssX1 : n x d # - g_sX2 : m x d # total : n*d + m*d + n*m if cuda_inputs and not change_dtype: # No allocation will be performed, so no need to split at all! n, d, m = ntot, dtot, mtot else: n, d, m = select_dim_fMM(avail_mem, ntot, dtot, mtot) tc_device = torch.device('cuda:%d' % (int(device_id))) s1 = torch.cuda.Stream(device=tc_device) with torch.cuda.device(tc_device), torch.cuda.stream(s1): # Initialize GPU buffers if not cuda_inputs or change_dtype: g_X1d = create_same_stride((n, d), X1, gpu_dtype, tc_device) g_X2d = create_same_stride((m, d), X2, gpu_dtype, tc_device) g_out = create_same_stride((n, m), out, gpu_dtype, tc_device) if not cuda_inputs: cpu_buf = None if change_dtype: cpu_buf = create_same_stride((n, m), out, gpu_dtype, 'cpu', pin_memory=True) for j in range(0, mtot, m): jc = min(m, mtot - j) X2_chunk = X2.narrow(0, j, jc) for i in range(0, ntot, n): ic = min(n, ntot - i) if _gpu_tns_same_memory(X1, X2) and j < i: out[i:i + ic, j:j + jc].copy_(out[j:j + jc, i:i + ic].T) continue X1_chunk = X1.narrow(0, i, ic) ddd = kernel._prepare(X1_chunk, X2_chunk) if not cuda_inputs or change_dtype: cur_g_out = g_out.narrow(0, 0, ic).narrow(1, 0, jc) else: cur_g_out = out.narrow(0, i, ic).narrow(1, j, jc) cur_g_out.fill_(0.0) for k in range(0, dtot, d): kc = min(d, dtot - k) # Move to GPU and type-convert if (not cuda_inputs) or change_dtype: cur_g_X1d = g_X1d.narrow(0, 0, ic).narrow(1, 0, kc) cur_g_X1d.copy_(X1_chunk.narrow(1, k, kc)) cur_g_X2d = g_X2d.narrow(0, 0, jc).narrow(1, 0, kc) cur_g_X2d.copy_(X2_chunk.narrow(1, k, kc)) else: cur_g_X1d = X1_chunk.narrow(1, k, kc) cur_g_X2d = X2_chunk.narrow(1, k, kc) # Apply a.kernel._apply(cur_g_X1d, cur_g_X2d.T, cur_g_out) a.kernel._finalize(cur_g_out, ddd) if not cuda_inputs: copy_to_host_noorder(ic, jc, cur_g_out, 0, 0, out, i, j, cpu_buf, s1) elif change_dtype: out.narrow(0, i, ic).narrow(1, j, jc).copy_(cur_g_out) del ddd return out
def init(self, X: Union[torch.Tensor, SparseTensor], weight_vec: Optional[torch.Tensor] = None): """Initialize the preconditioner matrix. This method must be called before the preconditioner can be used. Parameters ---------- X : torch.Tensor The (M x D) matrix of Nystroem centers weight_vec An optional vector of size (M x 1) which is used for reweighted least-squares. This vector should contain the weights corresponding to the Nystrom centers. """ if X.is_cuda and not self._use_cuda: raise RuntimeError( "use_cuda is set to False, but data is CUDA tensor. " "Check your options.") if weight_vec is not None and not check_same_device(X, weight_vec): raise ValueError(f"Weights and data are not on the same device " f"({weight_vec.device}, {X.device})") if weight_vec is not None and weight_vec.shape[0] != X.shape[0]: raise ValueError( f"Weights and Nystrom centers should have the same first dimension. " f"Found instead {weight_vec.shape[0]}, {X.shape[0]}.") dtype = X.dtype dev = X.device eps = self.params.pc_epsilon(X.dtype) M = X.size(0) with TicToc("Kernel", debug=self.params.debug): if isinstance(X, torch.Tensor): C = create_same_stride((M, M), X, dtype=dtype, device=dev, pin_memory=self._use_cuda) else: # If sparse tensor we need fortran for kernel calculation C = create_fortran((M, M), dtype=dtype, device=dev, pin_memory=self._use_cuda) self.kernel(X, X, out=C, opt=self.params) if not is_f_contig(C): C = C.T with TicToc("Cholesky 1", debug=self.params.debug): # Compute T: lower(fC) = T.T inplace_add_diag_th(C, eps * M) C = potrf_wrapper(C, clean=False, upper=False, use_cuda=self._use_cuda, opt=self.params) # Save the diagonal which will be overwritten when computing A self.dT = C.diag() with TicToc("Copy triangular", debug=self.params.debug): # Copy lower(fC) to upper(fC): upper(fC) = T. copy_triang(C, upper=False) # Weighted least-squares needs to weight the A matrix. We can weigh once before LAUUM, # but since CUDA-LAUUM touches both sides of C, weighting before LAUUM will also modify # the matrix T. Therefore for CUDA inputs we weigh twice after LAUUM! if weight_vec is not None and not self._use_cuda: with TicToc("Weighting(CPU)", debug=self.params.debug): weight_vec.sqrt_() vec_mul_triang(C, weight_vec, side=1, upper=False) if self._use_cuda: with TicToc("LAUUM(CUDA)", debug=self.params.debug): # Product upper(fC) @ upper(fC).T, store in lower(fC) = T @ T.T C = lauum_wrapper(C, upper=True, use_cuda=self._use_cuda, opt=self.params) else: with TicToc("LAUUM(CPU)", debug=self.params.debug): # Product lower(fC).T @ lower(fC), store in lower(fC) = T @ T.T C = lauum_wrapper(C, upper=False, use_cuda=self._use_cuda, opt=self.params) if weight_vec is not None and self._use_cuda: with TicToc("Weighting(CUDA)", debug=self.params.debug): weight_vec.sqrt_() vec_mul_triang(C, weight_vec, side=0, upper=False) vec_mul_triang(C, weight_vec, side=1, upper=False) with TicToc("Cholesky 2", debug=self.params.debug): # lower(fC) = 1/M * [email protected] mul_triang(C, upper=False, preserve_diag=False, multiplier=1 / M) # lower(fC) = 1/M * [email protected] + lambda * I inplace_add_diag_th(C, self._lambda) # Cholesky on lower(fC) : lower(fC) = A.T C = potrf_wrapper(C, clean=False, upper=False, use_cuda=self._use_cuda, opt=self.params) self.dA = C.diag() self.fC = C