def test_low(self, mat, order, dtype, device): mat = fix_mat(mat, order=order, dtype=dtype, numpy=True) mat_low = mat.copy(order="K") # Upper triangle of mat_low is 0 mat_low[np.triu_indices(self.t, 1)] = 0 # Create device matrix mat_low = torch.from_numpy(mat_low) mat_low_dev = move_tensor(mat_low, device) # Run copy copy_triang(mat_low_dev, upper=False) # Make checks on CPU mat_low = mat_low_dev.cpu().numpy() assert np.sum(mat_low == 0) == 0 np.testing.assert_array_equal(np.tril(mat), np.tril(mat_low)) np.testing.assert_array_equal(np.triu(mat_low), np.tril(mat_low).T) np.testing.assert_array_equal(np.diag(mat), np.diag(mat_low)) # Reset and try with `upper=True` mat_low[np.triu_indices(self.t, 1)] = 0 mat_low_dev.copy_(torch.from_numpy(mat_low)) copy_triang(mat_low_dev, upper=True) # Only the diagonal will be set mat_low = mat_low_dev.cpu().numpy() np.testing.assert_array_equal(np.diag(mat), np.diag(mat_low))
def test_up(self, mat, order, dtype, device): mat = fix_mat(mat, order=order, dtype=dtype, numpy=True) mat_up = mat.copy(order="K") # Lower triangle of mat_up is 0 mat_up[np.tril_indices(self.t, -1)] = 0 # Create device matrix mat_up = torch.from_numpy(mat_up) mat_up_dev = move_tensor(mat_up, device) copy_triang(mat_up_dev, upper=True) mat_up = mat_up_dev.cpu().numpy() assert np.sum(mat_up == 0) == 0 np.testing.assert_array_equal(np.triu(mat), np.triu(mat_up)) np.testing.assert_array_equal(np.tril(mat_up), np.triu(mat_up).T) np.testing.assert_array_equal(np.diag(mat), np.diag(mat_up)) # Reset and try with `upper=False` mat_up[np.tril_indices(self.t, -1)] = 0 mat_up_dev.copy_(torch.from_numpy(mat_up)) copy_triang(mat_up_dev, upper=False) # Only the diagonal will be set. mat_up = mat_up_dev.cpu().numpy() np.testing.assert_array_equal(np.diag(mat), np.diag(mat_up))
def init(self, X: Union[torch.Tensor, SparseTensor], Y: torch.Tensor, alpha: torch.Tensor, penalty: float, N: int) -> None: """Initialize the preconditioner matrix. This method must be called before the preconditioner becomes usable. Parameters ---------- X : torch.Tensor (M x D) matrix of Nystroem centers Y : torch.Tensor (M x 1) vector of targets corresponding to the Nystroem centers `X` alpha : torch.Tensor (M x 1) parameter vector (of the same dimension as `Y`) which gives the current solution to the optimization problem. penalty : float Regularization amount N : int Number of points in the full data-set. Notes ----- If `debug=True` is present in the options, this method will print a lot of extra information pertaining timings of the various preconditioner operations. This can be useful to help understand how the preconditioner works. """ if Y.shape[1] != 1: raise ValueError( "Logistic preconditioner can only deal with 1D outputs.") dtype = X.dtype M = X.size(0) eps = self.params.pc_epsilon(dtype) if self.fC is None: # This is done only at the first iteration of the logistic-falkon algorithm # It sets the `T` variable from the paper (chol(kMM)) to the upper part of `self.fC` with TicToc("Kernel", debug=self.params.debug): if isinstance(X, torch.Tensor): C = create_same_stride((M, M), X, dtype=dtype, device='cpu', pin_memory=self._use_cuda) else: # If sparse tensor we need fortran for kernel calculation C = create_fortran((M, M), dtype=dtype, device='cpu', pin_memory=self._use_cuda) self.kernel(X, X, out=C, opt=self.params) if not is_f_contig(C): C = C.T with TicToc("Add diag", debug=self.params.debug): # Compute T: lower(fC) = T.T inplace_add_diag_th(C, eps * M) with TicToc("Cholesky 1", debug=self.params.debug): C = potrf_wrapper(C, clean=True, upper=False, use_cuda=self._use_cuda, opt=self.params) # Save the diagonal which will be overwritten when computing A self.dT = C.diag() with TicToc("Copy triangular", debug=self.params.debug): # Copy lower(fC) to upper(fC): upper(fC) = T. copy_triang(C, upper=False) else: C = self.fC if not self._use_cuda: # Copy non-necessary for cuda since LAUUM will do the copying with TicToc("Copy triangular", debug=self.params.debug): # Copy upper(fC) to lower(fC): lower(fC) = T.T copy_triang(C, upper=True) # does not copy the diagonal # Setting diagonal necessary for trmm C.diagonal().copy_(self.dT) # Compute W with TicToc("TRMM", debug=self.params.debug): # T is on upper(fC). Compute T.T @ alpha alpha = self._trmm(C, alpha.clone()) with TicToc("W (ddf)", debug=self.params.debug): W = self.loss.ddf(Y, alpha) with TicToc("W-Multiply", debug=self.params.debug): W.sqrt_() vec_mul_triang(C, W.numpy().reshape(-1), side=0, upper=False) # LAUUM side depends on CUDA or CPU version because the matrix is initially symmetric and # the CUDA version will write the result on the opposite side (i.e. `write_opposite=True`) # while the CPU version will write on the same side. if self._use_cuda: with TicToc("LAUUM", debug=self.params.debug): # Product upper(fC) @ upper(fC).T : lower(fC) = T @ T.T C = lauum_wrapper(C, upper=True, use_cuda=self._use_cuda, opt=self.params) else: with TicToc("LAUUM", debug=self.params.debug): # Product lower(fC).T @ lower(fC) : lower(fC) = T @ T.T C = lauum_wrapper(C, upper=False, use_cuda=self._use_cuda, opt=self.params) # NOTE: Here the multiplier is 1/N instead of the more common 1/M! mul_triang(C, upper=False, preserve_diag=False, multiplier=1 / N) with TicToc("Add diag", debug=self.params.debug): # lower(fC) = 1/N * [email protected] + lambda * I inplace_add_diag_th(C, penalty) with TicToc("Cholesky 2", debug=self.params.debug): # Cholesky on lower(fC) : lower(fC) = A.T C = potrf_wrapper(C, clean=False, upper=False, use_cuda=self._use_cuda, opt=self.params) self.dA = C.diag() self.fC = C
def init(self, X: Union[torch.Tensor, SparseTensor]): """Initialize the preconditioner matrix. This method must be called before the preconditioner can be used. Parameters ---------- X : torch.Tensor The (M x D) matrix of Nystroem centers """ dtype = X.dtype dev = X.device if X.is_cuda and not self._use_cuda: raise RuntimeError("use_cuda is set to False, but data is CUDA tensor. " "Check your options.") eps = self.params.pc_epsilon(X.dtype) M = X.size(0) with TicToc("Kernel", debug=self.params.debug): if isinstance(X, torch.Tensor): C = create_same_stride((M, M), X, dtype=dtype, device=dev, pin_memory=self._use_cuda) else: # If sparse tensor we need fortran for kernel calculation C = create_fortran((M, M), dtype=dtype, device=dev, pin_memory=self._use_cuda) self.kernel(X, X, out=C, opt=self.params) if not is_f_contig(C): C = C.T with TicToc("Cholesky 1", debug=self.params.debug): # Compute T: lower(fC) = T.T inplace_add_diag_th(C, eps * M) C = potrf_wrapper(C, clean=False, upper=False, use_cuda=self._use_cuda, opt=self.params) # Save the diagonal which will be overwritten when computing A self.dT = C.diag() with TicToc("Copy triangular", debug=self.params.debug): # Copy lower(fC) to upper(fC): upper(fC) = T. copy_triang(C, upper=False) if self._use_cuda: with TicToc("LAUUM", debug=self.params.debug): # Product upper(fC) @ upper(fC).T : lower(fC) = T @ T.T C = lauum_wrapper(C, upper=True, use_cuda=self._use_cuda, opt=self.params) else: with TicToc("LAUUM", debug=self.params.debug): # Product lower(fC).T @ lower(fC) : lower(fC) = T @ T.T C = lauum_wrapper(C, upper=False, use_cuda=self._use_cuda, opt=self.params) with TicToc("Cholesky 2", debug=self.params.debug): # lower(fC) = 1/M * [email protected] mul_triang(C, upper=False, preserve_diag=False, multiplier=1 / M) # lower(fC) = 1/M * [email protected] + lambda * I inplace_add_diag_th(C, self._lambda) # Cholesky on lower(fC) : lower(fC) = A.T C = potrf_wrapper(C, clean=False, upper=False, use_cuda=self._use_cuda, opt=self.params) self.dA = C.diag() self.fC = C
def init(self, X: Union[torch.Tensor, SparseTensor], weight_vec: Optional[torch.Tensor] = None): """Initialize the preconditioner matrix. This method must be called before the preconditioner can be used. Parameters ---------- X : torch.Tensor The (M x D) matrix of Nystroem centers weight_vec An optional vector of size (M x 1) which is used for reweighted least-squares. This vector should contain the weights corresponding to the Nystrom centers. """ if X.is_cuda and not self._use_cuda: raise RuntimeError( "use_cuda is set to False, but data is CUDA tensor. " "Check your options.") if weight_vec is not None and not check_same_device(X, weight_vec): raise ValueError(f"Weights and data are not on the same device " f"({weight_vec.device}, {X.device})") if weight_vec is not None and weight_vec.shape[0] != X.shape[0]: raise ValueError( f"Weights and Nystrom centers should have the same first dimension. " f"Found instead {weight_vec.shape[0]}, {X.shape[0]}.") dtype = X.dtype dev = X.device eps = self.params.pc_epsilon(X.dtype) M = X.size(0) with TicToc("Kernel", debug=self.params.debug): if isinstance(X, torch.Tensor): C = create_same_stride((M, M), X, dtype=dtype, device=dev, pin_memory=self._use_cuda) else: # If sparse tensor we need fortran for kernel calculation C = create_fortran((M, M), dtype=dtype, device=dev, pin_memory=self._use_cuda) self.kernel(X, X, out=C, opt=self.params) if not is_f_contig(C): C = C.T with TicToc("Cholesky 1", debug=self.params.debug): # Compute T: lower(fC) = T.T inplace_add_diag_th(C, eps * M) C = potrf_wrapper(C, clean=False, upper=False, use_cuda=self._use_cuda, opt=self.params) # Save the diagonal which will be overwritten when computing A self.dT = C.diag() with TicToc("Copy triangular", debug=self.params.debug): # Copy lower(fC) to upper(fC): upper(fC) = T. copy_triang(C, upper=False) # Weighted least-squares needs to weight the A matrix. We can weigh once before LAUUM, # but since CUDA-LAUUM touches both sides of C, weighting before LAUUM will also modify # the matrix T. Therefore for CUDA inputs we weigh twice after LAUUM! if weight_vec is not None and not self._use_cuda: with TicToc("Weighting(CPU)", debug=self.params.debug): weight_vec.sqrt_() vec_mul_triang(C, weight_vec, side=1, upper=False) if self._use_cuda: with TicToc("LAUUM(CUDA)", debug=self.params.debug): # Product upper(fC) @ upper(fC).T, store in lower(fC) = T @ T.T C = lauum_wrapper(C, upper=True, use_cuda=self._use_cuda, opt=self.params) else: with TicToc("LAUUM(CPU)", debug=self.params.debug): # Product lower(fC).T @ lower(fC), store in lower(fC) = T @ T.T C = lauum_wrapper(C, upper=False, use_cuda=self._use_cuda, opt=self.params) if weight_vec is not None and self._use_cuda: with TicToc("Weighting(CUDA)", debug=self.params.debug): weight_vec.sqrt_() vec_mul_triang(C, weight_vec, side=0, upper=False) vec_mul_triang(C, weight_vec, side=1, upper=False) with TicToc("Cholesky 2", debug=self.params.debug): # lower(fC) = 1/M * [email protected] mul_triang(C, upper=False, preserve_diag=False, multiplier=1 / M) # lower(fC) = 1/M * [email protected] + lambda * I inplace_add_diag_th(C, self._lambda) # Cholesky on lower(fC) : lower(fC) = A.T C = potrf_wrapper(C, clean=False, upper=False, use_cuda=self._use_cuda, opt=self.params) self.dA = C.diag() self.fC = C