class TestKeops: basic_options = FalkonOptions(debug=True, compute_arch_speed=False, keops_active="force", max_cpu_mem=max_mem_dense, max_gpu_mem=max_mem_dense) @pytest.mark.parametrize("Ao,Adt,Bo,Bdt,vo,vdt", [ ("C", np.float32, "C", np.float32, "C", np.float32), ("C", np.float64, "C", np.float64, "C", np.float64), pytest.param("F", np.float32, "F", np.float32, "F", np.float32, marks=[pytest.mark.xfail(reason="KeOps only C")]), pytest.param("F", np.float32, "C", np.float32, "C", np.float32, marks=[pytest.mark.xfail(reason="KeOps only C")]), ], ids=["AC32-BC32-vC32", "AC64-BC64-vC64", "AF32-BF32-vF32", "AF32-BC32-vC32"]) @pytest.mark.parametrize("cpu", cpu_params, ids=["cpu", "gpu"]) def test_fmmv(self, A, B, v, Ao, Adt, Bo, Bdt, vo, vdt, kernel, expected_fmmv, cpu): A = fix_mat(A, order=Ao, dtype=Adt) B = fix_mat(B, order=Bo, dtype=Bdt) v = fix_mat(v, order=vo, dtype=vdt) opt = dataclasses.replace(self.basic_options, use_cpu=cpu) rtol = choose_on_dtype(A.dtype) # Test normal _run_fmmv_test(kernel.mmv, expected_fmmv, (A, B, v), out=None, rtol=rtol, opt=opt) # Test with out out = torch.empty(A.shape[0], v.shape[1], dtype=A.dtype) _run_fmmv_test(kernel.mmv, expected_fmmv, (A, B, v), out=out, rtol=rtol, opt=opt) @pytest.mark.skipif(not decide_cuda(), reason="No GPU found.") def test_gpu_inputs(self, A, B, v, kernel, expected_fmmv): A = fix_mat(A, order="C", dtype=n32).cuda() B = fix_mat(B, order="C", dtype=n32, device=A.device) v = fix_mat(v, order="C", dtype=n32, device=A.device) opt = dataclasses.replace(self.basic_options, use_cpu=False, max_gpu_mem=np.inf) rtol = choose_on_dtype(A.dtype) # Test normal _run_fmmv_test(kernel.mmv, expected_fmmv, (A, B, v), out=None, rtol=rtol, opt=opt) # Test with out out = torch.empty(A.shape[0], v.shape[1], dtype=A.dtype, device=A.device) _run_fmmv_test(kernel.mmv, expected_fmmv, (A, B, v), out=out, rtol=rtol, opt=opt) @pytest.mark.skipif(not decide_cuda(), reason="No GPU found.") def test_gpu_inputs_fail(self, A, B, v, kernel, expected_fmmv): A = fix_mat(A, order="C", dtype=n32, device="cuda:0") B = fix_mat(B, order="C", dtype=n32, device="cuda:0") v = fix_mat(v, order="C", dtype=n32, device="cpu") opt = dataclasses.replace(self.basic_options, use_cpu=False, max_gpu_mem=np.inf) rtol = choose_on_dtype(A.dtype) # Test normal with pytest.raises(RuntimeError): _run_fmmv_test(kernel.mmv, expected_fmmv, (A, B, v), out=None, rtol=rtol, opt=opt)
def _decide_backend(opt: BaseOptions, num_dim: int) -> str: """Switch between CPU and GPU backend for KeOps """ if not decide_cuda(opt): return 'CPU' else: return 'GPU_1D'
def __init__( self, kernel: falkon.kernels.Kernel, penalty_list: List[float], iter_list: List[int], loss: Loss, M: int, center_selection: Union[str, falkon.center_selection.NySel] = 'uniform', seed: Optional[int] = None, error_fn: Optional[callable] = None, error_every: Optional[int] = 1, options=FalkonOptions(), ): self.kernel = kernel self.penalty_list = penalty_list self.iter_list = iter_list if len(self.iter_list) != len(self.penalty_list): raise ValueError( "Iteration list must be of same length as penalty list " "(found %d and %d)" % (len(self.iter_list), len(self.penalty_list))) self.M = M self.seed = seed self.loss = loss if self.seed is not None: torch.manual_seed(self.seed) # Works for both CPU and GPU self.random_state_ = check_random_generator(self.seed) self.error_fn = error_fn self.error_every = error_every # Options self.options = options self._cg_options = options.get_conjgrad_options() self._keops_options = options.get_keops_options() self._pc_options = options.get_pc_options() self._cholesky_opt = options.get_chol_options() self._lauum_opt = options.get_lauum_options() self._base_opt = options.get_base_options() self.use_cuda_ = decide_cuda(self.options) self.alpha_ = None self.ny_points_ = None self.fit_times_ = None if isinstance(center_selection, str): if center_selection.lower() == 'uniform': self.center_selection = falkon.center_selection.UniformSel( self.random_state_) else: raise ValueError( f'Center selection "{center_selection}" is not valid.') else: self.center_selection = center_selection self._init_cuda()
def _decide_dmmv_impl(self, X1, X2, v, w, opt: FalkonOptions): """Choose which `dmmv` function to use for this data. Note that `dmmv` functions compute double kernel-vector products (see :meth:`dmmv` for an explanation of what they are). Parameters ---------- X1 : torch.Tensor First data matrix, of shape (N x D) X2 : torch.Tensor Second data matrix, of shape (M x D) v : torch.Tensor or None Vector for the matrix-vector multiplication (M x T) w : torch.Tensor or None Vector for the matrix-vector multiplicatoin (N x T) opt : FalkonOptions Falkon options. Options may be specified to force GPU or CPU usage. Returns ------- dmmv_fn A function which allows to perform the `mmv` operation. Notes ----- This function decides based on the inputs: if the inputs are sparse, it will choose the sparse implementations; if CUDA is detected, it will choose the CUDA implementation; otherwise it will simply choose the basic CPU implementation. """ use_cuda = decide_cuda(opt) sparsity = check_sparse(X1, X2) if not all(sparsity) and any(sparsity): raise ValueError( "Either all or none of 'X1', 'X2' must be sparse.") if (X1.device.type == 'cuda') and (not use_cuda): warnings.warn( "kernel-vector double product backend was chosen to be CPU, but GPU " "input tensors found. Defaulting to use the GPU (note this may " "cause issues later). To force usage of the CPU backend, " "please pass CPU tensors; to avoid this warning if the GPU backend is " "desired, check your options (i.e. set 'use_cpu=False').") use_cuda = True sparsity = all(sparsity) if use_cuda: from falkon.mmv_ops.fmmv_cuda import fdmmv_cuda, fdmmv_cuda_sparse if sparsity: return fdmmv_cuda_sparse else: return fdmmv_cuda else: if sparsity: return fdmmv_cpu_sparse else: return fdmmv_cpu
def __init__(self, penalty: float, kernel, opt: FalkonOptions): super().__init__() self.params = opt self._use_cuda = decide_cuda(self.params) and not self.params.cpu_preconditioner self._lambda = penalty self.kernel = kernel self.fC: Optional[torch.Tensor] = None self.dT: Optional[torch.Tensor] = None self.dA: Optional[torch.Tensor] = None
def __init__(self, kernel, loss, opt: FalkonOptions): super().__init__() self.params = opt self._use_cuda = decide_cuda( self.params) and not self.params.cpu_preconditioner self.kernel = kernel self.loss = loss self.fC = None self.dT = None self.dA = None
def _decide_mm_impl(self, X1, X2, opt: FalkonOptions): """Choose which `mm` function to use for this data. Note that `mm` functions compute the kernel itself so **KeOps may not be used**. Parameters ---------- X1 : torch.Tensor First data matrix, of shape (N x D) X2 : torch.Tensor Second data matrix, of shape (M x D) opt : FalkonOptions Falkon options. Options may be specified to force GPU or CPU usage. Returns ------- mm_fn A function which allows to perform the `mm` operation. Notes ----- This function decides based on the inputs: if the inputs are sparse, it will choose the sparse implementations; if CUDA is detected, it will choose the CUDA implementation; otherwise it will simply choose the basic CPU implementation. """ use_cuda = decide_cuda(opt) sparsity = check_sparse(X1, X2) if not all(sparsity) and any(sparsity): raise ValueError( "Either all or none of 'X1', 'X2' must be sparse.") sparsity = all(sparsity) if (X1.device.type == 'cuda') and (not use_cuda): warnings.warn( "kernel backend was chosen to be CPU, but GPU input tensors found. " "Defaulting to use the GPU (note this may cause issues later). " "To force usage of the CPU backend, please pass CPU tensors; " "to avoid this warning if the GPU backend is " "desired, check your options (i.e. set 'use_cpu=False').") use_cuda = True if use_cuda: from falkon.mmv_ops.fmm_cuda import fmm_cuda, fmm_cuda_sparse if sparsity: return fmm_cuda_sparse else: return fmm_cuda else: if sparsity: return fmm_cpu_sparse else: return fmm_cpu
def __init__(self, kernel: falkon.kernels.Kernel, penalty: float, M: int, center_selection: Union[str, falkon.center_selection.NySel] = 'uniform', maxiter: int = 20, seed: Optional[int] = None, error_fn: Optional[callable] = None, error_every: Optional[int] = 1, options=FalkonOptions(), ): self.kernel = kernel self.penalty = penalty self.M = M self.maxiter = maxiter self.seed = seed if self.seed is not None: torch.manual_seed(self.seed) # Works for both CPU and GPU self.random_state_ = check_random_generator(self.seed) self.error_fn = error_fn self.error_every = error_every # Options self.options = options self._cg_options = options.get_conjgrad_options() self._keops_options = options.get_keops_options() self._pc_options = options.get_pc_options() self._cholesky_opt = options.get_chol_options() self._lauum_opt = options.get_lauum_options() self._base_opt = options.get_base_options() self.use_cuda_ = decide_cuda(self.options) self.alpha_ = None self.ny_points_ = None self.fit_times_ = None if isinstance(center_selection, str): if center_selection.lower() == 'uniform': self.center_selection = falkon.center_selection.UniformSel( self.random_state_) else: raise ValueError(f'Center selection "{center_selection}" is not valid.') else: self.center_selection = center_selection self._init_cuda()
def _decide_dmmv_impl(self, X1, X2, v, w, opt: FalkonOptions): use_cuda = decide_cuda(opt) sparsity = check_sparse(X1, X2) if not all(sparsity) and any(sparsity): raise ValueError( "Either all or none of 'X1', 'X2' must be sparse.") sparsity = all(sparsity) if use_cuda: from falkon.mmv_ops.fmmv_cuda import fdmmv_cuda, fdmmv_cuda_sparse if sparsity: return fdmmv_cuda_sparse else: return fdmmv_cuda else: if sparsity: return fdmmv_cpu_sparse else: return fdmmv_cpu
def __init__(self, kernel: falkon.kernels.Kernel, M: Optional[int], center_selection: Union[str, falkon.center_selection.CenterSelector] = 'uniform', seed: Optional[int] = None, error_fn: Optional[callable] = None, error_every: Optional[int] = 1, options: Optional[FalkonOptions] = None, ): self.kernel = kernel self.M = M self.seed = seed if self.seed is not None: torch.manual_seed(self.seed) # Works for both CPU and GPU self.random_state_ = check_random_generator(self.seed) self.error_fn = error_fn self.error_every = error_every # Options self.options = options or FalkonOptions() self._cg_options = self.options.get_conjgrad_options() self._keops_options = self.options.get_keops_options() self._pc_options = self.options.get_pc_options() self._cholesky_opt = self.options.get_chol_options() self._base_opt = self.options.get_base_options() self.use_cuda_ = decide_cuda(self.options) self.num_gpus = 0 self.alpha_ = None self.ny_points_ = None self.fit_times_ = None if isinstance(center_selection, str): if center_selection.lower() == 'uniform': if M is None: raise ValueError( "M must be specified when no `CenterSelector` object is provided. " "Specify an integer value for `M` or a `CenterSelector` object.") self.center_selection: falkon.center_selection.CenterSelector = \ falkon.center_selection.UniformSelector(self.random_state_, num_centers=M) else: raise ValueError(f'Center selection "{center_selection}" is not valid.') else: self.center_selection: falkon.center_selection.CenterSelector = center_selection
def _decide_dmmv_impl(self, X1, X2, v, w, opt: FalkonOptions): use_cuda = decide_cuda(opt) sparsity = check_sparse(X1, X2) if not all(sparsity) and any(sparsity): raise ValueError("Either all or none of 'X1', 'X2' must be sparse.") if (X1.device.type == 'cuda') and (not use_cuda): warnings.warn("kernel-vector double product backend was chosen to be CPU, but GPU " "input tensors found. Defaulting to use the GPU (note this may " "cause issues later). To force usage of the CPU backend, " "please pass CPU tensors; to avoid this warning if the GPU backend is " "desired, check your options (i.e. set 'use_cpu=False').") use_cuda = True sparsity = all(sparsity) if use_cuda: from falkon.mmv_ops.fmmv_cuda import fdmmv_cuda, fdmmv_cuda_sparse if sparsity: return fdmmv_cuda_sparse else: return fdmmv_cuda else: if sparsity: return fdmmv_cpu_sparse else: return fdmmv_cpu
class TestSparse: basic_options = FalkonOptions(debug=True, compute_arch_speed=False, max_cpu_mem=max_mem_sparse, max_gpu_mem=max_mem_sparse) # sparse_dim and sparse_density result in sparse matrices with m and n non-zero entries. sparse_dim = 10_000 sparse_density = 1e-4 @pytest.fixture(scope="class") def s_A(self): A = gen_sparse_matrix(n, self.sparse_dim, np.float64, density=self.sparse_density, seed=14) Ad = torch.from_numpy(A.to_scipy().todense()) return A, Ad @pytest.fixture(scope="class") def s_B(self): B = gen_sparse_matrix(m, self.sparse_dim, np.float64, density=self.sparse_density, seed=14) Bd = torch.from_numpy(B.to_scipy().todense()) return B, Bd @pytest.fixture(scope="class") def s_gram(self, kernel, s_A, s_B): opt = FalkonOptions(use_cpu=True, compute_arch_speed=False) return kernel(s_A[1], s_B[1], opt=opt) # n x m kernel @pytest.fixture(scope="class") def s_expected_fmmv(self, s_gram, v): return s_gram @ v @pytest.fixture(scope="class") def s_e_dfmmv1(self, s_gram, v, w): return s_gram.T @ (s_gram @ v + w) @pytest.fixture(scope="class") def s_e_dfmmv2(self, s_gram, v): return s_gram.T @ (s_gram @ v) @pytest.fixture(scope="class") def s_e_dfmmv3(self, s_gram, w): return s_gram.T @ w @pytest.fixture(scope="class") def s_e_dfmmv(self, request): return request.getfixturevalue(request.param) @pytest.mark.parametrize("cpu", cpu_params, ids=["cpu", "gpu"]) @pytest.mark.parametrize( "Adt,Bdt,vo,vdt", [ (np.float32, np.float32, "F", np.float32), (np.float32, np.float32, "C", np.float32), (np.float64, np.float64, "F", np.float64), (np.float64, np.float64, "C", np.float64), ], ids=["A32-B32-vF32", "A32-B32-vC32", "A64-B64-vF64", "A64-B64-vC64"]) def test_fmmv(self, s_A, s_B, v, Adt, Bdt, vo, vdt, kernel, s_expected_fmmv, cpu): A = fix_sparse_mat(s_A[0], dtype=Adt) B = fix_sparse_mat(s_B[0], dtype=Bdt) v = fix_mat(v, dtype=vdt, order=vo, copy=True) opt = dataclasses.replace(self.basic_options, use_cpu=cpu) rtol = choose_on_dtype(A.dtype) # Test normal _run_fmmv_test(kernel.mmv, s_expected_fmmv, (A, B, v), out=None, rtol=rtol, opt=opt) # Test with out out = torch.empty(A.shape[0], v.shape[1], dtype=A.dtype) _run_fmmv_test(kernel.mmv, s_expected_fmmv, (A, B, v), out=out, rtol=rtol, opt=opt) @pytest.mark.skipif(not decide_cuda(), reason="No GPU found.") @pytest.mark.parametrize("Adt,Bdt,vo,vdt", [(np.float32, np.float32, "F", np.float32)], ids=["A32-B32-vF32"]) @pytest.mark.xfail(reason="Squared-norm not implemented for CUDA tensors", run=True) def test_fmmv_input_device(self, s_A, s_B, v, Adt, Bdt, vo, vdt, kernel, s_expected_fmmv): input_device = "cuda:0" A = fix_sparse_mat(s_A[0], dtype=Adt, device=input_device) B = fix_sparse_mat(s_B[0], dtype=Bdt, device=input_device) v = fix_mat(v, dtype=vdt, order=vo, copy=True, device=input_device) opt = dataclasses.replace(self.basic_options, use_cpu=False) rtol = choose_on_dtype(A.dtype) # Test normal _run_fmmv_test(kernel.mmv, s_expected_fmmv, (A, B, v), out=None, rtol=rtol, opt=opt) # Test with out out = torch.empty(A.shape[0], v.shape[1], dtype=A.dtype, device=input_device) _run_fmmv_test(kernel.mmv, s_expected_fmmv, (A, B, v), out=out, rtol=rtol, opt=opt) @pytest.mark.parametrize("cpu", cpu_params, ids=["cpu", "gpu"]) @pytest.mark.parametrize( "Adt,Bdt,vo,vdt,wo,wdt,s_e_dfmmv", [ pytest.param(n32, n32, "F", n32, "F", n32, "s_e_dfmmv1", marks=mark.usefixtures("s_e_dfmmv1")), pytest.param(n32, n32, "C", n32, "C", n32, "s_e_dfmmv1", marks=mark.usefixtures("s_e_dfmmv1")), pytest.param(n64, n64, "F", n64, "F", n64, "s_e_dfmmv1", marks=mark.usefixtures("s_e_dfmmv1")), pytest.param(n64, n64, "C", n64, "C", n64, "s_e_dfmmv1", marks=mark.usefixtures("s_e_dfmmv1")), pytest.param(n32, n32, "F", n32, None, None, "s_e_dfmmv2", marks=mark.usefixtures("s_e_dfmmv2")), pytest.param(n32, n32, "C", n32, None, None, "s_e_dfmmv2", marks=mark.usefixtures("s_e_dfmmv2")), pytest.param(n64, n64, "F", n64, None, None, "s_e_dfmmv2", marks=mark.usefixtures("s_e_dfmmv2")), pytest.param(n64, n64, "C", n64, None, None, "s_e_dfmmv2", marks=mark.usefixtures("s_e_dfmmv2")), pytest.param(n32, n32, None, None, "F", n32, "s_e_dfmmv3", marks=mark.usefixtures("s_e_dfmmv3")), pytest.param(n32, n32, None, None, "C", n32, "s_e_dfmmv3", marks=mark.usefixtures("s_e_dfmmv3")), pytest.param(n64, n64, None, None, "F", n64, "s_e_dfmmv3", marks=mark.usefixtures("s_e_dfmmv3")), pytest.param(n64, n64, None, None, "C", n64, "s_e_dfmmv3", marks=mark.usefixtures("s_e_dfmmv3")), # A few mixed-contiguity examples pytest.param(n32, n32, "C", n32, "F", n32, "s_e_dfmmv1", marks=mark.usefixtures("s_e_dfmmv1")), ], ids=[ "32-32-vF32-wF32", "32-32-vC32-wC32", "64-64-vF64-wF64", "64-64-vC64-wC64", "32-32-vF32", "32-32-vC32", "64-64-vF64", "64-64-vC64", "32-32-wF32", "32-32-wC32", "64-64-wF64", "64-64-wC64", "32-32-vC32-wF32" ], indirect=["s_e_dfmmv"]) def test_dfmmv(self, s_A, s_B, v, w, Adt, Bdt, vo, vdt, wo, wdt, kernel, s_e_dfmmv, cpu): A = fix_sparse_mat(s_A[0], dtype=Adt) B = fix_sparse_mat(s_B[0], dtype=Bdt) v = fix_mat(v, order=vo, dtype=vdt) w = fix_mat(w, order=wo, dtype=wdt) opt = dataclasses.replace(self.basic_options, use_cpu=cpu) rtol = choose_on_dtype(A.dtype) # Test normal _run_fmmv_test(kernel.dmmv, s_e_dfmmv, (A, B, v, w), out=None, rtol=rtol, opt=opt) # Test with out out = torch.empty(m, t, dtype=A.dtype) _run_fmmv_test(kernel.dmmv, s_e_dfmmv, (A, B, v, w), out=out, rtol=rtol, opt=opt) @pytest.mark.skipif(not decide_cuda(), reason="No GPU found.") @pytest.mark.xfail(reason="Squared-norm not implemented for CUDA tensors", run=True) @pytest.mark.parametrize( "Adt,Bdt,vo,vdt,wo,wdt,s_e_dfmmv", [ pytest.param(n32, n32, "F", n32, "F", n32, "s_e_dfmmv1", marks=mark.usefixtures("s_e_dfmmv1")), pytest.param(n32, n32, "F", n32, None, None, "s_e_dfmmv2", marks=mark.usefixtures("s_e_dfmmv2")), pytest.param(n32, n32, None, None, "F", n32, "s_e_dfmmv3", marks=mark.usefixtures("s_e_dfmmv3")), ], ids=["32-32-vF32-wF32", "32-32-vF32", "32-32-wF32"], indirect=["s_e_dfmmv"]) def test_dfmmv_input_devices(self, s_A, s_B, v, w, Adt, Bdt, vo, vdt, wo, wdt, kernel, s_e_dfmmv): input_device = "cuda:0" A = fix_sparse_mat(s_A[0], dtype=Adt, device=input_device) B = fix_sparse_mat(s_B[0], dtype=Bdt, device=input_device) v = fix_mat(v, order=vo, dtype=vdt, device=input_device) w = fix_mat(w, order=wo, dtype=wdt, device=input_device) opt = dataclasses.replace(self.basic_options, use_cpu=False) rtol = choose_on_dtype(A.dtype) # Test normal _run_fmmv_test(kernel.dmmv, s_e_dfmmv, (A, B, v, w), out=None, rtol=rtol, opt=opt) # Test with out out = torch.empty(m, t, dtype=A.dtype, device=input_device) _run_fmmv_test(kernel.dmmv, s_e_dfmmv, (A, B, v, w), out=out, rtol=rtol, opt=opt)
class TestDense: basic_options = FalkonOptions(debug=True, compute_arch_speed=False, keops_active="no", max_gpu_mem=max_mem_dense, max_cpu_mem=max_mem_dense) @pytest.mark.parametrize( "Ao,Adt,Bo,Bdt,vo,vdt", [ ("F", np.float32, "F", np.float32, "F", np.float32), ("C", np.float32, "C", np.float32, "C", np.float32), ("F", np.float64, "F", np.float64, "F", np.float64), ("C", np.float64, "C", np.float64, "C", np.float64), # A few mixed-contiguity examples ("F", np.float32, "C", np.float32, "F", np.float32), ("F", np.float32, "C", np.float32, "C", np.float32), ], ids=[ "AF32-BF32-vF32", "AC32-BC32-vC32", "AF64-BF64-vF64", "AC64-BC64-vC64", "AF32-BC32-vF32", "AF32-BC32-vC32" ]) @pytest.mark.parametrize("cpu", cpu_params, ids=["cpu", "gpu"]) def test_fmmv(self, A, B, v, Ao, Adt, Bo, Bdt, vo, vdt, kernel, expected_fmmv, cpu): A = fix_mat(A, order=Ao, dtype=Adt) B = fix_mat(B, order=Bo, dtype=Bdt) v = fix_mat(v, order=vo, dtype=vdt) opt = dataclasses.replace(self.basic_options, use_cpu=cpu) rtol = choose_on_dtype(A.dtype) # Test normal _run_fmmv_test(kernel.mmv, expected_fmmv, (A, B, v), out=None, rtol=rtol, opt=opt) # Test with out out = torch.empty(A.shape[0], v.shape[1], dtype=A.dtype) _run_fmmv_test(kernel.mmv, expected_fmmv, (A, B, v), out=out, rtol=rtol, opt=opt) @pytest.mark.skipif(not decide_cuda(), reason="No GPU found.") @pytest.mark.parametrize("Ao,Adt,Bo,Bdt,vo,vdt", [ ("F", np.float32, "F", np.float32, "F", np.float32), ], ids=["AF32-BF32-vF32"]) def test_fmmv_input_device(self, A, B, v, Ao, Adt, Bo, Bdt, vo, vdt, kernel, expected_fmmv): input_device = "cuda:0" A = fix_mat(A, order=Ao, dtype=Adt, device=input_device) B = fix_mat(B, order=Bo, dtype=Bdt, device=input_device) v = fix_mat(v, order=vo, dtype=vdt, device=input_device) opt = dataclasses.replace(self.basic_options, use_cpu=False) rtol = choose_on_dtype(A.dtype) # Test normal _run_fmmv_test(kernel.mmv, expected_fmmv, (A, B, v), out=None, rtol=rtol, opt=opt) # Test with out out = torch.empty(A.shape[0], v.shape[1], dtype=A.dtype, device=input_device) _run_fmmv_test(kernel.mmv, expected_fmmv, (A, B, v), out=out, rtol=rtol, opt=opt) @pytest.mark.parametrize("cpu", cpu_params, ids=["cpu", "gpu"]) @pytest.mark.parametrize( "Ao,Adt,Bo,Bdt,vo,vdt,wo,wdt,e_dfmmv", [ pytest.param("F", n32, "F", n32, "F", n32, "F", n32, "e_dfmmv1", marks=mark.usefixtures("e_dfmmv1")), pytest.param("C", n32, "C", n32, "C", n32, "C", n32, "e_dfmmv1", marks=mark.usefixtures("e_dfmmv1")), pytest.param("F", n64, "F", n64, "F", n64, "F", n64, "e_dfmmv1", marks=mark.usefixtures("e_dfmmv1")), pytest.param("C", n64, "C", n64, "C", n64, "C", n64, "e_dfmmv1", marks=mark.usefixtures("e_dfmmv1")), pytest.param("F", n32, "F", n32, "F", n32, None, None, "e_dfmmv2", marks=mark.usefixtures("e_dfmmv2")), pytest.param("C", n32, "C", n32, "C", n32, None, None, "e_dfmmv2", marks=mark.usefixtures("e_dfmmv2")), pytest.param("F", n64, "F", n64, "F", n64, None, None, "e_dfmmv2", marks=mark.usefixtures("e_dfmmv2")), pytest.param("C", n64, "C", n64, "C", n64, None, None, "e_dfmmv2", marks=mark.usefixtures("e_dfmmv2")), pytest.param("F", n32, "F", n32, None, None, "F", n32, "e_dfmmv3", marks=mark.usefixtures("e_dfmmv3")), pytest.param("C", n32, "C", n32, None, None, "C", n32, "e_dfmmv3", marks=mark.usefixtures("e_dfmmv3")), pytest.param("F", n64, "F", n64, None, None, "F", n64, "e_dfmmv3", marks=mark.usefixtures("e_dfmmv3")), pytest.param("C", n64, "C", n64, None, None, "C", n64, "e_dfmmv3", marks=mark.usefixtures("e_dfmmv3")), # A few mixed-contiguity examples pytest.param("F", n32, "C", n32, "C", n32, "F", n32, "e_dfmmv1", marks=mark.usefixtures("e_dfmmv1")), ], ids=[ "F32-F32-vF32-wF32", "C32-C32-vC32-wC32", "F64-F64-vF64-wF64", "C64-C64-vC64-wC64", "F32-F32-vF32", "C32-C32-vC32", "F64-F64-vF64", "C64-C64-vC64", "F32-F32-wF32", "C32-C32-wC32", "F64-F64-wF64", "C64-C64-wC64", "F32-C32-vC32-wF32" ], indirect=["e_dfmmv"]) def test_dfmmv(self, A, B, v, w, Ao, Adt, Bo, Bdt, vo, vdt, wo, wdt, kernel, e_dfmmv, cpu): A = fix_mat(A, order=Ao, dtype=Adt) B = fix_mat(B, order=Bo, dtype=Bdt) v = fix_mat(v, order=vo, dtype=vdt) w = fix_mat(w, order=wo, dtype=wdt) opt = dataclasses.replace(self.basic_options, use_cpu=cpu) rtol = choose_on_dtype(A.dtype) # Test normal _run_fmmv_test(kernel.dmmv, e_dfmmv, (A, B, v, w), out=None, rtol=rtol, opt=opt) # Test with out out = torch.empty(m, t, dtype=A.dtype) _run_fmmv_test(kernel.dmmv, e_dfmmv, (A, B, v, w), out=out, rtol=rtol, opt=opt) @pytest.mark.parametrize( "Ao,Adt,Bo,Bdt,vo,vdt,wo,wdt,e_dfmmv", [ pytest.param("F", n32, "F", n32, "F", n32, "F", n32, "e_dfmmv1", marks=mark.usefixtures("e_dfmmv1")), pytest.param("F", n32, "F", n32, "F", n32, None, None, "e_dfmmv2", marks=mark.usefixtures("e_dfmmv2")), pytest.param("F", n32, "F", n32, None, None, "F", n32, "e_dfmmv3", marks=mark.usefixtures("e_dfmmv3")) ], ids=["F32-F32-vF32-wF32", "F32-F32-vF32", "F32-F32-wF32"], indirect=["e_dfmmv"]) @pytest.mark.skipif(not decide_cuda(), reason="No GPU found.") def test_dfmmv_input_device(self, A, B, v, w, Ao, Adt, Bo, Bdt, vo, vdt, wo, wdt, kernel, e_dfmmv): input_device = "cuda:0" A = fix_mat(A, order=Ao, dtype=Adt, device=input_device) B = fix_mat(B, order=Bo, dtype=Bdt, device=input_device) v = fix_mat(v, order=vo, dtype=vdt, device=input_device) w = fix_mat(w, order=wo, dtype=wdt, device=input_device) opt = dataclasses.replace(self.basic_options, use_cpu=False) rtol = choose_on_dtype(A.dtype) # Test normal _run_fmmv_test(kernel.dmmv, e_dfmmv, (A, B, v, w), out=None, rtol=rtol, opt=opt) # Test with out out = torch.empty(m, t, dtype=A.dtype, device=input_device) _run_fmmv_test(kernel.dmmv, e_dfmmv, (A, B, v, w), out=out, rtol=rtol, opt=opt) @pytest.mark.skipif(not decide_cuda(), reason="No GPU found.") def test_incorrect_dev_setting(self, A, B, v, w, kernel, e_dfmmv1, expected_fmmv): # tests when use_cpu = True, but CUDA input tensors A = A.cuda() B = B.cuda() v = v.cuda() w = w.cuda() opt = dataclasses.replace(self.basic_options, use_cpu=True) rtol = choose_on_dtype(A.dtype) with pytest.warns( UserWarning, match= 'backend was chosen to be CPU, but GPU input tensors found'): _run_fmmv_test(kernel.dmmv, e_dfmmv1, (A, B, v, w), out=None, rtol=rtol, opt=opt) with pytest.warns( UserWarning, match= 'backend was chosen to be CPU, but GPU input tensors found'): _run_fmmv_test(kernel.mmv, expected_fmmv, (A, B, v), out=None, rtol=rtol, opt=opt)
from falkon.tests.gen_random import gen_random, gen_sparse_matrix from falkon.utils import decide_cuda n32 = np.float32 n64 = np.float64 # Global dimensions n = 1000 m = 850 d = 10 t = 5 max_mem_dense = 0.5 * 2**20 max_mem_sparse = 0.5 * 2**20 cpu_params = [ pytest.param(True), pytest.param( False, marks=[mark.skipif(not decide_cuda(), reason="No GPU found.")]) ] def choose_on_dtype(dtype): if dtype == np.float64 or dtype == torch.float64: return 1e-12 else: return 1e-4 def numpy_to_torch_type(dt): if dt == np.float32: return torch.float32 elif dt == np.float64: return torch.float64
class TestWeightedFalkon: @pytest.mark.parametrize("cuda_usage", [ pytest.param("incore", marks=[ pytest.mark.skipif(not decide_cuda(), reason="No GPU found.") ]), pytest.param("mixed", marks=[ pytest.mark.skipif(not decide_cuda(), reason="No GPU found.") ]), "cpu_only", ]) def test_classif(self, cls_data, cuda_usage): X, Y = cls_data if cuda_usage == "incore": X, Y = X.cuda(), Y.cuda() flk_cls = InCoreFalkon else: flk_cls = Falkon kernel = kernels.GaussianKernel(2.0) def error_fn(t, p): return 100 * torch.sum(t * p <= 0).to( torch.float32) / t.shape[0], "c-err" def weight_fn(y): weight = torch.empty_like(y) weight[y == 1] = 1 weight[y == -1] = 2 return weight opt = FalkonOptions(use_cpu=cuda_usage == "cpu_only", keops_active="no", debug=False) flk_weight = flk_cls(kernel=kernel, penalty=1e-6, M=500, seed=10, options=opt, error_fn=error_fn, weight_fn=weight_fn) flk_weight.fit(X, Y) preds_weight = flk_weight.predict(X) preds_weight_m1 = preds_weight[Y == -1] preds_weight_p1 = preds_weight[Y == 1] err_weight_m1 = error_fn(preds_weight_m1, Y[Y == -1])[0] err_weight_p1 = error_fn(preds_weight_p1, Y[Y == 1])[0] flk = flk_cls(kernel=kernel, penalty=1e-6, M=500, seed=10, options=opt, error_fn=error_fn, weight_fn=None) flk.fit(X, Y) preds = flk.predict(X) preds_m1 = preds[Y == -1] preds_p1 = preds[Y == 1] err_m1 = error_fn(preds_m1, Y[Y == -1])[0] err_p1 = error_fn(preds_p1, Y[Y == 1])[0] print( "Weighted errors: -1 (%f) +1 (%f) -- Normal errors: -1 (%f) +1 (%f)" % (err_weight_m1, err_weight_p1, err_m1, err_p1)) assert err_weight_m1 < err_m1, "Error of weighted class is higher than without weighting" assert err_weight_p1 >= err_p1, "Error of unweighted class is lower than in flk with no weights"
import dataclasses import numpy as np import pytest import scipy.linalg.lapack as scll import torch from falkon.options import FalkonOptions from falkon.tests.conftest import memory_checker, fix_mat from falkon.utils import decide_cuda from falkon.utils.helpers import sizeof_dtype from falkon.utils.tensor_helpers import move_tensor from falkon.ooc_ops.ooc_utils import calc_block_sizes3 if decide_cuda(): from falkon.ooc_ops.ooc_lauum import gpu_lauum # noinspection PyUnresolvedReferences from falkon.ooc_ops.cuda import cuda_lauum class TestBlockSizeCalculator: def test_small_edge(self): assert calc_block_sizes3(max_block_size=1, num_devices=4, num_rows=3) == [1, 1, 1] assert calc_block_sizes3(max_block_size=1, num_devices=5, num_rows=1) == [1] def test_small(self): assert calc_block_sizes3(max_block_size=10000, num_devices=2, num_rows=100) == [100]
def initialize_cuda(): # your setup code goes here, executed ahead of first test opt = BaseOptions(compute_arch_speed=False, use_cpu=False) if decide_cuda(): initialization.init(opt)
import dataclasses from contextlib import contextmanager import numpy as np import pytest import torch import torch.cuda as tcd from falkon.utils.tensor_helpers import move_tensor from falkon.options import BaseOptions, FalkonOptions from falkon.utils import decide_cuda from falkon.utils.devices import _cpu_used_mem if decide_cuda(): from falkon.cuda import initialization @pytest.fixture(scope="session", autouse=True) def initialize_cuda(): # your setup code goes here, executed ahead of first test opt = BaseOptions(compute_arch_speed=False, use_cpu=False) if decide_cuda(): initialization.init(opt) @contextmanager def memory_checker(opt: FalkonOptions, extra_mem=0): is_cpu = opt.use_cpu mem_check = False if (is_cpu and opt.max_cpu_mem < np.inf) or (not is_cpu and opt.max_gpu_mem < np.inf): mem_check = True
class TestFalkonPreconditioner: rtol = {np.float64: 1e-10, np.float32: 1e-2} basic_opt = FalkonOptions(compute_arch_speed=False, no_single_kernel=True) @pytest.mark.parametrize("order", ["C", "F"]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("cpu", [ pytest.param(True), pytest.param(False, marks=[ pytest.mark.skipif(not decide_cuda(), reason="No GPU found.") ]) ], ids=["cpu", "gpu"]) def test_simple(self, mat, kernel, gram, cpu, dtype, order): opt = dataclasses.replace(self.basic_opt, use_cpu=cpu, cpu_preconditioner=cpu) rtol = self.rtol[dtype] mat = fix_mat(mat, dtype=dtype, order=order, copy=True) gram = fix_mat(gram, dtype=dtype, order=order, copy=True) la = 100 prec = FalkonPreconditioner(la, kernel, opt) prec.init(mat) assert_invariant_on_TT(prec, gram, tol=rtol) assert_invariant_on_AT(prec, gram, la, tol=rtol) assert_invariant_on_T(prec, gram, tol=rtol * 10) assert_invariant_on_prec(prec, N, gram, la, tol=rtol * 10) @pytest.mark.parametrize("cpu", [ pytest.param(True), pytest.param(False, marks=[ pytest.mark.skipif(not decide_cuda(), reason="No GPU found.") ]) ], ids=["cpu", "gpu"]) def test_zero_lambda(self, mat, kernel, gram, cpu): opt = dataclasses.replace(self.basic_opt, use_cpu=cpu, cpu_preconditioner=cpu) mat = fix_mat(mat, dtype=np.float64, order="K", copy=True) gram = fix_mat(gram, dtype=np.float64, order="K", copy=True) la = 0 prec = FalkonPreconditioner(la, kernel, opt) prec.init(mat) assert_invariant_on_TT(prec, gram, tol=1e-10) assert_invariant_on_AT(prec, gram, la, tol=1e-10) assert_invariant_on_T(prec, gram, tol=1e-9) assert_invariant_on_prec(prec, N, gram, la, tol=1e-8) @pytest.mark.parametrize("order", ["C", "F"]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.skipif(not decide_cuda(), reason="No GPU found.") def test_cuda_start(self, mat, kernel, gram, dtype, order): opt = dataclasses.replace(self.basic_opt, use_cpu=False, cpu_preconditioner=False) rtol = self.rtol[dtype] mat = fix_mat(mat, dtype=dtype, order=order, copy=True) gpu_mat = move_tensor(mat, "cuda:0") gram = fix_mat(gram, dtype=dtype, order=order, copy=True) gpu_gram = move_tensor(gram, "cuda:0") la = 1 prec = FalkonPreconditioner(la, kernel, opt) prec.init(mat) gpu_prec = FalkonPreconditioner(la, kernel, opt) gpu_prec.init(gpu_mat) np.testing.assert_allclose(prec.dT.numpy(), gpu_prec.dT.cpu().numpy(), rtol=rtol) np.testing.assert_allclose(prec.dA.numpy(), gpu_prec.dA.cpu().numpy(), rtol=rtol) np.testing.assert_allclose(prec.fC.numpy(), gpu_prec.fC.cpu().numpy(), rtol=rtol * 10) assert gpu_prec.fC.device == gpu_mat.device, "Device changed unexpectedly" assert_invariant_on_TT(gpu_prec, gpu_gram, tol=rtol) assert_invariant_on_AT(prec, gram, la, tol=rtol) assert_invariant_on_T(prec, gram, tol=rtol * 10) assert_invariant_on_prec(prec, N, gram, la, tol=rtol * 10)
@pytest.fixture def colmaj_arr() -> torch.Tensor: return torch.from_numpy(gen_random(M, D, 'float64', True)) @pytest.fixture def uniform_sel() -> UniformSelector: return UniformSelector(np.random.default_rng(0)) @pytest.mark.parametrize("device", [ pytest.param("cpu"), pytest.param( "cuda:0", marks=[pytest.mark.skipif(not decide_cuda(), reason="No GPU found.")]) ]) def test_c_order(uniform_sel, rowmaj_arr, device): rowmaj_arr = rowmaj_arr.to(device=device) centers = uniform_sel.select(rowmaj_arr, None, 100) assert centers.stride() == (D, 1), "UniformSel changed input stride" assert centers.size() == (100, D), "UniformSel did not output correct size" assert centers.dtype == rowmaj_arr.dtype assert centers.device == rowmaj_arr.device def test_cuda(uniform_sel, rowmaj_arr): centers = uniform_sel.select(rowmaj_arr, None, 100) assert centers.stride() == (D, 1), "UniformSel changed input stride" assert centers.size() == (100, D), "UniformSel did not output correct size" assert centers.dtype == rowmaj_arr.dtype
rtol=rtol) np.testing.assert_allclose(prec.dA.numpy(), gpu_prec.dA.cpu().numpy(), rtol=rtol) np.testing.assert_allclose(prec.fC.numpy(), gpu_prec.fC.cpu().numpy(), rtol=rtol * 10) assert gpu_prec.fC.device == gpu_mat.device, "Device changed unexpectedly" assert_invariant_on_TT(gpu_prec, gpu_gram, tol=rtol) assert_invariant_on_AT(prec, gram, la, tol=rtol) assert_invariant_on_T(prec, gram, tol=rtol * 10) assert_invariant_on_prec(prec, N, gram, la, tol=rtol * 10) @unittest.skipIf(not decide_cuda(), "No GPU found.") def test_cpu_gpu_equality(mat, kernel, gram): la = 12.3 mat = fix_mat(mat, dtype=np.float64, order="F", copy=True) opt = FalkonOptions(compute_arch_speed=False, use_cpu=False, cpu_preconditioner=False) prec_gpu = FalkonPreconditioner(la, kernel, opt) prec_gpu.init(mat) opt = dataclasses.replace(opt, use_cpu=True, cpu_preconditioner=True) prec_cpu = FalkonPreconditioner(la, kernel, opt) prec_cpu.init(mat)
def mat(): return gen_random(M, M, 'float64', F=True, seed=10) @pytest.fixture(scope="module") def arr(): return gen_random(M, T, 'float64', F=True, seed=12) @pytest.mark.parametrize("order", ["C", "F"]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("lower", [True, False], ids=["lower", "upper"]) @pytest.mark.parametrize("transpose", [True, False], ids=["transpose", "no_transpose"]) @pytest.mark.parametrize("device", [ pytest.param("cpu"), pytest.param("cuda:0", marks=[pytest.mark.skipif(not decide_cuda(), reason="No GPU found.")])]) def test_trsm_wrapper(mat, arr, dtype, order, device, lower, transpose): rtol = 1e-2 if dtype == np.float32 else 1e-11 n_mat = move_tensor(fix_mat(mat, dtype=dtype, order=order, copy=True), device=device) n_arr = move_tensor(fix_mat(arr, dtype=dtype, order=order, copy=True), device=device) expected = sclb.dtrsm(1e-2, mat, arr, side=0, lower=lower, trans_a=transpose, overwrite_b=0) if device.startswith("cuda") and order == "C": with pytest.raises(ValueError): actual = trsm(n_arr, n_mat, alpha=1e-2, lower=lower, transpose=transpose) else: actual = trsm(n_arr, n_mat, alpha=1e-2, lower=lower, transpose=transpose) np.testing.assert_allclose(expected, actual.cpu().numpy(), rtol=rtol)
@pytest.fixture(scope="module") def w() -> torch.Tensor: return torch.from_numpy(gen_random(n, t, 'float64', False, seed=92)) @pytest.fixture(scope="module") def rtol() -> dict: return { torch.float32: 1e-5, torch.float64: 1e-12, } @pytest.mark.parametrize("cpu", [ pytest.param(True), pytest.param(False, marks=[pytest.mark.skipif(not decide_cuda(), reason="No GPU found.")]) ], ids=["cpu", "gpu"]) class AbstractKernelTester(abc.ABC): max_mem = 2 * 2**20 basic_options = FalkonOptions(debug=True, compute_arch_speed=False, max_cpu_mem=max_mem, max_gpu_mem=max_mem) @pytest.fixture(scope="class") def exp_v(self, exp_k: np.ndarray, v: torch.Tensor) -> np.ndarray: return exp_k @ v.numpy() @pytest.fixture(scope="class") def exp_dv(self, exp_k: np.ndarray, v: torch.Tensor) -> np.ndarray: return exp_k.T @ (exp_k @ v.numpy()) @pytest.fixture(scope="class")
class TestMatMul(): @pytest.fixture(scope="class") def mat1(self): return torch.randn(200, 10) @pytest.fixture(scope="class") def mat2(self): return torch.randn(10, 100) @pytest.fixture(scope="class") def expected(self, mat1, mat2): return mat1 @ mat2 @pytest.mark.parametrize("device", [ "cpu", pytest.param("cuda:0", marks=pytest.mark.skipif(not decide_cuda(), reason="No GPU found.")) ]) def test_matmul_zeros(self, mat1, mat2, expected, device): mat1_zero_csr = SparseTensor.from_scipy( scipy.sparse.csr_matrix( torch.zeros_like(mat1).numpy())).to(device=device) mat2_csc = SparseTensor.from_scipy( scipy.sparse.csc_matrix(mat2.numpy())).to(device=device) out = torch.empty_like(expected).to(device) sparse_matmul(mat1_zero_csr, mat2_csc, out) assert torch.all(out == 0.0) mat1_csr = SparseTensor.from_scipy( scipy.sparse.csr_matrix(mat1.numpy())).to(device=device) mat2_zero_csc = SparseTensor.from_scipy( scipy.sparse.csc_matrix( torch.zeros_like(mat2).numpy())).to(device=device) out = torch.empty_like(expected).to(device=device) sparse_matmul(mat1_csr, mat2_zero_csc, out) assert torch.all(out == 0.0) def test_cpu_matmul_wrong_format(self, mat1, mat2, expected): out = torch.empty_like(expected) mat1_csr = SparseTensor.from_scipy(scipy.sparse.csr_matrix(mat1)) mat2_csr = SparseTensor.from_scipy(scipy.sparse.csr_matrix(mat2)) with pytest.raises(ValueError) as exc_info: sparse_matmul(mat1_csr, mat2_csr, out) assert str(exc_info.value).startswith("B must be CSC matrix") mat1_csc = SparseTensor.from_scipy(scipy.sparse.csc_matrix(mat1)) with pytest.raises(ValueError) as exc_info: sparse_matmul(mat1_csc, mat2_csr, out) assert str(exc_info.value).startswith("A must be CSR matrix") def test_cpu_matmul(self, mat1, mat2, expected): out = torch.empty_like(expected) mat1_csr = SparseTensor.from_scipy(scipy.sparse.csr_matrix(mat1)) mat2_csc = SparseTensor.from_scipy(scipy.sparse.csc_matrix(mat2)) sparse_matmul(mat1_csr, mat2_csc, out) torch.testing.assert_allclose(out, expected) @pytest.mark.skipif(not decide_cuda(), reason="No GPU found.") def test_cuda_matmul_wrong_format(self, mat1, mat2, expected): dev = torch.device("cuda:0") out = torch.empty_like(expected).to(device=dev) mat1_csr = SparseTensor.from_scipy( scipy.sparse.csr_matrix(mat1)).to(device=dev) mat2_csc = SparseTensor.from_scipy( scipy.sparse.csc_matrix(mat2)).to(device=dev) with pytest.raises(ValueError) as exc_info: sparse_matmul(mat1_csr, mat2_csc, out) assert str(exc_info.value).startswith("B must be CSR matrix") mat1_csc = SparseTensor.from_scipy(scipy.sparse.csc_matrix(mat1)) with pytest.raises(ValueError) as exc_info: sparse_matmul(mat1_csc, mat2_csc, out) assert str(exc_info.value).startswith("A must be CSR matrix") @pytest.mark.skipif(not decide_cuda(), reason="No GPU found.") def test_cuda_matmul(self, mat1, mat2, expected): dev = torch.device("cuda:0") out = create_fortran(expected.shape, expected.dtype, dev) mat1_csr = SparseTensor.from_scipy( scipy.sparse.csr_matrix(mat1)).to(device=dev) mat2_csr = SparseTensor.from_scipy( scipy.sparse.csr_matrix(mat2)).to(device=dev) sparse_matmul(mat1_csr, mat2_csr, out) torch.testing.assert_allclose(out.cpu(), expected)
torch.from_numpy(exp).to(dtype=act.dtype)) out = torch.empty(csr_mat.shape[0], dtype=csr_mat.dtype, device=csr_mat.device) act = function(csr_mat, out=out) assert out.data_ptr() == act.data_ptr() torch.testing.assert_allclose( act, torch.from_numpy(exp).to(dtype=act.dtype).reshape(-1)) @pytest.mark.parametrize("device", [ "cpu", pytest.param("cuda:0", marks=pytest.mark.skipif(not decide_cuda(), reason="No GPU found.")) ]) class TestMyTranspose(): def test_simple_transpose(self, device, csr_mat): arr = csr_mat.to(device=device) tr_arr = arr.transpose_csc() assert tr_arr.shape == ( 2, 3), "expected transpose shape to be %s, but found %s" % ( (2, 3), tr_arr.shape) tr_mat = tr_arr.to_scipy().tocoo() assert tr_mat.row.tolist() == [ 1, 0, 1, 0 ], "expected rows %s, but found %s" % ([1, 0, 1, 0 ], tr_mat.row.tolist()) assert tr_mat.col.tolist() == [
class TestFalkon: def test_classif(self, cls_data): X, Y = cls_data kernel = kernels.GaussianKernel(2.0) torch.manual_seed(13) np.random.seed(13) def error_fn(t, p): return 100 * torch.sum(t * p <= 0).to( torch.float32) / t.shape[0], "c-err" opt = FalkonOptions(use_cpu=True, keops_active="no", debug=True) flk = Falkon(kernel=kernel, penalty=1e-6, M=500, seed=10, options=opt, error_fn=error_fn) flk.fit(X, Y) preds = flk.predict(X) err = error_fn(preds, Y)[0] assert err < 5 def test_multiclass(self, multicls_data): X, Y = multicls_data kernel = kernels.GaussianKernel(10.0) def error_fn(t, p): t = torch.argmax(t, dim=1) p = torch.argmax(p, dim=1) return torch.mean((t.reshape(-1, ) != p.reshape(-1, )).to( torch.float64)), "multic-err" opt = FalkonOptions(use_cpu=True, keops_active="no", debug=True) flk = Falkon(kernel=kernel, penalty=1e-6, M=500, seed=10, options=opt, error_fn=error_fn) flk.fit(X, Y) preds = flk.predict(X) err = error_fn(preds, Y)[0] assert err < 0.23 def test_regression(self, reg_data): Xtr, Ytr, Xts, Yts = reg_data kernel = kernels.GaussianKernel(20.0) def error_fn(t, p): return torch.sqrt(torch.mean((t - p)**2)), "RMSE" opt = FalkonOptions(use_cpu=True, keops_active="no", debug=True) flk = Falkon(kernel=kernel, penalty=1e-6, M=500, seed=10, options=opt, error_fn=error_fn) flk.fit(Xtr, Ytr, Xts=Xts, Yts=Yts) assert flk.predict(Xts).shape == (Yts.shape[0], 1) ts_err = error_fn(flk.predict(Xts), Yts)[0] tr_err = error_fn(flk.predict(Xtr), Ytr)[0] assert tr_err < ts_err assert ts_err < 2.5 @pytest.mark.skipif(not decide_cuda(), reason="No GPU found.") def test_cuda_predict(self, reg_data): Xtr, Ytr, Xts, Yts = reg_data kernel = kernels.GaussianKernel(20.0) def error_fn(t, p): return torch.sqrt(torch.mean((t - p)**2)), "RMSE" opt = FalkonOptions(use_cpu=False, keops_active="no", debug=True, min_cuda_pc_size_64=1, min_cuda_iter_size_64=1) flk = Falkon(kernel=kernel, penalty=1e-6, M=500, seed=10, options=opt, error_fn=error_fn) flk.fit(Xtr, Ytr, Xts=Xts, Yts=Yts) flk.to("cuda:0") cuda_ts_preds = flk.predict(Xts.to("cuda:0")) cuda_tr_preds = flk.predict(Xtr.to("cuda:0")) assert cuda_ts_preds.device.type == "cuda" assert cuda_ts_preds.shape == (Yts.shape[0], 1) ts_err = error_fn(cuda_ts_preds.cpu(), Yts)[0] tr_err = error_fn(cuda_tr_preds.cpu(), Ytr)[0] assert tr_err < ts_err assert ts_err < 2.5
class TestVecMulTriang: MAT_SIZE = 120 @pytest.fixture(scope="class") def mat(self): return torch.from_numpy( gen_random(TestVecMulTriang.MAT_SIZE, TestVecMulTriang.MAT_SIZE, 'float64', False, seed=91)) @pytest.fixture(scope="class") def vec(self): return torch.from_numpy( gen_random(TestVecMulTriang.MAT_SIZE, 1, 'float64', False, seed=91)) @staticmethod def exp_vec_mul_triang(mat, vec, upper, side): if side == 0: vec = vec.reshape(-1, 1) else: vec = vec.reshape(1, -1) if upper: tri_mat = torch.triu(mat, diagonal=0) tri_idx = torch.triu_indices(mat.shape[0], mat.shape[1], offset=0) else: tri_mat = torch.tril(mat, diagonal=0) tri_idx = torch.tril_indices(mat.shape[0], mat.shape[1], offset=0) tri_mat *= vec exp = mat.clone() exp[tri_idx[0], tri_idx[1]] = tri_mat[tri_idx[0], tri_idx[1]] return exp @pytest.mark.parametrize("order", ["F", "C"]) @pytest.mark.parametrize("upper", [True, False], ids=["upper", "lower"]) @pytest.mark.parametrize("side", [0, 1], ids=["side0", "side1"]) @pytest.mark.parametrize("device", [ "cpu", pytest.param("cuda:0", marks=[ pytest.mark.skipif(not decide_cuda(), reason="No GPU found.") ]) ]) def test_all_combos(self, mat, vec, order, device, upper, side): exp_output = self.exp_vec_mul_triang(mat, vec, upper, side) vec = fix_mat(vec, order=order, dtype=np.float64, numpy=False, device=device) mat2 = fix_mat(mat, order=order, dtype=np.float64, numpy=False, device=device, copy=True) out = vec_mul_triang(mat2, upper=upper, side=side, multipliers=vec).cpu().numpy() np.testing.assert_allclose(exp_output.numpy(), out) assert out.flags["%s_CONTIGUOUS" % order] is True, "Output is not %s-contiguous" % ( order) # Test with different vec orderings vec = vec.reshape(1, -1) mat2 = fix_mat(mat, order=order, dtype=np.float64, numpy=False, device=device, copy=True) out = vec_mul_triang(mat2, upper=upper, side=side, multipliers=vec).cpu().numpy() np.testing.assert_allclose(exp_output.numpy(), out, err_msg="Vec row ordering failed") vec = vec.reshape(-1) mat2 = fix_mat(mat, order=order, dtype=np.float64, numpy=False, device=device, copy=True) out = vec_mul_triang(mat2, upper=upper, side=side, multipliers=vec).cpu().numpy() np.testing.assert_allclose(exp_output.numpy(), out, err_msg="Vec 1D ordering failed") @pytest.mark.benchmark @pytest.mark.skipif(not decide_cuda(), reason="No GPU found.") def test_large(self): t = 20_000 num_rep = 5 mat = torch.from_numpy(gen_random(t, t, np.float32, F=False, seed=123)) vec = torch.from_numpy( gen_random(t, 1, np.float32, F=False, seed=124).reshape((-1, ))) mat_cuda = mat.cuda() vec_cuda = vec.cuda() cpu_times = [] for i in range(num_rep): t_s = time.time() out_cpu = vec_mul_triang(mat, vec, True, 1) cpu_times.append(time.time() - t_s) gpu_times = [] for i in range(num_rep): t_s = time.time() out_cuda = vec_mul_triang(mat_cuda, vec_cuda, True, 1) torch.cuda.synchronize() gpu_times.append(time.time() - t_s) print("mat size %d - t_cpu: %.4fs -- t_cuda: %.4fs" % (t, np.min(cpu_times), np.min(gpu_times))) np.testing.assert_allclose(out_cpu, out_cuda.cpu().numpy())