Exemplo n.º 1
0
    def test_with_x0(self, mat, vec_rhs, conjgrad, order, device):
        if order == "F":
            mat = torch.from_numpy(np.asfortranarray(mat.numpy()))
            vec_rhs = torch.from_numpy(np.asfortranarray(vec_rhs.numpy()))
        mat = move_tensor(mat, device)
        vec_rhs = move_tensor(vec_rhs, device)
        init_sol = create_same_stride(vec_rhs.size(), vec_rhs, vec_rhs.dtype,
                                      device)
        init_sol.fill_(0.0)

        x = conjgrad.solve(X0=init_sol,
                           B=vec_rhs,
                           mmv=lambda x_: mat @ x_,
                           max_iter=10,
                           callback=None)

        assert x.data_ptr() == init_sol.data_ptr(
        ), "Initial solution vector was copied"
        assert str(x.device) == device, "Device has changed unexpectedly"
        assert x.shape == (self.t,
                           vec_rhs.shape[1]), "Output shape is incorrect"
        assert x.stride() == vec_rhs.stride(
        ), "Stride has changed unexpectedly"
        expected = np.linalg.solve(mat.cpu().numpy(), vec_rhs.cpu().numpy())
        np.testing.assert_allclose(expected, x.cpu().numpy(), rtol=1e-6)
Exemplo n.º 2
0
    def test_lauum(self, dtype, get_mat, expected_lower, expected_upper,
                   lower):
        device = torch.device("cuda:0")

        mat = get_mat(order="F", dtype=dtype)
        gpu_in = move_tensor(mat, device)
        gpu_out = move_tensor(mat, device)
        gpu_out.fill_(0.0)

        # Run on the GPU
        cuda_lauum(n=mat.shape[0],
                   A=gpu_in,
                   lda=gpu_in.stride(1),
                   B=gpu_out,
                   ldb=gpu_out.stride(1),
                   lower=lower)
        torch.cuda.synchronize(device)

        # Compare outputs and print timing info
        if lower:
            np.testing.assert_allclose(np.tril(expected_lower),
                                       gpu_out.cpu().numpy(),
                                       rtol=self.rtol[dtype])
        else:
            np.testing.assert_allclose(np.triu(expected_upper),
                                       gpu_out.cpu().numpy(),
                                       rtol=self.rtol[dtype])
Exemplo n.º 3
0
    def test_flk_cg(self, data, centers, kernel, preconditioner, knm, kmm,
                    vec_rhs, device):
        preconditioner = preconditioner.to(device)
        options = dataclasses.replace(self.basic_opt, use_cpu=device == "cpu")
        opt = FalkonConjugateGradient(kernel, preconditioner, opt=options)

        # Solve (knm.T @ knm + lambda*n*kmm) x = knm.T @ b
        rhs = knm.T @ vec_rhs
        lhs = knm.T @ knm + self.penalty * self.N * kmm
        expected = np.linalg.solve(lhs.numpy(), rhs.numpy())

        data = move_tensor(data, device)
        centers = move_tensor(centers, device)
        vec_rhs = move_tensor(vec_rhs, device)

        beta = opt.solve(X=data,
                         M=centers,
                         Y=vec_rhs,
                         _lambda=self.penalty,
                         initial_solution=None,
                         max_iter=200)
        alpha = preconditioner.apply(beta)

        assert str(beta.device) == device, "Device has changed unexpectedly"
        np.testing.assert_allclose(expected, alpha.cpu().numpy(), rtol=1e-5)
Exemplo n.º 4
0
    def test_cuda_start(self, mat, kernel, gram, dtype, order):
        opt = dataclasses.replace(self.basic_opt,
                                  use_cpu=False,
                                  cpu_preconditioner=False)
        rtol = self.rtol[dtype]

        mat = fix_mat(mat, dtype=dtype, order=order, copy=True)
        gpu_mat = move_tensor(mat, "cuda:0")
        gram = fix_mat(gram, dtype=dtype, order=order, copy=True)
        gpu_gram = move_tensor(gram, "cuda:0")

        la = 1

        prec = FalkonPreconditioner(la, kernel, opt)
        prec.init(mat)

        gpu_prec = FalkonPreconditioner(la, kernel, opt)
        gpu_prec.init(gpu_mat)

        np.testing.assert_allclose(prec.dT.numpy(),
                                   gpu_prec.dT.cpu().numpy(),
                                   rtol=rtol)
        np.testing.assert_allclose(prec.dA.numpy(),
                                   gpu_prec.dA.cpu().numpy(),
                                   rtol=rtol)
        np.testing.assert_allclose(prec.fC.numpy(),
                                   gpu_prec.fC.cpu().numpy(),
                                   rtol=rtol * 10)
        assert gpu_prec.fC.device == gpu_mat.device, "Device changed unexpectedly"

        assert_invariant_on_TT(gpu_prec, gpu_gram, tol=rtol)
        assert_invariant_on_AT(prec, gram, la, tol=rtol)
        assert_invariant_on_T(prec, gram, tol=rtol * 10)
        assert_invariant_on_prec(prec, N, gram, la, tol=rtol * 10)
Exemplo n.º 5
0
    def test_rect(self, rect, order, dtype):
        from falkon.la_helpers.cuda_la_helpers import cuda_transpose
        mat = fix_mat(rect, order=order, dtype=dtype, copy=True, numpy=True)
        exp_mat_out = np.copy(mat.T, order=order)

        mat = move_tensor(torch.from_numpy(mat), "cuda:0")
        mat_out = move_tensor(torch.from_numpy(exp_mat_out), "cuda:0")
        mat_out.fill_(0.0)

        cuda_transpose(input=mat, output=mat_out)

        mat_out = move_tensor(mat_out, "cpu").numpy()
        assert mat_out.strides == exp_mat_out.strides
        np.testing.assert_allclose(exp_mat_out, mat_out)
Exemplo n.º 6
0
def test_trsm_wrapper(mat, arr, dtype, order, device, lower, transpose):
    rtol = 1e-2 if dtype == np.float32 else 1e-11

    n_mat = move_tensor(fix_mat(mat, dtype=dtype, order=order, copy=True), device=device)
    n_arr = move_tensor(fix_mat(arr, dtype=dtype, order=order, copy=True), device=device)

    expected = sclb.dtrsm(1e-2, mat, arr, side=0, lower=lower, trans_a=transpose, overwrite_b=0)

    if device.startswith("cuda") and order == "C":
        with pytest.raises(ValueError):
            actual = trsm(n_arr, n_mat, alpha=1e-2, lower=lower, transpose=transpose)
    else:
        actual = trsm(n_arr, n_mat, alpha=1e-2, lower=lower, transpose=transpose)
        np.testing.assert_allclose(expected, actual.cpu().numpy(), rtol=rtol)
Exemplo n.º 7
0
def run_potrf_test(np_data, dtype, order, opt, input_device, upper, clean,
                   overwrite):
    # Convert pd_data to the appropriate form
    data = np.array(np_data, order=order, dtype=dtype, copy=True)
    lapack_fn, rtol = choose_on_dtype(dtype)
    A = move_tensor(torch.from_numpy(data.copy(order="K")), input_device)

    orig_stride = A.stride()
    orig_ptr = A.data_ptr()

    with memory_checker(opt) as new_opt:
        C_gpu = gpu_cholesky(A,
                             upper=upper,
                             clean=clean,
                             overwrite=overwrite,
                             opt=new_opt)

    assert orig_stride == C_gpu.stride(), "gpu_potrf modified matrix stride."
    if overwrite:
        assert orig_ptr == C_gpu.data_ptr(
        ), "Data-pointer changed although overwrite is True."

    C_cpu = lapack_fn(data,
                      lower=int(not upper),
                      clean=int(clean),
                      overwrite_a=int(overwrite))[0]
    np.testing.assert_allclose(C_cpu,
                               C_gpu.cpu().numpy(),
                               rtol=rtol,
                               verbose=True)
Exemplo n.º 8
0
    def test_low(self, mat, order, dtype, device):
        mat = fix_mat(mat, order=order, dtype=dtype, numpy=True)
        mat_low = mat.copy(order="K")
        # Upper triangle of mat_low is 0
        mat_low[np.triu_indices(self.t, 1)] = 0

        # Create device matrix
        mat_low = torch.from_numpy(mat_low)
        mat_low_dev = move_tensor(mat_low, device)

        # Run copy
        copy_triang(mat_low_dev, upper=False)

        # Make checks on CPU
        mat_low = mat_low_dev.cpu().numpy()
        assert np.sum(mat_low == 0) == 0
        np.testing.assert_array_equal(np.tril(mat), np.tril(mat_low))
        np.testing.assert_array_equal(np.triu(mat_low), np.tril(mat_low).T)
        np.testing.assert_array_equal(np.diag(mat), np.diag(mat_low))

        # Reset and try with `upper=True`
        mat_low[np.triu_indices(self.t, 1)] = 0
        mat_low_dev.copy_(torch.from_numpy(mat_low))

        copy_triang(mat_low_dev, upper=True)  # Only the diagonal will be set

        mat_low = mat_low_dev.cpu().numpy()
        np.testing.assert_array_equal(np.diag(mat), np.diag(mat_low))
Exemplo n.º 9
0
    def test_up(self, mat, order, dtype, device):
        mat = fix_mat(mat, order=order, dtype=dtype, numpy=True)
        mat_up = mat.copy(order="K")
        # Lower triangle of mat_up is 0
        mat_up[np.tril_indices(self.t, -1)] = 0
        # Create device matrix
        mat_up = torch.from_numpy(mat_up)
        mat_up_dev = move_tensor(mat_up, device)

        copy_triang(mat_up_dev, upper=True)
        mat_up = mat_up_dev.cpu().numpy()

        assert np.sum(mat_up == 0) == 0
        np.testing.assert_array_equal(np.triu(mat), np.triu(mat_up))
        np.testing.assert_array_equal(np.tril(mat_up), np.triu(mat_up).T)
        np.testing.assert_array_equal(np.diag(mat), np.diag(mat_up))

        # Reset and try with `upper=False`
        mat_up[np.tril_indices(self.t, -1)] = 0
        mat_up_dev.copy_(torch.from_numpy(mat_up))

        copy_triang(mat_up_dev, upper=False)  # Only the diagonal will be set.

        mat_up = mat_up_dev.cpu().numpy()
        np.testing.assert_array_equal(np.diag(mat), np.diag(mat_up))
Exemplo n.º 10
0
    def test(self, A, B, k_class, k_exp, dtype, cpu, input_device):
        opt = dataclasses.replace(self.basic_options, use_cpu=cpu)
        if input_device.startswith("cuda"):
            # For fMM there is nothing we can do about CUDA memory usage!
            opt = dataclasses.replace(opt, max_gpu_mem=np.inf)
        A = move_tensor(torch.from_numpy(A), input_device)
        B = move_tensor(torch.from_numpy(B), input_device)

        _run_fmm_test(k_class,
                      k_exp,
                      A,
                      B,
                      out=None,
                      dtype=dtype,
                      opt=opt,
                      rtol=self._RTOL[A.dtype])
Exemplo n.º 11
0
    def test_trsm(self, mat, vec, solution, alpha, dtype, order_v, order_A,
                  device):
        mat = move_tensor(fix_mat(mat, dtype, order_A, copy=True, numpy=False),
                          device=device)
        vec = move_tensor(fix_mat(vec, dtype, order_v, copy=True, numpy=False),
                          device=device)

        sol_vec, lower, trans = solution
        out = trsm(vec, mat, alpha, lower=int(lower), transpose=int(trans))

        assert out.data_ptr() != vec.data_ptr(), "Vec was overwritten."
        assert out.device == vec.device, "Output device is incorrect."
        assert out.stride() == vec.stride(), "Stride was modified."
        assert out.dtype == vec.dtype, "Dtype was modified."
        np.testing.assert_allclose(sol_vec,
                                   out.cpu().numpy(),
                                   rtol=self.rtol[dtype])
Exemplo n.º 12
0
    def test_strided(self, dtype, get_mat, expected_lower):
        device = torch.device("cuda:0")

        mat = get_mat(order="F", dtype=dtype)
        gpu_in = move_tensor(mat, device)
        gpu_in_strided = torch.cat([gpu_in, torch.zeros(gpu_in.shape[0], 10, device=device, dtype=gpu_in.dtype)], 1).T
        gpu_in_strided = gpu_in_strided[:gpu_in.shape[0], :gpu_in.shape[0]]
        gpu_in_strided.copy_(gpu_in)
        gpu_out = move_tensor(mat, device)
        gpu_out_strided = torch.cat([gpu_out, torch.zeros(gpu_out.shape[0], 10, device=device, dtype=gpu_in.dtype)], 1).T
        gpu_out_strided = gpu_out_strided[:gpu_out.shape[0], :gpu_out.shape[0]]
        gpu_out_strided.fill_(0.0)

        # Run on the GPU
        cuda_lauum_lower(n=gpu_in.shape[0], A=gpu_in_strided, lda=gpu_in_strided.stride(1), B=gpu_out_strided, ldb=gpu_out_strided.stride(1))
        torch.cuda.synchronize(device)

        # Compare outputs and print timing info
        np.testing.assert_allclose(np.tril(expected_lower), gpu_out_strided.cpu().numpy(), rtol=self.rtol[dtype])
Exemplo n.º 13
0
    def test_with_out(self, Ac: np.ndarray, Bc: np.ndarray, k_class, k_exp,
                      dtype, cpu, input_device):
        opt = dataclasses.replace(self.basic_options, use_cpu=cpu)

        Ac = move_tensor(torch.from_numpy(Ac.astype(dtype)), input_device)
        Bc = move_tensor(torch.from_numpy(Bc.astype(dtype)), input_device)
        out = torch.empty(Ac.shape[0],
                          Bc.shape[0],
                          dtype=Ac.dtype,
                          device=input_device)

        _run_fmm_test(k_class,
                      k_exp,
                      Ac,
                      Bc,
                      out=out,
                      dtype=dtype,
                      opt=opt,
                      rtol=self._RTOL[Ac.dtype])
Exemplo n.º 14
0
def fix_mat(t, dtype, order, device="cpu", copy=False, numpy=False):
    if dtype is None or order is None:
        return None
    if isinstance(t, torch.Tensor):
        t = t.numpy()
    if isinstance(t, np.ndarray):
        t = np.array(t, dtype=dtype, order=order, copy=copy)
        if numpy:
            return t
        return move_tensor(torch.from_numpy(t), device)
    return t
Exemplo n.º 15
0
    def test_one_rhs(self, mat, vec_rhs, conjgrad, order, device):
        if order == "F":
            mat = torch.from_numpy(np.asfortranarray(mat.numpy()))
            vec_rhs = torch.from_numpy(np.asfortranarray(vec_rhs.numpy()))
        mat = move_tensor(mat, device)
        vec_rhs = move_tensor(vec_rhs, device)

        x = conjgrad.solve(X0=None,
                           B=vec_rhs,
                           mmv=lambda x_: mat @ x_,
                           max_iter=10,
                           callback=None)

        assert str(x.device) == device, "Device has changed unexpectedly"
        assert x.stride() == vec_rhs.stride(
        ), "Stride has changed unexpectedly"
        assert x.shape == (self.t,
                           vec_rhs.shape[1]), "Output shape is incorrect"
        expected = np.linalg.solve(mat.cpu().numpy(), vec_rhs.cpu().numpy())
        np.testing.assert_allclose(expected, x.cpu().numpy(), rtol=1e-6)
Exemplo n.º 16
0
    def test_precise_kernel(self, A, B, k_class, k_exp, cpu, input_device):
        opt = dataclasses.replace(self.basic_options,
                                  use_cpu=cpu,
                                  no_single_kernel=True)

        A = move_tensor(torch.from_numpy(A), input_device)
        B = move_tensor(torch.from_numpy(B), input_device)
        out = torch.empty(A.shape[0],
                          B.shape[0],
                          dtype=A.dtype,
                          device=input_device)
        # Note rtol is 10x lower than in the other tests
        _run_fmm_test(k_class,
                      k_exp,
                      A,
                      B,
                      out=out,
                      dtype=np.float32,
                      opt=opt,
                      rtol=1e-6)
Exemplo n.º 17
0
    def test_no_overwrite(self, dtype, order, get_mat, expected_lower, expected_upper, device):
        mat = get_mat(order=order, dtype=dtype)
        mat = move_tensor(mat, device)

        # For cuda inputs we must add to available GPU memory the amount used by the
        # input matrix, since overwrite=False and a full copy must be performed.
        mgpu_slack = 0
        if device.startswith("cuda"):
            mgpu_slack = self.basic_opt.max_gpu_mem + mat.shape[0]**2 * sizeof_dtype(mat.dtype)

        with memory_checker(self.basic_opt, extra_mem=mgpu_slack) as new_opt:
            act_up = gpu_lauum(mat, upper=True, overwrite=False, opt=new_opt)
            torch.cuda.synchronize()
        np.testing.assert_allclose(expected_upper, act_up.cpu().numpy(), rtol=self.rtol[dtype])

        with memory_checker(self.basic_opt, extra_mem=mgpu_slack) as new_opt:
            act_lo = gpu_lauum(mat, upper=False, overwrite=False, opt=new_opt)
            torch.cuda.synchronize()
        np.testing.assert_allclose(expected_lower, act_lo.cpu().numpy(), rtol=self.rtol[dtype])
Exemplo n.º 18
0
    def test_write_opposite(self, dtype, order, get_mat, expected_lower,
                            expected_upper, device):
        omat = get_mat(order=order, dtype=dtype)
        mat = get_mat(order=order, dtype=dtype, device=device)

        with memory_checker(self.basic_opt) as new_opt:
            act_up = gpu_lauum(mat,
                               upper=True,
                               overwrite=True,
                               write_opposite=True,
                               opt=new_opt)
            torch.cuda.synchronize()
        act_up = act_up.cpu()
        np.testing.assert_allclose(np.triu(omat, k=1),
                                   np.triu(act_up.numpy(), k=1),
                                   rtol=self.rtol[dtype])
        np.testing.assert_allclose(np.tril(act_up.numpy()),
                                   np.triu(expected_upper).T,
                                   rtol=self.rtol[dtype])

        mat = get_mat(order=order, dtype=dtype)
        mat = move_tensor(mat, device)
        with memory_checker(self.basic_opt) as new_opt:
            act_lo = gpu_lauum(mat,
                               upper=False,
                               overwrite=True,
                               write_opposite=True,
                               opt=new_opt)
            torch.cuda.synchronize()
        act_lo = act_lo.cpu()
        np.testing.assert_allclose(np.tril(omat, k=-1),
                                   np.tril(act_lo.numpy(), k=-1),
                                   rtol=self.rtol[dtype])
        np.testing.assert_allclose(np.triu(act_lo.numpy()),
                                   np.tril(expected_lower).T,
                                   rtol=self.rtol[dtype])