예제 #1
0
    def select_indices(self,
                       X: _tensor_type,
                       Y: Optional[torch.Tensor]) -> _opt_tns_idx_tup:
        """Select M observations from 2D tensor `X`, preserving device and memory order.

        The selection strategy is uniformly at random. To control the randomness,
        pass an appropriate numpy random generator to this class's constructor.

        This method behaves the same as :meth:`select` but additionally returns a `LongTensor`
        containing the indices of the chosen points.

        Parameters
        ----------
        X
            N x D tensor containing the whole input dataset. If N is lower than the number of
            centers this class is programmed to pick, a warning will be raised and only N centers
            will be returned.
        Y
            Optional N x T tensor containing the input targets. If `Y` is provided,
            the same observations selected for `X` will also be selected from `Y`.
            Certain models (such as :class:`falkon.models.LogisticFalkon`) require centers to be
            extracted from both predictors and targets, while others (such as
            :class:`falkon.models.Falkon`) only require the centers from the predictors.

        Returns
        -------
        (X_M, indices)
            The randomly selected centers and the corresponding indices.
            The centers will be stored in a new, memory-contiguous tensor and all
            characteristics of the input tensor will be preserved.
        (X_M, Y_M, indices)
            If parameter`Y` is not `None` then the entries of `Y` corresponding to the
            selected centers of `X` will also be returned.
        """
        N = X.shape[0]
        num_centers = self.num_centers
        if num_centers > N:
            warnings.warn("Number of centers M greater than the "
                          f"number of data-points. Setting `num_centers` to {N}")
            num_centers = N
        idx = self.random_gen.choice(N, size=num_centers, replace=False)

        if isinstance(X, SparseTensor):
            X_sp = X.to_scipy()
            centers = X_sp[idx, :].copy()
            Xc = SparseTensor.from_scipy(centers)
            th_idx = torch.from_numpy(idx.astype(np.long)).to(X.device)
        else:
            Xc = create_same_stride((num_centers, X.shape[1]), other=X, dtype=X.dtype,
                                    device=X.device, pin_memory=False)
            th_idx = torch.from_numpy(idx.astype(np.long)).to(X.device)
            torch.index_select(X, dim=0, index=th_idx, out=Xc)

        if Y is not None:
            Yc = create_same_stride((num_centers, Y.shape[1]), other=Y, dtype=Y.dtype,
                                    device=Y.device, pin_memory=False)
            th_idx = torch.from_numpy(idx.astype(np.long)).to(Y.device)
            torch.index_select(Y, dim=0, index=th_idx, out=Yc)
            return Xc, Yc, th_idx
        return Xc, th_idx
예제 #2
0
파일: fmmv_cpu.py 프로젝트: gpleiss/falkon
def fdmmv_cpu_sparse(X1: SparseTensor,
                     X2: SparseTensor,
                     v: Optional[torch.Tensor],
                     w: Optional[torch.Tensor],
                     kernel,
                     out: Optional[torch.Tensor] = None,
                     opt: Optional[BaseOptions] = None):
    opt = _setup_opt(opt, is_cpu=True)

    # Parameter validation
    if v is None and w is None:
        raise ValueError("One of v and w must be specified to run fMMV.")
    T = v.size(1) if v is not None else w.size(1)
    ntot, dtot = X1.size()
    M = X2.size(0)
    dtype = X1.dtype

    # Create output matrix
    if out is None:
        out = torch.empty(M, T, dtype=dtype)
    out.fill_(0)

    avail_mem = _get_cpu_ram(opt, 0.95) / sizeof_dtype(dtype)
    # Narrow X1 : n
    # ker_chunk : n*M
    # w_blk     : n*T
    n = avail_mem / (M * T + 1)
    n = int(math.floor(n))
    if n < 1:
        raise MemoryError(("Available memory %.2fGB is insufficient "
                           "for blockwise fdMMv.") %
                          (avail_mem * sizeof_dtype(dtype) / 2**30))

    # Allocate fixed arrays
    ker_chunk = create_same_stride((n, M), out, dtype, device='cpu')
    w_blk = create_same_stride((n, T), out, dtype, device='cpu')
    # Run blocked fdmmv
    for i in range(0, ntot, n):
        ic = min(n, ntot - i)
        X1_chunk = X1.narrow_rows(i, ic)
        cur_ker_chunk = ker_chunk[:ic]
        cur_ker_chunk.fill_(0.0)
        ddd = kernel._prepare_sparse(X1_chunk, X2)
        kernel._apply_sparse(X1_chunk, X2.transpose_csc(), cur_ker_chunk)
        kernel._finalize(cur_ker_chunk, ddd)

        # Multiply by the vector v
        cur_w_blk = w_blk[:ic]  # n x T
        cur_w_blk.fill_(0.0)
        if w is not None:
            cur_w_blk.copy_(w[i:i + ic, :])
        if v is not None:
            # w_blk + c_out * v => (n x T) + (n x M)*(M x T)
            cur_w_blk.addmm_(cur_ker_chunk, v)
        out.addmm_(cur_ker_chunk.T, cur_w_blk)
    del ker_chunk, w_blk
    return out
예제 #3
0
    def select(self,
               X: _tensor_type,
               Y: Union[torch.Tensor, None],
               M: int) -> Union[_tensor_type, Tuple[_tensor_type, torch.Tensor]]:
        """Select M observations from 2D tensor `X`, preserving device and memory order.

        The selection strategy is uniformly at random. To control the randomness,
        pass an appropriate numpy random generator to this class's constructor.

        Parameters
        ----------
        X
            N x D tensor containing the whole input dataset. We have that N <= M.
        Y
            Optional N x T tensor containing the input targets. If `Y` is provided,
            the same observations selected for `X` will also be selected from `Y`.
            Certain models (such as :class:`falkon.models.LogisticFalkon`) require centers to be
            extracted from both predictors and targets, while others (such as
            :class:`falkon.models.Falkon`) only require the centers from the predictors.
        M
            The number of observations to choose. M <= N, otherwise M is forcibly set to N
            with a warning.

        Returns
        -------
        X_M
            The randomly selected centers. They will be in a new, memory-contiguous tensor.
            All characteristics of the input tensor will be preserved.
        (X_M, Y_M)
            If `Y` was different than `None` then the entries of `Y` corresponding to the
            selected centers of `X` will also be returned.
        """
        N = X.shape[0]
        if M > N:
            warnings.warn("Number of centers M greater than the "
                          "number of data-points. Setting M to %d" % (N))
            M = N
        idx = self.random_gen.choice(N, size=M, replace=False)

        if isinstance(X, SparseTensor):
            X = X.to_scipy()
            centers = X[idx, :].copy()
            Xc = SparseTensor.from_scipy(centers)
        else:
            Xc = create_same_stride((M, X.shape[1]), other=X, dtype=X.dtype, device=X.device,
                                    pin_memory=False)
            th_idx = torch.from_numpy(idx.astype(np.long)).to(X.device)
            torch.index_select(X, dim=0, index=th_idx, out=Xc)

        if Y is not None:
            Yc = create_same_stride((M, Y.shape[1]), other=Y, dtype=Y.dtype, device=Y.device,
                                    pin_memory=False)
            th_idx = torch.from_numpy(idx.astype(np.long)).to(Y.device)
            torch.index_select(Y, dim=0, index=th_idx, out=Yc)
            return Xc, Yc
        return Xc
예제 #4
0
def generic_fmmv(proc_idx, queue, device_id):
    a: ArgsFmmv = queue.get()

    X1, X2, v, out = a.X1, a.X2, a.v, a.out
    kernel, max_mem = a.kernel, a.max_mem
    dtype = X1.dtype
    ntot, dtot = X1.size()
    M, T = v.size()

    # GPU Memory Usage:
    # ker_gpu  : n*M
    # v_gpu    : M*T
    # X1s_gpu  : n*d
    # X2s_gpu  : M*d
    # mmv_gpu  : n*T
    # ----------
    # total : n*d + n*(M+T) + d*M + M*T
    avail_mem = max_mem / sizeof_dtype(dtype)
    n, d = select_dim_over_d(maxD=dtot,
                             maxN=ntot,
                             coef_nd=1,
                             coef_n=M + T,
                             coef_d=M,
                             rest=M * T,
                             tot=avail_mem)

    ddev = torch.device('cuda:%d' % int(device_id))
    with tcd.device(ddev):
        ker_gpu = torch.empty(n, M, dtype=dtype, device=ddev)
        v_gpu = v.to(device=ddev)  # M x T
        X1s_gpu = create_same_stride((n, d), X1, dtype, ddev)
        X2s_gpu = create_same_stride((M, d), X2, dtype, ddev)
        mmv_gpu = create_same_stride((n, T), out, dtype, ddev)

        for i in range(0, ntot, n):
            ic = min(n, ntot - i)
            ddd = kernel._prepare(X1.narrow(0, i, ic), X2)
            c_g_ker = ker_gpu.narrow(0, 0, ic)
            c_g_ker.fill_(0.0)
            for k in range(0, dtot, d):
                kc = min(d, dtot - k)
                c_g_X1s = copy_to_device_noorder(ic, kc, X1, i, k, X1s_gpu, 0,
                                                 0)
                c_g_X2s = copy_to_device_noorder(M, kc, X2, 0, k, X2s_gpu, 0,
                                                 0)
                kernel._apply(c_g_X1s, c_g_X2s.T, c_g_ker)
            kernel._finalize(c_g_ker, ddd)
            # Multiply by the vector v
            c_g_mmv = mmv_gpu[:ic, :]
            torch.mm(c_g_ker, v_gpu, out=c_g_mmv)  # n x T
            # Copy back to host
            copy_to_host_noorder(ic, T, c_g_mmv, 0, 0, out, i, 0)
    return out
예제 #5
0
    def test_zero(self, mat, upper, preserve_diag, order, device):
        inpt1 = fix_mat(mat,
                        dtype=mat.dtype,
                        order=order,
                        copy=True,
                        numpy=True)
        inpt2 = inpt1.copy(order="K")

        k = 1 if preserve_diag else 0
        if upper:
            tri_fn = partial(np.triu, k=k)
        else:
            tri_fn = partial(np.tril, k=-k)

        inpt1 = torch.from_numpy(inpt1)
        inpt1_dev = create_same_stride(inpt1.shape, inpt1, inpt1.dtype, device)
        inpt1_dev.copy_(inpt1)
        mul_triang(inpt1_dev,
                   upper=upper,
                   preserve_diag=preserve_diag,
                   multiplier=0)
        inpt1 = inpt1_dev.cpu().numpy()

        assert np.sum(tri_fn(inpt1)) == 0

        if preserve_diag:
            inpt2_dev = inpt1_dev
            inpt2_dev.copy_(torch.from_numpy(inpt2))
            zero_triang(inpt2_dev, upper=upper)
            inpt2 = inpt2_dev.cpu().numpy()
            np.testing.assert_allclose(inpt1, inpt2)
예제 #6
0
    def test_with_x0(self, mat, vec_rhs, conjgrad, order, device):
        if order == "F":
            mat = torch.from_numpy(np.asfortranarray(mat.numpy()))
            vec_rhs = torch.from_numpy(np.asfortranarray(vec_rhs.numpy()))
        mat = move_tensor(mat, device)
        vec_rhs = move_tensor(vec_rhs, device)
        init_sol = create_same_stride(vec_rhs.size(), vec_rhs, vec_rhs.dtype,
                                      device)
        init_sol.fill_(0.0)

        x = conjgrad.solve(X0=init_sol,
                           B=vec_rhs,
                           mmv=lambda x_: mat @ x_,
                           max_iter=10,
                           callback=None)

        assert x.data_ptr() == init_sol.data_ptr(
        ), "Initial solution vector was copied"
        assert str(x.device) == device, "Device has changed unexpectedly"
        assert x.shape == (self.t,
                           vec_rhs.shape[1]), "Output shape is incorrect"
        assert x.stride() == vec_rhs.stride(
        ), "Stride has changed unexpectedly"
        expected = np.linalg.solve(mat.cpu().numpy(), vec_rhs.cpu().numpy())
        np.testing.assert_allclose(expected, x.cpu().numpy(), rtol=1e-6)
예제 #7
0
def incore_fmmv(mat: torch.Tensor,
                vec: torch.Tensor,
                out: Optional[torch.Tensor] = None,
                transpose: bool = False,
                opt: Optional[FalkonOptions] = None) -> torch.Tensor:
    if not check_same_dtype(mat, vec, out):
        raise TypeError("Data types of input matrices must be equal.")
    if not check_same_device(mat, vec, out):
        raise RuntimeError("All input arguments to incore_fmmv must be on the same device")

    if out is None:
        if transpose:
            out_shape = (mat.shape[1], vec.shape[1])
        else:
            out_shape = (mat.shape[0], vec.shape[1])
        out = create_same_stride(out_shape, mat, mat.dtype, device=mat.device, pin_memory=False)
    out.fill_(0.0)

    if mat.is_cuda:
        s1 = torch.cuda.Stream()
        with torch.cuda.stream(s1):
            if transpose:
                out.addmm_(mat.T, vec, beta=0.0)
            else:
                out.addmm_(mat, vec, beta=0.0)
            s1.synchronize()
    else:
        if transpose:
            out.addmm_(mat.T, vec, beta=0.0)
        else:
            out.addmm_(mat, vec, beta=0.0)
    return out
예제 #8
0
    def test_mul(self, mat, upper, preserve_diag, order, device):
        inpt1 = fix_mat(mat,
                        dtype=mat.dtype,
                        order=order,
                        copy=True,
                        numpy=True)

        k = 1 if preserve_diag else 0
        if upper:
            tri_fn = partial(np.triu, k=k)
            other_tri_fn = partial(np.tril, k=k - 1)
        else:
            tri_fn = partial(np.tril, k=-k)
            other_tri_fn = partial(np.triu, k=-k + 1)

        inpt1 = torch.from_numpy(inpt1)
        inpt1_dev = create_same_stride(inpt1.shape, inpt1, inpt1.dtype, device)
        inpt1_dev.copy_(inpt1)
        mul_triang(inpt1_dev,
                   upper=upper,
                   preserve_diag=preserve_diag,
                   multiplier=10**6)
        inpt1 = inpt1_dev.cpu().numpy()

        assert np.mean(tri_fn(inpt1)) > 10**5
        assert np.mean(other_tri_fn(inpt1)) < 1
예제 #9
0
    def init(self, X: Union[torch.Tensor, SparseTensor]):
        """Initialize the preconditioner matrix.

        This method must be called before the preconditioner can be used.

        Parameters
        ----------
        X : MxD tensor
            The matrix of Nystroem centers
        """
        dtype = X.dtype
        eps = self.params.pc_epsilon(X.dtype)

        M = X.size(0)

        with TicToc("Kernel", debug=self.params.debug):
            if isinstance(X, torch.Tensor):
                C = create_same_stride((M, M), X, dtype=dtype, device='cpu',
                                       pin_memory=self._use_cuda)
            else:  # If sparse tensor we need fortran for kernel calculation
                C = create_fortran((M, M), dtype=dtype, device='cpu', pin_memory=self._use_cuda)
            self.kernel(X, X, out=C, opt=self.params)
        self.fC = C.numpy()
        if not is_f_contig(C):
            self.fC = self.fC.T

        with TicToc("Cholesky 1", debug=self.params.debug):
            # Compute T: lower(fC) = T.T
            inplace_add_diag(self.fC, eps * M)
            self.fC = potrf_wrapper(self.fC, clean=False, upper=False,
                                    use_cuda=self._use_cuda, opt=self.params)
            # Save the diagonal which will be overwritten when computing A
            self.dT = C.diag()

        with TicToc("Copy triangular", debug=self.params.debug):
            # Copy lower(fC) to upper(fC):  upper(fC) = T.
            copy_triang(self.fC, upper=False)

        if self._use_cuda:
            with TicToc("LAUUM", debug=self.params.debug):
                # Product upper(fC) @ upper(fC).T : lower(fC) = T @ T.T
                self.fC = lauum_wrapper(self.fC, upper=True, use_cuda=self._use_cuda, opt=self.params)
        else:
            with TicToc("LAUUM", debug=self.params.debug):
                # Product lower(fC).T @ lower(fC) : lower(fC) = T @ T.T
                self.fC = lauum_wrapper(self.fC, upper=False, use_cuda=self._use_cuda, opt=self.params)

        with TicToc("Cholesky 2", debug=self.params.debug):
            # lower(fC) = 1/M * [email protected]
            self.fC = mul_triang(self.fC, upper=False, preserve_diag=False, multiplier=1 / M)
            # lower(fC) = 1/M * [email protected] + lambda * I
            inplace_add_diag(self.fC, self._lambda)
            # Cholesky on lower(fC) : lower(fC) = A.T
            self.fC = potrf_wrapper(self.fC, clean=False, upper=False,
                                    use_cuda=self._use_cuda, opt=self.params)
            self.dA = C.diag()
예제 #10
0
    def solve(self, X0, B, mmv, max_iter, callback=None):
        t_start = time.time()

        if X0 is None:
            R = copy_same_stride(B)
            X = create_same_stride(B.size(), B, B.dtype, B.device)
            X.fill_(0.0)
        else:
            R = B - mmv(X0)
            X = X0

        m_eps = self.params.cg_epsilon(X.dtype)

        P = R
        # noinspection PyArgumentList
        Rsold = torch.sum(R.pow(2), dim=0)

        e_train = time.time() - t_start

        for i in range(max_iter):
            with TicToc("Chol Iter", debug=False):  # TODO: FIXME
                t_start = time.time()
                AP = mmv(P)
                # noinspection PyArgumentList
                alpha = Rsold / (torch.sum(P * AP, dim=0) + m_eps)
                X.addmm_(P, torch.diag(alpha))

                if (i + 1) % self.params.cg_full_gradient_every == 0:
                    if (X.is_cuda):
                        # addmm_ may not be finished yet causing mmv to get stale inputs.
                        torch.cuda.synchronize()
                    R = B - mmv(X)
                else:
                    R = R - torch.mm(AP, torch.diag(alpha))
                    # R.addmm_(mat1=AP, mat2=torch.diag(alpha), alpha=-1.0)

                # noinspection PyArgumentList
                Rsnew = torch.sum(R.pow(2), dim=0)
                if Rsnew.abs().max().sqrt() < self.params.cg_tolerance:
                    print("Stopping conjugate gradient descent at "
                          "iteration %d. Solution has converged." % (i + 1))
                    break

                P = R + torch.mm(P, torch.diag(Rsnew / (Rsold + m_eps)))
                if P.is_cuda:
                    # P must be synced so that it's correct for mmv in next iter.
                    torch.cuda.synchronize()
                Rsold = Rsnew

                e_iter = time.time() - t_start
                e_train += e_iter
            with TicToc("Chol callback", debug=False):
                if callback is not None:
                    callback(i + 1, X, e_train)

        return X
예제 #11
0
def fmm_cuda(X1: torch.Tensor,
             X2: torch.Tensor,
             kernel: 'falkon.kernels.Kernel',
             out: Optional[torch.Tensor] = None,
             opt: Optional[BaseOptions] = None) -> torch.Tensor:
    """
    performs fnc(X1*X2', X1, X2) in blocks on multiple GPUs
    """
    opt = _setup_opt(opt)
    _check_contiguity((X1, 'X1'), (X2, 'X2'), (out, 'out'))

    N = X1.shape[0]
    M = X2.shape[0]
    device = X1.device
    if out is None:
        out = create_same_stride((N, M),
                                 X1,
                                 X1.dtype,
                                 device=device,
                                 pin_memory=False)
    gpu_info = _get_gpu_info(opt, slack=0.9)
    block_sizes = calc_gpu_block_sizes(gpu_info, N)

    # If float32 we need to upcast to float64 to avoid numerical precision errors
    # in the kernel
    gpu_dtype = X1.dtype
    if sizeof_dtype(X1.dtype) < 8 and opt.no_single_kernel:
        gpu_dtype = torch.float64

    if device.type == 'cuda':
        sync_current_stream(device)
        single_gpu_info = [g for g in gpu_info if g.Id == device.index][0]
        args = ArgsFmm(X1=X1,
                       X2=X2,
                       out=out,
                       kernel=kernel,
                       gpu_dtype=gpu_dtype,
                       max_mem=single_gpu_info.usable_ram,
                       num_streams=opt.num_fmm_streams)
        _call_direct(_generic_fmm, (args, device.index))
    else:
        # Create the arguments passed to each subprocess
        args = []
        for i, g in enumerate(gpu_info):
            bwidth = block_sizes[i + 1] - block_sizes[i]
            if bwidth <= 0:
                continue
            args.append((ArgsFmm(X1=X1.narrow(0, block_sizes[i], bwidth),
                                 X2=X2,
                                 out=out.narrow(0, block_sizes[i], bwidth),
                                 kernel=kernel,
                                 gpu_dtype=gpu_dtype,
                                 max_mem=g.usable_ram,
                                 num_streams=opt.num_fmm_streams), g.Id))
        _start_wait_processes(_generic_fmm, args)
    return out
예제 #12
0
def fmmv_cuda(X1: torch.Tensor,
              X2: torch.Tensor,
              v: torch.Tensor,
              kernel,
              out: Optional[torch.Tensor] = None,
              opt: Optional[BaseOptions] = None) -> torch.Tensor:
    """
    X1 : N x D
    X2 : M x D
    v  : M x T

    performs  fnc(X1*X2', X1, X2) * v   : N x T
    in blocks on multiple GPUs
    """
    opt = _setup_opt(opt)
    _check_contiguity((X1, 'X1'), (X2, 'X2'), (v, 'v'), (out, 'out'))
    device = X1.device

    N = X1.size(0)
    # Create output matrix
    if out is None:
        out = create_same_stride((N, v.size(1)), X1, v.dtype, device=device,
                                 pin_memory=device.type != 'cuda')
    out.fill_(0.0)

    if kernel.kernel_type == "l2distance" and kernel.name == "gaussian":
        target = distk_fmmv
    else:
        target = generic_fmmv

    gpu_info = _get_gpu_info(opt, slack=0.9)

    if device.type == 'cuda':
        single_gpu_info = [g for g in gpu_info if g.Id == device.index][0]
        args = ArgsFmmv(X1=X1, X2=X2, v=v, out=out, kernel=kernel,
                        max_mem=single_gpu_info.usable_ram)
        _call_direct(target, (args, device.index))
    else:
        block_sizes = calc_gpu_block_sizes(gpu_info, N)
        # Create queues
        args = []  # Arguments passed to each subprocess
        for i, g in enumerate(gpu_info):
            bwidth = block_sizes[i + 1] - block_sizes[i]
            if bwidth <= 0:
                continue
            args.append((ArgsFmmv(
                X1=X1.narrow(0, block_sizes[i], bwidth),
                X2=X2, v=v,
                out=out.narrow(0, block_sizes[i], bwidth),
                kernel=kernel, max_mem=g.usable_ram), g.Id))

        _start_wait_processes(target, args)
    return out
예제 #13
0
def fmmv_cuda(X1: torch.Tensor,
              X2: torch.Tensor,
              v: torch.Tensor,
              kernel,
              out: Optional[torch.Tensor] = None,
              opt: Optional[BaseOptions] = None) -> torch.Tensor:
    """
    X1 : N x D
    X2 : M x D
    v  : M x T

    performs  fnc(X1*X2', X1, X2) * v   : N x T
    in blocks on multiple GPUs
    """
    opt = _setup_opt(opt)
    _check_contiguity((X1, 'X1'), (X2, 'X2'), (v, 'v'), (out, 'out'))

    N = X1.size(0)
    # Create output matrix
    if out is None:
        out = create_same_stride((N, v.size(1)),
                                 X1,
                                 v.dtype,
                                 'cpu',
                                 pin_memory=True)
    out.fill_(0.0)

    gpu_info = _get_gpu_info(opt, slack=0.9)
    block_sizes = calc_gpu_block_sizes(gpu_info, N)

    # Create queues
    args = []  # Arguments passed to each subprocess
    for i, g in enumerate(gpu_info):
        bwidth = block_sizes[i + 1] - block_sizes[i]
        if bwidth <= 0: continue
        args.append((ArgsFmmv(X1=X1.narrow(0, block_sizes[i], bwidth),
                              X2=X2,
                              v=v,
                              out=out.narrow(0, block_sizes[i], bwidth),
                              kernel=kernel,
                              max_mem=g.usable_ram), g.Id))

    # If using torch multiprocessing with spawn/forkserver here we must make sure
    # that any consumer of the queues is on a different process than the queue producer.
    # This is due to passing in a CUDA tensor to the queue
    # https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors
    # Thus we cannot run the first task on the current process.
    if kernel.kernel_type == "l2distance" and kernel.name == "gaussian":
        target = distk_fmmv
    else:
        target = generic_fmmv
    _start_wait_processes(target, args)
    return out
예제 #14
0
    def solve(self, X0, B, mmv, max_iter, callback=None):
        t_start = time.time()

        if X0 is None:
            R = copy_same_stride(B)
            X = create_same_stride(B.size(), B, B.dtype, B.device)
            X.fill_(0.0)
        else:
            R = B - mmv(X0)
            X = X0

        m_eps = self.params.cg_epsilon(X.dtype)

        P = R
        Rsold = torch.sum(R.pow(2), dim=0)

        e_train = time.time() - t_start

        for i in range(max_iter):
            with TicToc("Chol Iter", debug=False):
                t_start = time.time()
                AP = mmv(P)
                alpha = Rsold / (torch.sum(P * AP, dim=0) + m_eps)
                X.addmm_(P, torch.diag(alpha))

                if (i + 1) % self.params.cg_full_gradient_every == 0:
                    R = B - mmv(X)
                else:
                    R = R - torch.mm(AP, torch.diag(alpha))
                    # R.addmm_(mat1=AP, mat2=torch.diag(alpha), alpha=-1.0)

                Rsnew = torch.sum(R.pow(2), dim=0)
                if Rsnew.abs().max().sqrt() < self.params.cg_tolerance:
                    print("Stopping conjugate gradient descent at "
                          "iteration %d. Solution has converged." % (i + 1))
                    break

                P = R + torch.mm(P, torch.diag(Rsnew / (Rsold + m_eps)))
                Rsold = Rsnew

                e_iter = time.time() - t_start
                e_train += e_iter
            with TicToc("Chol callback", debug=False):
                if callback is not None:
                    callback(i + 1, X, e_train)

        return X
예제 #15
0
파일: fmmv_cpu.py 프로젝트: gpleiss/falkon
def fmmv_cpu_sparse(X1: SparseTensor, X2: SparseTensor, v: torch.Tensor,
                    kernel: 'falkon.kernels.Kernel',
                    out: Optional[torch.Tensor], opt: BaseOptions):
    opt = _setup_opt(opt, is_cpu=True)

    dtype = X1.dtype
    ntot, dtot = X1.size()
    mtot, T = v.size()

    # Create output matrix
    if out is None:
        out = torch.empty(ntot, T, dtype=dtype)
    out.fill_(0.0)

    avail_mem = _get_cpu_ram(opt, 0.95) / sizeof_dtype(dtype)
    # Narrowing X1, X2: n + m
    # Prepare - not computable, depends on kernel
    # ker_chunk : n*m
    # finalize : 0 (if can be implemented in place, kernel-dependent)
    n, m = select_dim_over_m(maxM=mtot,
                             maxN=ntot,
                             coef_nm=1,
                             coef_n=1,
                             coef_m=1,
                             tot=avail_mem)

    ker_chunk = create_same_stride((n, m), out, dtype, device='cpu')
    for i in range(0, ntot, n):
        ic = min(n, ntot - i)
        cur_out = out[i:i + ic, :]
        X1_chunk = X1.narrow_rows(i, ic)
        for j in range(0, mtot, m):
            jc = min(m, mtot - j)
            X2_chunk = X2.narrow_rows(j, jc)
            cur_ker_chunk = ker_chunk[:ic, :jc]
            cur_ker_chunk.fill_(0.0)

            ddd = kernel._prepare_sparse(X1_chunk, X2_chunk)
            kernel._apply_sparse(X1_chunk, X2_chunk.transpose_csc(),
                                 cur_ker_chunk)
            kernel._finalize(cur_ker_chunk, ddd)

            # Multiply by the vector v
            cur_out.addmm_(cur_ker_chunk, v.narrow(0, j, jc))
    return out
예제 #16
0
def fmm_cuda(X1: torch.Tensor,
             X2: torch.Tensor,
             kernel: 'falkon.kernels.Kernel',
             out: Optional[torch.Tensor] = None,
             opt: Optional[BaseOptions] = None) -> torch.Tensor:
    """
    performs fnc(X1*X2', X1, X2) in blocks on multiple GPUs
    """
    opt = _setup_opt(opt)
    _check_contiguity((X1, 'X1'), (X2, 'X2'), (out, 'out'))

    N = X1.size(0)
    M = X2.size(0)
    if out is None:
        out = create_same_stride((N, M), X1, X1.dtype, 'cpu', pin_memory=True)
    gpu_info = _get_gpu_info(opt, slack=0.9)
    block_sizes = calc_gpu_block_sizes(gpu_info, N)

    # If float32 we need to upcast to float64 to avoid numerical precision errors
    # in the kernel
    gpu_dtype = X1.dtype
    if sizeof_dtype(X1.dtype) < 8 and opt.no_single_kernel:
        gpu_dtype = torch.float64

    # Create the arguments passed to each subprocess
    args = []
    for i, g in enumerate(gpu_info):
        bwidth = block_sizes[i + 1] - block_sizes[i]
        if bwidth <= 0: continue
        args.append((ArgsFmm(X1=X1.narrow(0, block_sizes[i], bwidth),
                             X2=X2,
                             out=out.narrow(0, block_sizes[i], bwidth),
                             kernel=kernel,
                             gpu_dtype=gpu_dtype,
                             max_mem=g.usable_ram), g.Id))
    _start_wait_processes(_generic_fmm, args)
    torch.cuda.empty_cache()
    return out
예제 #17
0
def _generic_fmm(proc_idx, queue, device_id):
    # Unpack the function arguments
    a: ArgsFmm = queue.get()
    X1: torch.Tensor = a.X1
    X2: torch.Tensor = a.X2
    cuda_inputs = X1.is_cuda
    out = a.out
    kernel, gpu_dtype = a.kernel, a.gpu_dtype
    max_mem = a.max_mem
    num_streams = a.num_streams

    # flags and local variables
    change_dtype = gpu_dtype != X1.dtype
    X1_equal_X2 = _gpu_tns_same_memory(X1, X2)
    use_gpu_bufs = change_dtype or not cuda_inputs
    stride = "F" if is_f_contig(out, strict=True) else "C"
    j_iter = 0
    dts = sizeof_dtype(gpu_dtype)
    tc_device = torch.device('cuda:%d' % (int(device_id)))
    avail_mem = max_mem / dts

    # Choose block sizes n, m such that we won't run out of GPU memory
    ntot, d = X1.shape
    mtot = X2.shape[0]
    extra_mem = kernel.extra_mem()
    if cuda_inputs and not change_dtype:
        # No allocation will be performed by us. Only in-kernel stuff.
        n, m = select_dim_over_nm(max_n=ntot,
                                  max_m=mtot,
                                  d=d,
                                  coef_nd=extra_mem.get('nd', 0),
                                  coef_md=extra_mem.get('md', 0),
                                  coef_nm=extra_mem.get('nm', 0),
                                  coef_n=extra_mem.get('n', 0),
                                  coef_m=extra_mem.get('m', 0),
                                  rest=extra_mem.get('d', 0),
                                  max_mem=avail_mem)
    else:
        n, m = select_dim_over_nm(
            max_n=ntot,
            max_m=mtot,
            d=d,
            coef_nd=num_streams * (extra_mem.get('nd', 0) + 1),
            coef_md=num_streams * (extra_mem.get('md', 0) + 1),
            coef_nm=num_streams * (extra_mem.get('nm', 0) + 1),
            coef_n=extra_mem.get('n', 0),
            coef_m=extra_mem.get('m', 0),
            rest=extra_mem.get('d', 0),
            max_mem=avail_mem)

    # Create streams
    streams = [tcd.Stream(device=tc_device) for _ in range(num_streams)]

    # Create buffers
    if use_gpu_bufs:
        gX1 = create_same_stride((n, d), X1, gpu_dtype, tc_device)
        gX2_list = [
            create_same_stride((m, d), X2, gpu_dtype, tc_device)
            for _ in range(num_streams)
        ]
        gout_list = [
            create_same_stride((n, m), out, gpu_dtype, tc_device)
            for _ in range(num_streams)
        ]
    if not cuda_inputs:
        cpu_buf_list = [
            create_same_stride((n, m), out, gpu_dtype, 'cpu', pin_memory=True)
            for _ in range(num_streams)
        ]

    # Define helpers for the copy-back operations (from cpu_buf to output)
    copy_ops = [None] * num_streams

    def wrap_copy_op(stream_idx):
        if copy_ops[stream_idx] is not None:
            copy_ops[stream_idx]()
            copy_ops[stream_idx] = None

    def do_copy_op(output, buf, i_, ic_, j_, jc_):
        # This function will also do the type conversion
        output[i_:i_ + ic_, j_:j_ + jc_].copy_(buf[:ic_, :jc_])

    # Kernel computation begin
    with tcd.device(tc_device):
        for i in range(0, ntot, n):
            ic = min(n, ntot - i)

            with tcd.stream(streams[j_iter % len(streams)]):
                X1_chunk = X1.narrow(0, i, ic)
                if use_gpu_bufs:
                    cur_gX1 = gX1.narrow(0, 0, ic)
                    cur_gX1.copy_(X1_chunk, non_blocking=True)
                else:
                    cur_gX1 = X1_chunk

            for j in range(0, mtot, m):
                jc = min(m, mtot - j)
                # Choose the buffers for this inner iteration
                stream_id = j_iter % len(streams)
                stream = streams[stream_id]
                if use_gpu_bufs:
                    gX2 = gX2_list[stream_id]
                    gout = gout_list[stream_id]
                if not cuda_inputs:
                    cpu_buf = cpu_buf_list[stream_id]

                # Sync for buffers we must use now (e.g. 2 previous iters)
                with tcd.stream(stream):  # Inner-loop
                    stream.synchronize()
                    wrap_copy_op(stream_id)

                    if X1_equal_X2 and j < i:  # Shortcut for symmetric kernels
                        jc = min(m, mtot - j)
                        out[i:i + ic, j:j + jc].copy_(out[j:j + jc,
                                                          i:i + ic].T,
                                                      non_blocking=True)
                        j_iter += 1
                        continue

                    # Copy (CPU->GPU)
                    X2_chunk = X2.narrow(0, j, jc)
                    if use_gpu_bufs:
                        cur_gX2 = gX2.narrow(0, 0, jc)
                        cur_gX2.copy_(X2_chunk, non_blocking=True)
                    else:
                        cur_gX2 = X2_chunk

                    if use_gpu_bufs:
                        cur_gout = gout[:ic, :jc]
                    else:
                        cur_gout = out[i:i + ic, j:j + jc]
                    cur_gout.fill_(0.0)

                    # Compute
                    ddd = kernel._prepare(cur_gX1, cur_gX2)
                    kernel._apply(cur_gX1, cur_gX2.T, cur_gout)
                    cur_gout = kernel._finalize(cur_gout, ddd)

                    # Copy Back (GPU->CPU)
                    if not cuda_inputs:
                        # copy_ does not care about the contiguity of copies, as long as it's consistent
                        # however, in case of C-contiguous inputs it will create an intermediate array
                        # which is undesired. We use cuda_memcpy2d_async which works well with C-contiguous
                        # arrays.
                        if stride == "F":
                            copy_to_host(ic,
                                         jc,
                                         cur_gout,
                                         0,
                                         0,
                                         cpu_buf,
                                         0,
                                         0,
                                         s=stream)
                        else:
                            cuda_memcpy2d_async(dst=cpu_buf.data_ptr(),
                                                dpitch=cpu_buf.stride(0) * dts,
                                                src=cur_gout.data_ptr(),
                                                spitch=cur_gout.stride(0) *
                                                dts,
                                                width=jc * dts,
                                                height=ic,
                                                stream=stream._as_parameter_)
                        copy_ops[stream_id] = partial(do_copy_op, out, cpu_buf,
                                                      i, ic, j, jc)
                    elif change_dtype:
                        out.narrow(0, i,
                                   ic).narrow(1, j,
                                              jc).copy_(cur_gout,
                                                        non_blocking=True)
                j_iter += 1

            for i in range(num_streams):
                streams[i].synchronize()
                wrap_copy_op(i)

    return out
예제 #18
0
def fdmmv_cuda(X1: torch.Tensor,
               X2: torch.Tensor,
               v: Optional[torch.Tensor],
               w: Optional[torch.Tensor],
               kernel,
               out: Optional[torch.Tensor] = None,
               opt: Optional[BaseOptions] = None) -> torch.Tensor:
    """
    X1 : N x D
    X2 : M x D
    v  : M x T
    w  : N x T

    performs fnc(X1*X2', X1, X2)' * ( fnc(X1*X2', X1, X2) * v  +  w )  : M x T
    in blocks on multiple GPUs

    Assume all inputs have the same data type
    """
    opt = _setup_opt(opt)
    _check_contiguity((X1, 'X1'), (X2, 'X2'), (v, 'v'), (w, 'w'), (out, 'out'))
    if v is None and w is None:
        raise ValueError("one of 'v' or 'w' must not be None.")

    T = v.size(1) if v is not None else w.size(1)
    M = X2.size(0)
    N = X1.size(0)

    gpu_info = _get_gpu_info(opt, slack=0.9)
    block_sizes = calc_gpu_block_sizes(gpu_info, N)

    if out is None:
        out = create_same_stride((M, T), X1, X1.dtype, 'cpu', pin_memory=True)

    wrlk = []  # outputs for each subprocess.
    args = []
    for i, g in enumerate(gpu_info):
        bwidth = block_sizes[i + 1] - block_sizes[i]
        if bwidth <= 0:
            continue

        cur_out_gpu = create_same_stride((M, T), X1, X1.dtype,
                                         f'cuda:{gpu_info[i].Id}')  # M x T
        wrlk.append(cur_out_gpu)

        cur_w = None
        if w is not None:
            cur_w = w.narrow(0, block_sizes[i], bwidth)
        args.append((ArgsFdmmv(X1=X1.narrow(0, block_sizes[i], bwidth),
                               X2=X2,
                               v=v,
                               w=cur_w,
                               out=cur_out_gpu,
                               kernel=kernel,
                               max_mem=g.usable_ram), g.Id))

    # If using torch multiprocessing with spawn/forkserver here we must make sure
    # that any consumer of the queues is on a different process than the queue producer.
    # This is due to passing in a CUDA tensor to the queue
    # https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors
    # Thus we cannot run the first task on the current process.
    if kernel.kernel_type == "l2distance" and kernel.name == "gaussian":
        target = distk_fdmmv
    else:
        target = generic_fdmmv
    _start_wait_processes(target, args)

    if len(wrlk) > 1:
        # noinspection PyTypeChecker
        fastest_device: int = np.argmax([d.speed for d in gpu_info])
        out.copy_(
            tcd.comm.reduce_add(wrlk, destination=gpu_info[fastest_device].Id))
    else:
        out.copy_(wrlk[0])
    return out
예제 #19
0
    def init(self, X: Union[torch.Tensor, SparseTensor], Y: torch.Tensor,
             alpha: torch.Tensor, penalty: float, N: int) -> None:
        """Initialize the preconditioner matrix.

        This method must be called before the preconditioner becomes usable.

        Parameters
        ----------
        X : MxD tensor
            Matrix of Nystroem centers
        Y : Mx1 tensor
            Vector of targets corresponding to the Nystroem centers `X`
        alpha : Mx1 tensor
            Parameter vector (of the same dimension as `Y`) which gives the current
            solution to the optimization problem.
        penalty : float
            Regularization amount
        N : int
            Number of points in the full data-set.

        Notes
        -----
        If `debug=True` is present in the options, this method will print a lot of extra
        information pertaining timings of the various preconditioner operations. This can be
        useful to help understand how the preconditioner works.
        """
        if Y.shape[1] != 1:
            raise ValueError(
                "Logistic preconditioner can only deal with 1D outputs.")

        dtype = X.dtype
        M = X.size(0)

        eps = self.params.pc_epsilon(dtype)

        if self.fC is None:
            # This is done only at the first iteration of the logistic-falkon algorithm
            # It sets the `T` variable from the paper (chol(kMM)) to the upper part of `self.fC`
            with TicToc("Kernel", debug=self.params.debug):
                if isinstance(X, torch.Tensor):
                    C = create_same_stride((M, M),
                                           X,
                                           dtype=dtype,
                                           device='cpu',
                                           pin_memory=self._use_cuda)
                else:  # If sparse tensor we need fortran for kernel calculation
                    C = create_fortran((M, M),
                                       dtype=dtype,
                                       device='cpu',
                                       pin_memory=self._use_cuda)
                self.kernel(X, X, out=C, opt=self.params)
            self.fC = C.numpy()
            if not is_f_contig(C):
                self.fC = self.fC.T

            with TicToc("Add diag", debug=self.params.debug):
                # Compute T: lower(fC) = T.T
                inplace_add_diag(self.fC, eps * M)
            with TicToc("Cholesky 1", debug=self.params.debug):
                self.fC = potrf_wrapper(self.fC,
                                        clean=True,
                                        upper=False,
                                        use_cuda=self._use_cuda,
                                        opt=self.params)
                # Save the diagonal which will be overwritten when computing A
                self.dT = C.diag()
            with TicToc("Copy triangular", debug=self.params.debug):
                # Copy lower(fC) to upper(fC):  upper(fC) = T.
                copy_triang(self.fC, upper=False)
        else:
            if not self._use_cuda:
                # Copy non-necessary for cuda since LAUUM will do the copying
                with TicToc("Copy triangular", debug=self.params.debug):
                    # Copy upper(fC) to lower(fC): lower(fC) = T.T
                    copy_triang(self.fC,
                                upper=True)  # does not copy the diagonal
            # Setting diagonal necessary for trmm
            inplace_set_diag(self.fC, self.dT)

        # Compute W
        with TicToc("TRMM", debug=self.params.debug):
            # T is on upper(fC). Compute T.T @ alpha
            alpha = self._trmm(alpha.clone())
        with TicToc("W (ddf)", debug=self.params.debug):
            W = self.loss.ddf(Y, alpha)
        with TicToc("W-Multiply", debug=self.params.debug):
            W.sqrt_()
            self.fC = vec_mul_triang(self.fC,
                                     W.numpy().reshape(-1),
                                     side=0,
                                     upper=False)

        if self._use_cuda:
            with TicToc("LAUUM", debug=self.params.debug):
                # Product upper(fC) @ upper(fC).T : lower(fC) = T @ T.T
                self.fC = lauum_wrapper(self.fC,
                                        upper=True,
                                        use_cuda=self._use_cuda,
                                        opt=self.params)
        else:
            with TicToc("LAUUM", debug=self.params.debug):
                # Product lower(fC).T @ lower(fC) : lower(fC) = T @ T.T
                self.fC = lauum_wrapper(self.fC,
                                        upper=False,
                                        use_cuda=self._use_cuda,
                                        opt=self.params)

        # NOTE: Here the multiplier is 1/N instead of the more common 1/M!
        mul_triang(self.fC, upper=False, preserve_diag=False, multiplier=1 / N)

        with TicToc("Add diag", debug=self.params.debug):
            # lower(fC) = 1/N * [email protected] + lambda * I
            inplace_add_diag(self.fC, penalty)

        with TicToc("Cholesky 2", debug=self.params.debug):
            # Cholesky on lower(fC) : lower(fC) = A.T
            self.fC = potrf_wrapper(self.fC,
                                    clean=False,
                                    upper=False,
                                    use_cuda=self._use_cuda,
                                    opt=self.params)
            self.dA = torch.from_numpy(self.fC).diag()
예제 #20
0
def sparse_fmmv(proc_idx, queue, device_id):
    a: ArgsFmmv = queue.get()

    X1: SparseTensor = a.X1
    X2: SparseTensor = a.X2
    v, out = a.v, a.out
    kernel, max_mem = a.kernel, a.max_mem
    dtype = X1.dtype
    ntot, dtot = X1.shape
    mtot, T = v.size()

    avail_mem = max_mem / sizeof_dtype(dtype)
    # Memory needs:
    # X1_chunk : N + 2*D*N*density
    # X2_chunk : D + 2*D*M*density (because is transposed)
    # sparse_out : N + 2*N*M*(density) (assume density = 1)
    # ker_gpu  : M*N
    # mmv_gpu  : N*T
    # v_gpu    : M*T
    # Other: GPU buffer
    n, m = select_dim_over_m(
        maxM=mtot,
        maxN=ntot,
        tot=avail_mem,
        coef_nm=3,
        coef_n=2 + 2 * dtot * X1.density + T,
        coef_m=2 * dtot * X2.density + T,
        rest=dtot,
    )

    ddev = torch.device('cuda:%d' % int(device_id))
    with tcd.device(ddev):
        v_gpu = v.to(device=ddev)  # M x T
        mmv_gpu = create_same_stride((n, T), out, dtype, ddev)
        # ker_gpu should be fortran-ordered due to cusparse csr2dense function
        ker_gpu = create_fortran((n, m), dtype=dtype, device=ddev)

        for i in range(0, ntot, n):
            ic = min(n, ntot - i)

            cur_mmv_gpu = mmv_gpu[:ic]  # n x T
            cur_mmv_gpu.fill_(0.0)

            X1_chunk = X1.narrow_rows(i, ic)
            X1_chunk_d = X1_chunk.index_to_int().to(device=ddev)
            for j in range(0, mtot, m):
                jc = min(m, mtot - j)

                X2_chunk = X2.narrow_rows(j, jc)
                # Prepare sparse on CPU
                ddd = kernel._prepare_sparse(X1_chunk, X2_chunk)

                # Transpose X2-chunk and convert it to CSR. This uses lots of RAM
                X2_chunk_d = SparseTensor.from_scipy(
                    X2_chunk.transpose_csc().to_scipy().tocsr(copy=False)) \
                    .index_to_int() \
                    .to(device=ddev)

                cur_ker_gpu = ker_gpu[:ic, :jc]
                cur_ker_gpu.fill_(0.0)
                # Run the matrix multiplication (kernel apply)
                cur_ker_gpu = kernel._apply_sparse(X1_chunk_d, X2_chunk_d,
                                                   cur_ker_gpu)
                cur_ker_gpu = kernel._finalize(cur_ker_gpu, ddd)

                # Multiply by the vector v
                cur_mmv_gpu.addmm_(cur_ker_gpu, v_gpu.narrow(0, j, jc))
                del ddd, X2_chunk, X2_chunk_d

            # send result to CPU
            copy_to_host_noorder(ic, T, cur_mmv_gpu, 0, 0, out, i, 0)
            del X1_chunk, X1_chunk_d
    return out
예제 #21
0
def distk_fdmmv(proc_idx, queue, device_id):
    a: ArgsFdmmv = queue.get()
    X1, X2, v, w, out = a.X1, a.X2, a.v, a.w, a.out
    kernel: L2DistanceKernel = a.kernel
    max_mem = a.max_mem
    N, D = X1.size()
    M = X2.size(0)
    T = v.size(1) if v is not None else w.size(1)
    dtype = X1.dtype

    # Memory usage:
    # v    : M x T
    # K    : n x M
    # X1ss : n x d
    # X2s  : M x d
    # Kv   : n x T
    # out  : M x T
    # sq1  : n x 1
    # sq2  : M x 1
    # ------------
    # total : n*d + M*d + n*(M + T + 1) + 2*M*T + M
    avail_mem = max_mem / sizeof_dtype(dtype)
    # FIXME: There seems to be a bug where if we let avail_mem like it is
    #        for 32-bit data-types some copy fails. In such case we need
    #        to free up some more memory and then everything runs fine.
    rest_coef = 2 * M * T if v is not None else M * T
    n, d = select_dim_over_d(maxD=D,
                             maxN=N,
                             coef_nd=1,
                             coef_n=M + T + 1,
                             coef_d=M,
                             rest=rest_coef + M,
                             tot=avail_mem)

    ddev = torch.device('cuda:%d' % int(device_id))
    s1 = tcd.Stream()
    s2 = tcd.Stream()

    with tcd.device(ddev), tcd.stream(s1):
        if v is not None:
            v_gpu = create_same_stride((M, T), v, dtype, ddev)
            copy_to_device_noorder(M, T, v, 0, 0, v_gpu, 0, 0)
        K_gpu = create_same_stride((n, M), X1, dtype, ddev)
        X1ss_gpu = create_same_stride((n, d), X1, dtype, ddev)
        X2s_gpu = create_same_stride((M, d), X2, dtype, ddev)
        Kv_gpu = create_same_stride((n, T), X1, dtype, ddev)
        if out.is_cuda:
            out_gpu = out
        else:
            out_gpu = create_same_stride((M, T), out, dtype, ddev)
        out_gpu.fill_(0.0)
        sq1_gpu = create_same_stride((n, ), X1, dtype, ddev)
        sq2_gpu = create_same_stride((M, ), X1, dtype, ddev)

        #if (d == D):
        #    with torch.cuda.stream(s2):
        #        cur_X2s_gpu = copy_to_device_noorder(M, d, X2, 0, 0, X2s_gpu, 0, 0, s=s2)
        #        torch.norm(cur_X2s_gpu, p=2, dim=1, keepdim=True, out=sq2_gpu).pow_(2)

        for i in range(0, N, n):
            nb = min(N - i, n)

            cur_K_gpu = K_gpu.narrow(0, 0, nb)  # nb x M
            cur_K_gpu.fill_(0.0)

            for j in range(0, D, d):
                db = min(D - j, d)
                # Parallelize two matrix transfers (probably pointless)
                #if d < D:
                with torch.cuda.stream(s2):
                    cur_X2s_gpu = copy_to_device_noorder(M,
                                                         db,
                                                         X2,
                                                         0,
                                                         j,
                                                         X2s_gpu,
                                                         0,
                                                         0,
                                                         s=s2)
                    torch.norm(cur_X2s_gpu,
                               p=2,
                               dim=1,
                               keepdim=True,
                               out=sq2_gpu).pow_(2)
                cur_X1ss_gpu = copy_to_device_noorder(nb,
                                                      db,
                                                      X1,
                                                      i,
                                                      j,
                                                      X1ss_gpu,
                                                      0,
                                                      0,
                                                      s=s1)
                torch.norm(cur_X1ss_gpu, p=2, dim=1, keepdim=True,
                           out=sq1_gpu).pow_(2)

                s2.synchronize()
                s1.synchronize()
                cur_K_gpu.addmm_(mat1=cur_X1ss_gpu,
                                 mat2=cur_X2s_gpu.T,
                                 alpha=-2.0)
                cur_K_gpu.add_(sq1_gpu)
                cur_K_gpu.add_(sq2_gpu.T)
                cur_K_gpu.clamp_min_(0)

            cur_K_gpu = kernel._transform(cur_K_gpu)

            if w is not None:
                # Copy split w to GPU into cur_Kv_gpu,
                cur_Kv_gpu = copy_to_device_noorder(nb,
                                                    T,
                                                    w,
                                                    i,
                                                    0,
                                                    Kv_gpu,
                                                    0,
                                                    0,
                                                    s=s1)  # n x T
                if v is not None:
                    cur_Kv_gpu.addmm_(cur_K_gpu, v_gpu)
            else:
                # v cannot be None if w is None
                cur_Kv_gpu = Kv_gpu.narrow(0, 0, nb)  # n x T
                torch.mm(cur_K_gpu, v_gpu, out=cur_Kv_gpu)  # n x T

            # Multiply transposed kernel with the Kv result.
            out_gpu.addmm_(cur_K_gpu.T, cur_Kv_gpu)  # M x T
            s1.synchronize()
        s1.synchronize()

        if not out.is_cuda:
            copy_to_host_noorder(M, T, out_gpu, 0, 0, out, 0, 0)
    return out
예제 #22
0
def distk_fmmv(proc_idx, queue, device_id):
    a: ArgsFmmv = queue.get()
    X1, X2, v, out = a.X1, a.X2, a.v, a.out
    kernel: L2DistanceKernel = a.kernel
    max_mem = a.max_mem

    N, D = X1.shape
    M = X2.shape[0]
    T = v.shape[1]
    dtype = X1.dtype

    # GPU memory usage:
    # X1s : n x D
    # X2s : m x D
    # vs  : m x T
    # nm  : n x m
    # out : n x T
    # -----------
    # total: n*m + n * (D + T) + m * (D + T) = R
    avail_mem = max_mem / sizeof_dtype(dtype)
    #if sizeof_dtype(dtype) == 4:
    #    avail_mem /= 2
    n, m = select_dim_over_m(maxM=M,
                             maxN=N,
                             coef_nm=1.0,
                             coef_n=D + T,
                             coef_m=D + T,
                             tot=avail_mem)

    ddev = torch.device('cuda:%d' % int(device_id))
    with tcd.device(ddev):
        nm_gpu = create_same_stride((n, m), X1, dtype, ddev)
        out_gpu = create_same_stride((n, T), out, dtype, ddev)
        X1s_gpu = create_same_stride((n, D), X1, dtype, ddev)
        X2s_gpu = create_same_stride((m, D), X2, dtype, ddev)
        vs_gpu = create_same_stride((m, T), v, dtype, ddev)

        for i in range(0, N, n):
            nb = min(n, N - i)
            cur_X1s_gpu = copy_to_device_noorder(nb, D, X1, i, 0, X1s_gpu, 0,
                                                 0)
            sq1 = torch.norm(cur_X1s_gpu, p=2, dim=1, keepdim=True).pow_(2)
            cur_out_gpu = out_gpu.narrow(0, 0, nb)  # n x T
            cur_out_gpu.fill_(0.0)

            for j in range(0, M, m):
                mb = min(m, M - j)
                cur_X2s_gpu = copy_to_device_noorder(mb, D, X2, j, 0, X2s_gpu,
                                                     0, 0)
                cur_vs_gpu = copy_to_device_noorder(mb, T, v, j, 0, vs_gpu, 0,
                                                    0)  # m x T
                cur_nm_gpu = nm_gpu[:nb, :mb]  # n x m

                sq2 = torch.norm(cur_X2s_gpu, p=2, dim=1, keepdim=True).pow_(2)
                torch.mm(cur_X1s_gpu, cur_X2s_gpu.T, out=cur_nm_gpu)

                cur_nm_gpu.mul_(-2.0)
                cur_nm_gpu.add_(sq1)
                cur_nm_gpu.add_(sq2.T)
                cur_nm_gpu.clamp_min_(0)
                kernel._transform(cur_nm_gpu)

                # Multiply by the vector v
                # FIXME: This is the cause of mapping errors in case of float32 calculations.
                cur_out_gpu.addmm_(cur_nm_gpu, cur_vs_gpu)  # n x T
            # send result to CPU
            copy_to_host_noorder(nb, T, out_gpu, 0, 0, out, i, 0)

    return out
예제 #23
0
def generic_fdmmv(proc_idx, queue, device_id):
    a: ArgsFdmmv = queue.get()
    X1, X2, v, w, out = a.X1, a.X2, a.v, a.w, a.out
    kernel, max_mem = a.kernel, a.max_mem
    dtype = X1.dtype
    N, D = X1.size()
    M = X2.size(0)
    if v is None:
        T = w.size(1)
    else:
        T = v.size(1)

    # Memory usage:
    # v    : M x T
    # K    : n x M
    # X1d  : n x d
    # X2d  : M x d
    # Kv   : n x T
    # out2 : M x T
    # sq1  : n x 1
    # sq2  : M x 1
    # ------------
    # total : n*d + M*d + n*(M + T) + 2*M*T + M
    avail_mem = max_mem / sizeof_dtype(dtype)
    # FIXME: There seems to be a bug where if we let avail_mem like it is
    #        for 32-bit data-types some copy fails. In such case we need
    #        to free up some more memory and then everything runs fine.
    if sizeof_dtype(dtype) == 4:
        avail_mem /= 2
    rest_coef = 2 * M * T if v is not None else M * T
    n, d = select_dim_over_d(maxD=D,
                             maxN=N,
                             coef_nd=1,
                             coef_n=M + T + 1,
                             coef_d=M,
                             rest=rest_coef + M,
                             tot=avail_mem)

    ddev = torch.device('cuda:%d' % int(device_id))
    with tcd.device(ddev):
        # Initialize GPU data
        ker_gpu = create_same_stride((n, M), out, dtype=dtype, device=ddev)
        X1s_gpu = create_same_stride((n, d), X1, dtype, ddev)
        X2s_gpu = create_same_stride((M, d), X2, dtype, ddev)
        w_gpu = create_same_stride((n, T), ker_gpu, dtype, ddev)
        if out.is_cuda:
            out_gpu = out
        else:
            out_gpu = create_same_stride((M, T), out, dtype, ddev)
        out_gpu.fill_(0.0)
        if v is not None:
            v_gpu = v.to(device=ddev)  # M x T

        for i in range(0, N, n):
            ic = min(n, N - i)
            ddd = kernel._prepare(X1.narrow(0, i, ic), X2)

            c_g_ker = ker_gpu.narrow(0, 0, ic)
            c_g_ker.fill_(0.0)
            for k in range(0, D, d):
                kc = min(d, D - k)
                c_g_X1s = copy_to_device_noorder(ic, kc, X1, i, k, X1s_gpu, 0,
                                                 0)
                c_g_X2s = copy_to_device_noorder(M, kc, X2, 0, k, X2s_gpu, 0,
                                                 0)
                kernel._apply(c_g_X1s, c_g_X2s.T, c_g_ker)
            kernel._finalize(c_g_ker, ddd)

            if w is not None:
                c_g_w = copy_to_device_noorder(ic, T, w, i, 0, w_gpu, 0, 0)
            else:
                c_g_w = w_gpu.narrow(0, 0, ic)
                c_g_w.fill_(0.0)
            if v is not None:
                c_g_w.addmm_(c_g_ker, v_gpu)
            out_gpu.addmm_(c_g_ker.T, c_g_w)

        if not out.is_cuda:
            copy_to_device_noorder(M, T, out_gpu, 0, 0, out, 0, 0)
    return out
예제 #24
0
파일: fmmv_cuda.py 프로젝트: ymohit/falkon
def fdmmv_cuda(X1: torch.Tensor,
               X2: torch.Tensor,
               v: Optional[torch.Tensor],
               w: Optional[torch.Tensor],
               kernel,
               out: Optional[torch.Tensor] = None,
               opt: Optional[BaseOptions] = None) -> torch.Tensor:
    """
    X1 : N x D
    X2 : M x D
    v  : M x T
    w  : N x T

    performs fnc(X1*X2', X1, X2)' * ( fnc(X1*X2', X1, X2) * v  +  w )  : M x T
    in blocks on multiple GPUs

    Assume all inputs have the same data type
    """
    opt = _setup_opt(opt)
    _check_contiguity((X1, 'X1'), (X2, 'X2'), (v, 'v'), (w, 'w'), (out, 'out'))
    device = X1.device
    if v is None and w is None:
        raise ValueError("one of 'v' or 'w' must not be None.")

    T = v.size(1) if v is not None else w.size(1)
    M = X2.size(0)
    N = X1.size(0)

    if out is None:
        out = create_same_stride((M, T),
                                 X1,
                                 X1.dtype,
                                 device=device,
                                 pin_memory=device.type != 'cuda')

    gpu_info = _get_gpu_info(opt, slack=0.9)

    if kernel.kernel_type == "l2distance" and kernel.name == "gaussian":
        target = distk_fdmmv
    else:
        target = generic_fdmmv

    if device.type == 'cuda':
        single_gpu_info = [g for g in gpu_info if g.Id == device.index][0]
        args = ArgsFdmmv(X1=X1,
                         X2=X2,
                         v=v,
                         w=w,
                         out=out,
                         kernel=kernel,
                         max_mem=single_gpu_info.usable_ram)
        _call_direct(target, (args, device.index))
    else:
        block_sizes = calc_gpu_block_sizes(gpu_info, N)
        wrlk = []  # outputs for each subprocess.
        args = []
        for i, g in enumerate(gpu_info):
            bwidth = block_sizes[i + 1] - block_sizes[i]
            if bwidth <= 0:
                continue

            cur_out_gpu = create_same_stride((M, T), X1, X1.dtype,
                                             f'cuda:{gpu_info[i].Id}')  # M x T
            wrlk.append(cur_out_gpu)

            cur_w = None
            if w is not None:
                cur_w = w.narrow(0, block_sizes[i], bwidth)
            args.append((ArgsFdmmv(X1=X1.narrow(0, block_sizes[i], bwidth),
                                   X2=X2,
                                   v=v,
                                   w=cur_w,
                                   out=cur_out_gpu,
                                   kernel=kernel,
                                   max_mem=g.usable_ram), g.Id))
        _start_wait_processes(target, args)
        if len(wrlk) > 1:
            # noinspection PyTypeChecker
            fastest_device: int = np.argmax([d.speed for d in gpu_info])
            out.copy_(
                tcd.comm.reduce_add(wrlk,
                                    destination=gpu_info[fastest_device].Id))
        else:
            out.copy_(wrlk[0])
    return out
예제 #25
0
def _generic_fmm(proc_idx, queue, device_id):
    a: ArgsFmm = queue.get()
    X1: torch.Tensor = a.X1
    X2: torch.Tensor = a.X2
    out = a.out
    kernel, gpu_dtype = a.kernel, a.gpu_dtype
    max_mem = a.max_mem

    ntot, dtot = X1.shape
    mtot = X2.shape[0]

    # This function is slightly faster if we limit the sizes
    # of the processed blocks slightly. Especially when doing
    # a cold run since pinned-memory allocation is extremely slow.
    # We don't want to do it if we're memory constrained though.
    if max_mem > 4 * 2**30:
        max_mem /= 4
    avail_mem = max_mem / sizeof_dtype(gpu_dtype)
    # Memory usage:
    # - gOut    : n x m
    # - g_ssX1  : n x d
    # - g_sX2   : m x d
    # total : n*d + m*d + n*m
    n, d, m = select_dim_fMM(avail_mem, ntot, dtot, mtot)

    tc_device = torch.device('cuda:%d' % (int(device_id)))
    with torch.cuda.device(tc_device):
        # Initialize GPU buffers
        g_out = create_same_stride((n, m), out, gpu_dtype, tc_device)
        g_X1d = create_same_stride((n, d), X1, gpu_dtype, tc_device)
        g_X2d = create_same_stride((m, d), X2, gpu_dtype, tc_device)
        cpu_buf = None
        if X1.dtype != gpu_dtype:
            cpu_buf = create_same_stride((n, m),
                                         out,
                                         gpu_dtype,
                                         'cpu',
                                         pin_memory=True)

        for j in range(0, mtot, m):
            jc = min(m, mtot - j)
            X2_chunk = cast_tensor(X2.narrow(0, j, jc),
                                   dtype=gpu_dtype,
                                   warn=False).pin_memory()

            for i in range(0, ntot, n):
                ic = min(n, ntot - i)

                if _gpu_tns_same_memory(X1, X2) and j < i:
                    out[i:i + ic, j:j + jc].copy_(out[j:j + jc, i:i + ic].T)
                else:
                    X1_chunk = cast_tensor(X1.narrow(0, i, ic),
                                           dtype=gpu_dtype,
                                           warn=False).pin_memory()

                    ddd = kernel._prepare(X1_chunk, X2_chunk)

                    cur_g_out = g_out.narrow(0, 0, ic).narrow(1, 0, jc)
                    cur_g_out.fill_(0.0)

                    for k in range(0, dtot, d):
                        kc = min(d, dtot - k)
                        # Move to GPU
                        cur_g_X1d = g_X1d.narrow(0, 0, ic).narrow(1, 0, kc)
                        cur_g_X1d.copy_(X1_chunk.narrow(1, k, kc))
                        cur_g_X2d = g_X2d.narrow(0, 0, jc).narrow(1, 0, kc)
                        cur_g_X2d.copy_(X2_chunk.narrow(1, k, kc))
                        # Apply
                        a.kernel._apply(cur_g_X1d, cur_g_X2d.T, cur_g_out)

                    a.kernel._finalize(cur_g_out, ddd)
                    copy_to_host_noorder(ic, jc, cur_g_out, 0, 0, out, i, j,
                                         cpu_buf)
                    del ddd
        del g_out, g_X1d, g_X2d
    return out
예제 #26
0
def _sparse_fmm(proc_idx, queue, device_id):
    a: ArgsFmm = queue.get()
    X1: SparseTensor = a.X1
    X2: SparseTensor = a.X2
    out = a.out
    kernel, gpu_dtype = a.kernel, a.gpu_dtype
    max_mem = a.max_mem

    ntot, dtot = X1.shape
    mtot = X2.size(0)

    avail_mem = max_mem / sizeof_dtype(gpu_dtype)
    # Memory usage:
    # X1_chunk : ntot + 2 * D * ntot * density
    # X2_chunk : dtot + 2 * D * mtot * density (because is transposed)
    # sparse_out : ntot + 2 * ntot * mtot * density (assume density=1 here)
    # ker_gpu  : mtot * ntot
    n, m = select_dim_over_nm_v2(max_n=ntot,
                                 max_m=mtot,
                                 coef_nm=3,
                                 coef_n=2 + 2 * dtot * X1.density,
                                 coef_m=2 * dtot * X2.density,
                                 rest=dtot,
                                 max_mem=avail_mem)

    tc_device = torch.device('cuda:%d' % (int(device_id)))
    with torch.cuda.device(tc_device):
        # Initialize GPU buffers
        g_out = create_same_stride((n, m), out, gpu_dtype, tc_device)
        cpu_buf = None
        if X1.dtype != gpu_dtype:
            cpu_buf = create_same_stride((n, m),
                                         out,
                                         gpu_dtype,
                                         'cpu',
                                         pin_memory=True)

        for j in range(0, mtot, m):
            jc = min(m, mtot - j)

            X2_chunk = X2.narrow_rows(j, jc).to(dtype=gpu_dtype)
            X2_chunk_d = SparseTensor.from_scipy(
                X2_chunk.transpose_csc().to_scipy().tocsr(copy=False)) \
                .index_to_int() \
                .to(device=tc_device)
            for i in range(0, ntot, n):
                ic = min(n, ntot - i)

                X1_chunk = X1.narrow_rows(i, ic).to(dtype=gpu_dtype)
                X1_chunk_d = X1_chunk.index_to_int().to(device=tc_device)
                cur_g_out = g_out.narrow(0, 0, ic).narrow(1, 0, jc)
                cur_g_out.fill_(0.0)

                ddd = kernel._prepare_sparse(X1_chunk, X2_chunk)
                cur_g_out = kernel._apply_sparse(X1_chunk_d, X2_chunk_d,
                                                 cur_g_out)
                cur_g_out = kernel._finalize(cur_g_out, ddd)
                copy_to_host_noorder(ic, jc, cur_g_out, 0, 0, out, i, j,
                                     cpu_buf)
                del ddd, X1_chunk_d, X1_chunk
            del X2_chunk, X2_chunk_d
        del g_out
    return out
예제 #27
0
def sparse_fdmmv(proc_idx, queue, device_id):
    a: ArgsFdmmv = queue.get()
    X1: SparseTensor = a.X1
    X2: SparseTensor = a.X2
    v, w, out = a.v, a.w, a.out
    kernel, max_mem = a.kernel, a.max_mem
    dtype = X1.dtype
    N, D = X1.shape
    M = X2.size(0)
    if v is None:
        T = w.size(1)
    else:
        T = v.size(1)

    # Memory needs:
    # X1_chunk : ntot + 2 * D * ntot * density
    # X2       : dtot + 2 * D * M * density (because is transposed)
    # sparse_out : ntot + 2 * ntot * M * density (assume here density = 1)
    # ker_gpu  : M * ntot
    # w_gpu    : ntot * T
    # v_gpu    : M * T
    # out_gpu  : M * T
    avail_mem = max_mem / sizeof_dtype(dtype)
    den = 2 * D * X1.density + 2 + 3 * M + T
    sub = D + 2 * D * M * X2.density + M * T
    if v is not None:
        sub += M * T
    n = (avail_mem - sub) / den
    n = min(int(n), N)
    if n < 1:
        raise MemoryError("Not enough memory to run sparse dfmmv")

    ddev = torch.device('cuda:%d' % int(device_id))
    with tcd.device(ddev):
        # Initialize GPU data
        w_gpu = create_same_stride((n, T), out, dtype, ddev)
        if out.is_cuda:
            out_gpu = out
        else:
            out_gpu = create_same_stride((M, T), out, dtype, ddev)
        out_gpu.fill_(0.0)
        ker_gpu = create_fortran((n, M), dtype, ddev)
        if v is not None:
            v_gpu = v.to(device=ddev)  # M x T

        X2_d = SparseTensor.from_scipy(
            X2.transpose_csc().to_scipy().tocsr(copy=False)) \
            .index_to_int() \
            .to(device=ddev)

        for i in range(0, N, n):
            ic = min(n, N - i)
            X1_chunk = X1.narrow_rows(i, ic)
            X1_chunk_d = X1_chunk.index_to_int().to(device=ddev)

            ker_chunk = ker_gpu[:ic]
            ker_chunk.fill_(0.0)

            # TODO: This is wasteful (X2 will be prepared many times over)
            ddd = kernel._prepare_sparse(X1_chunk, X2)
            ker_chunk = kernel._apply_sparse(X1_chunk_d, X2_d, ker_chunk)
            ker_chunk = kernel._finalize(ker_chunk, ddd)

            if w is not None:
                c_g_w = copy_to_device_noorder(ic, T, w, i, 0, w_gpu, 0, 0)
            else:
                c_g_w = w_gpu.narrow(0, 0, ic)
                c_g_w.fill_(0.0)

            if v is not None:
                c_g_w.addmm_(ker_chunk, v_gpu)
            out_gpu.addmm_(ker_chunk.T, c_g_w)
            del ddd, X1_chunk, X1_chunk_d

        if not out.is_cuda:
            copy_to_device_noorder(M, T, out_gpu, 0, 0, out, 0, 0)
    return out
예제 #28
0
def _generic_fmm(proc_idx, queue, device_id):
    a: ArgsFmm = queue.get()
    X1: torch.Tensor = a.X1
    X2: torch.Tensor = a.X2
    cuda_inputs = X1.is_cuda
    out = a.out
    kernel, gpu_dtype = a.kernel, a.gpu_dtype
    max_mem = a.max_mem
    change_dtype = gpu_dtype != X1.dtype

    ntot, dtot = X1.shape
    mtot = X2.shape[0]

    # This function is slightly faster if we limit the sizes
    # of the processed blocks slightly. Especially when doing
    # a cold run since pinned-memory allocation is extremely slow.
    # We don't want to do it if we're memory constrained though.
    if max_mem > 4 * 2**30:
        max_mem /= 4
    avail_mem = max_mem / sizeof_dtype(gpu_dtype)
    # Memory usage:
    # - gOut    : n x m
    # - g_ssX1  : n x d
    # - g_sX2   : m x d
    # total : n*d + m*d + n*m
    if cuda_inputs and not change_dtype:
        # No allocation will be performed, so no need to split at all!
        n, d, m = ntot, dtot, mtot
    else:
        n, d, m = select_dim_fMM(avail_mem, ntot, dtot, mtot)

    tc_device = torch.device('cuda:%d' % (int(device_id)))
    s1 = torch.cuda.Stream(device=tc_device)
    with torch.cuda.device(tc_device), torch.cuda.stream(s1):
        # Initialize GPU buffers
        if not cuda_inputs or change_dtype:
            g_X1d = create_same_stride((n, d), X1, gpu_dtype, tc_device)
            g_X2d = create_same_stride((m, d), X2, gpu_dtype, tc_device)
            g_out = create_same_stride((n, m), out, gpu_dtype, tc_device)
        if not cuda_inputs:
            cpu_buf = None
            if change_dtype:
                cpu_buf = create_same_stride((n, m),
                                             out,
                                             gpu_dtype,
                                             'cpu',
                                             pin_memory=True)

        for j in range(0, mtot, m):
            jc = min(m, mtot - j)
            X2_chunk = X2.narrow(0, j, jc)

            for i in range(0, ntot, n):
                ic = min(n, ntot - i)

                if _gpu_tns_same_memory(X1, X2) and j < i:
                    out[i:i + ic, j:j + jc].copy_(out[j:j + jc, i:i + ic].T)
                    continue

                X1_chunk = X1.narrow(0, i, ic)
                ddd = kernel._prepare(X1_chunk, X2_chunk)
                if not cuda_inputs or change_dtype:
                    cur_g_out = g_out.narrow(0, 0, ic).narrow(1, 0, jc)
                else:
                    cur_g_out = out.narrow(0, i, ic).narrow(1, j, jc)
                cur_g_out.fill_(0.0)

                for k in range(0, dtot, d):
                    kc = min(d, dtot - k)
                    # Move to GPU and type-convert
                    if (not cuda_inputs) or change_dtype:
                        cur_g_X1d = g_X1d.narrow(0, 0, ic).narrow(1, 0, kc)
                        cur_g_X1d.copy_(X1_chunk.narrow(1, k, kc))
                        cur_g_X2d = g_X2d.narrow(0, 0, jc).narrow(1, 0, kc)
                        cur_g_X2d.copy_(X2_chunk.narrow(1, k, kc))
                    else:
                        cur_g_X1d = X1_chunk.narrow(1, k, kc)
                        cur_g_X2d = X2_chunk.narrow(1, k, kc)

                    # Apply
                    a.kernel._apply(cur_g_X1d, cur_g_X2d.T, cur_g_out)

                a.kernel._finalize(cur_g_out, ddd)
                if not cuda_inputs:
                    copy_to_host_noorder(ic, jc, cur_g_out, 0, 0, out, i, j,
                                         cpu_buf, s1)
                elif change_dtype:
                    out.narrow(0, i, ic).narrow(1, j, jc).copy_(cur_g_out)
                del ddd
    return out
예제 #29
0
    def init(self,
             X: Union[torch.Tensor, SparseTensor],
             weight_vec: Optional[torch.Tensor] = None):
        """Initialize the preconditioner matrix.

        This method must be called before the preconditioner can be used.

        Parameters
        ----------
        X : torch.Tensor
            The (M x D) matrix of Nystroem centers
        weight_vec
            An optional vector of size (M x 1) which is used for reweighted least-squares.
            This vector should contain the weights corresponding to the Nystrom centers.
        """
        if X.is_cuda and not self._use_cuda:
            raise RuntimeError(
                "use_cuda is set to False, but data is CUDA tensor. "
                "Check your options.")
        if weight_vec is not None and not check_same_device(X, weight_vec):
            raise ValueError(f"Weights and data are not on the same device "
                             f"({weight_vec.device}, {X.device})")
        if weight_vec is not None and weight_vec.shape[0] != X.shape[0]:
            raise ValueError(
                f"Weights and Nystrom centers should have the same first dimension. "
                f"Found instead {weight_vec.shape[0]}, {X.shape[0]}.")
        dtype = X.dtype
        dev = X.device
        eps = self.params.pc_epsilon(X.dtype)
        M = X.size(0)

        with TicToc("Kernel", debug=self.params.debug):
            if isinstance(X, torch.Tensor):
                C = create_same_stride((M, M),
                                       X,
                                       dtype=dtype,
                                       device=dev,
                                       pin_memory=self._use_cuda)
            else:  # If sparse tensor we need fortran for kernel calculation
                C = create_fortran((M, M),
                                   dtype=dtype,
                                   device=dev,
                                   pin_memory=self._use_cuda)
            self.kernel(X, X, out=C, opt=self.params)
        if not is_f_contig(C):
            C = C.T

        with TicToc("Cholesky 1", debug=self.params.debug):
            # Compute T: lower(fC) = T.T
            inplace_add_diag_th(C, eps * M)
            C = potrf_wrapper(C,
                              clean=False,
                              upper=False,
                              use_cuda=self._use_cuda,
                              opt=self.params)
            # Save the diagonal which will be overwritten when computing A
            self.dT = C.diag()

        with TicToc("Copy triangular", debug=self.params.debug):
            # Copy lower(fC) to upper(fC):  upper(fC) = T.
            copy_triang(C, upper=False)

        # Weighted least-squares needs to weight the A matrix. We can weigh once before LAUUM,
        # but since CUDA-LAUUM touches both sides of C, weighting before LAUUM will also modify
        # the matrix T. Therefore for CUDA inputs we weigh twice after LAUUM!
        if weight_vec is not None and not self._use_cuda:
            with TicToc("Weighting(CPU)", debug=self.params.debug):
                weight_vec.sqrt_()
                vec_mul_triang(C, weight_vec, side=1, upper=False)

        if self._use_cuda:
            with TicToc("LAUUM(CUDA)", debug=self.params.debug):
                # Product upper(fC) @ upper(fC).T, store in lower(fC) = T @ T.T
                C = lauum_wrapper(C,
                                  upper=True,
                                  use_cuda=self._use_cuda,
                                  opt=self.params)
        else:
            with TicToc("LAUUM(CPU)", debug=self.params.debug):
                # Product lower(fC).T @ lower(fC), store in lower(fC) = T @ T.T
                C = lauum_wrapper(C,
                                  upper=False,
                                  use_cuda=self._use_cuda,
                                  opt=self.params)

        if weight_vec is not None and self._use_cuda:
            with TicToc("Weighting(CUDA)", debug=self.params.debug):
                weight_vec.sqrt_()
                vec_mul_triang(C, weight_vec, side=0, upper=False)
                vec_mul_triang(C, weight_vec, side=1, upper=False)

        with TicToc("Cholesky 2", debug=self.params.debug):
            # lower(fC) = 1/M * [email protected]
            mul_triang(C, upper=False, preserve_diag=False, multiplier=1 / M)
            # lower(fC) = 1/M * [email protected] + lambda * I
            inplace_add_diag_th(C, self._lambda)
            # Cholesky on lower(fC) : lower(fC) = A.T
            C = potrf_wrapper(C,
                              clean=False,
                              upper=False,
                              use_cuda=self._use_cuda,
                              opt=self.params)
            self.dA = C.diag()

        self.fC = C