Пример #1
0
    def solve(self, X0, B, mmv, max_iter, callback=None):
        t_start = time.time()

        if X0 is None:
            R = copy_same_stride(B)
            X = create_same_stride(B.size(), B, B.dtype, B.device)
            X.fill_(0.0)
        else:
            R = B - mmv(X0)
            X = X0

        m_eps = self.params.cg_epsilon(X.dtype)

        P = R
        # noinspection PyArgumentList
        Rsold = torch.sum(R.pow(2), dim=0)

        e_train = time.time() - t_start

        for i in range(max_iter):
            with TicToc("Chol Iter", debug=False):  # TODO: FIXME
                t_start = time.time()
                AP = mmv(P)
                # noinspection PyArgumentList
                alpha = Rsold / (torch.sum(P * AP, dim=0) + m_eps)
                X.addmm_(P, torch.diag(alpha))

                if (i + 1) % self.params.cg_full_gradient_every == 0:
                    if (X.is_cuda):
                        # addmm_ may not be finished yet causing mmv to get stale inputs.
                        torch.cuda.synchronize()
                    R = B - mmv(X)
                else:
                    R = R - torch.mm(AP, torch.diag(alpha))
                    # R.addmm_(mat1=AP, mat2=torch.diag(alpha), alpha=-1.0)

                # noinspection PyArgumentList
                Rsnew = torch.sum(R.pow(2), dim=0)
                if Rsnew.abs().max().sqrt() < self.params.cg_tolerance:
                    print("Stopping conjugate gradient descent at "
                          "iteration %d. Solution has converged." % (i + 1))
                    break

                P = R + torch.mm(P, torch.diag(Rsnew / (Rsold + m_eps)))
                if P.is_cuda:
                    # P must be synced so that it's correct for mmv in next iter.
                    torch.cuda.synchronize()
                Rsold = Rsnew

                e_iter = time.time() - t_start
                e_train += e_iter
            with TicToc("Chol callback", debug=False):
                if callback is not None:
                    callback(i + 1, X, e_train)

        return X
Пример #2
0
    def solve(self,
              X,
              M,
              Y,
              _lambda,
              initial_solution,
              max_iter,
              callback=None):
        n = X.size(0)
        if M is None:
            Knm = X
        else:
            Knm = None

        cuda_inputs: bool = X.is_cuda
        device = X.device

        stream = None
        if cuda_inputs:
            stream = get_non_default_stream(device)

        # Note that if we don't have CUDA this still works with stream=None.
        with torch.cuda.stream(stream):
            with TicToc("ConjGrad preparation", False):
                y_over_n = Y / n  # Cannot be in-place since Y needs to be preserved

                if self.is_weighted:
                    y_weights = self.weight_fn(Y)
                    y_over_n.mul_(
                        y_weights
                    )  # This can be in-place since we own y_over_n

                # Compute the right hand side
                if Knm is not None:
                    B = incore_fmmv(Knm,
                                    y_over_n,
                                    None,
                                    transpose=True,
                                    opt=self.params)
                else:
                    B = self.kernel.dmmv(X, M, None, y_over_n, opt=self.params)
                B = self.preconditioner.apply_t(B)

                if self.is_weighted:
                    mmv = functools.partial(self.weighted_falkon_mmv,
                                            penalty=_lambda,
                                            X=X,
                                            M=M,
                                            Knm=Knm,
                                            y_weights=y_weights)
                else:
                    mmv = functools.partial(self.falkon_mmv,
                                            penalty=_lambda,
                                            X=X,
                                            M=M,
                                            Knm=Knm)
            # Run the conjugate gradient solver
            beta = self.optimizer.solve(initial_solution, B, mmv, max_iter,
                                        callback)
        return beta
Пример #3
0
    def solve(self, X0, B, mmv, max_iter, callback=None):
        t_start = time.time()

        if X0 is None:
            R = copy_same_stride(B)
            X = create_same_stride(B.size(), B, B.dtype, B.device)
            X.fill_(0.0)
        else:
            R = B - mmv(X0)
            X = X0

        m_eps = self.params.cg_epsilon(X.dtype)

        P = R
        Rsold = torch.sum(R.pow(2), dim=0)

        e_train = time.time() - t_start

        for i in range(max_iter):
            with TicToc("Chol Iter", debug=False):
                t_start = time.time()
                AP = mmv(P)
                alpha = Rsold / (torch.sum(P * AP, dim=0) + m_eps)
                X.addmm_(P, torch.diag(alpha))

                if (i + 1) % self.params.cg_full_gradient_every == 0:
                    R = B - mmv(X)
                else:
                    R = R - torch.mm(AP, torch.diag(alpha))
                    # R.addmm_(mat1=AP, mat2=torch.diag(alpha), alpha=-1.0)

                Rsnew = torch.sum(R.pow(2), dim=0)
                if Rsnew.abs().max().sqrt() < self.params.cg_tolerance:
                    print("Stopping conjugate gradient descent at "
                          "iteration %d. Solution has converged." % (i + 1))
                    break

                P = R + torch.mm(P, torch.diag(Rsnew / (Rsold + m_eps)))
                Rsold = Rsnew

                e_iter = time.time() - t_start
                e_train += e_iter
            with TicToc("Chol callback", debug=False):
                if callback is not None:
                    callback(i + 1, X, e_train)

        return X
Пример #4
0
    def solve(self,
              X,
              M,
              Y,
              _lambda,
              initial_solution,
              max_iter,
              callback=None):
        n = X.size(0)
        prec = self.preconditioner

        with TicToc("ConjGrad preparation", False):
            if M is None:
                Knm = X
            else:
                Knm = None
            # Compute the right hand side
            if Knm is not None:
                B = incore_fmmv(Knm,
                                Y / n,
                                None,
                                transpose=True,
                                opt=self.params)
            else:
                B = self.kernel.dmmv(X, M, None, Y / n, opt=self.params)

            B = prec.apply_t(B)

            # Define the Matrix-vector product iteration
            if X.is_cuda:
                s1 = torch.cuda.Stream(X.device)

            def mmv(sol):
                with TicToc("MMV", False):
                    v = prec.invA(sol)
                    v_t = prec.invT(v)

                    if Knm is not None:
                        cc = incore_fdmmv(Knm, v_t, None, opt=self.params)
                    else:
                        cc = self.kernel.dmmv(X, M, v_t, None, opt=self.params)

                    if X.is_cuda:
                        with torch.cuda.stream(s1), torch.cuda.device(
                                X.device):
                            # We must sync before calls to prec.inv* which use a different stream
                            cc_ = cc.div_(n)
                            v_ = v.mul_(_lambda)
                            s1.synchronize()
                            cc_ = prec.invTt(cc_).add_(v_)
                            s1.synchronize()
                            return prec.invAt(cc_)
                    else:
                        return prec.invAt(prec.invTt(cc / n) + _lambda * v)

        # Run the conjugate gradient solver
        beta = self.optimizer.solve(initial_solution, B, mmv, max_iter,
                                    callback)
        return beta
Пример #5
0
def run_logistic_falkon(dset: Dataset, algorithm: Algorithm,
                        dtype: Optional[DataType], iter_list: List[int],
                        penalty_list: List[float], num_centers: int,
                        kernel_sigma: float, kernel: str, seed: int):
    import torch
    import falkon
    from falkon import kernels
    from falkon.models import logistic_falkon
    from falkon.gsc_losses import LogisticLoss
    from falkon.utils import TicToc
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Data types
    if dtype is None:
        dtype = DataType.float64
    # Arguments
    if kernel.lower() == 'gaussian':
        k = kernels.GaussianKernel(kernel_sigma)
    elif kernel.lower() == 'laplacian':
        k = kernels.LaplacianKernel(kernel_sigma)
    elif kernel.lower() == 'linear':
        k = kernels.LinearKernel(beta=1.0, sigma=kernel_sigma)
    else:
        raise ValueError("Kernel %s not understood for algorithm %s" %
                         (kernel, algorithm))
    opt = falkon.FalkonOptions(compute_arch_speed=False,
                               no_single_kernel=True,
                               pc_epsilon_32=1e-6,
                               pc_epsilon_64=1e-13,
                               debug=True)
    loss = LogisticLoss(kernel=k)
    flk = logistic_falkon.LogisticFalkon(kernel=k,
                                         loss=loss,
                                         penalty_list=penalty_list,
                                         iter_list=iter_list,
                                         M=num_centers,
                                         seed=seed,
                                         error_fn=None,
                                         error_every=1,
                                         options=opt)

    # Error metrics
    err_fns = get_err_fns(dset)
    # Load data
    load_fn = get_load_fn(dset)
    Xtr, Ytr, Xts, Yts, kwargs = load_fn(dtype=dtype.to_numpy_dtype(),
                                         as_torch=True)
    Xtr = Xtr.pin_memory()
    Ytr = Ytr.pin_memory()
    err_fns = [functools.partial(fn, **kwargs) for fn in err_fns]
    with TicToc("LOGISTIC FALKON ALGORITHM"):
        flk.error_fn = err_fns[0]
        print("Starting to train model %s on data %s" % (flk, dset),
              flush=True)
        flk.fit(Xtr, Ytr, Xts, Yts)
    test_model(flk, f"{algorithm} on {dset}", Xts, Yts, Xtr, Ytr, err_fns)
Пример #6
0
    def init(self, X: Union[torch.Tensor, SparseTensor]):
        """Initialize the preconditioner matrix.

        This method must be called before the preconditioner can be used.

        Parameters
        ----------
        X : MxD tensor
            The matrix of Nystroem centers
        """
        dtype = X.dtype
        eps = self.params.pc_epsilon(X.dtype)

        M = X.size(0)

        with TicToc("Kernel", debug=self.params.debug):
            if isinstance(X, torch.Tensor):
                C = create_same_stride((M, M), X, dtype=dtype, device='cpu',
                                       pin_memory=self._use_cuda)
            else:  # If sparse tensor we need fortran for kernel calculation
                C = create_fortran((M, M), dtype=dtype, device='cpu', pin_memory=self._use_cuda)
            self.kernel(X, X, out=C, opt=self.params)
        self.fC = C.numpy()
        if not is_f_contig(C):
            self.fC = self.fC.T

        with TicToc("Cholesky 1", debug=self.params.debug):
            # Compute T: lower(fC) = T.T
            inplace_add_diag(self.fC, eps * M)
            self.fC = potrf_wrapper(self.fC, clean=False, upper=False,
                                    use_cuda=self._use_cuda, opt=self.params)
            # Save the diagonal which will be overwritten when computing A
            self.dT = C.diag()

        with TicToc("Copy triangular", debug=self.params.debug):
            # Copy lower(fC) to upper(fC):  upper(fC) = T.
            copy_triang(self.fC, upper=False)

        if self._use_cuda:
            with TicToc("LAUUM", debug=self.params.debug):
                # Product upper(fC) @ upper(fC).T : lower(fC) = T @ T.T
                self.fC = lauum_wrapper(self.fC, upper=True, use_cuda=self._use_cuda, opt=self.params)
        else:
            with TicToc("LAUUM", debug=self.params.debug):
                # Product lower(fC).T @ lower(fC) : lower(fC) = T @ T.T
                self.fC = lauum_wrapper(self.fC, upper=False, use_cuda=self._use_cuda, opt=self.params)

        with TicToc("Cholesky 2", debug=self.params.debug):
            # lower(fC) = 1/M * [email protected]
            self.fC = mul_triang(self.fC, upper=False, preserve_diag=False, multiplier=1 / M)
            # lower(fC) = 1/M * [email protected] + lambda * I
            inplace_add_diag(self.fC, self._lambda)
            # Cholesky on lower(fC) : lower(fC) = A.T
            self.fC = potrf_wrapper(self.fC, clean=False, upper=False,
                                    use_cuda=self._use_cuda, opt=self.params)
            self.dA = C.diag()
Пример #7
0
    def falkon_mmv(self, sol, penalty, X, M, Knm):
        n = Knm.shape[0] if Knm is not None else X.shape[0]
        prec = self.preconditioner

        with TicToc("MMV", False):
            v = prec.invA(sol)
            v_t = prec.invT(v)

            if Knm is not None:
                cc = incore_fdmmv(Knm, v_t, None, opt=self.params)
            else:
                cc = self.kernel.dmmv(X, M, v_t, None, opt=self.params)

            # AT^-1 @ (TT^-1 @ (cc / n) + penalty * v)
            cc_ = cc.div_(n)
            v_ = v.mul_(penalty)
            cc_ = prec.invTt(cc_).add_(v_)
            out = prec.invAt(cc_)
            return out
Пример #8
0
            def mmv(sol):
                with TicToc("MMV", False):
                    v = prec.invA(sol)
                    v_t = prec.invT(v)
                    if Knm is not None:
                        cc = incore_fdmmv(Knm, v_t, None, opt=self.params)
                    else:
                        cc = self.kernel.dmmv(X, M, v_t, None, opt=self.params)

                    if X.is_cuda:
                        with torch.cuda.stream(s1):
                            cc_ = cc.div_(n)
                            v_ = v.mul_(_lambda)
                            s1.synchronize()
                            cc_ = prec.invTt(cc_).add_(v_)
                            s1.synchronize()
                            out = prec.invAt(cc_)
                            s1.synchronize()
                            return out
                    else:
                        return prec.invAt(prec.invTt(cc / n) + _lambda * v)
Пример #9
0
            def mmv(sol):
                with TicToc("MMV", False):
                    v = prec.invA(sol)
                    v_t = prec.invT(v)

                    if Knm is not None:
                        cc = incore_fdmmv(Knm, v_t, None, opt=self.params)
                    else:
                        cc = self.kernel.dmmv(X, M, v_t, None, opt=self.params)

                    if X.is_cuda:
                        with torch.cuda.stream(s1), torch.cuda.device(
                                X.device):
                            # We must sync before calls to prec.inv* which use a different stream
                            cc_ = cc.div_(n)
                            v_ = v.mul_(_lambda)
                            s1.synchronize()
                            cc_ = prec.invTt(cc_).add_(v_)
                            s1.synchronize()
                            return prec.invAt(cc_)
                    else:
                        return prec.invAt(prec.invTt(cc / n) + _lambda * v)
Пример #10
0
def run_falkon(dset: Dataset, algorithm: Algorithm, dtype: Optional[DataType],
               num_iter: int, num_centers: int, kernel_sigma: float,
               penalty: float, kernel: str, kfold: int, seed: int):
    import torch
    from falkon import kernels
    from falkon.models import falkon
    from falkon.utils import TicToc
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Data types
    if dtype is None:
        dtype = DataType.float64
    # Arguments
    if kernel.lower() == 'gaussian':
        k = kernels.GaussianKernel(kernel_sigma)
    elif kernel.lower() == 'laplacian':
        k = kernels.LaplacianKernel(kernel_sigma)
    elif kernel.lower() == 'linear':
        k = kernels.LinearKernel(beta=1.0, sigma=kernel_sigma)
    else:
        raise ValueError("Kernel %s not understood for algorithm %s" %
                         (kernel, algorithm))

    opt = falkon.FalkonOptions(compute_arch_speed=False,
                               no_single_kernel=True,
                               pc_epsilon_32=1e-6,
                               pc_epsilon_64=1e-13,
                               debug=True)
    flk = falkon.Falkon(kernel=k,
                        penalty=penalty,
                        M=num_centers,
                        maxiter=num_iter,
                        seed=seed,
                        error_fn=None,
                        error_every=1,
                        options=opt)

    # Error metrics
    err_fns = get_err_fns(dset)
    if kfold == 1:
        # Load data
        load_fn = get_load_fn(dset)
        Xtr, Ytr, Xts, Yts, kwargs = load_fn(dtype=dtype.to_numpy_dtype(),
                                             as_torch=True)
        Xtr = Xtr.pin_memory()
        Ytr = Ytr.pin_memory()
        temp_test = torch.empty(3, 3).cuda()
        del temp_test
        err_fns = [functools.partial(fn, **kwargs) for fn in err_fns]
        with TicToc("FALKON ALGORITHM"):
            flk.error_fn = err_fns[0]
            print("Starting to train model %s on data %s" % (flk, dset),
                  flush=True)
            flk.fit(Xtr, Ytr, Xts, Yts)
        test_model(flk, f"{algorithm} on {dset}", Xts, Yts, Xtr, Ytr, err_fns)
    else:
        print("Will train model %s on data %s with %d-fold CV" %
              (flk, dset, kfold),
              flush=True)
        load_fn = get_cv_fn(dset)
        iteration = 0
        test_errs, train_errs = [], []

        for Xtr, Ytr, Xts, Yts, kwargs in load_fn(k=kfold,
                                                  dtype=dtype.to_numpy_dtype(),
                                                  as_torch=True):
            err_fns = [functools.partial(fn, **kwargs) for fn in err_fns]
            with TicToc("FALKON ALGORITHM (fold %d)" % (iteration)):
                flk.error_every = err_fns[0]
                flk.fit(Xtr, Ytr, Xts, Yts)
            iteration += 1
            c_test_errs, c_train_errs = test_model(flk,
                                                   f"{algorithm} on {dset}",
                                                   Xts, Yts, Xtr, Ytr, err_fns)
            train_errs.append(c_train_errs)
            test_errs.append(c_test_errs)

        print("Full errors: Test %s - Train %s" % (test_errs, train_errs))
        print()
        print("%d-Fold Error Report" % (kfold))
        for err_fn_i in range(len(err_fns)):
            print("Final test errors: %.4f +- %4f" % (np.mean(
                [e[err_fn_i]
                 for e in test_errs]), np.std([e[err_fn_i]
                                               for e in test_errs])))
            print("Final train errors: %.4f +- %4f" %
                  (np.mean([e[err_fn_i] for e in train_errs
                            ]), np.std([e[err_fn_i] for e in train_errs])))
            print()
Пример #11
0
    def fit(self,
            X: torch.Tensor,
            Y: torch.Tensor,
            Xts: Optional[torch.Tensor] = None,
            Yts: Optional[torch.Tensor] = None):
        """Fits the Falkon KRR model.

        Parameters
        -----------
        X : torch.Tensor (2D)
            The tensor of training data, of shape [num_samples, num_dimensions].
            If X is in Fortran order (i.e. column-contiguous) then we can avoid
            an extra copy of the data.
        Y : torch.Tensor (1D or 2D)
            The tensor of training targets, of shape [num_samples, num_outputs].
            If X and Y represent a classification problem, Y can be encoded as a one-hot
            vector.
            If Y is in Fortran order (i.e. column-contiguous) then we can avoid an
            extra copy of the data.
        Xts : torch.Tensor (2D) or None
            Tensor of validation data, of shape [num_test_samples, num_dimensions].
            If validation data is provided and `error_fn` was specified when
            creating the model, they will be used to print the validation error
            during the optimization iterations.
            If Xts is in Fortran order (i.e. column-contiguous) then we can avoid an
            extra copy of the data.
        Yts : torch.Tensor (1D or 2D) or None
            Tensor of validation targets, of shape [num_test_samples, num_outputs].
            If validation data is provided and `error_fn` was specified when
            creating the model, they will be used to print the validation error
            during the optimization iterations.
            If Yts is in Fortran order (i.e. column-contiguous) then we can avoid an
            extra copy of the data.

        Returns
        --------
        model: Falkon
            The fitted model
        """
        if X.size(0) != Y.size(0):
            raise ValueError("X and Y must have the same number of "
                             "samples (found %d and %d)" %
                             (X.size(0), Y.size(0)))
        if Y.dim() == 1:
            Y = torch.unsqueeze(Y, 1)
        if Y.dim() != 2:
            raise ValueError("Y is expected 1D or 2D. Found %dD." % (Y.dim()))
        if not check_same_dtype(X, Y):
            raise TypeError("X and Y must have the same data-type.")

        dtype = X.dtype
        # Decide whether to use CUDA for preconditioning based on M
        _use_cuda_preconditioner = (
                self.use_cuda_ and
                (not self.options.cpu_preconditioner) and
                self.M >= get_min_cuda_preconditioner_size(dtype)
        )
        _use_cuda_mmv = (
                self.use_cuda_ and
                X.shape[0] * X.shape[1] * self.M / self.num_gpus >= get_min_cuda_mmv_size(dtype)
        )

        self.fit_times_ = []
        self.ny_points_ = None
        self.alpha_ = None

        t_s = time.time()
        ny_points = self.center_selection.select(X, None, self.M)
        if self.use_cuda_:
            ny_points = ny_points.pin_memory()

        with TicToc("Calcuating Preconditioner of size %d" % (self.M), debug=self.options.debug):
            pc_opt: FalkonOptions = dataclasses.replace(self.options,
                                                        use_cpu=not _use_cuda_preconditioner)
            if pc_opt.debug:
                print("Preconditioner will run on %s" %
                      ("CPU" if pc_opt.use_cpu else ("%d GPUs" % self.num_gpus)))
            precond = falkon.preconditioner.FalkonPreconditioner(self.penalty, self.kernel, pc_opt)
            precond.init(ny_points)

        if _use_cuda_mmv:
            # Cache must be emptied to ensure enough memory is visible to the optimizer
            torch.cuda.empty_cache()
            X = X.pin_memory()

        # Decide whether it's worthwile to pre-compute the k_NM kernel.
        # If we precompute K_NM, each CG iteration costs
        # Given a single kernel evaluation between two D-dimensional vectors
        # costs D, at CG iteration we must perform N*M kernel evaluations.
        # Other than the kernel evaluations we must perform two matrix-vector
        # products 2(N*M*T) and a bunch of triangular solves.
        #
        # So if we precompute we have 2*(N*M*T), othewise we also have N*M*D
        # but precomputing costs us N*M memory.
        # So heuristic is the following:
        #  - If D is large (e.g. > 100) check if RAM is sufficient
        #  - If RAM is sufficient precompute
        #  - Otherwise do not precompute
        Knm = None
        if X.size(1) > 1200:
            necessary_ram = X.size(0) * ny_points.size(0) * sizeof_dtype(dtype)
            k_opt = dataclasses.replace(self.options, use_cpu=True)
            cpu_info = get_device_info(k_opt)
            available_ram = min(k_opt.max_cpu_mem, cpu_info[-1].free_memory) * 0.9
            del k_opt

            if available_ram > necessary_ram:
                if self.options.debug:
                    print("%d*%d Kernel matrix will be stored" %
                          (X.size(0), ny_points.size(0)))
                Knm = self.kernel(X, ny_points, opt=self.options)
                # TODO: Maybe we should do the same for Kts, but this complicates
                #       checks for fitting in memory
            elif self.options.debug:
                print(
                    "Cannot store full kernel matrix: not enough memory (have %.2fGB, need %.2fGB)" %
                    (available_ram / 2 ** 30, necessary_ram / 2 ** 30))
        self.fit_times_.append(time.time() - t_s)  # Preparation time

        # Here we define the callback function which will run at the end
        # of conjugate gradient iterations. This function computes and
        # displays the validation error.
        val_cback = None
        if self.error_fn is not None and self.error_every is not None:
            def val_cback(it, beta, train_time):
                self.fit_times_.append(self.fit_times_[0] + train_time)
                if it % self.error_every != 0:
                    print("Iteration %3d - Elapsed %.1fs" % (it, self.fit_times_[-1]), flush=True)
                    return
                err_str = "training" if Xts is None or Yts is None else "validation"
                alpha = precond.apply(beta)
                # Compute error: can be train or test;
                if Xts is not None and Yts is not None:
                    pred = self._predict(Xts, ny_points, alpha)
                    err = self.error_fn(Yts, pred)
                else:
                    pred = self._predict(X, ny_points, alpha)
                    err = self.error_fn(Y, pred)
                err_name = "error"
                if isinstance(err, tuple) and len(err) == 2:
                    err, err_name = err
                print("Iteration %3d - Elapsed %.1fs - %s %s: %.4f" %
                      (it, self.fit_times_[-1], err_str, err_name, err), flush=True)

        # Start with the falkon algorithm
        with TicToc('Computing Falkon iterations', debug=self.options.debug):
            o_opt: FalkonOptions = dataclasses.replace(self.options, use_cpu=not _use_cuda_mmv)
            if o_opt.debug:
                print("Optimizer will run on %s" %
                      ("CPU" if o_opt.use_cpu else ("%d GPUs" % self.num_gpus)), flush=True)
            optim = falkon.optim.FalkonConjugateGradient(self.kernel, precond, o_opt)
            if Knm is not None:
                beta = optim.solve(
                    Knm, None, Y, self.penalty, initial_solution=None,
                    max_iter=self.maxiter, callback=val_cback)
            else:
                beta = optim.solve(
                    X, ny_points, Y, self.penalty, initial_solution=None,
                    max_iter=self.maxiter, callback=val_cback)

            self.alpha_ = precond.apply(beta)
            self.ny_points_ = ny_points
        return self
Пример #12
0
    def fit(self,
            X: torch.Tensor,
            Y: torch.Tensor,
            Xts: Optional[torch.Tensor] = None,
            Yts: Optional[torch.Tensor] = None):
        """Fits the Falkon Kernel Logistic Regression model.

        Parameters
        -----------
        X : torch.Tensor (2D)
            The tensor of training data, of shape [num_samples, num_dimensions].
            If X is in Fortran order (i.e. column-contiguous) then we can avoid
            an extra copy of the data.
        Y : torch.Tensor (1D or 2D)
            The tensor of training targets, of shape [num_samples, num_outputs].
            If X and Y represent a classification problem, Y can be encoded as a one-hot
            vector.
            If Y is in Fortran order (i.e. column-contiguous) then we can avoid an
            extra copy of the data.
        Xts : torch.Tensor (2D) or None
            Tensor of validation data, of shape [num_test_samples, num_dimensions].
            If validation data is provided and `error_fn` was specified when
            creating the model, they will be used to print the validation error
            during the optimization iterations.
            If Xts is in Fortran order (i.e. column-contiguous) then we can avoid an
            extra copy of the data.
        Yts : torch.Tensor (1D or 2D) or None
            Tensor of validation targets, of shape [num_test_samples, num_outputs].
            If validation data is provided and `error_fn` was specified when
            creating the model, they will be used to print the validation error
            during the optimization iterations.
            If Yts is in Fortran order (i.e. column-contiguous) then we can avoid an
            extra copy of the data.

        Returns
        --------
        model: LogisticFalkon
            The fitted model
        """
        X, Y, Xts, Yts = self._check_fit_inputs(X, Y, Xts, Yts)

        dtype = X.dtype
        self.fit_times_ = []

        t_s = time.time()
        ny_X, ny_Y = self.center_selection.select(X, Y, self.M)
        if self.use_cuda_:
            ny_X = ny_X.pin_memory()

        # beta is the temporary iterative solution
        beta = torch.zeros(ny_X.shape[0], 1, dtype=dtype)
        optim = ConjugateGradient(opt=self.options)
        validation_cback = None
        precond = None
        if self.error_fn is not None and self.error_every is not None:

            def validation_cback(iteration, x, pc, train_time):
                self.fit_times_.append(train_time)
                if iteration % self.error_every != 0:
                    print("Iteration %3d - Elapsed %.1fs" %
                          (iteration, self.fit_times_[-1]),
                          flush=True)
                    return
                err_str = "training" if Xts is None or Yts is None else "validation"
                coeff = pc.invT(x)
                # Compute error: can be train or test;
                if Xts is not None and Yts is not None:
                    pred = self._predict(Xts, ny_X, coeff)
                    err = self.error_fn(Yts, pred)
                    loss = torch.mean(self.loss(Yts, pred)).item()
                else:
                    pred = self._predict(X, ny_X, coeff)
                    err = self.error_fn(Y, pred)
                    loss = torch.mean(self.loss(Y, pred)).item()
                err_name = "error"
                if isinstance(err, tuple) and len(err) == 2:
                    err, err_name = err
                print(
                    f"Iteration {iteration:3d} - Elapsed {self.fit_times_[-1]:.2f}s - "
                    f"{err_str} loss {loss:.4f} - "
                    f"{err_str} {err_name} {err:.4f} ",
                    flush=True)

        t_elapsed = 0.0
        for it, penalty in enumerate(self.penalty_list):
            max_iter = self.iter_list[it]
            print("Iteration %d - penalty %e - sub-iterations %d" %
                  (it, penalty, max_iter),
                  flush=True)

            with TicToc("Preconditioner", self.options.debug):
                if precond is None:
                    precond = falkon.preconditioner.LogisticPreconditioner(
                        self.kernel, self.loss, self.options)
                precond.init(ny_X, ny_Y, beta, penalty, X.shape[0])
            if self.use_cuda_:
                torch.cuda.empty_cache()

            with TicToc("Gradient", self.options.debug):
                # Gradient computation
                knmp_grad, inner_mmv = self.loss.knmp_grad(X,
                                                           ny_X,
                                                           Y,
                                                           precond.invT(beta),
                                                           opt=self.options)
                grad_p = precond.invAt(
                    precond.invTt(knmp_grad).add_(penalty * beta))

            with TicToc("Optim", self.options.debug):
                # MMV operation for CG
                def mmv(sol):
                    sol_a = precond.invA(sol)
                    knmp_hess = self.loss.knmp_hess(X,
                                                    ny_X,
                                                    Y,
                                                    inner_mmv,
                                                    precond.invT(sol_a),
                                                    opt=self.options)
                    return precond.invAt(
                        precond.invTt(knmp_hess).add_(sol_a.mul_(penalty)))

                optim_out = optim.solve(X0=None,
                                        B=grad_p,
                                        mmv=mmv,
                                        max_iter=max_iter,
                                        callback=None)
                beta -= precond.invA(optim_out)

            t_elapsed += time.time() - t_s
            if validation_cback is not None:
                validation_cback(it, beta, precond, train_time=t_elapsed)
            t_s = time.time()
        t_elapsed += time.time() - t_s

        if validation_cback is not None:
            validation_cback(len(self.penalty_list),
                             beta,
                             precond,
                             train_time=t_elapsed)
        self.alpha_ = precond.invT(beta)
        self.ny_points_ = ny_X
        return self
Пример #13
0
    def fit(self,
            X: torch.Tensor,
            Y: torch.Tensor,
            Xts: Optional[torch.Tensor] = None,
            Yts: Optional[torch.Tensor] = None):
        """Fits the Falkon KRR model.

        Parameters
        -----------
        X : torch.Tensor
            The tensor of training data, of shape [num_samples, num_dimensions].
            If X is in Fortran order (i.e. column-contiguous) then we can avoid
            an extra copy of the data.
        Y : torch.Tensor
            The tensor of training targets, of shape [num_samples, num_outputs].
            If X and Y represent a classification problem, Y can be encoded as a one-hot
            vector.
            If Y is in Fortran order (i.e. column-contiguous) then we can avoid an
            extra copy of the data.
        Xts : torch.Tensor or None
            Tensor of validation data, of shape [num_test_samples, num_dimensions].
            If validation data is provided and `error_fn` was specified when
            creating the model, they will be used to print the validation error
            during the optimization iterations.
            If Xts is in Fortran order (i.e. column-contiguous) then we can avoid an
            extra copy of the data.
        Yts : torch.Tensor or None
            Tensor of validation targets, of shape [num_test_samples, num_outputs].
            If validation data is provided and `error_fn` was specified when
            creating the model, they will be used to print the validation error
            during the optimization iterations.
            If Yts is in Fortran order (i.e. column-contiguous) then we can avoid an
            extra copy of the data.

        Returns
        --------
        model: Falkon
            The fitted model
        """
        X, Y, Xts, Yts = self._check_fit_inputs(X, Y, Xts, Yts)
        dtype = X.dtype
        self.fit_times_ = []
        self.ny_points_ = None
        self.alpha_ = None

        # Start training timer
        t_s = time.time()

        # Pick Nystrom centers
        if self.weight_fn is not None:
            # noinspection PyTupleAssignmentBalance
            ny_points, ny_indices = self.center_selection.select_indices(
                X, None)
        else:
            # noinspection PyTypeChecker
            ny_points: Union[
                torch.Tensor,
                falkon.sparse.SparseTensor] = self.center_selection.select(
                    X, None)
            ny_indices = None
        num_centers = ny_points.shape[0]

        # Decide whether to use CUDA for preconditioning and iterations, based on number of centers
        _use_cuda_preconditioner = (
            self.use_cuda_ and (not self.options.cpu_preconditioner)
            and num_centers >= get_min_cuda_preconditioner_size(
                dtype, self.options))
        _use_cuda_mmv = (self.use_cuda_ and
                         X.shape[0] * X.shape[1] * num_centers / self.num_gpus
                         >= get_min_cuda_mmv_size(dtype, self.options))

        if self.use_cuda_:
            ny_points = ny_points.pin_memory()

        with TicToc("Calcuating Preconditioner of size %d" % (num_centers),
                    debug=self.options.debug):
            pc_opt: FalkonOptions = dataclasses.replace(
                self.options, use_cpu=not _use_cuda_preconditioner)
            if pc_opt.debug:
                print("Preconditioner will run on %s" %
                      ("CPU" if pc_opt.use_cpu else
                       ("%d GPUs" % self.num_gpus)))
            precond = falkon.preconditioner.FalkonPreconditioner(
                self.penalty, self.kernel, pc_opt)
            ny_weight_vec = None
            if self.weight_fn is not None:
                ny_weight_vec = self.weight_fn(Y[ny_indices])
            precond.init(ny_points, weight_vec=ny_weight_vec)

        if _use_cuda_mmv:
            # Cache must be emptied to ensure enough memory is visible to the optimizer
            torch.cuda.empty_cache()
            X = X.pin_memory()

        # K_NM storage decision
        k_opt = dataclasses.replace(self.options, use_cpu=True)
        cpu_info = get_device_info(k_opt)
        available_ram = min(k_opt.max_cpu_mem, cpu_info[-1].free_memory) * 0.9
        if self._can_store_knm(X, ny_points, available_ram):
            Knm = self.kernel(X, ny_points, opt=self.options)
        else:
            Knm = None
        self.fit_times_.append(time.time() - t_s)  # Preparation time

        # Here we define the callback function which will run at the end
        # of conjugate gradient iterations. This function computes and
        # displays the validation error.
        validation_cback = None
        if self.error_fn is not None and self.error_every is not None:
            validation_cback = self._get_callback_fn(X, Y, Xts, Yts, ny_points,
                                                     precond)

        # Start with the falkon algorithm
        with TicToc('Computing Falkon iterations', debug=self.options.debug):
            o_opt: FalkonOptions = dataclasses.replace(
                self.options, use_cpu=not _use_cuda_mmv)
            if o_opt.debug:
                print("Optimizer will run on %s" %
                      ("CPU" if o_opt.use_cpu else
                       ("%d GPUs" % self.num_gpus)),
                      flush=True)
            optim = falkon.optim.FalkonConjugateGradient(
                self.kernel, precond, o_opt, weight_fn=self.weight_fn)
            if Knm is not None:
                beta = optim.solve(Knm,
                                   None,
                                   Y,
                                   self.penalty,
                                   initial_solution=None,
                                   max_iter=self.maxiter,
                                   callback=validation_cback)
            else:
                beta = optim.solve(X,
                                   ny_points,
                                   Y,
                                   self.penalty,
                                   initial_solution=None,
                                   max_iter=self.maxiter,
                                   callback=validation_cback)

            self.alpha_ = precond.apply(beta)
            self.ny_points_ = ny_points
        return self
Пример #14
0
    def init(self,
             X: Union[torch.Tensor, SparseTensor],
             weight_vec: Optional[torch.Tensor] = None):
        """Initialize the preconditioner matrix.

        This method must be called before the preconditioner can be used.

        Parameters
        ----------
        X : torch.Tensor
            The (M x D) matrix of Nystroem centers
        weight_vec
            An optional vector of size (M x 1) which is used for reweighted least-squares.
            This vector should contain the weights corresponding to the Nystrom centers.
        """
        if X.is_cuda and not self._use_cuda:
            raise RuntimeError(
                "use_cuda is set to False, but data is CUDA tensor. "
                "Check your options.")
        if weight_vec is not None and not check_same_device(X, weight_vec):
            raise ValueError(f"Weights and data are not on the same device "
                             f"({weight_vec.device}, {X.device})")
        if weight_vec is not None and weight_vec.shape[0] != X.shape[0]:
            raise ValueError(
                f"Weights and Nystrom centers should have the same first dimension. "
                f"Found instead {weight_vec.shape[0]}, {X.shape[0]}.")
        dtype = X.dtype
        dev = X.device
        eps = self.params.pc_epsilon(X.dtype)
        M = X.size(0)

        with TicToc("Kernel", debug=self.params.debug):
            if isinstance(X, torch.Tensor):
                C = create_same_stride((M, M),
                                       X,
                                       dtype=dtype,
                                       device=dev,
                                       pin_memory=self._use_cuda)
            else:  # If sparse tensor we need fortran for kernel calculation
                C = create_fortran((M, M),
                                   dtype=dtype,
                                   device=dev,
                                   pin_memory=self._use_cuda)
            self.kernel(X, X, out=C, opt=self.params)
        if not is_f_contig(C):
            C = C.T

        with TicToc("Cholesky 1", debug=self.params.debug):
            # Compute T: lower(fC) = T.T
            inplace_add_diag_th(C, eps * M)
            C = potrf_wrapper(C,
                              clean=False,
                              upper=False,
                              use_cuda=self._use_cuda,
                              opt=self.params)
            # Save the diagonal which will be overwritten when computing A
            self.dT = C.diag()

        with TicToc("Copy triangular", debug=self.params.debug):
            # Copy lower(fC) to upper(fC):  upper(fC) = T.
            copy_triang(C, upper=False)

        # Weighted least-squares needs to weight the A matrix. We can weigh once before LAUUM,
        # but since CUDA-LAUUM touches both sides of C, weighting before LAUUM will also modify
        # the matrix T. Therefore for CUDA inputs we weigh twice after LAUUM!
        if weight_vec is not None and not self._use_cuda:
            with TicToc("Weighting(CPU)", debug=self.params.debug):
                weight_vec.sqrt_()
                vec_mul_triang(C, weight_vec, side=1, upper=False)

        if self._use_cuda:
            with TicToc("LAUUM(CUDA)", debug=self.params.debug):
                # Product upper(fC) @ upper(fC).T, store in lower(fC) = T @ T.T
                C = lauum_wrapper(C,
                                  upper=True,
                                  use_cuda=self._use_cuda,
                                  opt=self.params)
        else:
            with TicToc("LAUUM(CPU)", debug=self.params.debug):
                # Product lower(fC).T @ lower(fC), store in lower(fC) = T @ T.T
                C = lauum_wrapper(C,
                                  upper=False,
                                  use_cuda=self._use_cuda,
                                  opt=self.params)

        if weight_vec is not None and self._use_cuda:
            with TicToc("Weighting(CUDA)", debug=self.params.debug):
                weight_vec.sqrt_()
                vec_mul_triang(C, weight_vec, side=0, upper=False)
                vec_mul_triang(C, weight_vec, side=1, upper=False)

        with TicToc("Cholesky 2", debug=self.params.debug):
            # lower(fC) = 1/M * [email protected]
            mul_triang(C, upper=False, preserve_diag=False, multiplier=1 / M)
            # lower(fC) = 1/M * [email protected] + lambda * I
            inplace_add_diag_th(C, self._lambda)
            # Cholesky on lower(fC) : lower(fC) = A.T
            C = potrf_wrapper(C,
                              clean=False,
                              upper=False,
                              use_cuda=self._use_cuda,
                              opt=self.params)
            self.dA = C.diag()

        self.fC = C
Пример #15
0
    def fit(self,
            X: torch.Tensor,
            Y: torch.Tensor,
            Xts: Optional[torch.Tensor] = None,
            Yts: Optional[torch.Tensor] = None):
        """Fits the Falkon KRR model.

        Parameters
        -----------
        X : torch.Tensor
            The tensor of training data, of shape [num_samples, num_dimensions].
            If X is in Fortran order (i.e. column-contiguous) then we can avoid
            an extra copy of the data. Must be a CUDA tensor.
        Y : torch.Tensor
            The tensor of training targets, of shape [num_samples, num_outputs].
            If X and Y represent a classification problem, Y can be encoded as a one-hot
            vector.
            If Y is in Fortran order (i.e. column-contiguous) then we can avoid an
            extra copy of the data. Must be a CUDA tensor.
        Xts : torch.Tensor or None
            Tensor of validation data, of shape [num_test_samples, num_dimensions].
            If validation data is provided and `error_fn` was specified when
            creating the model, they will be used to print the validation error
            during the optimization iterations.
            If Xts is in Fortran order (i.e. column-contiguous) then we can avoid an
            extra copy of the data. Must be a CUDA tensor.
        Yts : torch.Tensor or None
            Tensor of validation targets, of shape [num_test_samples, num_outputs].
            If validation data is provided and `error_fn` was specified when
            creating the model, they will be used to print the validation error
            during the optimization iterations.
            If Yts is in Fortran order (i.e. column-contiguous) then we can avoid an
            extra copy of the data. Must be a CUDA tensor.

        Returns
        --------
        model: InCoreFalkon
            The fitted model
        """
        # Fix a synchronization bug which occurs when re-using center selector.
        torch.cuda.synchronize()
        X, Y, Xts, Yts = self._check_fit_inputs(X, Y, Xts, Yts)

        self.fit_times_ = []
        self.ny_points_ = None
        self.alpha_ = None

        # Start training timer
        t_s = time.time()

        # Pick Nystrom centers
        if self.weight_fn is not None:
            # noinspection PyTupleAssignmentBalance
            ny_points, ny_indices = self.center_selection.select_indices(
                X, None)
        else:
            # noinspection PyTypeChecker
            ny_points: Union[
                torch.Tensor,
                falkon.sparse.SparseTensor] = self.center_selection.select(
                    X, None)
            ny_indices = None
        num_centers = ny_points.shape[0]

        pc_stream = torch.cuda.Stream(X.device)
        with TicToc("Calcuating Preconditioner of size %d" % (num_centers),
                    debug=self.options.debug), torch.cuda.stream(pc_stream):
            precond = falkon.preconditioner.FalkonPreconditioner(
                self.penalty, self.kernel, self.options)
            ny_weight_vec = None
            if self.weight_fn is not None:
                ny_weight_vec = self.weight_fn(Y[ny_indices])
            precond.init(ny_points, weight_vec=ny_weight_vec)
        pc_stream.synchronize()

        # Cache must be emptied to ensure enough memory is visible to the optimizer
        torch.cuda.empty_cache()

        # K_NM storage decision
        gpu_info = get_device_info(self.options)[X.device.index]
        available_ram = min(self.options.max_gpu_mem,
                            gpu_info.free_memory) * 0.9
        if self._can_store_knm(X, ny_points, available_ram):
            Knm = self.kernel(X, ny_points, opt=self.options)
        else:
            Knm = None
        self.fit_times_.append(time.time() - t_s)  # Preparation time

        # Here we define the callback function which will run at the end
        # of conjugate gradient iterations. This function computes and
        # displays the validation error.
        validation_cback = None
        if self.error_fn is not None and self.error_every is not None:
            validation_cback = self._get_callback_fn(X, Y, Xts, Yts, ny_points,
                                                     precond)

        # Start with the falkon algorithm
        with TicToc('Computing Falkon iterations', debug=self.options.debug):
            optim = falkon.optim.FalkonConjugateGradient(
                self.kernel, precond, self.options, weight_fn=self.weight_fn)
            if Knm is not None:
                beta = optim.solve(Knm,
                                   None,
                                   Y,
                                   self.penalty,
                                   initial_solution=None,
                                   max_iter=self.maxiter,
                                   callback=validation_cback)
            else:
                beta = optim.solve(X,
                                   ny_points,
                                   Y,
                                   self.penalty,
                                   initial_solution=None,
                                   max_iter=self.maxiter,
                                   callback=validation_cback)

            self.alpha_ = precond.apply(beta)
            self.ny_points_ = ny_points
        return self
Пример #16
0
    def fit(self,
            X: torch.Tensor,
            Y: torch.Tensor,
            Xts: Optional[torch.Tensor] = None,
            Yts: Optional[torch.Tensor] = None):
        if X.size(0) != Y.size(0):
            raise ValueError("X and Y must have the same number of "
                             "samples (found %d and %d)" %
                             (X.size(0), Y.size(0)))
        if Y.dim() == 1:
            Y = torch.unsqueeze(Y, 1)
        if Y.dim() != 2:
            raise ValueError("Y is expected 1D or 2D. Found %dD." % (Y.dim()))
        if not check_same_dtype(X, Y):
            raise TypeError("X and Y must have the same data-type.")

        dtype = X.dtype
        self.fit_times_ = []

        t_s = time.time()
        ny_X, ny_Y = self.center_selection.select(X, Y, self.M)
        if self.use_cuda_:
            ny_X = ny_X.pin_memory()

        # beta is the temporary iterative solution
        beta = torch.zeros(ny_X.shape[0], 1, dtype=dtype)
        optim = ConjugateGradient(opt=self.options)
        cback = None
        precond = None
        if self.error_fn is not None and self.error_every is not None:

            def cback(it, x, pc, train_time):
                self.fit_times_.append(train_time)
                if it % self.error_every != 0:
                    print("Iteration %3d - Elapsed %.1fs" %
                          (it, self.fit_times_[-1]),
                          flush=True)
                    return
                err_str = "training" if Xts is None or Yts is None else "validation"
                coeff = pc.invT(x)
                # Compute error: can be train or test;
                if Xts is not None and Yts is not None:
                    pred = self._predict(Xts, ny_X, coeff)
                    err = self.error_fn(Yts, pred)
                    loss = torch.mean(self.loss(Yts, pred)).item()
                else:
                    pred = self._predict(X, ny_X, coeff)
                    err = self.error_fn(Y, pred)
                    loss = torch.mean(self.loss(Y, pred)).item()
                err_name = "error"
                if isinstance(err, tuple) and len(err) == 2:
                    err, err_name = err
                print(
                    f"Iteration {it:3d} - Elapsed {self.fit_times_[-1]:.2f}s - "
                    f"{err_str} loss {loss:.4f} - "
                    f"{err_str} {err_name} {err:.4f} ",
                    flush=True)

        t_elapsed = 0.0
        for it, penalty in enumerate(self.penalty_list):
            max_iter = self.iter_list[it]
            print("Iteration %d - penalty %e - sub-iterations %d" %
                  (it, penalty, max_iter),
                  flush=True)

            with TicToc("Preconditioner", self.options.debug):
                if precond is None:
                    precond = falkon.preconditioner.LogisticPreconditioner(
                        self.kernel, self.loss, self.options)
                precond.init(ny_X, ny_Y, beta, penalty, X.shape[0])
            if self.use_cuda_:
                torch.cuda.empty_cache()

            with TicToc("Gradient", self.options.debug):
                # Gradient computation
                knmp_grad, inner_mmv = self.loss.knmp_grad(X,
                                                           ny_X,
                                                           Y,
                                                           precond.invT(beta),
                                                           opt=self.options)
                grad_p = precond.invAt(
                    precond.invTt(knmp_grad).add_(penalty * beta))

            # Callback
            def mmv(sol):
                sol_a = precond.invA(sol)
                knmp_hess = self.loss.knmp_hess(X,
                                                ny_X,
                                                Y,
                                                inner_mmv,
                                                precond.invT(sol_a),
                                                opt=self.options)
                return precond.invAt(
                    precond.invTt(knmp_hess).add_(sol_a.mul_(penalty)))

            with TicToc("Optim", self.options.debug):
                optim_out = optim.solve(X0=None,
                                        B=grad_p,
                                        mmv=mmv,
                                        max_iter=max_iter,
                                        callback=None)
                beta -= precond.invA(optim_out)

            t_elapsed += time.time() - t_s
            cback(it, beta, precond, train_time=t_elapsed)
            t_s = time.time()
        t_elapsed += time.time() - t_s

        cback(len(self.penalty_list), beta, precond, train_time=t_elapsed)
        self.alpha_ = precond.invT(beta)
        self.ny_points_ = ny_X
Пример #17
0
    def init(self, X: Union[torch.Tensor, SparseTensor], Y: torch.Tensor,
             alpha: torch.Tensor, penalty: float, N: int) -> None:
        """Initialize the preconditioner matrix.

        This method must be called before the preconditioner becomes usable.

        Parameters
        ----------
        X : MxD tensor
            Matrix of Nystroem centers
        Y : Mx1 tensor
            Vector of targets corresponding to the Nystroem centers `X`
        alpha : Mx1 tensor
            Parameter vector (of the same dimension as `Y`) which gives the current
            solution to the optimization problem.
        penalty : float
            Regularization amount
        N : int
            Number of points in the full data-set.

        Notes
        -----
        If `debug=True` is present in the options, this method will print a lot of extra
        information pertaining timings of the various preconditioner operations. This can be
        useful to help understand how the preconditioner works.
        """
        if Y.shape[1] != 1:
            raise ValueError(
                "Logistic preconditioner can only deal with 1D outputs.")

        dtype = X.dtype
        M = X.size(0)

        eps = self.params.pc_epsilon(dtype)

        if self.fC is None:
            # This is done only at the first iteration of the logistic-falkon algorithm
            # It sets the `T` variable from the paper (chol(kMM)) to the upper part of `self.fC`
            with TicToc("Kernel", debug=self.params.debug):
                if isinstance(X, torch.Tensor):
                    C = create_same_stride((M, M),
                                           X,
                                           dtype=dtype,
                                           device='cpu',
                                           pin_memory=self._use_cuda)
                else:  # If sparse tensor we need fortran for kernel calculation
                    C = create_fortran((M, M),
                                       dtype=dtype,
                                       device='cpu',
                                       pin_memory=self._use_cuda)
                self.kernel(X, X, out=C, opt=self.params)
            self.fC = C.numpy()
            if not is_f_contig(C):
                self.fC = self.fC.T

            with TicToc("Add diag", debug=self.params.debug):
                # Compute T: lower(fC) = T.T
                inplace_add_diag(self.fC, eps * M)
            with TicToc("Cholesky 1", debug=self.params.debug):
                self.fC = potrf_wrapper(self.fC,
                                        clean=True,
                                        upper=False,
                                        use_cuda=self._use_cuda,
                                        opt=self.params)
                # Save the diagonal which will be overwritten when computing A
                self.dT = C.diag()
            with TicToc("Copy triangular", debug=self.params.debug):
                # Copy lower(fC) to upper(fC):  upper(fC) = T.
                copy_triang(self.fC, upper=False)
        else:
            if not self._use_cuda:
                # Copy non-necessary for cuda since LAUUM will do the copying
                with TicToc("Copy triangular", debug=self.params.debug):
                    # Copy upper(fC) to lower(fC): lower(fC) = T.T
                    copy_triang(self.fC,
                                upper=True)  # does not copy the diagonal
            # Setting diagonal necessary for trmm
            inplace_set_diag(self.fC, self.dT)

        # Compute W
        with TicToc("TRMM", debug=self.params.debug):
            # T is on upper(fC). Compute T.T @ alpha
            alpha = self._trmm(alpha.clone())
        with TicToc("W (ddf)", debug=self.params.debug):
            W = self.loss.ddf(Y, alpha)
        with TicToc("W-Multiply", debug=self.params.debug):
            W.sqrt_()
            self.fC = vec_mul_triang(self.fC,
                                     W.numpy().reshape(-1),
                                     side=0,
                                     upper=False)

        if self._use_cuda:
            with TicToc("LAUUM", debug=self.params.debug):
                # Product upper(fC) @ upper(fC).T : lower(fC) = T @ T.T
                self.fC = lauum_wrapper(self.fC,
                                        upper=True,
                                        use_cuda=self._use_cuda,
                                        opt=self.params)
        else:
            with TicToc("LAUUM", debug=self.params.debug):
                # Product lower(fC).T @ lower(fC) : lower(fC) = T @ T.T
                self.fC = lauum_wrapper(self.fC,
                                        upper=False,
                                        use_cuda=self._use_cuda,
                                        opt=self.params)

        # NOTE: Here the multiplier is 1/N instead of the more common 1/M!
        mul_triang(self.fC, upper=False, preserve_diag=False, multiplier=1 / N)

        with TicToc("Add diag", debug=self.params.debug):
            # lower(fC) = 1/N * [email protected] + lambda * I
            inplace_add_diag(self.fC, penalty)

        with TicToc("Cholesky 2", debug=self.params.debug):
            # Cholesky on lower(fC) : lower(fC) = A.T
            self.fC = potrf_wrapper(self.fC,
                                    clean=False,
                                    upper=False,
                                    use_cuda=self._use_cuda,
                                    opt=self.params)
            self.dA = torch.from_numpy(self.fC).diag()