def _scipy_apply(x):
     x = torch.from_numpy(x)
     x = utils.maybe_fp16(x, fp16)
     if use_gpu:
         x = x.cuda()
     out = operator.apply(x)
     out = utils.maybe_fp16(out, fp16)
     out = out.cpu().numpy()
     return out
def power_iteration(
    operator: Operator,
    steps: int = 20,
    error_threshold: float = 1e-4,
    momentum: float = 0.0,
    use_gpu: bool = True,
    fp16: bool = False,
    init_vec: torch.Tensor = None,
) -> Tuple[float, torch.Tensor]:
    """
    Compute dominant eigenvalue/eigenvector of a matrix
    operator: linear Operator giving us matrix-vector product access
    steps: number of update steps to take
    returns: (principal eigenvalue, principal eigenvector) pair
    """
    vector_size = operator.size  # input dimension of operator
    if init_vec is None:
        vec = torch.rand(vector_size)
    else:
        vec = init_vec

    vec = utils.maybe_fp16(vec, fp16)

    if use_gpu:
        vec = vec.cuda()

    prev_lambda = 0.0
    prev_vec = utils.maybe_fp16(torch.randn_like(vec), fp16)
    for i in range(steps):
        prev_vec = vec / (torch.norm(vec) + 1e-6)
        new_vec = utils.maybe_fp16(operator.apply(vec),
                                   fp16) - momentum * prev_vec
        # need to handle case where we end up in the nullspace of the operator.
        # in this case, we are done.
        if torch.norm(new_vec).item() == 0.0:
            return 0.0, new_vec
        lambda_estimate = vec.dot(new_vec).item()
        diff = lambda_estimate - prev_lambda
        vec = new_vec.detach() / torch.norm(new_vec)
        if lambda_estimate == 0.0:  # for low-rank
            error = 1.0
        else:
            error = np.abs(diff / lambda_estimate)
        utils.progress_bar(i, steps, "power iter error: %.4f" % error)
        if error < error_threshold:
            break
        prev_lambda = lambda_estimate
    return lambda_estimate, vec
Exemplo n.º 3
0
    def _prepare_grad(self):
        """
        Compute gradient w.r.t loss over all parameters and vectorize
        """
        try:
            all_inputs, all_targets = next(self.dataloader_iter)
        except StopIteration:
            self.dataloader_iter = iter(self.dataloader)
            all_inputs, all_targets = next(self.dataloader_iter)

        num_chunks = max(1, len(all_inputs) // self.max_samples)

        grad_vec = None

        input_chunks = all_inputs.chunk(num_chunks)
        target_chunks = all_targets.chunk(num_chunks)
        for input, target in zip(input_chunks, target_chunks):
            if self.use_gpu:
                input = input.cuda()
                target = target.cuda()

            output = self.model(input)
            loss = self.criterion(output, target)
            grad_dict = torch.autograd.grad(
                loss, self.model.parameters(), create_graph=True
            )
            if grad_vec is not None:
                grad_vec += torch.cat([g.contiguous().view(-1) for g in grad_dict])
            else:
                grad_vec = torch.cat([g.contiguous().view(-1) for g in grad_dict])
            grad_vec = utils.maybe_fp16(grad_vec, self.fp16)
        grad_vec /= num_chunks
        self.grad_vec = grad_vec
        return self.grad_vec
Exemplo n.º 4
0
 def _apply_batch(self, vec):
     # compute original gradient, tracking computation graph
     self.zero_grad()
     grad_vec = self._prepare_grad()
     self.zero_grad()
     # take the second gradient
     grad_grad = torch.autograd.grad(
         grad_vec, self.model.parameters(), grad_outputs=vec, only_inputs=True
     )
     # concatenate the results over the different components of the network
     hessian_vec_prod = torch.cat([g.contiguous().view(-1) for g in grad_grad])
     hessian_vec_prod = utils.maybe_fp16(hessian_vec_prod, self.fp16)
     return hessian_vec_prod
 def _apply_batch(self, vec: torch.Tensor) -> torch.Tensor:
     """
     Computes the Hessian-vector product for a mini-batch from the dataset.
     """
     # compute original gradient, tracking computation graph
     self._zero_grad()
     grad_vec = self._prepare_grad()
     self._zero_grad()
     # take the second gradient
     # this is the derivative of <grad_vec, v> where <,> is an inner product.
     hessian_vec_prod_dict = torch.autograd.grad(grad_vec,
                                                 self.model.parameters(),
                                                 grad_outputs=vec,
                                                 only_inputs=True)
     # concatenate the results over the different components of the network
     hessian_vec_prod = torch.cat(
         [g.contiguous().view(-1) for g in hessian_vec_prod_dict])
     hessian_vec_prod = utils.maybe_fp16(hessian_vec_prod, self.fp16)
     return hessian_vec_prod
    def _prepare_grad(self) -> torch.Tensor:
        """
        Compute gradient w.r.t loss over all parameters and vectorize
        """
        try:
            all_inputs, all_targets = next(self.dataloader_iter)
        except StopIteration:
            self.dataloader_iter = iter(self.dataloader)
            all_inputs, all_targets = next(self.dataloader_iter)

        num_chunks = max(1, len(all_inputs) // self.max_possible_gpu_samples)

        grad_vec = None

        # This will do the "gradient chunking trick" to create micro-batches
        # when the batch size is larger than what will fit in memory.
        # WARNING: this may interact poorly with batch normalization.

        input_microbatches = all_inputs.chunk(num_chunks)
        target_microbatches = all_targets.chunk(num_chunks)
        for input, target in zip(input_microbatches, target_microbatches):
            if self.use_gpu:
                input = input.cuda()
                target = target.cuda()

            output = self.model(input)
            loss = self.criterion(output, target)
            grad_dict = torch.autograd.grad(loss,
                                            self.model.parameters(),
                                            create_graph=True)
            if grad_vec is not None:
                grad_vec += torch.cat(
                    [g.contiguous().view(-1) for g in grad_dict])
            else:
                grad_vec = torch.cat(
                    [g.contiguous().view(-1) for g in grad_dict])
            grad_vec = utils.maybe_fp16(grad_vec, self.fp16)
        grad_vec /= num_chunks
        self.grad_vec = grad_vec
        return self.grad_vec
def lanczos(
    operator,
    num_eigenthings=10,
    which="LM",
    max_steps=20,
    tol=1e-6,
    num_lanczos_vectors=None,
    init_vec=None,
    use_gpu=False,
    fp16=False,
):
    """
    Use the scipy.sparse.linalg.eigsh hook to the ARPACK lanczos algorithm
    to find the top k eigenvalues/eigenvectors.

    Parameters
    -------------
    operator: power_iter.Operator
        linear operator to solve.
    num_eigenthings : int
        number of eigenvalue/eigenvector pairs to compute
    which : str ['LM', SM', 'LA', SA']
        L,S = largest, smallest. M, A = in magnitude, algebriac
        SM = smallest in magnitude. LA = largest algebraic.
    max_steps : int
        maximum number of arnoldi updates
    tol : float
        relative accuracy of eigenvalues / stopping criterion
    num_lanczos_vectors : int
        number of lanczos vectors to compute. if None, > 2*num_eigenthings
    init_vec: [torch.Tensor, torch.cuda.Tensor]
        if None, use random tensor. this is the init vec for arnoldi updates.
    use_gpu: bool
        if true, use cuda tensors.
    fp16: bool
        if true, keep operator input/output in fp16 instead of fp32.

    Returns
    ----------------
    eigenvalues : np.ndarray
        array containing `num_eigenthings` eigenvalues of the operator
    eigenvectors : np.ndarray
        array containing `num_eigenthings` eigenvectors of the operator
    """
    if isinstance(operator.size, int):
        size = operator.size
    else:
        size = operator.size[0]
    shape = (size, size)

    if num_lanczos_vectors is None:
        num_lanczos_vectors = min(2 * num_eigenthings, size - 1)
    if num_lanczos_vectors < 2 * num_eigenthings:
        warn(
            "[lanczos] number of lanczos vectors should usually be > 2*num_eigenthings"
        )

    def _scipy_apply(x):
        x = torch.from_numpy(x)
        x = utils.maybe_fp16(x, fp16)
        if use_gpu:
            x = x.cuda()
        out = operator.apply(x)
        out = utils.maybe_fp16(out, fp16)
        out = out.cpu().numpy()
        return out

    scipy_op = ScipyLinearOperator(shape, _scipy_apply)
    if init_vec is None:
        init_vec = np.random.rand(size)
    elif isinstance(init_vec, torch.Tensor):
        init_vec = init_vec.cpu().numpy()

    init_vec = utils.maybe_fp16(init_vec, fp16)
    eigenvals, eigenvecs = eigsh(
        A=scipy_op,
        k=num_eigenthings,
        which=which,
        maxiter=max_steps,
        tol=tol,
        ncv=num_lanczos_vectors,
        return_eigenvectors=True,
    )
    return eigenvals, eigenvecs.T
 def _new_op_fn(x, op=current_op, val=eigenval, vec=eigenvec):
     return utils.maybe_fp16(op.apply(x), fp16) - _deflate(x, val, vec)