示例#1
0
def _multi_tensor_adadelta(params: List[Tensor],
                           grads: List[Tensor],
                           square_avgs: List[Tensor],
                           acc_deltas: List[Tensor],
                           *,
                           lr: float,
                           weight_decay: float,
                           rho: float,
                           eps: float,
                           maximize: bool):

    if len(params) == 0:
        return

    if maximize:
        grads = torch._foreach_neg(grads)

    if weight_decay != 0:
        torch._foreach_add_(grads, params, alpha=weight_decay)

    torch._foreach_mul_(square_avgs, rho)
    torch._foreach_addcmul_(square_avgs, grads, grads, value=1 - rho)

    std = torch._foreach_add(square_avgs, eps)
    torch._foreach_sqrt_(std)

    deltas = torch._foreach_add(acc_deltas, eps)
    torch._foreach_sqrt_(deltas)
    torch._foreach_div_(deltas, std)
    torch._foreach_mul_(deltas, grads)

    torch._foreach_add_(params, deltas, alpha=-lr)

    torch._foreach_mul_(acc_deltas, rho)
    torch._foreach_addcmul_(acc_deltas, deltas, deltas, value=1 - rho)
示例#2
0
def _multi_tensor_rmsprop(params: List[Tensor], grads: List[Tensor],
                          square_avgs: List[Tensor], grad_avgs: List[Tensor],
                          momentum_buffer_list: List[Tensor], *, lr: float,
                          alpha: float, eps: float, weight_decay: float,
                          momentum: float, centered: bool):

    if len(params) == 0:
        return

    if weight_decay != 0:
        torch._foreach_add_(grads, params, alpha=weight_decay)

    torch._foreach_mul_(square_avgs, alpha)
    torch._foreach_addcmul_(square_avgs, grads, grads, value=1 - alpha)

    if centered:
        torch._foreach_mul_(grad_avgs, alpha)
        torch._foreach_add_(grad_avgs, grads, alpha=1 - alpha)
        avg = torch._foreach_addcmul(square_avgs,
                                     grad_avgs,
                                     grad_avgs,
                                     value=-1)
        torch._foreach_sqrt_(avg)
        torch._foreach_add_(avg, eps)
    else:
        avg = torch._foreach_sqrt(square_avgs)
        torch._foreach_add_(avg, eps)

    if momentum > 0:
        torch._foreach_mul_(momentum_buffer_list, momentum)
        torch._foreach_addcdiv_(momentum_buffer_list, grads, avg)
        torch._foreach_add_(params, momentum_buffer_list, alpha=-lr)
    else:
        torch._foreach_addcdiv_(params, grads, avg, value=-lr)
示例#3
0
def adadelta(params: List[Tensor], grads: List[Tensor],
             square_avgs: List[Tensor], acc_deltas: List[Tensor], *, lr: float,
             weight_decay: float, rho: float, eps: float):
    r"""Functional API that performs Adadelta algorithm computation.

    See :class:`~torch.optim.Adadelta` for details.
    """

    if weight_decay != 0:
        torch._foreach_add_(grads, params, alpha=weight_decay)

    torch._foreach_mul_(square_avgs, rho)
    torch._foreach_addcmul_(square_avgs, grads, grads, value=1 - rho)

    std = torch._foreach_add(square_avgs, eps)
    torch._foreach_sqrt_(std)

    deltas = torch._foreach_add(acc_deltas, eps)
    torch._foreach_sqrt_(deltas)
    torch._foreach_div_(deltas, std)
    torch._foreach_mul_(deltas, grads)

    torch._foreach_add_(params, deltas, alpha=-lr)

    torch._foreach_mul_(acc_deltas, rho)
    torch._foreach_addcmul_(acc_deltas, deltas, deltas, value=1 - rho)
示例#4
0
def _multi_tensor_rmsprop(params: List[Tensor], grads: List[Tensor],
                          square_avgs: List[Tensor], grad_avgs: List[Tensor],
                          momentum_buffer_list: List[Tensor], *, lr: float,
                          alpha: float, eps: float, weight_decay: float,
                          momentum: float, centered: bool, maximize: bool,
                          differentiable: bool):

    if len(params) == 0:
        return

    assert not differentiable, "_foreach ops don't support autograd"

    if maximize:
        grads = torch._foreach_neg(grads)

    if weight_decay != 0:
        torch._foreach_add_(grads, params, alpha=weight_decay)

    def _view_complex_as_real(tensor_list):
        return [
            torch.view_as_real(t) if torch.is_complex(t) else t
            for t in tensor_list
        ]

    grads = _view_complex_as_real(grads)
    params = _view_complex_as_real(params)
    square_avgs = _view_complex_as_real(square_avgs)

    torch._foreach_mul_(square_avgs, alpha)
    torch._foreach_addcmul_(square_avgs, grads, grads, value=1 - alpha)

    if centered:
        grad_avgs = _view_complex_as_real(grad_avgs)
        torch._foreach_mul_(grad_avgs, alpha)
        torch._foreach_add_(grad_avgs, grads, alpha=1 - alpha)
        avg = torch._foreach_addcmul(square_avgs,
                                     grad_avgs,
                                     grad_avgs,
                                     value=-1)
        torch._foreach_sqrt_(avg)
        torch._foreach_add_(avg, eps)
    else:
        avg = torch._foreach_sqrt(square_avgs)
        torch._foreach_add_(avg, eps)

    if momentum > 0:
        momentum_buffer_list = _view_complex_as_real(momentum_buffer_list)
        torch._foreach_mul_(momentum_buffer_list, momentum)
        torch._foreach_addcdiv_(momentum_buffer_list, grads, avg)
        torch._foreach_add_(params, momentum_buffer_list, alpha=-lr)
    else:
        torch._foreach_addcdiv_(params, grads, avg, value=-lr)
示例#5
0
    def step(self, closure=None):
        """Performs a single optimization step.

        Args:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        for group in self.param_groups:
            grads = []
            params_with_grad = []
            states = []
            alpha = group['alpha']
            square_avg = []

            for p in group['params']:
                if p.grad is not None:
                    if p.grad.is_sparse:
                        raise RuntimeError(
                            'RMSprop does not support sparse gradients')

                    grads.append(p.grad)
                    params_with_grad.append(p)

                    state = self.state[p]
                    # State initialization
                    if len(state) == 0:
                        state['step'] = 0
                        state['square_avg'] = torch.zeros_like(
                            p, memory_format=torch.preserve_format)
                        if group['momentum'] > 0:
                            state['momentum_buffer'] = torch.zeros_like(
                                p, memory_format=torch.preserve_format)
                        if group['centered']:
                            state['grad_avg'] = torch.zeros_like(
                                p, memory_format=torch.preserve_format)

                        state['step'] += 1

                    states.append(state)
                    square_avg.append(state['square_avg'])

            if group['weight_decay'] != 0:
                torch._foreach_add_(grads,
                                    params_with_grad,
                                    alpha=group['weight_decay'])

            torch._foreach_mul_(square_avg, alpha)
            torch._foreach_addcmul_(square_avg, grads, grads, value=1 - alpha)

            if group['centered']:
                grad_avgs = [s['grad_avg'] for s in states]
                torch._foreach_mul_(grad_avgs, alpha)
                torch._foreach_add_(grad_avgs, grads, alpha=1 - alpha)
                avg = torch._foreach_addcmul(square_avg,
                                             grad_avgs,
                                             grad_avgs,
                                             value=-1)
                torch._foreach_sqrt_(avg)
                torch._foreach_add_(avg, group['eps'])
            else:
                avg = torch._foreach_sqrt(square_avg)
                torch._foreach_add_(avg, group['eps'])

            if group['momentum'] > 0:
                buf = [s['momentum_buffer'] for s in states]
                torch._foreach_mul_(buf, group['momentum'])
                torch._foreach_addcdiv_(buf, grads, avg)
                torch._foreach_add_(params_with_grad, buf, alpha=-group['lr'])
            else:
                torch._foreach_addcdiv_(params_with_grad,
                                        grads,
                                        avg,
                                        value=-group['lr'])

        return loss
示例#6
0
    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        for group in self.param_groups:
            grads = []
            params_with_grad = []
            states = []
            square_avgs = []
            acc_deltas = []

            rho, eps = group['rho'], group['eps']

            for p in group['params']:
                if p.grad is not None:
                    if p.grad.is_sparse:
                        raise RuntimeError(
                            'Adadelta does not support sparse gradients')

                    grads.append(p.grad)
                    params_with_grad.append(p)

                    state = self.state[p]

                    # State initialization
                    if len(state) == 0:
                        state['step'] = 0
                        state['square_avg'] = torch.zeros_like(
                            p, memory_format=torch.preserve_format)
                        state['acc_delta'] = torch.zeros_like(
                            p, memory_format=torch.preserve_format)

                    square_avgs.append(state['square_avg'])
                    acc_deltas.append(state['acc_delta'])

                    state['step'] += 1
                    states.append(state)

            if group['weight_decay'] != 0:
                torch._foreach_add_(grads,
                                    params_with_grad,
                                    alpha=group['weight_decay'])

            torch._foreach_mul_(square_avgs, rho)
            torch._foreach_addcmul_(square_avgs, grads, grads, value=1 - rho)

            std = torch._foreach_add(square_avgs, eps)
            torch._foreach_sqrt_(std)

            deltas = torch._foreach_add(acc_deltas, eps)
            torch._foreach_sqrt_(deltas)
            torch._foreach_div_(deltas, std)
            torch._foreach_mul_(deltas, grads)

            torch._foreach_add_(params_with_grad, deltas, alpha=-group['lr'])

            torch._foreach_mul_(acc_deltas, rho)
            torch._foreach_addcmul_(acc_deltas, deltas, deltas, value=1 - rho)

        return loss