示例#1
0
def _multi_tensor_adadelta(params: List[Tensor],
                           grads: List[Tensor],
                           square_avgs: List[Tensor],
                           acc_deltas: List[Tensor],
                           *,
                           lr: float,
                           weight_decay: float,
                           rho: float,
                           eps: float,
                           maximize: bool):

    if len(params) == 0:
        return

    if maximize:
        grads = torch._foreach_neg(grads)

    if weight_decay != 0:
        torch._foreach_add_(grads, params, alpha=weight_decay)

    torch._foreach_mul_(square_avgs, rho)
    torch._foreach_addcmul_(square_avgs, grads, grads, value=1 - rho)

    std = torch._foreach_add(square_avgs, eps)
    torch._foreach_sqrt_(std)

    deltas = torch._foreach_add(acc_deltas, eps)
    torch._foreach_sqrt_(deltas)
    torch._foreach_div_(deltas, std)
    torch._foreach_mul_(deltas, grads)

    torch._foreach_add_(params, deltas, alpha=-lr)

    torch._foreach_mul_(acc_deltas, rho)
    torch._foreach_addcmul_(acc_deltas, deltas, deltas, value=1 - rho)
示例#2
0
def _multi_tensor_sgd(params: List[Tensor],
                      grads: List[Tensor],
                      momentum_buffer_list: List[Optional[Tensor]],
                      *,
                      weight_decay: float,
                      momentum: float,
                      lr: float,
                      dampening: float,
                      nesterov: bool,
                      maximize: bool,
                      has_sparse_grad: bool):

    if len(params) == 0:
        return

    if has_sparse_grad is None:
        has_sparse_grad = any(grad.is_sparse for grad in grads)

    if maximize:
        grads = torch._foreach_neg(tuple(grads))  # type: ignore[assignment]

    if weight_decay != 0:
        grads = torch._foreach_add(grads, params, alpha=weight_decay)

    if momentum != 0:
        bufs = []

        all_states_with_momentum_buffer = True
        for i in range(len(momentum_buffer_list)):
            if momentum_buffer_list[i] is None:
                all_states_with_momentum_buffer = False
                break
            else:
                bufs.append(momentum_buffer_list[i])

        if all_states_with_momentum_buffer:
            torch._foreach_mul_(bufs, momentum)
            torch._foreach_add_(bufs, grads, alpha=1 - dampening)
        else:
            bufs = []
            for i in range(len(momentum_buffer_list)):
                if momentum_buffer_list[i] is None:
                    buf = momentum_buffer_list[i] = torch.clone(grads[i]).detach()
                else:
                    buf = momentum_buffer_list[i]
                    buf.mul_(momentum).add_(grads[i], alpha=1 - dampening)

                bufs.append(buf)

        if nesterov:
            torch._foreach_add_(grads, bufs, alpha=momentum)
        else:
            grads = bufs

    if not has_sparse_grad:
        torch._foreach_add_(params, grads, alpha=-lr)
    else:
        # foreach APIs dont support sparse
        for i in range(len(params)):
            params[i].add_(grads[i], alpha=-lr)
示例#3
0
def _multi_tensor_adam(params: List[Tensor],
                       grads: List[Tensor],
                       exp_avgs: List[Tensor],
                       exp_avg_sqs: List[Tensor],
                       max_exp_avg_sqs: List[Tensor],
                       state_steps: List[Tensor],
                       *,
                       amsgrad: bool,
                       beta1: float,
                       beta2: float,
                       lr: float,
                       weight_decay: float,
                       eps: float,
                       maximize: bool):

    if len(params) == 0:
        return

    # update steps
    torch._foreach_add_(state_steps, 1)

    if maximize:
        grads = torch._foreach_neg(tuple(grads))  # type: ignore[assignment]

    bias_correction1 = [1 - beta1 ** step.item() for step in state_steps]
    bias_correction2 = [1 - beta2 ** step.item() for step in state_steps]
    if weight_decay != 0:
        torch._foreach_add_(grads, params, alpha=weight_decay)

    torch._foreach_mul_(exp_avgs, beta1)
    torch._foreach_add_(exp_avgs, grads, alpha=1 - beta1)

    torch._foreach_mul_(exp_avg_sqs, beta2)
    torch._foreach_addcmul_(exp_avg_sqs, grads, grads, 1 - beta2)

    if amsgrad:
        # Maintains the maximum of all 2nd moment running avg. till now
        max_exp_avg_sqs = torch._foreach_maximum(max_exp_avg_sqs, exp_avg_sqs)  # type: ignore[assignment]

        # Use the max. for normalizing running avg. of gradient
        max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sqs)
        bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
        torch._foreach_div_(max_exp_avg_sq_sqrt, bias_correction_sqrt)
        denom = torch._foreach_add(max_exp_avg_sq_sqrt, eps)
    else:
        exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sqs)
        bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
        torch._foreach_div_(exp_avg_sq_sqrt, bias_correction_sqrt)
        denom = torch._foreach_add(exp_avg_sq_sqrt, eps)

    step_size = [(lr / bc) * -1 for bc in bias_correction1]
    torch._foreach_addcdiv_(params, exp_avgs, denom, step_size)
示例#4
0
def _multi_tensor_rmsprop(params: List[Tensor], grads: List[Tensor],
                          square_avgs: List[Tensor], grad_avgs: List[Tensor],
                          momentum_buffer_list: List[Tensor], *, lr: float,
                          alpha: float, eps: float, weight_decay: float,
                          momentum: float, centered: bool, maximize: bool,
                          differentiable: bool):

    if len(params) == 0:
        return

    assert not differentiable, "_foreach ops don't support autograd"

    if maximize:
        grads = torch._foreach_neg(grads)

    if weight_decay != 0:
        torch._foreach_add_(grads, params, alpha=weight_decay)

    def _view_complex_as_real(tensor_list):
        return [
            torch.view_as_real(t) if torch.is_complex(t) else t
            for t in tensor_list
        ]

    grads = _view_complex_as_real(grads)
    params = _view_complex_as_real(params)
    square_avgs = _view_complex_as_real(square_avgs)

    torch._foreach_mul_(square_avgs, alpha)
    torch._foreach_addcmul_(square_avgs, grads, grads, value=1 - alpha)

    if centered:
        grad_avgs = _view_complex_as_real(grad_avgs)
        torch._foreach_mul_(grad_avgs, alpha)
        torch._foreach_add_(grad_avgs, grads, alpha=1 - alpha)
        avg = torch._foreach_addcmul(square_avgs,
                                     grad_avgs,
                                     grad_avgs,
                                     value=-1)
        torch._foreach_sqrt_(avg)
        torch._foreach_add_(avg, eps)
    else:
        avg = torch._foreach_sqrt(square_avgs)
        torch._foreach_add_(avg, eps)

    if momentum > 0:
        momentum_buffer_list = _view_complex_as_real(momentum_buffer_list)
        torch._foreach_mul_(momentum_buffer_list, momentum)
        torch._foreach_addcdiv_(momentum_buffer_list, grads, avg)
        torch._foreach_add_(params, momentum_buffer_list, alpha=-lr)
    else:
        torch._foreach_addcdiv_(params, grads, avg, value=-lr)
示例#5
0
def _multi_tensor_rprop(params: List[Tensor],
                        grads: List[Tensor],
                        prevs: List[Tensor],
                        step_sizes: List[Tensor],
                        *,
                        step_size_min: float,
                        step_size_max: float,
                        etaminus: float,
                        etaplus: float,
                        maximize: bool):

    if len(params) == 0:
        return

    # Handle complex params
    def _view_complex_as_real(tensor_list):
        return [torch.view_as_real(t) if torch.is_complex(t) else t for t in tensor_list]

    grads = _view_complex_as_real(grads)
    prevs = _view_complex_as_real(prevs)
    params = _view_complex_as_real(params)
    step_sizes = _view_complex_as_real(step_sizes)

    if maximize:
        grads = torch._foreach_neg(grads)

    signs = torch._foreach_mul(grads, prevs)
    signs = [s.sign() for s in signs]
    for sign in signs:
        sign[sign.gt(0)] = etaplus
        sign[sign.lt(0)] = etaminus
        sign[sign.eq(0)] = 1

    # update stepsizes with step size updates
    torch._foreach_mul_(step_sizes, signs)
    for step_size in step_sizes:
        step_size.clamp_(step_size_min, step_size_max)

    # for dir<0, dfdx=0
    # for dir>=0 dfdx=dfdx
    grads = list(grads)
    for i in range(len(grads)):
        grads[i] = grads[i].clone(memory_format=torch.preserve_format)
        grads[i][signs[i].eq(etaminus)] = 0

    # update parameters
    grad_signs = [grad.sign() for grad in grads]
    torch._foreach_addcmul_(params, grad_signs, step_sizes, value=-1)

    for i in range(len(prevs)):
        prevs[i].copy_(grads[i])
示例#6
0
def _multi_tensor_adamax(params: List[Tensor],
                         grads: List[Tensor],
                         exp_avgs: List[Tensor],
                         exp_infs: List[Tensor],
                         state_steps: List[Tensor],
                         *,
                         beta1: float,
                         beta2: float,
                         lr: float,
                         weight_decay: float,
                         eps: float,
                         maximize: bool):

    if len(params) == 0:
        return

    if maximize:
        grads = torch._foreach_neg(grads)

    params = [torch.view_as_real(x) if torch.is_complex(x) else x for x in params]
    grads = [torch.view_as_real(x) if torch.is_complex(x) else x for x in grads]
    exp_avgs = [torch.view_as_real(x) if torch.is_complex(x) else x for x in exp_avgs]
    exp_infs = [torch.view_as_real(x) if torch.is_complex(x) else x for x in exp_infs]

    # Update steps
    torch._foreach_add_(state_steps, 1)

    if weight_decay != 0:
        torch._foreach_add_(grads, params, alpha=weight_decay)

    # Update biased first moment estimate.
    torch._foreach_mul_(exp_avgs, beta1)
    torch._foreach_add_(exp_avgs, grads, alpha=1 - beta1)

    # Update the exponentially weighted infinity norm.
    torch._foreach_mul_(exp_infs, beta2)

    for exp_inf, grad in zip(exp_infs, grads):
        norm_buf = torch.cat([
            exp_inf.unsqueeze(0),
            grad.abs().add_(eps).unsqueeze_(0)
        ], 0)
        torch.max(norm_buf, 0, keepdim=False, out=(exp_inf, exp_inf.new().long()))

    bias_corrections = [1 - beta1 ** step.item() for step in state_steps]
    clr = [-1 * (lr / bias_correction) for bias_correction in bias_corrections]
    torch._foreach_addcdiv_(params, exp_avgs, exp_infs, clr)
示例#7
0
def _multi_tensor_rmsprop(params: List[Tensor],
                          grads: List[Tensor],
                          square_avgs: List[Tensor],
                          grad_avgs: List[Tensor],
                          momentum_buffer_list: List[Tensor],
                          *,
                          lr: float,
                          alpha: float,
                          eps: float,
                          weight_decay: float,
                          momentum: float,
                          centered: bool,
                          maximize: bool):

    if len(params) == 0:
        return

    if maximize:
        grads = torch._foreach_neg(grads)

    if weight_decay != 0:
        torch._foreach_add_(grads, params, alpha=weight_decay)

    torch._foreach_mul_(square_avgs, alpha)
    torch._foreach_addcmul_(square_avgs, grads, grads, value=1 - alpha)

    if centered:
        torch._foreach_mul_(grad_avgs, alpha)
        torch._foreach_add_(grad_avgs, grads, alpha=1 - alpha)
        avg = torch._foreach_addcmul(square_avgs, grad_avgs, grad_avgs, value=-1)
        torch._foreach_sqrt_(avg)
        torch._foreach_add_(avg, eps)
    else:
        avg = torch._foreach_sqrt(square_avgs)
        torch._foreach_add_(avg, eps)

    if momentum > 0:
        torch._foreach_mul_(momentum_buffer_list, momentum)
        torch._foreach_addcdiv_(momentum_buffer_list, grads, avg)
        torch._foreach_add_(params, momentum_buffer_list, alpha=-lr)
    else:
        torch._foreach_addcdiv_(params, grads, avg, value=-lr)
示例#8
0
文件: asgd.py 项目: huaxz1986/pytorch
def _multi_tensor_asgd(params: List[Tensor], grads: List[Tensor],
                       axs: List[Tensor], mus: List[Tensor],
                       etas: List[Tensor], state_steps: List[Tensor], *,
                       lambd: float, lr: float, t0: float, alpha: float,
                       weight_decay: float, maximize: bool):

    if len(params) == 0:
        return

    if maximize:
        grads = torch._foreach_neg(grads)

    # update step
    torch._foreach_add_(state_steps, 1)

    if weight_decay != 0:
        torch._foreach_add_(grads, params, alpha=weight_decay)

    # decay term
    eta = etas[0].item()
    torch._foreach_mul_(params, 1 - lambd * eta)

    # update parameter
    torch._foreach_add_(params, grads, alpha=-eta)

    # averaging
    for i in range(len(axs)):
        if mus[i].item() != 1:
            axs[i].add_(params[i].sub(axs[i]).mul(mus[i]))
        else:
            axs[i].copy_(params[i])

    # update eta and mu
    for i in range(len(mus)):
        new_eta = torch.tensor(lr / math.pow(
            (1 + lambd * lr * state_steps[i].item()), alpha))
        etas[i].copy_(new_eta)
        new_mu = torch.tensor(1 / max(1, state_steps[i].item() - t0))
        mus[i].copy_(new_mu)
示例#9
0
def _multi_tensor_adagrad(
    params: List[Tensor],
    grads: List[Tensor],
    state_sums: List[Tensor],
    state_steps: List[Tensor],
    *,
    lr: float,
    weight_decay: float,
    lr_decay: float,
    eps: float,
    has_sparse_grad: bool,
    maximize: bool,
):

    # Foreach functions will throw errors if given empty lists
    if len(params) == 0:
        return

    if maximize:
        grads = torch._foreach_neg(grads)

    if has_sparse_grad is None:
        has_sparse_grad = any(grad.is_sparse for grad in grads)

    if has_sparse_grad:
        return _single_tensor_adagrad(
            params,
            grads,
            state_sums,
            state_steps,
            lr=lr,
            weight_decay=weight_decay,
            lr_decay=lr_decay,
            eps=eps,
            has_sparse_grad=has_sparse_grad,
            maximize=False,
        )

    # Update steps
    torch._foreach_add_(state_steps, 1)

    if weight_decay != 0:
        torch._foreach_add_(grads, params, alpha=weight_decay)

    minus_clr = [-lr / (1 + (step - 1) * lr_decay) for step in state_steps]

    grads = [torch.view_as_real(x) if torch.is_complex(x) else x for x in grads]
    state_sums = [
        torch.view_as_real(x) if torch.is_complex(x) else x for x in state_sums
    ]
    torch._foreach_addcmul_(state_sums, grads, grads, value=1)
    std = torch._foreach_add(torch._foreach_sqrt(state_sums), eps)
    toAdd = torch._foreach_div(torch._foreach_mul(grads, minus_clr), std)
    toAdd = [
        torch.view_as_complex(x) if torch.is_complex(params[i]) else x
        for i, x in enumerate(toAdd)
    ]
    torch._foreach_add_(params, toAdd)
    state_sums = [
        torch.view_as_complex(x) if torch.is_complex(params[i]) else x
        for i, x in enumerate(state_sums)
    ]
示例#10
0
def _multi_tensor_adamw(params: List[Tensor], grads: List[Tensor],
                        exp_avgs: List[Tensor], exp_avg_sqs: List[Tensor],
                        max_exp_avg_sqs: List[Tensor],
                        state_steps: List[Tensor], *, amsgrad: bool,
                        beta1: float, beta2: float, lr: float,
                        weight_decay: float, eps: float, maximize: bool,
                        capturable: bool):
    if len(params) == 0:
        return

    if capturable:
        assert all(p.is_cuda and step.is_cuda for p, step in zip(params, state_steps)), \
            "If capturable=True, params and state_steps must be CUDA tensors."

    if maximize:
        grads = torch._foreach_neg(tuple(grads))  # type: ignore[assignment]

    grads = [
        torch.view_as_real(x) if torch.is_complex(x) else x for x in grads
    ]
    exp_avgs = [
        torch.view_as_real(x) if torch.is_complex(x) else x for x in exp_avgs
    ]
    exp_avg_sqs = [
        torch.view_as_real(x) if torch.is_complex(x) else x
        for x in exp_avg_sqs
    ]
    params = [
        torch.view_as_real(x) if torch.is_complex(x) else x for x in params
    ]

    # update steps
    torch._foreach_add_(state_steps, 1)

    # Perform stepweight decay
    torch._foreach_mul_(params, 1 - lr * weight_decay)

    # Decay the first and second moment running average coefficient
    torch._foreach_mul_(exp_avgs, beta1)
    torch._foreach_add_(exp_avgs, grads, alpha=1 - beta1)

    torch._foreach_mul_(exp_avg_sqs, beta2)
    torch._foreach_addcmul_(exp_avg_sqs, grads, grads, 1 - beta2)

    if capturable:
        # TODO: use foreach_pow if/when foreach_pow is added
        bias_correction1 = [torch.pow(beta1, step) for step in state_steps]
        bias_correction2 = [torch.pow(beta2, step) for step in state_steps]
        # foreach_sub doesn't allow a scalar as the first arg
        torch._foreach_sub_(bias_correction1, 1)
        torch._foreach_sub_(bias_correction2, 1)
        torch._foreach_neg_(bias_correction1)
        torch._foreach_neg_(bias_correction2)

        # foreach_div doesn't allow a scalar as the first arg
        step_size = torch._foreach_div(bias_correction1, lr)
        torch._foreach_reciprocal_(step_size)
        torch._foreach_neg_(step_size)

        bias_correction2_sqrt = torch._foreach_sqrt(bias_correction2)

        if amsgrad:
            # Maintains the maximum of all 2nd moment running avg. till now
            torch._foreach_maximum_(max_exp_avg_sqs, exp_avg_sqs)

            # Use the max. for normalizing running avg. of gradient
            max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sqs)
            # Folds in (admittedly ugly) 1-elem step_size math here to avoid extra param-set-sized read+write
            # (can't fold it into addcdiv_ below because addcdiv_ requires value is a Number, not a Tensor)
            torch._foreach_div_(
                max_exp_avg_sq_sqrt,
                torch._foreach_mul(bias_correction2_sqrt, step_size))
            eps_over_step_size = torch._foreach_div(step_size, eps)
            torch._foreach_reciprocal_(eps_over_step_size)
            denom = torch._foreach_add(max_exp_avg_sq_sqrt, eps_over_step_size)
        else:
            exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sqs)
            torch._foreach_div_(
                exp_avg_sq_sqrt,
                torch._foreach_mul(bias_correction2_sqrt, step_size))
            eps_over_step_size = torch._foreach_div(step_size, eps)
            torch._foreach_reciprocal_(eps_over_step_size)
            denom = torch._foreach_add(exp_avg_sq_sqrt, eps_over_step_size)

        torch._foreach_addcdiv_(params, exp_avgs, denom)
    else:
        bias_correction1 = [1 - beta1**step.item() for step in state_steps]
        bias_correction2 = [1 - beta2**step.item() for step in state_steps]

        step_size = [(lr / bc) * -1 for bc in bias_correction1]

        bias_correction2_sqrt = [math.sqrt(bc) for bc in bias_correction2]

        if amsgrad:
            # Maintains the maximum of all 2nd moment running avg. till now
            torch._foreach_maximum_(max_exp_avg_sqs, exp_avg_sqs)

            # Use the max. for normalizing running avg. of gradient
            max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sqs)
            torch._foreach_div_(max_exp_avg_sq_sqrt, bias_correction2_sqrt)
            denom = torch._foreach_add(max_exp_avg_sq_sqrt, eps)
        else:
            exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sqs)
            torch._foreach_div_(exp_avg_sq_sqrt, bias_correction2_sqrt)
            denom = torch._foreach_add(exp_avg_sq_sqrt, eps)

        torch._foreach_addcdiv_(params, exp_avgs, denom, step_size)
示例#11
0
    def step(self, closure=None):
        """Performs a single optimization step.

        Args:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        for group in self.param_groups:
            amsgrad = group['amsgrad']

            grads = []
            states = []
            exp_avg = []
            exp_avg_sq = []
            max_exp_avg_sq = []
            params_with_grad = []

            for p in group['params']:
                if p.grad is not None:
                    if p.grad.is_sparse:
                        raise RuntimeError('AdamW does not support sparse gradients')

                    # Perform stepweight decay
                    p.mul_(1 - group['lr'] * group['weight_decay'])

                    params_with_grad.append(p)
                    grads.append(p.grad)

            if group['maximize']:
                grads = torch._foreach_neg(tuple(grads))

            for p in params_with_grad:
                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
                    if amsgrad:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)

                exp_avg.append(state['exp_avg'])
                exp_avg_sq.append(state['exp_avg_sq'])

                if amsgrad:
                    max_exp_avg_sq.append(state['max_exp_avg_sq'])

                state['step'] += 1
                states.append(state)

            beta1, beta2 = group['betas']

            bias_correction1 = [1 - beta1 ** state['step'] for state in states]
            bias_correction2 = [1 - beta2 ** state['step'] for state in states]

            #
            # Decay the first and second moment running average coefficient
            #
            torch._foreach_mul_(exp_avg, beta1)
            torch._foreach_add_(exp_avg, grads, alpha=1 - beta1)

            torch._foreach_mul_(exp_avg_sq, beta2)
            torch._foreach_addcmul_(exp_avg_sq, grads, grads, 1 - beta2)

            if amsgrad:
                # Maintains the maximum of all 2nd moment running avg. till now
                max_exp_avg_sq = torch._foreach_maximum(max_exp_avg_sq, exp_avg_sq)

                # Use the max. for normalizing running avg. of gradient
                max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sq)
                bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
                torch._foreach_div_(max_exp_avg_sq_sqrt, bias_correction_sqrt)
                denom = torch._foreach_add(max_exp_avg_sq_sqrt, group['eps'])
            else:
                exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sq)
                bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
                torch._foreach_div_(exp_avg_sq_sqrt, bias_correction_sqrt)
                denom = torch._foreach_add(exp_avg_sq_sqrt, group['eps'])

            step_size = [-1 * (group['lr'] / bc) for bc in bias_correction1]
            torch._foreach_addcdiv_(params_with_grad, exp_avg, denom, step_size)

        return loss