def DistributedOptimizer(optimizer, named_parameters=None, compression=Compression.none, backward_passes_per_step=1, op=Average, gradient_predivide_factor=1.0): """ An optimizer that wraps another torch.optim.Optimizer, using an allreduce to combine gradient values before applying gradients to model weights. Allreduce operations are executed after each gradient is computed by ``loss.backward()`` in parallel with each other. The ``step()`` method ensures that all allreduce operations are finished before applying gradients to the model. DistributedOptimizer exposes the ``synchronize()`` method, which forces allreduce operations to finish before continuing the execution. It's useful in conjunction with gradient clipping, or other operations that modify gradients in place before ``step()`` is executed. Make sure to use ``optimizer.skip_synchronize()`` if you're calling ``synchronize()`` in your code. Example of gradient clipping: .. code-block:: python output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.synchronize() torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) with optimizer.skip_synchronize(): optimizer.step() Arguments: optimizer: Optimizer to use for computing gradients and applying updates. named_parameters: A mapping between parameter names and values. Used for naming of allreduce operations. Typically just ``model.named_parameters()``. compression: Compression algorithm used during allreduce to reduce the amount of data sent during the each parameter update step. Defaults to not using compression. backward_passes_per_step: Number of expected backward passes to perform before calling step()/synchronize(). This allows accumulating gradients over multiple mini-batches before reducing and applying them. op: The reduction operation to use when combining gradients across different ranks. gradient_predivide_factor: If op == Average, gradient_predivide_factor splits the averaging before and after the sum. Gradients are scaled by 1.0 / gradient_predivide_factor before the sum and gradient_predivide_factor / size after the sum. """ # We dynamically create a new class that inherits from the optimizer that was passed in. # The goal is to override the `step()` method with an allreduce implementation. if gradient_predivide_factor != 1.0: if rocm_built(): raise ValueError( 'gradient_predivide_factor not supported yet with ROCm') if op != Average: raise ValueError( 'gradient_predivide_factor not supported with op != Average') if op != Adasum or size() == 1: cls = type(optimizer.__class__.__name__, (optimizer.__class__, ), dict(_DistributedOptimizer.__dict__)) return cls(optimizer.param_groups, named_parameters, compression, backward_passes_per_step, op, gradient_predivide_factor) else: cls = type(optimizer.__class__.__name__, (optimizer.__class__, ), dict(_DistributedAdasumOptimizer.__dict__)) return cls(optimizer.param_groups, named_parameters, compression, backward_passes_per_step)
def DistributedOptimizer(optimizer, named_parameters=None, compression=Compression.none, backward_passes_per_step=1, op=Average, gradient_predivide_factor=1.0, num_groups=0, groups=None, sparse_as_dense=False): """ An optimizer that wraps another torch.optim.Optimizer, using an allreduce to combine gradient values before applying gradients to model weights. Allreduce operations are executed after each gradient is computed by ``loss.backward()`` in parallel with each other. The ``step()`` method ensures that all allreduce operations are finished before applying gradients to the model. DistributedOptimizer exposes the ``synchronize()`` method, which forces allreduce operations to finish before continuing the execution. It's useful in conjunction with gradient clipping, or other operations that modify gradients in place before ``step()`` is executed. Make sure to use ``optimizer.skip_synchronize()`` if you're calling ``synchronize()`` in your code. Example of gradient clipping: .. code-block:: python output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.synchronize() torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) with optimizer.skip_synchronize(): optimizer.step() Arguments: optimizer: Optimizer to use for computing gradients and applying updates. named_parameters: A mapping between parameter names and values. Used for naming of allreduce operations. Typically just ``model.named_parameters()``. compression: Compression algorithm used during allreduce to reduce the amount of data sent during the each parameter update step. Defaults to not using compression. backward_passes_per_step: Number of expected backward passes to perform before calling step()/synchronize(). This allows accumulating gradients over multiple mini-batches before reducing and applying them. op: The reduction operation to use when combining gradients across different ranks. gradient_predivide_factor: If op == Average, gradient_predivide_factor splits the averaging before and after the sum. Gradients are scaled by 1.0 / gradient_predivide_factor before the sum and gradient_predivide_factor / size after the sum. num_groups: Number of groups to assign gradient allreduce ops to for explicit grouping. Defaults to no explicit groups. groups: The parameter to group the gradient allreduce ops. Accept values is a non-negative integer or a list of list of torch.Tensor. If groups is a non-negative integer, it is the number of groups to assign gradient allreduce ops to for explicit grouping. If groups is a list of list of torch.Tensor. Tensors in the same inner list will be assigned to the same group, while parameter that does not appear in any list will form a group itself. Defaults as None, which is no explicit groups. sparse_as_dense: If set True, convert all sparse gradients to dense and perform allreduce, then convert back to sparse before applying the update. """ # We dynamically create a new class that inherits from the optimizer that was passed in. # The goal is to override the `step()` method with an allreduce implementation. if gradient_predivide_factor != 1.0: if rocm_built(): raise ValueError( 'gradient_predivide_factor not supported yet with ROCm') if op != Average: raise ValueError( 'gradient_predivide_factor not supported with op != Average') if num_groups != 0: warnings.warn( 'Parameter `num_groups` has been replaced by `groups` ' 'and will be removed in v0.23.0.', DeprecationWarning) if groups is None: groups = num_groups if op != Adasum or size() == 1: cls = type(optimizer.__class__.__name__, (optimizer.__class__, ), dict(_DistributedOptimizer.__dict__)) return cls(optimizer.param_groups, named_parameters, compression, backward_passes_per_step, op, gradient_predivide_factor, groups, sparse_as_dense) else: cls = type(optimizer.__class__.__name__, (optimizer.__class__, ), dict(_DistributedAdasumOptimizer.__dict__)) return cls(optimizer.param_groups, named_parameters, compression, backward_passes_per_step)