def _allreduce_grads(self): if size() == 1: return if (self._num_groups > 0): grads = [] names = [] for i, param in enumerate(self._params): if param.grad_req != 'null': grads.append(param.list_grad()[0]) names.append(self._prefix + str(i)) grads_split = split_list(grads, self._num_groups) names_split = split_list(names, self._num_groups) for i, (group_grads, group_names) in enumerate(zip(grads_split, names_split)): # For better performance, enqueue groups in separate grouped_allreduce calls by dtype. entries_by_dtype = defaultdict(list) for grad, name in zip(group_grads, group_names): entries_by_dtype[grad.dtype].append((grad, name)) for entries in entries_by_dtype.values(): grads, names = zip(*entries) grouped_allreduce_(tensors=grads, average=False, name="{}:{}".format(names[0], names[-1]), priority=-i, prescale_factor=1.0 / self._gradient_predivide_factor) else: # In MXNet 2.0, param.name is no longer unique. # Meanwhile, since horovod requires Python 3.6, there is no need to sort # self._params as enumerating a python dict is always deterministic. for i, param in enumerate(self._params): if param.grad_req != 'null': allreduce_(param.list_grad()[0], average=False, name=self._prefix + str(i), priority=-i, prescale_factor=1.0 / self._gradient_predivide_factor)
def __init__(self, params, optimizer, optimizer_params=None, compression=Compression.none, gradient_predivide_factor=1.0, prefix=None, num_groups=0): self._compression = compression if gradient_predivide_factor != 1.0 and rocm_built(): raise ValueError('gradient_predivide_factor not supported yet with ROCm') if isinstance(optimizer, DistributedOptimizer): optimizer = optimizer._optimizer warnings.warn("DistributedTrainer does not take DistributedOptimizer " "as its optimizer. We have unwrapped it for you.") # To ensure consistent parameter ordering across workers, sort params before # passing to base Trainer constructor. This logic is consistent with trainer.py # since v1.6 but we do it here for backwards compatability if isinstance(params, dict): params = OrderedDict(params) elif isinstance(params, (list, tuple)): params = sorted(params) super(DistributedTrainer, self).__init__( params, optimizer, optimizer_params=optimizer_params, kvstore=None) # _scale is used to check and set rescale_grad for optimizer in Trainer.step() # function. Normalizing it by Horovod size, which is equivalent to performing # average in allreduce, has better performance. self._scale *= (gradient_predivide_factor / size()) self._gradient_predivide_factor = gradient_predivide_factor assert prefix is None or isinstance(prefix, str) self._prefix = prefix if prefix else "" self._num_groups = num_groups
def _do_allreduce(self, index, grad): if size() == 1: return if isinstance(index, (tuple, list)): if (self._num_groups > 0): grad_split = split_list(grad, self._num_groups) index_split = split_list(index, self._num_groups) for i, (grads, indices) in enumerate(zip(grad_split, index_split)): tensors_compressed, ctxs = zip(*[self._compression.compress(grad) for grad in grads]) grouped_allreduce_(tensors=tensors_compressed, average=False, name="{}:{}".format(names[0], names[-1]), priority=-i, prescale_factor=1.0 / self._gradient_predivide_factor) grads = [self._compression.decompress(t, ctx) for t, ctx in zip(tensors_compressed, ctxs)] else: for i in range(len(index)): tensor_compressed, ctx = self._compression.compress(grad[i]) allreduce_(tensor_compressed, average=False, name=str(index[i]), priority=-i, prescale_factor=1.0 / self._gradient_predivide_factor) grad[i] = self._compression.decompress(tensor_compressed, ctx) else: tensor_compressed, ctx = self._compression.compress(grad) allreduce_(tensor_compressed, average=False, name=str(index), prescale_factor=1.0 / self._gradient_predivide_factor) grad = self._compression.decompress(tensor_compressed, ctx)
def _allreduce_grads(self): if size() == 1: return for i, param in enumerate(self._params): if param.grad_req != 'null': allreduce_(param.list_grad()[0], average=False, name=param.name, priority=-i)
def allgather_object(obj, name=None): """ Serializes and allgathers an object from all other processes. Arguments: obj: An object capable of being serialized without losing any context. name: Optional name to use during allgather, will default to the class type. Returns: The list of objects that were allgathered across all ranks. """ if name is None: name = type(obj).__name__ def load(byte_array): buf = io.BytesIO(byte_array.tobytes()) return cloudpickle.load(buf) b = io.BytesIO() cloudpickle.dump(obj, b) t = mx.nd.array(bytearray(b.getvalue()), dtype='byte') sz = mx.nd.array([t.size], dtype='int') sizes = allgather(sz, name=name + '.sz').asnumpy() gathered = allgather(t, name=name + '.t').asnumpy() def select(i): start = sizes[i - 1] if i > 0 else 0 end = start + sizes[i] return gathered[start:end] return [load(select(i)) for i in range(size())]
def _do_allreduce(self, index, grad): if size() == 1: return if isinstance(index, (tuple, list)): if (self._num_groups > 0): grad_split = split_list(grad, self._num_groups) index_split = split_list(index, self._num_groups) for i, (grads, indices) in enumerate(zip(grad_split, index_split)): grouped_allreduce_( tensors=grads, average=False, name="{}:{}".format(indices[0], indices[-1]), priority=-i, prescale_factor=1.0 / self._gradient_predivide_factor) else: for i in range(len(index)): allreduce_(grad[i], average=False, name=str(index[i]), priority=-i, prescale_factor=1.0 / self._gradient_predivide_factor) else: allreduce_(grad, average=False, name=str(index), prescale_factor=1.0 / self._gradient_predivide_factor)
def __init__(self, params, optimizer, optimizer_params=None, gradient_predivide_factor=1.0, prefix=None): if gradient_predivide_factor != 1.0 and rocm_built(): raise ValueError( 'gradient_predivide_factor not supported yet with ROCm') if isinstance(optimizer, DistributedOptimizer): optimizer = optimizer._optimizer warnings.warn( "DistributedTrainer does not take DistributedOptimizer " "as its optimizer. We have unwrapped it for you.") super(DistributedTrainer, self).__init__(params, optimizer, optimizer_params=optimizer_params, kvstore=None) # _scale is used to check and set rescale_grad for optimizer in Trainer.step() # function. Normalizing it by Horovod size, which is equivalent to performing # average in allreduce, has better performance. self._scale *= (gradient_predivide_factor / size()) self._gradient_predivide_factor = gradient_predivide_factor assert prefix is None or isinstance(prefix, str) self._prefix = prefix if prefix else ""
def broadcast_parameters(params, root_rank=0): """ Broadcasts the parameters from root rank to all other processes. Typical usage is to broadcast the `Module.get_params()` or the `Block.collect_params()`. Arguments: params: One of the following: - dict of parameters to broadcast - ParameterDict to broadcast root_rank: The rank of the process from which parameters will be broadcasted to all other processes. """ if size() == 1: return tensors = [] names = [] if isinstance(params, dict): names, tensors = zip(*params.items()) elif isinstance(params, mx.gluon.parameter.ParameterDict): for name, p in sorted(params.items()): try: tensors.append(p.data()) names.append(name) except mx.gluon.parameter.DeferredInitializationError: # Inject wrapper method with post-initialization broadcast to # handle parameters with deferred initialization new_init = _append_broadcast_init(p, root_rank) p._init_impl = types.MethodType(new_init, p) else: raise ValueError('invalid params of type: %s' % type(params)) # Run broadcasts. for tensor, name in zip(tensors, names): broadcast_(tensor, root_rank, name=str(name))
def _do_allreduce(self, index, grad): if size() == 1: return if isinstance(index, (tuple, list)): for i in range(len(index)): allreduce_(grad[i], average=False, name=str(index[i]), priority=-i) else: allreduce_(grad, average=False, name=str(index))
def _allreduce_grads(self): if size() == 1: return # sort needed for Python < 3.6 is not guaranteed for i, param in enumerate(sorted(self._params, key=lambda p: p.name)): if param.grad_req != 'null': allreduce_(param.list_grad()[0], average=False, name=str(i), priority=-i)
def __init__(self, optimizer, gradient_predivide_factor=1.0): if gradient_predivide_factor != 1.0 and rocm_built(): raise ValueError( 'gradient_predivide_factor not supported yet with ROCm') self._optimizer = optimizer # Normalizing rescale_grad by Horovod size, which is equivalent to # performing average in allreduce, has better performance. self._optimizer.rescale_grad *= (gradient_predivide_factor / size()) self._gradient_predivide_factor = gradient_predivide_factor
def _allreduce_grads(self): if size() == 1: return for i, param in enumerate(self._params): if param.grad_req != 'null': allreduce_(param.list_grad()[0], average=False, name=param.name, priority=-i, prescale_factor=1.0 / self._gradient_predivide_factor)
def __init__(self, params, optimizer, optimizer_params=None): if isinstance(optimizer, DistributedOptimizer): optimizer = optimizer._optimizer warnings.warn("DistributedTrainer does not take DistributedOptimizer " "as its optimizer. We have unwrapped it for you.") super(DistributedTrainer, self).__init__( params, optimizer, optimizer_params=optimizer_params, kvstore=None) # _scale is used to check and set rescale_grad for optimizer in Trainer.step() # function. Normalizing it by Horovod size, which is equivalent to performing # average in allreduce, has better performance. self._scale /= size()
def _allreduce_grads(self): if size() == 1: return # In MXNet 2.0, param.name is no longer unique. # Meanwhile, since horovod requires Python 3.6, there is no need to sort # self._params as enumerating a python dict is always deterministic. for i, param in enumerate(self._params): if param.grad_req != 'null': allreduce_(param.list_grad()[0], average=False, name=self._prefix + str(i), priority=-i, prescale_factor=1.0 / self._gradient_predivide_factor)
def broadcast_parameters(params, root_rank=0, prefix=None): """Broadcasts the parameters from root rank to all other processes. Typical usage is to broadcast the `Module.get_params()` or the `Block.collect_params()`. Arguments: params: One of the following: - dict of parameters to broadcast - ParameterDict to broadcast root_rank: The rank of the process from which parameters will be broadcasted to all other processes. prefix: The prefix of the parameters to broadcast. If multiple `broadcast_parameters` are called in the same program, they must be specified by different prefixes to avoid tensor name collision. """ if size() == 1: return tensors = [] names = [] assert prefix is None or isinstance(prefix, str) prefix = prefix if prefix else "" try: from mxnet.gluon.parameter import ParameterDict valid_types = (dict, ParameterDict) except ImportError: valid_types = (dict, ) if isinstance(params, valid_types): for name, p in sorted(params.items()): try: if isinstance(p, mx.gluon.parameter.Parameter): tensors.append(p.data()) else: tensors.append(p) names.append(prefix + str(name)) except mx.gluon.parameter.DeferredInitializationError: # Inject wrapper method with post-initialization broadcast to # handle parameters with deferred initialization # we use the key of params instead of param.name, since # param.name is no longer unique in MXNet 2.0 new_init = _append_broadcast_init(p, root_rank, prefix + str(name)) p._init_impl = types.MethodType(new_init, p) else: raise ValueError('invalid params of type: %s' % type(params)) # Run broadcasts. for tensor, name in zip(tensors, names): broadcast_(tensor, root_rank, name=name)
def _do_allreduce(self, index, grad): if size() == 1: return if isinstance(index, (tuple, list)): for i in range(len(index)): allreduce_(grad[i], average=False, name=str(index[i]), priority=-i, prescale_factor=1.0 / self._gradient_predivide_factor) else: allreduce_(grad, average=False, name=str(index), prescale_factor=1.0 / self._gradient_predivide_factor)
def broadcast_parameters(params, root_rank=0): """ Broadcasts the parameters from root rank to all other processes. Typical usage is to broadcast the `Module.get_params()` or the `Block.collect_params()`. Arguments: params: One of the following: - dict of parameters to broadcast - ParameterDict to broadcast root_rank: The rank of the process from which parameters will be broadcasted to all other processes. """ if size() == 1: return tensors = [] if isinstance(params, dict): tensors = [p for _, p in sorted(params.items())] elif isinstance(params, mx.gluon.parameter.ParameterDict): for _, p in sorted(params.items()): try: tensors.append(p.data()) except mx.gluon.parameter.DeferredInitializationError: # Inject wrapper method with post-initialization broadcast to # handle parameters with deferred initialization new_init = _append_broadcast_init(p, root_rank) p._init_impl = types.MethodType(new_init, p) else: raise ValueError('invalid params of type: %s' % type(params)) # Run broadcasts. for i, tensor in enumerate(tensors): broadcast_(tensor, root_rank, str(i)) # Make sure tensors pushed to MXNet engine get processed such that all # workers are synced before starting training. for tensor in tensors: tensor.wait_to_read()
def __init__(self, optimizer): self._optimizer = optimizer # Normalizing rescale_grad by Horovod size, which is equivalent to # performing average in allreduce, has better performance. self._optimizer.rescale_grad /= size()
def __init__(self, *args, **kwargs): self._size = size() self._rank = rank() return super().__init__(*args, **kwargs)