def _allreduce_grads(self): if size() == 1: return if (self._num_groups > 0): grads = [] names = [] for i, param in enumerate(self._params): if param.grad_req != 'null': grads.append(param.list_grad()[0]) names.append(self._prefix + str(i)) grads_split = split_list(grads, self._num_groups) names_split = split_list(names, self._num_groups) for i, (group_grads, group_names) in enumerate(zip(grads_split, names_split)): # For better performance, enqueue groups in separate grouped_allreduce calls by dtype. entries_by_dtype = defaultdict(list) for grad, name in zip(group_grads, group_names): entries_by_dtype[grad.dtype].append((grad, name)) for entries in entries_by_dtype.values(): grads, names = zip(*entries) grouped_allreduce_(tensors=grads, average=False, name="{}:{}".format(names[0], names[-1]), priority=-i, prescale_factor=1.0 / self._gradient_predivide_factor) else: # In MXNet 2.0, param.name is no longer unique. # Meanwhile, since horovod requires Python 3.6, there is no need to sort # self._params as enumerating a python dict is always deterministic. for i, param in enumerate(self._params): if param.grad_req != 'null': allreduce_(param.list_grad()[0], average=False, name=self._prefix + str(i), priority=-i, prescale_factor=1.0 / self._gradient_predivide_factor)
def _do_allreduce(self, index, grad): if size() == 1: return if isinstance(index, (tuple, list)): if (self._num_groups > 0): grad_split = split_list(grad, self._num_groups) index_split = split_list(index, self._num_groups) for i, (grads, indices) in enumerate(zip(grad_split, index_split)): tensors_compressed, ctxs = zip(*[self._compression.compress(grad) for grad in grads]) grouped_allreduce_(tensors=tensors_compressed, average=False, name="{}:{}".format(names[0], names[-1]), priority=-i, prescale_factor=1.0 / self._gradient_predivide_factor) grads = [self._compression.decompress(t, ctx) for t, ctx in zip(tensors_compressed, ctxs)] else: for i in range(len(index)): tensor_compressed, ctx = self._compression.compress(grad[i]) allreduce_(tensor_compressed, average=False, name=str(index[i]), priority=-i, prescale_factor=1.0 / self._gradient_predivide_factor) grad[i] = self._compression.decompress(tensor_compressed, ctx) else: tensor_compressed, ctx = self._compression.compress(grad) allreduce_(tensor_compressed, average=False, name=str(index), prescale_factor=1.0 / self._gradient_predivide_factor) grad = self._compression.decompress(tensor_compressed, ctx)
def _do_allreduce(self, index, grad): if self._process_set.size() == 1: return if isinstance(index, (tuple, list)): if (self._num_groups > 0): grad_split = split_list(grad, self._num_groups) index_split = split_list(index, self._num_groups) for i, (grads, indices) in enumerate(zip(grad_split, index_split)): grouped_allreduce_( tensors=grads, average=False, name="{}:{}".format(indices[0], indices[-1]), priority=-i, prescale_factor=1.0 / self._gradient_predivide_factor, process_set=self._process_set) else: for i in range(len(index)): allreduce_(grad[i], average=False, name=str(index[i]), priority=-i, prescale_factor=1.0 / self._gradient_predivide_factor, process_set=self._process_set) else: allreduce_(grad, average=False, name=str(index), prescale_factor=1.0 / self._gradient_predivide_factor, process_set=self._process_set)