示例#1
0
    def _do_allreduce(self, index, grad):
        if size() == 1: return

        if isinstance(index, (tuple, list)):
            if (self._num_groups > 0):
                grad_split = split_list(grad, self._num_groups)
                index_split = split_list(index, self._num_groups)

                for i, (grads, indices) in enumerate(zip(grad_split, index_split)):
                    tensors_compressed, ctxs = zip(*[self._compression.compress(grad) for grad in grads])
                    grouped_allreduce_(tensors=tensors_compressed, average=False,
                                       name="{}:{}".format(names[0], names[-1]), priority=-i,
                                       prescale_factor=1.0 / self._gradient_predivide_factor)
                    grads = [self._compression.decompress(t, ctx) for t, ctx in zip(tensors_compressed, ctxs)]
            else:
              for i in range(len(index)):
                  tensor_compressed, ctx = self._compression.compress(grad[i])
                  allreduce_(tensor_compressed, average=False,
                             name=str(index[i]), priority=-i,
                             prescale_factor=1.0 / self._gradient_predivide_factor)
                  grad[i] = self._compression.decompress(tensor_compressed, ctx)
        else:
            tensor_compressed, ctx = self._compression.compress(grad)
            allreduce_(tensor_compressed, average=False, name=str(index),
                       prescale_factor=1.0 / self._gradient_predivide_factor)
            grad = self._compression.decompress(tensor_compressed, ctx)
示例#2
0
    def _allreduce_grads(self):
        if size() == 1: return

        if (self._num_groups > 0):
            grads = []
            names = []

            for i, param in enumerate(self._params):
                if param.grad_req != 'null':
                    grads.append(param.list_grad()[0])
                    names.append(self._prefix + str(i))

            grads_split = split_list(grads, self._num_groups)
            names_split = split_list(names, self._num_groups)

            for i, (group_grads, group_names) in enumerate(zip(grads_split, names_split)):
                # For better performance, enqueue groups in separate grouped_allreduce calls by dtype.
                entries_by_dtype = defaultdict(list)
                for grad, name in zip(group_grads, group_names):
                    entries_by_dtype[grad.dtype].append((grad, name))

                for entries in entries_by_dtype.values():
                    grads, names = zip(*entries)
                    grouped_allreduce_(tensors=grads, average=False, name="{}:{}".format(names[0], names[-1]), priority=-i,
                                       prescale_factor=1.0 / self._gradient_predivide_factor)
        else:
            # In MXNet 2.0, param.name is no longer unique.
            # Meanwhile, since horovod requires Python 3.6, there is no need to sort
            # self._params as enumerating a python dict is always deterministic.
            for i, param in enumerate(self._params):
                if param.grad_req != 'null':
                    allreduce_(param.list_grad()[0], average=False,
                               name=self._prefix + str(i), priority=-i,
                               prescale_factor=1.0 / self._gradient_predivide_factor)
示例#3
0
    def _do_allreduce(self, index, grad):
        if self._process_set.size() == 1: return

        if isinstance(index, (tuple, list)):
            if (self._num_groups > 0):
                grad_split = split_list(grad, self._num_groups)
                index_split = split_list(index, self._num_groups)

                for i, (grads,
                        indices) in enumerate(zip(grad_split, index_split)):
                    grouped_allreduce_(
                        tensors=grads,
                        average=False,
                        name="{}:{}".format(indices[0], indices[-1]),
                        priority=-i,
                        prescale_factor=1.0 / self._gradient_predivide_factor,
                        process_set=self._process_set)
            else:
                for i in range(len(index)):
                    allreduce_(grad[i],
                               average=False,
                               name=str(index[i]),
                               priority=-i,
                               prescale_factor=1.0 /
                               self._gradient_predivide_factor,
                               process_set=self._process_set)
        else:
            allreduce_(grad,
                       average=False,
                       name=str(index),
                       prescale_factor=1.0 / self._gradient_predivide_factor,
                       process_set=self._process_set)
示例#4
0
    def allreduce_grads(grads):
        with tf.name_scope(name + "_Allreduce"):
            if sparse_as_dense:
                grads = [
                    tf.convert_to_tensor(grad) if grad is not None
                    and isinstance(grad, tf.IndexedSlices) else grad
                    for grad in grads
                ]

            if num_groups > 0:
                grads_clean = [grad for grad in grads if grad is not None]
                grads_split = split_list(grads_clean, num_groups)

                reduce_ops = []
                for group in grads_split:
                    reduce_ops += _grouped_allreduce_cond(
                        group,
                        device_dense=device_dense,
                        device_sparse=device_sparse,
                        compression=compression,
                        op=op,
                        prescale_factor=prescale_factor,
                        postscale_factor=postscale_factor)
                return reduce_ops

            return [
                _allreduce_cond(grad,
                                device_dense=device_dense,
                                device_sparse=device_sparse,
                                compression=compression,
                                op=op,
                                prescale_factor=prescale_factor,
                                postscale_factor=postscale_factor)
                if grad is not None else grad for grad in grads
            ]
示例#5
0
    def _register_hooks(self):

        if self._num_groups > 0:
            p_list = []
            # Get list of parameters with grads
            for param_group in self.param_groups:
                for p in param_group['params']:
                    if p.requires_grad:
                        p_list.append(p)

            # To ensure parameter order and group formation is consistent, broadcast p_list order
            # from rank 0 and use for every worker
            p_list_names = [self._parameter_names.get(p) for p in p_list]
            p_list_names = broadcast_object(p_list_names, root_rank=0)
            p_list = sorted(
                p_list,
                key=lambda p: p_list_names.index(self._parameter_names.get(p)))

            # Form groups
            p_groups = split_list(p_list, self._num_groups)
            p_groups = [tuple(p) for p in p_groups]
            for group in p_groups:
                for p in group:
                    self._p_to_group[p] = group
                self._group_counts[group] = 0

        for param_group in self.param_groups:
            for p in param_group['params']:
                if p.requires_grad:
                    p.grad = p.data.new(p.size()).zero_()
                    self._requires_update.add(p)
                    p_tmp = p.expand_as(p)
                    grad_acc = p_tmp.grad_fn.next_functions[0][0]
                    grad_acc.register_hook(self._make_hook(p))
                    self._grad_accs.append(grad_acc)
示例#6
0
    def allreduce_grads(grads, vars=None):
        with tf.name_scope(name + "_Allreduce"):
            if sparse_as_dense:
                grads = [
                    tf.convert_to_tensor(grad) if grad is not None
                    and isinstance(grad, tf.IndexedSlices) else grad
                    for grad in grads
                ]

            if groups is not None:
                if isinstance(groups, list):
                    var_name2grad = {}
                    for i in range(len(vars)):
                        var = vars[i]
                        grad = grads[i]
                        if grad is not None:
                            var_name2grad[var.name] = (i, grad)
                    grads_split = []
                    for group in groups:
                        grad_group = []
                        for var in group:
                            if var.name in var_name2grad:
                                grad_group.append(var_name2grad[var.name])
                                del var_name2grad[var.name]
                        grads_split.append(grad_group)
                    for _, grad in var_name2grad.items():
                        grads_split.append([grad])
                elif groups > 0:
                    grads_clean = [(i, grad) for i, grad in enumerate(grads)
                                   if grad is not None]
                    grads_split = split_list(grads_clean, groups)

                reduce_ops = [None] * len(vars)
                for group in grads_split:
                    index_group, grad_group = [list(t) for t in zip(*group)]
                    reduce_ops_group = _grouped_allreduce_cond(
                        grad_group,
                        device_dense=device_dense,
                        device_sparse=device_sparse,
                        compression=compression,
                        op=op,
                        prescale_factor=prescale_factor,
                        postscale_factor=postscale_factor,
                        process_set=process_set)
                    for i in range(len(index_group)):
                        reduce_ops[index_group[i]] = reduce_ops_group[i]
                return reduce_ops

            return [
                _allreduce_cond(grad,
                                device_dense=device_dense,
                                device_sparse=device_sparse,
                                compression=compression,
                                op=op,
                                prescale_factor=prescale_factor,
                                postscale_factor=postscale_factor,
                                process_set=process_set)
                if grad is not None else grad for grad in grads
            ]
示例#7
0
    def _register_hooks(self):
        if self._groups is not None:
            p_list = []
            # Get list of parameters with grads
            for param_group in self.param_groups:
                for p in param_group['params']:
                    if p.requires_grad:
                        p_list.append(p)

            # To ensure parameter order and group formation is consistent, broadcast p_list order
            # from rank 0 and use for every worker
            p_list_names = [self._parameter_names.get(p) for p in p_list]
            p_list_names = broadcast_object(p_list_names,
                                            root_rank=0,
                                            process_set=self.process_set)
            p_list = sorted(
                p_list,
                key=lambda p: p_list_names.index(self._parameter_names.get(p)))

            # Form groups
            if isinstance(self._groups, list):
                p_groups = []
                grouped_id = set()
                p_list_ids = [id(p) for p in p_list]
                for group in self._groups:
                    p_groups.append([p for p in group if id(p) in p_list_ids])
                    for p in p_groups[-1]:
                        grouped_id.add(id(p))
                for p in p_list:
                    if id(p) not in grouped_id:
                        p_groups.append([p])
            else:
                p_groups = split_list(p_list, self._groups)

            p_groups = [tuple(p) for p in p_groups]
            for group in p_groups:
                for p in group:
                    self._p_to_group[p] = group
                self._group_counts[group] = 0

        for param_group in self.param_groups:
            for p in param_group['params']:
                if p.requires_grad:
                    self._requires_update.add(p)
                    p_tmp = p.expand_as(p)
                    grad_acc = p_tmp.grad_fn.next_functions[0][0]
                    grad_acc.register_hook(self._make_hook(p))
                    self._grad_accs.append(grad_acc)