def all_reduce(tensor, op='SUM', group=None): """Reduce the tensor across all nodes in a group. Parameters ---------- tensor : Sequence[dragon.vm.torch.Tensor] The tensor(s) to reduce. op : {'SUM', 'MEAN'}, optional The reduce operation. group : ProcessGroup, optional The group for communication. Returns ------- dragon.vm.torch.Tensor The output tensor. """ if group is None: group = distributed.get_group() if group is None: raise ValueError('<group> is required.') if op not in ('MEAN', 'SUM'): raise ValueError('Unsupported reduce op: ' + op) tensors = nest.flatten(tensor) return _functions.Collective \ .instantiate( tensors[0].device, operation=op, communication='ALLREDUCE', group=group, ).apply(tensors)
def broadcast(inputs, root=0, group=None, **kwargs): """Broadcast the input from root node in a group. Parameters ---------- inputs : dragon.Tensor The tensor to broadcast. root : int, optional, default=0 The node index in the group. group : ProcessGroup, optional The communication group. Returns ------- dragon.Tensor The output tensor. """ args = OpSchema.parse_args(locals()) if group is None: group = distributed.get_group() if group is None: raise ValueError('<group> is required.') coll_args = group.arguments.copy() coll_args['root'] = root coll_args['operation'] = 'BROADCAST' if context.executing_eagerly(): return OpLib.execute('Collective', inputs, **coll_args) kwargs.update(coll_args) return OpLib.add('Collective', inputs, **kwargs)
def broadcast(tensor, src=0, group=None): """Broadcast the tensor from source node in a group. Parameters ---------- tensor : Sequence[dragon.vm.torch.Tensor] The tensor(s) to reduce. src : int The rank of the source node. group : ProcessGroup, optional The group for communication. Returns ------- dragon.vm.torch.Tensor The output tensor. """ if group is None: group = distributed.get_group() if group is None: raise ValueError('<group> is required.') tensors = nest.flatten(tensor) return _functions.Collective \ .instantiate( tensors[0].device, root=src, communication='BROADCAST', group=group, ).apply(tensors)
def broadcast(tensor, src=0, group=None): """Broadcast the tensor from source node in a group. Parameters ---------- tensor : dragon.vm.torch.Tensor The tensor to be sent. src : int The rank of the source node. group : ProcessGroup, optional The group for communication. Returns ------- dragon.vm.torch.Tensor The output tensor. """ group = group or distributed.get_group() if group is None: return tensor return Function.apply('Collective', tensor.device, [tensor], outputs=[tensor], operation='BROADCAST', root=src, **group.arguments)
def all_reduce(inputs, reduction='mean', group=None, **kwargs): """Reduce the input across all nodes in a group. Parameters ---------- inputs : dragon.Tensor The input tensor. reduction : str, optional The reduction method. group : ProcessGroup, optional The group for communication. Returns ------- dragon.Tensor The output tensor. """ reduction = reduction.upper() if group is None: group = distributed.get_group() if group is None: raise ValueError('<group> is required.') if reduction not in ('MEAN', 'SUM'): raise ValueError('Unsupported reduction: ' + reduction) coll_args = group.arguments.copy() coll_args['operation'] = 'ALLREDUCE' coll_args['reduction'] = reduction if context.executing_eagerly(): return OpLib.execute('Collective', inputs, **coll_args) kwargs.update(coll_args) return OpLib.add('Collective', inputs, **kwargs)
def all_reduce(tensor, op='sum', group=None): """Reduce the tensor across all nodes in a group. Parameters ---------- tensor : dragon.vm.torch.Tensor The tensor to reduce. op : str, optional The reduction op. group : ProcessGroup, optional The group for communication. Returns ------- dragon.vm.torch.Tensor The output tensor. """ group = group or distributed.get_group() if group is None: return tensor op = op.upper() if op not in ('MEAN', 'SUM'): raise ValueError('Unsupported reduction: ' + op) return Function.apply('Collective', tensor.device, [tensor], outputs=[tensor], operation='ALLREDUCE', reduction=op, **group.arguments)
def all_gather(tensor_list, tensor, group=None): """Gather the tensor across all nodes in a group. Parameters ---------- tensor_list : Sequence[dragon.vm.torch.Tensor] The output tensor list. tensor : dragon.vm.torch.Tensor The tensor to be sent. group : ProcessGroup, optional The group for communication. Returns ------- dragon.vm.torch.Tensor The output tensor. """ group = group or distributed.get_group() if group is None: return tensor output_tensor = Function.apply('Collective', tensor.device, [tensor], operation='ALLGATHER', **group.arguments) if len(tensor_list) > 0: return Function.apply('Split', output_tensor.device, [output_tensor], outputs=[None] * len(tensor_list), axis=0, size_split=None, copy=True) return output_tensor
def _add_updates(graph_def, grads_and_vars, optimizer): group_vars = collections.defaultdict(list) group_grads = collections.defaultdict(list) for grad, var in grads_and_vars: weight_decay = getattr(var, '_weight_decay', None) if weight_decay is not None: weight_decay = float(weight_decay) group_vars[weight_decay].append(var.id) group_grads[weight_decay].append(grad.id) op_defs = [] process_group = distributed.get_group() if process_group: grads = list(itertools.chain(*group_grads.values())) op_defs.append(proto_util.make_operator_def( op_type='Collective', inputs=grads, outputs=grads, name=optimizer._name, operation='ALLREDUCE', reduction='MEAN', **process_group.arguments)) for weight_decay, vars in group_vars.items(): grads = group_grads[weight_decay] op_defs.append(proto_util.make_operator_def( op_type=optimizer._op_type, inputs=grads, outputs=vars, name=optimizer._name, weight_decay=weight_decay)) graph_def.op.extend(op_defs)
def __init__(self, params, defaults): """Create a ``Optimizer``. Parameters ---------- params : Sequence[dragon.vm.torch.nn.Parameter] The parameters to optimize. defaults : dict The pre-defined default hyper-parameters. """ self.defaults = defaults if isinstance(params, Tensor): raise TypeError('<params> should be a sequence of tensors.') self.state = defaultdict(dict) self.param_groups = [] param_groups = list(params) if len(param_groups) == 0: raise ValueError('Got an empty parameter list') if not isinstance(param_groups[0], dict): param_groups = [{'params': param_groups}] for param_group in param_groups: self.add_param_group(param_group) self._op_type = self.__class__.__name__ + 'Update' self._process_group = distributed.get_group() self._shared_args = {}
def broadcast(inputs, root=0, group=None, **kwargs): """Broadcast the input from root node in a group. Parameters ---------- inputs : dragon.Tensor The tensor to broadcast. root : int, optional, default=0 The node index in the group. group : ProcessGroup, optional The group for communication. Returns ------- dragon.Tensor The output tensor. """ args = ArgHelper.parse(locals()) if group is None: group = distributed.get_group() if group is None: raise ValueError('<group> is required.') args.update(group.arguments) args.pop('group') op_lib = distributed_ops_lib.Collective if context.executing_eagerly(): return op_lib \ .instantiate( root=root, communication='BROADCAST', group=group, ).apply(inputs) else: return op_lib.blend(communication='BROADCAST', **args)
def sync_batch_norm(inputs, axis=-1, momentum=0.9, epsilon=1e-5, use_stats=-1, process_group=None, **kwargs): r"""Apply the batch normalization with synced statistics. `[Ioffe & Szegedy, 2015] <https://arxiv.org/abs/1502.03167>`_. The normalization is defined as: .. math:: y = \frac{x - \mathrm{E}[x]} {\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta The running average of statistics are calculated as: .. math:: x_{\text{running}} = \text{momentum} * x_{\text{running}} + (1 - \text{momentum}) * x_{\text{batch}} Parameters ---------- inputs : Sequence[dragon.Tensor] The tensor ``x``, ``gamma``, ``beta``, ``mean`` and ``var``. axis : int, optional, default=-1 The channel axis. momentum : Union[float, dragon.Tensor], optional The value to :math:`\text{momentum}`. epsilon : float, optional, default=1e-5 The value to :math:`\epsilon`. use_stats : int, optional, default=-1 Whether to use estimated statistics or not. process_group : ProcessGroup, optional The group for communication. Returns ------- dragon.Tensor The output tensor. """ args = OpSchema.parse_args(locals()) args['epsilon'] = float(epsilon) if process_group is None: process_group = distributed.get_group() if process_group is None: raise ValueError('<process_group> is required.') if context.executing_eagerly(): return OpLib.execute('SyncBatchNorm', inputs, axis=axis, epsilon=args['epsilon'], use_stats=use_stats, momentum=args['momentum'], **process_group.arguments) args.pop('process_group') args.update(process_group.arguments) return OpLib.add('SyncBatchNorm', **args)
def __init__(self, **kwargs): """Create a ``Optimizer``.""" self._name = workspace.get_workspace().create_handle('Optimizer') self._op_type = self.__class__.__name__ self._process_group = distributed.get_group() self._hyper = {} self._set_hyper('grad_scale', kwargs.pop('grad_scale', 1)) self._set_hyper('weight_decay', kwargs.pop('weight_decay', 0)) self._set_hyper('clip_norm', kwargs.pop('clip_norm', 0)) self._set_hyper('clip_value', kwargs.pop('clip_value', 0)) if kwargs: raise ValueError('Unexpected arguments: ' + ','.join(v for v in kwargs))
def get_distributed_info(allowed=True): """Return the rank and size of current nesting group. Parameters ---------- allowed : bool, optional, default=True Whether the distributed utilities are allowed. Returns ------- Tuple[int] The node rank and group size. """ if allowed: group = distributed.get_group() if group is not None: return distributed.get_rank(group), group.size return 0, 1
def apply_gradients(self, grads_and_vars): """Apply the gradients on variables. Parameters ---------- grads_and_vars : Sequence[Sequence[dragon.Tensor]] The sequence of update pair. """ # Create execution context for graph mode. if not context.executing_eagerly(): return GraphLib.from_updates(grads_and_vars, self) # Separate variables by explicit weight decay. group_vars = collections.defaultdict(list) group_grads = collections.defaultdict(list) for grad, var in grads_and_vars: if grad is not None: weight_decay = getattr(var, '_weight_decay', None) if weight_decay is not None: weight_decay = float(weight_decay) group_vars[weight_decay].append(var) group_grads[weight_decay].append(grad) # Reduce grads in the process group. process_group = distributed.get_group() if process_group is not None: grads = list(itertools.chain(*group_grads.values())) OpLib.execute('Collective', grads, outputs=grads, operation='ALLREDUCE', reduction='MEAN', **process_group.arguments) # Apply updates. for weight_decay, vars in group_vars.items(): grads = group_grads[weight_decay] # Skip if grads are all missing. if len(grads) == 0: continue OpLib.execute(self._op_type, grads, outputs=vars, name=self._name, weight_decay=weight_decay)
def __init__( self, num_features, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True, process_group=None, ): r"""Create a ``SyncBatchNorm`` module. Parameters ---------- num_features : int The number of channels. eps : float, optional, default=1e-5 The value to :math:`\epsilon`. momentum : float, optional, default=0.1 The value to :math:`\text{momentum}`. affine : bool, optional, default=True ``True`` to apply a affine transformation. track_running_stats : bool, optional, default=True ``True`` to using stats when switching to ``eval``. process_group : ProcessGroup, optional The group for communication. """ super(SyncBatchNorm, self).__init__( num_features, eps, momentum, affine, track_running_stats, ) if process_group is None: process_group = distributed.get_group() self.process_group = process_group
def all_reduce(inputs, operation='MEAN', group=None, **kwargs): """Reduce the input across all nodes in a group. Parameters ---------- inputs : dragon.Tensor The input tensor. operation : {'MEAN', 'SUM'}, optional The reduce operation. group : ProcessGroup, optional The group for communication. Returns ------- dragon.Tensor The output tensor. """ args = ArgHelper.parse(locals()) if group is None: group = distributed.get_group() if group is None: raise ValueError('<group> is required.') if operation not in ('MEAN', 'SUM'): raise ValueError('Unsupported reduce op: ' + operation) args.update(group.arguments) args.pop('group') op_lib = distributed_ops_lib.Collective if context.executing_eagerly(): return op_lib \ .instantiate( operation=operation, communication='ALLREDUCE', group=group, ).apply(inputs) else: return op_lib.blend(communication='ALLREDUCE', **args)
def _update_group(self, group): """Update parameters for the group.""" execute_ws = workspace.get_workspace() # Collect params and grads. params_with_grad, grads = [], [] for p in group['params']: g = self._get_grad(execute_ws, p, self._sums_grad) if g is not None: params_with_grad.append(p) grads.append(g) # Skip if grads are all missing. if len(params_with_grad) == 0: return # Update hyper from group values. for name in self._hyper.keys(): group_name = group['name'] impl_name, group_dict = self._hyper[name] if group_name not in group_dict: impl_name = group_name + '/' + impl_name group_dict[group_name] = execute_ws.create_tensor(impl_name) impl = group_dict[group_name] impl.FromNumpy(numpy.array(group[name], 'float32'), False) # Reduce grads in the process group. process_group = distributed.get_group() if process_group is not None: Function.apply('Collective', grads[0].device, grads, outputs=grads, operation='ALLREDUCE', reduction='MEAN', **process_group.arguments) # Apply updates. Function.apply(self._op_type, params_with_grad[0].device, grads, outputs=params_with_grad, name=group['name'], weight_decay=None)
def __init__(self, **kwargs): """Create a ``DataIterator``. Parameters ---------- dataset : class The dataset class to load examples. source : str The path of data source. shuffle : bool, optional, default=False Whether to shuffle the data. initial_fill : int, optional, default=1024 The length of sampling sequence for shuffle. resize : int, optional, default=0 The size for the shortest edge. padding : int, optional, default=0 The size for the zero padding on two sides. fill_value : Union[int, Sequence], optional, default=127 The value(s) to fill for padding or cutout. crop_size : int, optional, default=0 The size for random-or-center cropping. random_crop_size: int, optional, default=0 The size for sampling-based random cropping. cutout_size : int, optional, default=0 The square size for the cutout algorithm. mirror : bool, optional, default=False Whether to apply the mirror (flip horizontally). random_scales : Sequence[float], optional, default=(0.08, 1.) The range of scales to sample a crop randomly. random_aspect_ratios : Sequence[float], optional, default=(0.75, 1.33) The range of aspect ratios to sample a crop randomly. distort_color : bool, optional, default=False Whether to apply color distortion. inverse_color : bool, option, default=False Whether to inverse channels for color images. training : optional, default=True Whether to enable the training randoms. batch_size : int, optional, default=128 The size of a mini-batch. prefetch_depth : int, optional, default=4 The number of prefetching queues. num_transformers : int, optional, default=-1 The number of transformers to process image. seed : int, optional The random seed to use instead. """ super(DataIterator, self).__init__(daemon=True) # Distributed settings. rank, group_size = 0, 1 process_group = distributed.get_group() if process_group is not None and kwargs.get('training', True): group_size = process_group.size rank = distributed.get_rank(process_group) # Configuration. self._prefetch_depth = kwargs.get('prefetch_depth', 4) self._num_readers = kwargs.get('num_readers', 1) self._num_workers = kwargs.get('num_workers', -1) self._batch_size = kwargs.get('batch_size', 128) # Io-Aware Policy. if self._num_workers == -1: self._num_workers = 1 # Add a transformer for cropping. if kwargs.get('random_crop_size', 0) > 0: self._num_workers += 1 # Add a transformer for distortion. if kwargs.get('distort_color', False): self._num_workers += 1 # Initialize queues. num_batches = self._prefetch_depth * self._num_readers self._reader_queue = mp.Queue(num_batches * self._batch_size) self._worker_queue = mp.Queue(num_batches * self._batch_size) self._batch_queue = queue.Queue(num_batches) # Initialize readers. self._readers = [] for i in range(self._num_readers): part_idx, num_parts = i, self._num_readers num_parts *= group_size part_idx += rank * self._num_readers self._readers.append( reader.DataReader(part_idx=part_idx, num_parts=num_parts, **kwargs)) self._readers[i]._seed += part_idx self._readers[i]._reader_queue = self._reader_queue self._readers[i].start() time.sleep(0.1) # Initialize transformers. self._workers = [] for i in range(self._num_workers): p = data_worker.DataWorker(**kwargs) p._seed += (i + rank * self._num_workers) p._reader_queue = self._reader_queue p._worker_queue = self._worker_queue p.start() self._workers.append(p) time.sleep(0.1) # Register cleanup callbacks. def cleanup(): def terminate(processes): for p in processes: p.terminate() p.join() terminate(self._workers) if rank == 0: logging.info('Terminate DataWorker.') terminate(self._readers) if rank == 0: logging.info('Terminate DataReader.') import atexit atexit.register(cleanup) # Start batch prefetching. self.start()