def run_extensions(self, *, completed: bool = True, only_iterations: bool = True) -> None: if completed: # Check if the model is available for the iteration just # completed, i.e., the iteration number is already incremented. self._model_available = self.needs_model_state(self.iteration) else: self._model_available = False to_run = [] for name, entry in self.extensions: # When iterations are deferred we only # launch the extensions that doesn't need # the training status to advance # those are extensions set to execute # in a given interval of executions is_async = (hasattr(entry.extension, 'is_async') and entry.extension.is_async) if ((not completed and not is_async) or (completed and is_async and only_iterations)): continue manager: _manager_protocol.ExtensionsManagerProtocol = self if is_async: manager = self._get_proxy_for_trigger(entry.trigger) if entry.trigger(manager): # Execution of snapshot extensions are deferred until all the # triggers are evaluated. # If we don't do this, when two (or more) snapshot extensions # are registered and triggers for them are stateful, the first # snapshot extension will save the state of the second trigger # before invoking it although it will be executed later in this # iteration, making them to fire again just after resuming from # the snaphsot saved by the first snapshot extension. # Non-snapshot extensions are executed right away (note that # the order is already sorted by the priority) as they will # report values that might be needed by other triggers, i.e., # trigger based on evaluator reported value. if entry.priority == extension_module.PRIORITY_SNAPSHOT: to_run.append((name, entry.extension)) else: with record( f'pytorch_pfn_extras.training.ExtensionsManager' f'.run_extensions:{name}', enable=self._enable_profile, ): entry.extension(self) for name, extension in to_run: with record( f'pytorch_pfn_extras.training.ExtensionsManager' f'.run_extensions:{name}', enable=self._enable_profile, ): extension(self) self._model_available = True
def _reduce( values: Sequence[torch.Tensor], group: Optional[dist.ProcessGroup], ) -> None: size = sum([v.numel() for v in values]) # flatten values to improve the runtime perfomance of all-reduce coalesced = torch.empty(size, device=values[0].device, dtype=values[0].dtype) coalesced_views = get_foreach_wrapper( ).unflatten( # type: ignore[no-untyped-call] coalesced, values) get_foreach_wrapper().multi_tensor_scale(values, coalesced_views, 1.0) with record("torch.distributed.all_reduce", use_cuda=torch.cuda.is_available()): dist.all_reduce(coalesced, group=group) # type: ignore[no-untyped-call] # unflatten values get_foreach_wrapper().multi_tensor_scale( coalesced_views, values, 1.0 / dist.get_world_size(group) # type: ignore[no-untyped-call] )
def _broadcast(values: Sequence[torch.Tensor], group: Optional[dist.ProcessGroup]) -> None: with torch.no_grad(): # type: ignore[no-untyped-call] coalesced = get_foreach_wrapper().flatten(values) with record("torch.distributed.broadcast", use_cuda=torch.cuda.is_available()): dist.broadcast(coalesced, 0, group=group) # type: ignore[no-untyped-call] src = get_foreach_wrapper().unflatten(coalesced, values) get_foreach_wrapper().multi_tensor_scale(src, values, 1.0)
def run_extensions(self) -> None: self._model_available = self.needs_model_state(self.iteration) to_run = [] for name, entry in self.extensions: # When iterations are deferred we only # launch the extensions that doesn't need # the training status to advance # those are extensions set to execute # in a given interval of executions if entry.trigger(self): # Execution of snapshot extensions are deferred until all the # triggers are evaluated. # If we don't do this, when two (or more) snapshot extensions # are registered and triggers for them are stateful, the first # snapshot extension will save the state of the second trigger # before invoking it although it will be executed later in this # iteration, making them to fire again just after resuming from # the snaphsot saved by the first snapshot extension. # Non-snapshot extensions are executed right away (note that # the order is already sorted by the priority) as they will # report values that might be needed by other triggers, i.e., # trigger based on evaluator reported value. if entry.priority == extension_module.PRIORITY_SNAPSHOT: to_run.append((name, entry.extension)) else: with record( f'pytorch_pfn_extras.training.ExtensionsManager' f'.run_extensions:{name}', enable=self._enable_profile, ): entry.extension(self) for name, extension in to_run: with record( f'pytorch_pfn_extras.training.ExtensionsManager' f'.run_extensions:{name}', enable=self._enable_profile, ): extension(self) self._model_available = True
def run(self, train_loader: torch.utils.data.DataLoader, val_loader: Optional[torch.utils.data.DataLoader] = None, *, train_len: Optional[int] = None, eval_len: Optional[int] = None): """Executes the training loop. Args: train_loader (torch.utils.data.DataLoader): A data loader for training. val_loader (torch.utils.data.DataLoader, optional): A data loader passed to ``Evaluator.run()``. train_len (int, optional): The number of iterations per one training epoch. The default value is inferred from the size of training data loader. eval_len (int, optional): The number of iterations per one evaluation epoch, passed to ``Evaluator.run()`` .. seealso:: - :meth:`pytorch_pfn_extras.training._evaluator._Evaluator` """ if train_len is None: train_len = len(train_loader) self._val_loader = val_loader self._eval_len = eval_len class _EvaluatorExt: def __init__(self, trainer): self.name = 'Evaluator' self.needs_model_state = True self._trainer = trainer def __call__(self, manager): self._trainer._run_evaluator() if self._manager is None: self._setup_manager(train_len) if self.evaluator is not None: # Register the evaluator as an extension to the manager # To be triggered with the correct timing self._manager.extend( _EvaluatorExt(self), trigger=self.evaluator_trigger, priority=extension.PRIORITY_WRITER, ) self.handler.train_setup(self, train_loader) if self.evaluator is not None: self.evaluator.handler.eval_setup(self.evaluator, val_loader) while not self.manager.stop_trigger: self.handler.train_epoch_begin(self, train_loader) # When iterations are completed in the callback # This is needed to avoid being constantly passing parameters self._idxs = queue.Queue() self._inputs = queue.Queue() self._times = queue.Queue() self._observed = queue.Queue() # Iterator must be created after `train_epoch_begin` as it may be # using a DistributedSampler. loader_iter = iter(train_loader) self._profile_records = queue.Queue() for idx in range(train_len): with record( "pytorch_pfn_extras.training.Trainer:iteration", use_cuda=torch.cuda.is_available() ) as ntf0: try: with record( "pytorch_pfn_extras.training.Trainer:get_data" ): x = next(loader_iter) except StopIteration: loader_iter = iter(train_loader) with record( "pytorch_pfn_extras.training.Trainer:get_data" ): x = next(loader_iter) begin = time.time() self._idxs.put(idx) self._inputs.put(x) self._times.put(begin) self._deferred = True with record( "pytorch_pfn_extras.training.Trainer:run_iteration", use_cuda=torch.cuda.is_available() ) as ntf1, \ self.manager.run_iteration() as iter_notifier: self._observed.put(self.manager.observation) with record( "pytorch_pfn_extras.training.Trainer:train_step", use_cuda=torch.cuda.is_available(), ) as ntf2: self._profile_records.put([ntf0, ntf1, ntf2]) self.handler.train_step( self, idx, x, complete_fn=self._complete_step) # Check if the callback was called if self._deferred: # The iteration will be completed later ntf0.defer() ntf1.defer() ntf2.defer() iter_notifier.defer() # In some cases, DataLoaders are continuos # And will keep yielding results even if the epoch # is completed. We forcefully exit at the end of # every epoch if ( self.is_epoch_last_iter(idx) or self.manager.stop_trigger ): break # In handlers that support a completely Async model train_epoch_end # Will take care of completing pending work self.handler.train_epoch_end(self)
def run(self, train_loader: Iterable[Any], val_loader: Optional[Iterable[Any]] = None, *, train_len: Optional[int] = None, eval_len: Optional[int] = None) -> None: """Executes the training loop. Args: train_loader (torch.utils.data.DataLoader): A data loader for training. val_loader (torch.utils.data.DataLoader, optional): A data loader passed to ``Evaluator.run()``. train_len (int, optional): The number of iterations per one training epoch. The default value is inferred from the size of training data loader. eval_len (int, optional): The number of iterations per one evaluation epoch, passed to ``Evaluator.run()`` .. seealso:: - :meth:`pytorch_pfn_extras.training._evaluator.Evaluator` """ if train_len is None: train_len = len(train_loader) # type: ignore[arg-type] if eval_len is None and val_loader is not None: eval_len = len(val_loader) # type: ignore[arg-type] self._train_len = train_len self._eval_len = eval_len class _EvaluatorExt: def __init__( self, trainer: 'Trainer', evaluator: 'Evaluator', val_loader: Optional[Iterable[Any]], eval_len: Optional[int], ) -> None: self.needs_model_state = True self._trainer = trainer self._evaluator = evaluator self._val_loader = val_loader self._eval_len = eval_len def __call__(self, manager: ExtensionsManagerProtocol) -> None: evaluator = self._evaluator if self._val_loader is None: raise ValueError('"val_loader" is not given.') evaluator.handler.train_validation_begin(self._trainer, evaluator) evaluator.run(self._val_loader, eval_len=self._eval_len) evaluator.handler.train_validation_end(self._trainer, evaluator) if self._manager is None: self._manager = self._setup_manager(train_len) for name, (evaluator, trigger) in self._evaluators.items(): # Register the evaluator as an extension to the manager # To be triggered with the correct timing self._manager.extend( _EvaluatorExt(self, evaluator, val_loader, eval_len), name=name, trigger=trigger_module.get_trigger(trigger), priority=extension.PRIORITY_WRITER, ) self.handler.train_setup(self, train_loader) if len(self._evaluators) == 0: if val_loader is not None: warnings.warn( '`val_loader` is given whereas the evaluator is missing.', UserWarning) else: if val_loader is None: raise ValueError('`val_loader` is required') for _, (evaluator, _) in self._evaluators.items(): evaluator.handler.eval_setup(evaluator, val_loader) with self._profile or _nullcontext() as prof: while not self.manager.stop_trigger: self.handler.train_epoch_begin(self, train_loader) # When iterations are completed in the callback # This is needed to avoid being constantly passing parameters self._idxs: 'queue.Queue[int]' = queue.Queue() self._inputs: 'queue.Queue[Any]' = queue.Queue() self._times: 'queue.Queue[float]' = queue.Queue() self._observed: 'queue.Queue[reporting.Observation]' = queue.Queue() # Iterator must be created after `train_epoch_begin` as it may be # using a DistributedSampler. loader_iter = iter(train_loader) self._profile_records: 'queue.Queue[List[_ReportNotification]]' \ = queue.Queue() for idx in range(train_len): with record( "pytorch_pfn_extras.training.Trainer:iteration", use_cuda=torch.cuda.is_available(), enable=self._enable_profile ) as ntf0: try: with record( "pytorch_pfn_extras.training.Trainer:get_data", enable=self._enable_profile ): x = next(loader_iter) except StopIteration: loader_iter = iter(train_loader) with record( "pytorch_pfn_extras.training.Trainer:get_data", enable=self._enable_profile ): x = next(loader_iter) begin = time.time() self._idxs.put(idx) self._inputs.put(x) self._times.put(begin) with record( "pytorch_pfn_extras.training.Trainer:run_iteration", use_cuda=torch.cuda.is_available(), enable=self._enable_profile ) as ntf1, \ self.manager.run_iteration(): self._observed.put(self.manager.observation) with record( "pytorch_pfn_extras.training.Trainer:train_step", use_cuda=torch.cuda.is_available(), enable=self._enable_profile ) as ntf2: self._profile_records.put([ntf0, ntf1, ntf2]) self.handler.train_step( self, idx, x, complete_fn=self._complete_step) # Check if the callback was called if prof is not None: prof.step() # type: ignore[no-untyped-call] # In some cases, DataLoaders are continuos # And will keep yielding results even if the epoch # is completed. We forcefully exit at the end of # every epoch if self.is_epoch_last_iter(idx) or self.manager.stop_trigger: break # In handlers that support a completely Async model train_epoch_end # Will take care of completing pending work self.handler.train_epoch_end(self) if prof is not None: prof.on_trace_ready = None self.handler.train_cleanup(self)
def _synchronize() -> None: if not self._require_sync: return for hook in self._comm_hooks.values(): hook(self) with record_function( "ppe.nn.parallel.DistributedDataParallel.synchronize"): params = dict(self.named_parameters()) if self._negotiate_grads: # find parameters that have gradients has_grads = torch.tensor([ params[name].grad is not None for name in self._sorted_param_keys ], device=self._device) # cast to long because bool may not be used in all_reduce has_grads = has_grads.long() with record( "pytorch_pfn_extras.nn.parallel." "DistributedDataParallel:coordinate", use_cuda=torch.cuda.is_available(), ): dist.all_reduce( # type: ignore[no-untyped-call] has_grads, op=dist.ReduceOp.MAX) for name, has_grad in zip(self._sorted_param_keys, has_grads.bool().cpu()): # create zero tensor as a gradient if a parameter # does not have the gradient and other processes # require to synchronize this parameter. if has_grad and params[name].grad is None: params[name].grad = \ torch.zeros_like(params[name].data) grads = [ params[name].grad for name in self._sorted_param_keys if params[name].grad is not None ] groups = _group_by_type(grads) with record( "pytorch_pfn_extras.nn.parallel." "DistributedDataParallel:reduce_gradient", use_cuda=torch.cuda.is_available(), ): for group in groups: self._reduce_function(group, self._process_group) if self._broadcast_buffers: buffers = dict(self.named_buffers()) bufs = [buffers[name] for name in self._sorted_buffer_keys] groups = _group_by_type(bufs) with record( "pytorch_pfn_extras.nn.parallel." "DistributedDataParallel:broadcast_buffer", use_cuda=torch.cuda.is_available(), ): for group in groups: self._broadcast_function(group, self._process_group)