def get_batch_length(self, batch: Any) -> int: """Count the number of records in a given batch. Override this method when you are using custom batch types, as produced when iterating over the `DataLoader`. For example, when using `pytorch_geometric`: .. code-block:: python # Extra imports: from determined.pytorch import DataLoader from torch_geometric.data.dataloader import Collater # Trial methods: def build_training_data_loader(self): return DataLoader( self.train_subset, batch_size=self.context.get_per_slot_batch_size(), collate_fn=Collater([], []), ) def get_batch_length(self, batch): # `batch` is `torch_geometric.data.batch.Batch`. return batch.num_graphs Arguments: batch (Any): input training or validation data batch object. """ return pytorch.data_length(batch)
def _records_in_batch(self, batch): """Count the number of records batch. Only needs overriding for unusal datasets.""" return pytorch.data_length(batch)
def _compute_validation_metrics(self) -> workload.Response: self.context.experimental.reset_reducers() # Set the behavior of certain layers (e.g., dropout) that are # different between training and inference. for model in self.context.models: model.eval() for callback in self.callbacks.values(): logging.warning( "on_validation_step_start is now deprecated, please use on_validation_start instead" ) callback.on_validation_step_start() for callback in self.callbacks.values(): callback.on_validation_start() num_inputs = 0 metrics = {} # type: Dict[str, Any] if self._evaluate_batch_defined(): keys = None batch_metrics = [] self.validation_loader = cast(torch.utils.data.DataLoader, self.validation_loader) check.gt(len(self.validation_loader), 0) for batch in self.validation_loader: batch = self.context.to_device(batch) num_inputs += pytorch.data_length(batch) vld_metrics = self.trial.evaluate_batch(batch=batch) # Verify validation metric names are the same across batches. if keys is None: keys = vld_metrics.keys() else: check.eq( keys, vld_metrics.keys(), "Validation metric names must match across all batches of data.", ) check.is_instance( vld_metrics, dict, "validation_metrics() must return a " "dictionary of string names to Tensor " "metrics", ) # TODO: For performance perform -> cpu() only at the end of validation. batch_metrics.append( self._convert_metrics_to_numpy(vld_metrics)) if self.env.test_mode: break metrics = self._reduce_metrics( batch_metrics=batch_metrics, keys=keys, metrics_reducers=self._prepare_metrics_reducers(keys=keys), ) if self.hvd_config.use: num_inputs *= hvd.size() else: check.true(self._evaluate_full_dataset_defined()) self.validation_loader = cast(torch.utils.data.DataLoader, self.validation_loader) if self.is_chief: metrics = self.trial.evaluate_full_dataset( data_loader=self.validation_loader) check.is_instance( metrics, dict, f"eval() must return a dictionary, got {type(metrics)}.") metrics = self._convert_metrics_to_numpy(metrics) num_inputs = self.context.get_per_slot_batch_size() * len( self.validation_loader) metrics.update( self._convert_metrics_to_numpy( self.context.experimental.reduce_metrics(for_training=False))) if self.hvd_config.use and any( map( lambda c: util.is_overridden( c.on_validation_end, pytorch. PyTorchCallback) or util.is_overridden( c.on_validation_step_end, pytorch.PyTorchCallback), self.callbacks.values(), )): logging.debug( "Broadcasting metrics to all worker processes to execute a " "validation step end callback") metrics = hvd.broadcast_object(metrics, root_rank=0) for callback in self.callbacks.values(): logging.warning( "on_validation_step_end is now deprecated, please use on_validation_end instead" ) callback.on_validation_step_end(metrics) for callback in self.callbacks.values(): callback.on_validation_end(metrics) if not self.is_chief: return workload.Skipped() return {"num_inputs": num_inputs, "validation_metrics": metrics}
def _train_for_step(self, step_id: int, num_batches: int, total_batches_processed: int) -> workload.Response: check.gt(step_id, 0) self.context.experimental.reset_reducers() # Set the behavior of certain layers (e.g., dropout) that are different # between training and inference. for model in self.context.models: model.train() start = total_batches_processed end = start + num_batches per_batch_metrics = [] # type: List[Dict] num_inputs = 0 for batch_idx in range(start, end): batch = next(self.training_iterator) num_inputs += pytorch.data_length(batch) batch = self.context.to_device(batch) self.context._current_batch_idx = batch_idx if self.context.is_epoch_start(): for callback in self.callbacks.values(): callback.on_training_epoch_start() self.context._loss_ids = {} tr_metrics = self.trial.train_batch( batch=batch, epoch_idx=self.get_epoch_idx(batch_idx), batch_idx=batch_idx, ) if self._should_update_scaler(): self.context._scaler.update() if isinstance(tr_metrics, torch.Tensor): tr_metrics = {"loss": tr_metrics} check.is_instance( tr_metrics, dict, "train_batch() must return a dictionary " f"mapping string names to Tensor metrics, got {type(tr_metrics)}", ) # Step learning rate of a pytorch.LRScheduler. for lr_scheduler in self.context.lr_schedulers: self._auto_step_lr_scheduler_per_batch(batch_idx, lr_scheduler) for name, metric in tr_metrics.items(): # Convert PyTorch metric values to NumPy, so that # `det.util.encode_json` handles them properly without # needing a dependency on PyTorch. if isinstance(metric, torch.Tensor): metric = metric.cpu().detach().numpy() tr_metrics[name] = metric per_batch_metrics.append(tr_metrics) # Aggregate and reduce training metrics from all the training processes. if self.hvd_config.use and self.hvd_config.average_training_metrics: per_batch_metrics = self._average_training_metrics( per_batch_metrics) if self.hvd_config.use: num_inputs *= hvd.size() metrics = det.util.make_metrics(num_inputs, per_batch_metrics) # Ignore batch_metrics entirely for custom reducers; there's no guarantee that per-batch # metrics are even logical for a custom reducer. metrics["avg_metrics"].update( self._convert_metrics_to_numpy( self.context.experimental.reduce_metrics(for_training=True))) if not self.is_chief: # The training metrics are reported only in the chief process. return workload.Skipped() logging.debug( f"Done training step: {num_inputs} records in {num_batches} batches." ) return metrics
def test_data_type_error(data: typing.Any, error: typing.Any) -> None: with pytest.raises(error): data_length(data)
def test_data_length(data: Data, length: int): assert data_length(data) == length
def _train_for_step(self, step_id: int, num_batches: int, total_batches_processed: int) -> workload.Response: check.gt(step_id, 0) # Set the behavior of certain layers (e.g., dropout) that are different # between training and inference. for model in self.context.models: model.train() start = total_batches_processed end = start + num_batches per_batch_metrics = [] # type: List[Dict] num_inputs = 0 for batch_idx in range(start, end): batch = next(self.training_iterator) num_inputs += data_length(batch) batch = self.context._to_device(batch) self.context._current_batch_idx = batch_idx self.context._loss_ids = {} tr_metrics = self.trial.train_batch( batch=batch, model=self.context.models[0], epoch_idx=self.get_epoch_idx(batch_idx), batch_idx=batch_idx, ) if isinstance(tr_metrics, torch.Tensor): tr_metrics = {"loss": tr_metrics} check.is_instance( tr_metrics, dict, "train_batch() must return a dictionary " f"mapping string names to Tensor metrics, got {type(tr_metrics)}", ) check.is_in("loss", tr_metrics.keys(), 'Please include "loss" in you training metrics.') # Step learning rate of a LRScheduler. for lr_scheduler in self.context.lr_schedulers: self._auto_step_lr_scheduler_per_batch(batch_idx, lr_scheduler) for name, metric in tr_metrics.items(): # Convert PyTorch metric values to NumPy, so that # `det.util.encode_json` handles them properly without # needing a dependency on PyTorch. if isinstance(metric, torch.Tensor): metric = metric.cpu().detach().numpy() tr_metrics[name] = metric check.is_in("loss", tr_metrics, 'Please include "loss" in your training metrics.') per_batch_metrics.append(tr_metrics) # Aggregate and reduce training metrics from all the training processes. if self.hvd_config.use and self.hvd_config.average_training_metrics: per_batch_metrics = self._average_training_metrics( per_batch_metrics) if self.hvd_config.use: num_inputs *= hvd.size() metrics = det.util.make_metrics(num_inputs, per_batch_metrics) if not self.is_chief: # The training metrics are reported only in the chief process. return workload.Skipped() logging.debug( f"Done training step: {num_inputs} records in {num_batches} batches." ) return metrics
def _compute_validation_metrics(self) -> workload.Response: # Set the behavior of certain layers (e.g., dropout) that are # different between training and inference. self.model.eval() num_inputs = 0 metrics = {} # type: Optional[Dict[str, Any]] if self._evaluate_batch_defined(): keys = None batch_metrics = [] self.validation_loader = cast(torch.utils.data.DataLoader, self.validation_loader) check.gt(len(self.validation_loader), 0) for batch in self.validation_loader: batch = self._to_device(batch) num_inputs += data_length(batch) vld_metrics = self.trial.evaluate_batch(batch=batch, model=self.model) # Verify validation metric names are the same across batches. if keys is None: keys = vld_metrics.keys() else: check.eq( keys, vld_metrics.keys(), "Validation metric names must match across all batches of data.", ) check.is_instance( vld_metrics, dict, "validation_metrics() must return a " "dictionary of string names to Tensor " "metrics", ) # TODO: For performance perform -> cpu() only at the end of validation. batch_metrics.append( self._convert_metrics_to_numpy(vld_metrics)) keys = cast(Any, keys) metrics = self._reduce_metrics( batch_metrics=batch_metrics, keys=keys, metrics_reducers=self._prepare_metrics_reducers(keys=keys), ) if self.hvd_config.use: num_inputs *= hvd.size() else: check.true(self._evaluate_full_dataset_defined()) self.validation_loader = cast(torch.utils.data.DataLoader, self.validation_loader) if self.is_chief: metrics = self.trial.evaluate_full_dataset( data_loader=self.validation_loader, model=self.model) check.is_instance( metrics, dict, f"eval() must return a dictionary, got {type(metrics)}.") metrics = self._convert_metrics_to_numpy(metrics) num_inputs = self.context.get_per_slot_batch_size() * len( self.validation_loader) if not self.is_chief: return workload.Skipped() return {"num_inputs": num_inputs, "validation_metrics": metrics}
def _train_for_step(self, step_id: int, batches_per_step: int) -> workload.Response: check.gt(step_id, 0) step_idx = step_id - 1 start = step_idx * batches_per_step end = start + batches_per_step # Set the behavior of certain layers (e.g., dropout) that are different # between training and inference. self.model.train() per_batch_metrics = [] # type: List[Dict] num_inputs = 0 for batch_idx in range(start, end): batch = next(self.training_iterator) num_inputs += data_length(batch) batch = self._to_device(batch) # Forward pass. tr_metrics = self.trial.train_batch( batch=batch, model=self.model, epoch_idx=self.get_epoch_idx(batch_idx), batch_idx=batch_idx, ) if isinstance(tr_metrics, torch.Tensor): tr_metrics = {"loss": tr_metrics} check.is_instance( tr_metrics, dict, "train_batch() must return a dictionary " "mapping string names to Tensor metrics, got {type(tr_metrics)}", ) check.is_in("loss", tr_metrics.keys(), 'Please include "loss" in you training metrics.') # Backwards pass. loss = tr_metrics["loss"] communicate_and_update = ( batch_idx + 1) % self.hvd_config.aggregation_frequency == 0 if self.use_amp(): with apex.amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() if self.hvd_config.use and communicate_and_update: self.optimizer.synchronize() else: loss.backward() if communicate_and_update: parameters = (self.model.parameters() if not self.use_amp() else apex.amp.master_params(self.optimizer)) if self.hvd_config.average_aggregated_gradients: self._average_gradients( parameters=parameters, divisor=self.hvd_config.aggregation_frequency) self._clip_grads(parameters) if self.hvd_config.use and self.use_amp(): with self.optimizer.skip_synchronize(): self.optimizer.step() else: self.optimizer.step() self.optimizer.zero_grad() if self.lr_helper.should_step_lr( batches_completed=batch_idx + 1, epoch_length=len(self.training_loader), aggregation_frequency=self.hvd_config. aggregation_frequency, ): self.lr_helper.step() for name, metric in tr_metrics.items(): # Convert PyTorch metric values to NumPy, so that # `det.util.encode_json` handles them properly without # needing a dependency on PyTorch. if isinstance(metric, torch.Tensor): metric = metric.cpu().detach().numpy() tr_metrics[name] = metric check.is_in("loss", tr_metrics, 'Please include "loss" in your training metrics.') per_batch_metrics.append(tr_metrics) if self.hvd_config.use and self.hvd_config.average_training_metrics: per_batch_metrics = self._average_training_metrics( per_batch_metrics) if not self.is_chief: return workload.Skipped() if self.hvd_config.use: num_inputs *= hvd.size() logging.debug( f"Done training step: {num_inputs} records in {batches_per_step} batches." ) return det.util.make_metrics(num_inputs, per_batch_metrics)
def _train_for_step(self, step_id: int, batches_per_step: int) -> workload.Response: check.gt(step_id, 0) # Set the behavior of certain layers (e.g., dropout) that are different # between training and inference. self.context.model.train() for callback in self.callbacks.values(): callback.on_train_step_start(step_id) step_idx = step_id - 1 start = step_idx * batches_per_step end = start + batches_per_step per_batch_metrics = [] # type: List[Dict] num_inputs = 0 for batch_idx in range(start, end): batch = next(self.training_iterator) num_inputs += data_length(batch) batch = self._to_device(batch) # Forward pass. tr_metrics = self.trial.train_batch( batch=batch, model=self.context.model, epoch_idx=self.get_epoch_idx(batch_idx), batch_idx=batch_idx, ) if isinstance(tr_metrics, torch.Tensor): tr_metrics = {"loss": tr_metrics} check.is_instance( tr_metrics, dict, "train_batch() must return a dictionary " "mapping string names to Tensor metrics, got {type(tr_metrics)}", ) check.is_in("loss", tr_metrics.keys(), 'Please include "loss" in you training metrics.') # Backwards pass. loss = tr_metrics["loss"] communicate_and_update = (batch_idx + 1) % self.hvd_config.aggregation_frequency == 0 if self.use_amp(): with apex.amp.scale_loss(loss, self.context.optimizer) as scaled_loss: scaled_loss.backward() if self.hvd_config.use and communicate_and_update: # When using horovod, we need to finish communicating gradient # updates before they are unscaled which happens when we exit # of this context manager. self.context.optimizer.synchronize() else: loss.backward() # Communication needs to be synchronized so that is completed # before we apply gradient clipping and `step()`. if communicate_and_update and self.hvd_config.use: self.context.optimizer.synchronize() if communicate_and_update: parameters = ( self.context.model.parameters() if not self.use_amp() else apex.amp.master_params(self.context.optimizer) ) if self.hvd_config.average_aggregated_gradients: self._average_gradients( parameters=parameters, divisor=self.hvd_config.aggregation_frequency ) # TODO: Remove this check in v0.12.8. check.false( self.env.hparams.get("clip_grad_l2_norm", None) or self.env.hparams.get("clip_grad_val", None), "Please specify gradient clipping via callbacks.", ) for callback in self.callbacks.values(): callback.on_before_optimizer_step(parameters) if self.hvd_config.use: with self.context.optimizer.skip_synchronize(): self.context.optimizer.step() else: self.context.optimizer.step() self.context.optimizer.zero_grad() # Step learning rate of a LRScheduler. if self.context.lr_scheduler is not None: self._auto_step_lr_scheduler_per_batch(batch_idx, self.context.lr_scheduler) for name, metric in tr_metrics.items(): # Convert PyTorch metric values to NumPy, so that # `det.util.encode_json` handles them properly without # needing a dependency on PyTorch. if isinstance(metric, torch.Tensor): metric = metric.cpu().detach().numpy() tr_metrics[name] = metric check.is_in("loss", tr_metrics, 'Please include "loss" in your training metrics.') per_batch_metrics.append(tr_metrics) if self.hvd_config.use and self.hvd_config.average_training_metrics: per_batch_metrics = self._average_training_metrics(per_batch_metrics) if self.hvd_config.use: num_inputs *= hvd.size() metrics = det.util.make_metrics(num_inputs, per_batch_metrics) for callback in self.callbacks.values(): callback.on_train_step_end(step_id, metrics) if not self.is_chief: return workload.Skipped() logging.debug(f"Done training step: {num_inputs} records in {batches_per_step} batches.") return metrics