def _compute_validation_metrics(self) -> workload.Response: validation_start_time = time.time() metrics = self._launch_evaluate() num_inputs, num_batches = self.multiplexer.get_test_inputs() if self.context.distributed.size > 1: # Use a global ZMQ barrier here because we have observed cases where hvd.allreduce # may hang when called minutes apart by different workers which may happen if # workers complete evaluation at different speeds. _ = self.context.distributed.gather(None) num_inputs = hvd.allreduce(num_inputs, average=False, name="validation_num_inputs") if isinstance(num_inputs, EagerTensor): # Horovod will promote an int to a tensor in eager mode. num_inputs = num_inputs.numpy() num_batches = hvd.allreduce(num_batches, average=False, name="validation_num_batches") if isinstance(num_batches, EagerTensor): num_batches = num_batches.numpy() metrics = self._allreduce_logs(metrics) check.gt(len(metrics), 0) self.multiplexer._test_end(metrics) if not self.is_chief: return {} step_duration = time.time() - validation_start_time logging.info(det.util.make_timing_log("validated", step_duration, num_inputs, num_batches)) self.metric_writer.on_validation_step_end(self.steps_completed, metrics) self.upload_tb_files() return {"num_inputs": num_inputs, "validation_metrics": metrics}
def compute_validation_metrics(self) -> workload.Response: ( validation_data, validation_steps, ) = self._validation_input_manager.get_validation_input_and_num_batches( ) metrics_values = self.model.evaluate(validation_data, steps=validation_steps, verbose=0) # If the model was compiled with metrics=None, metrics_value will be a single value. if not isinstance(metrics_values, (tuple, list)): metrics_values = (metrics_values, ) if self.hvd_config.use: for index, metric_value in enumerate(metrics_values): metrics_values[index] = np.array(hvd.allreduce(metric_value)) num_inputs = self._validation_input_manager.stop_validation_input_and_get_num_inputs( ) if not self.is_chief: return workload.Skipped() metrics = make_logs(self.model, {}, metrics_values, ModeKeys.TEST, prefix="val_") check.gt(len(metrics), 0) return {"num_inputs": num_inputs, "validation_metrics": metrics}
def _compute_validation_metrics(self) -> workload.Response: metrics = self._launch_evaluate() num_inputs = self.multiplexer.get_test_inputs() if self.hvd_config.use: # Use a global ZMQ barrier here because we have observed cases where hvd.allreduce # may hang when called minutes apart by different workers which may happen if # workers complete evaluation at different speeds. self._global_barrier() num_inputs = hvd.allreduce(num_inputs, average=False, name="validation_num_inputs") if isinstance(num_inputs, EagerTensor): # Horovod will promote an int to a tensor in eager mode. num_inputs = num_inputs.numpy() metrics = self._allreduce_logs(metrics) check.gt(len(metrics), 0) self.multiplexer._test_end(metrics) if not self.is_chief: return workload.Skipped() return {"num_inputs": num_inputs, "validation_metrics": metrics}
def _allreduce_logs(self, logs: Dict) -> Dict: if not self.hvd_config.use: return logs # Reduce logs in key-sorted to be deterministic across workers. keys = sorted(logs) logging.debug(f"all-reducing logs on worker {hvd.rank()} for {len(keys)} keys {keys}.") return {key: np.array(hvd.allreduce(logs[key], name=key)) for key in keys}
def _post_train_batch_end(self, num_inputs: int, logs: Dict) -> None: # Remove default keras metrics we aren't interested in like "batch" and "size". self.train_workload_metrics.append( {k: v for k, v in logs.items() if k not in {"batch", "size"}}) self.train_workload_inputs += num_inputs self.train_workload_batches += 1 if self.train_workload_batches != self.train_workload_len: return if self.train_response_func is None: raise AssertionError( "Callback should avoid calling model.predict(), " "as this will affect Determined training behavior", ) if self.hvd_config.use: num_inputs = hvd.allreduce(num_inputs, average=False, name="train_num_inputs") if isinstance(num_inputs, EagerTensor): # Horovod will promote an int to a tensor in eager mode. num_inputs = num_inputs.numpy() # Return only the latest metrics, which is the running average for all trained batches in # the step (Keras does not report individual logs, only running averages at any point). final_metrics = self.train_workload_metrics[-1] if self.env.experiment_config.averaging_training_metrics_enabled(): final_metrics = self._allreduce_logs(final_metrics) self.multiplexer._train_workload_end(final_metrics) self._stop_training_check() if self.is_chief: # Don't use det.util.make_metrics, because our batch metrics are not raw metrics. response = { "metrics": { "num_inputs": num_inputs, "batch_metrics": self.train_workload_metrics, "avg_metrics": final_metrics, }, "stop_requested": self.context.get_stop_requested(), } self.train_response_func(response) else: self.train_response_func(workload.Skipped()) self.train_response_func = None self._control_loop() # Always reset metrics before starting a new training step. assert self.model is not None self.model.reset_metrics()
def forward(self, outputs, targets, eval=False): """This performs the loss computation. Parameters: outputs: dict of tensors, see the output specification of the model for the format targets: list of dicts, such that len(targets) == batch_size. The expected keys in each dict depends on the losses applied, see each loss' doc """ outputs_without_aux = { k: v for k, v in outputs.items() if k != "aux_outputs" } # Retrieve the matching between the outputs of the last layer and the targets indices = self.matcher(outputs_without_aux, targets) # Compute the average number of target boxes accross all nodes, for normalization purposes num_boxes = sum(len(t["labels"]) for t in targets) num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device) if self.world_size > 1 and not eval: # Horovod allreduce defaults to avg while torch allreduce defaults to sum. num_boxes = hvd.allreduce(num_boxes, name="num_boxes") num_boxes = torch.clamp(num_boxes, min=1).item() # Compute all the requested losses losses = {} for loss in self.losses: losses.update( self.get_loss(loss, outputs, targets, indices, num_boxes)) # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. if "aux_outputs" in outputs: for i, aux_outputs in enumerate(outputs["aux_outputs"]): indices = self.matcher(aux_outputs, targets) for loss in self.losses: if loss == "masks": # Intermediate masks losses are too costly to compute, we ignore them. continue kwargs = {} if loss == "labels": # Logging is enabled only for the last layer kwargs = {"log": False} l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **kwargs) l_dict = {k + f"_{i}": v for k, v in l_dict.items()} losses.update(l_dict) return losses
def _hvd_allreduce(self, value: Any, average: bool, name: str) -> Any: # The signature of our horovod allreduce changed after we rebased onto 0.21. hvd_sig = inspect.signature(hvd.allreduce) horovod_kwargs = { "value": value, "name": name, } # type: Dict[str, Any] if "op" in hvd_sig.parameters: horovod_kwargs["op"] = hvd.Average if average else hvd.Sum # average has not yet been removed but it's deprecated. It defaults # to true and horovod does not support specifying an op while having # average be not None. if "average" in hvd_sig.parameters: horovod_kwargs["average"] = None else: horovod_kwargs["average"] = average return hvd.allreduce(**horovod_kwargs)