Пример #1
0
    def _compute_validation_metrics(self) -> workload.Response:
        validation_start_time = time.time()
        metrics = self._launch_evaluate()
        num_inputs, num_batches = self.multiplexer.get_test_inputs()

        if self.context.distributed.size > 1:
            # Use a global ZMQ barrier here because we have observed cases where hvd.allreduce
            # may hang when called minutes apart by different workers which may happen if
            # workers complete evaluation at different speeds.
            _ = self.context.distributed.gather(None)

            num_inputs = hvd.allreduce(num_inputs, average=False, name="validation_num_inputs")
            if isinstance(num_inputs, EagerTensor):
                # Horovod will promote an int to a tensor in eager mode.
                num_inputs = num_inputs.numpy()
            num_batches = hvd.allreduce(num_batches, average=False, name="validation_num_batches")
            if isinstance(num_batches, EagerTensor):
                num_batches = num_batches.numpy()

        metrics = self._allreduce_logs(metrics)
        check.gt(len(metrics), 0)

        self.multiplexer._test_end(metrics)

        if not self.is_chief:
            return {}

        step_duration = time.time() - validation_start_time
        logging.info(det.util.make_timing_log("validated", step_duration, num_inputs, num_batches))

        self.metric_writer.on_validation_step_end(self.steps_completed, metrics)
        self.upload_tb_files()
        return {"num_inputs": num_inputs, "validation_metrics": metrics}
Пример #2
0
    def compute_validation_metrics(self) -> workload.Response:
        (
            validation_data,
            validation_steps,
        ) = self._validation_input_manager.get_validation_input_and_num_batches(
        )

        metrics_values = self.model.evaluate(validation_data,
                                             steps=validation_steps,
                                             verbose=0)

        # If the model was compiled with metrics=None, metrics_value will be a single value.
        if not isinstance(metrics_values, (tuple, list)):
            metrics_values = (metrics_values, )

        if self.hvd_config.use:
            for index, metric_value in enumerate(metrics_values):
                metrics_values[index] = np.array(hvd.allreduce(metric_value))

        num_inputs = self._validation_input_manager.stop_validation_input_and_get_num_inputs(
        )

        if not self.is_chief:
            return workload.Skipped()

        metrics = make_logs(self.model, {},
                            metrics_values,
                            ModeKeys.TEST,
                            prefix="val_")
        check.gt(len(metrics), 0)

        return {"num_inputs": num_inputs, "validation_metrics": metrics}
Пример #3
0
    def _compute_validation_metrics(self) -> workload.Response:
        metrics = self._launch_evaluate()
        num_inputs = self.multiplexer.get_test_inputs()

        if self.hvd_config.use:
            # Use a global ZMQ barrier here because we have observed cases where hvd.allreduce
            # may hang when called minutes apart by different workers which may happen if
            # workers complete evaluation at different speeds.
            self._global_barrier()

            num_inputs = hvd.allreduce(num_inputs,
                                       average=False,
                                       name="validation_num_inputs")
            if isinstance(num_inputs, EagerTensor):
                # Horovod will promote an int to a tensor in eager mode.
                num_inputs = num_inputs.numpy()

        metrics = self._allreduce_logs(metrics)
        check.gt(len(metrics), 0)

        self.multiplexer._test_end(metrics)

        if not self.is_chief:
            return workload.Skipped()

        return {"num_inputs": num_inputs, "validation_metrics": metrics}
Пример #4
0
 def _allreduce_logs(self, logs: Dict) -> Dict:
     if not self.hvd_config.use:
         return logs
     # Reduce logs in key-sorted to be deterministic across workers.
     keys = sorted(logs)
     logging.debug(f"all-reducing logs on worker {hvd.rank()} for {len(keys)} keys {keys}.")
     return {key: np.array(hvd.allreduce(logs[key], name=key)) for key in keys}
Пример #5
0
    def _post_train_batch_end(self, num_inputs: int, logs: Dict) -> None:
        # Remove default keras metrics we aren't interested in like "batch" and "size".
        self.train_workload_metrics.append(
            {k: v
             for k, v in logs.items() if k not in {"batch", "size"}})
        self.train_workload_inputs += num_inputs
        self.train_workload_batches += 1
        if self.train_workload_batches != self.train_workload_len:
            return

        if self.train_response_func is None:
            raise AssertionError(
                "Callback should avoid calling model.predict(), "
                "as this will affect Determined training behavior", )

        if self.hvd_config.use:
            num_inputs = hvd.allreduce(num_inputs,
                                       average=False,
                                       name="train_num_inputs")
            if isinstance(num_inputs, EagerTensor):
                # Horovod will promote an int to a tensor in eager mode.
                num_inputs = num_inputs.numpy()

        # Return only the latest metrics, which is the running average for all trained batches in
        # the step (Keras does not report individual logs, only running averages at any point).
        final_metrics = self.train_workload_metrics[-1]
        if self.env.experiment_config.averaging_training_metrics_enabled():
            final_metrics = self._allreduce_logs(final_metrics)

        self.multiplexer._train_workload_end(final_metrics)
        self._stop_training_check()

        if self.is_chief:
            # Don't use det.util.make_metrics, because our batch metrics are not raw metrics.
            response = {
                "metrics": {
                    "num_inputs": num_inputs,
                    "batch_metrics": self.train_workload_metrics,
                    "avg_metrics": final_metrics,
                },
                "stop_requested": self.context.get_stop_requested(),
            }
            self.train_response_func(response)
        else:
            self.train_response_func(workload.Skipped())

        self.train_response_func = None

        self._control_loop()

        # Always reset metrics before starting a new training step.
        assert self.model is not None
        self.model.reset_metrics()
Пример #6
0
    def forward(self, outputs, targets, eval=False):
        """This performs the loss computation.
        Parameters:
             outputs: dict of tensors, see the output specification of the model for the format
             targets: list of dicts, such that len(targets) == batch_size.
                      The expected keys in each dict depends on the losses applied, see each loss' doc
        """
        outputs_without_aux = {
            k: v
            for k, v in outputs.items() if k != "aux_outputs"
        }

        # Retrieve the matching between the outputs of the last layer and the targets
        indices = self.matcher(outputs_without_aux, targets)

        # Compute the average number of target boxes accross all nodes, for normalization purposes
        num_boxes = sum(len(t["labels"]) for t in targets)
        num_boxes = torch.as_tensor([num_boxes],
                                    dtype=torch.float,
                                    device=next(iter(outputs.values())).device)
        if self.world_size > 1 and not eval:
            # Horovod allreduce defaults to avg while torch allreduce defaults to sum.
            num_boxes = hvd.allreduce(num_boxes, name="num_boxes")
        num_boxes = torch.clamp(num_boxes, min=1).item()

        # Compute all the requested losses
        losses = {}
        for loss in self.losses:
            losses.update(
                self.get_loss(loss, outputs, targets, indices, num_boxes))

        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
        if "aux_outputs" in outputs:
            for i, aux_outputs in enumerate(outputs["aux_outputs"]):
                indices = self.matcher(aux_outputs, targets)
                for loss in self.losses:
                    if loss == "masks":
                        # Intermediate masks losses are too costly to compute, we ignore them.
                        continue
                    kwargs = {}
                    if loss == "labels":
                        # Logging is enabled only for the last layer
                        kwargs = {"log": False}
                    l_dict = self.get_loss(loss, aux_outputs, targets, indices,
                                           num_boxes, **kwargs)
                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
                    losses.update(l_dict)

        return losses
Пример #7
0
    def _hvd_allreduce(self, value: Any, average: bool, name: str) -> Any:
        # The signature of our horovod allreduce changed after we rebased onto 0.21.
        hvd_sig = inspect.signature(hvd.allreduce)
        horovod_kwargs = {
            "value": value,
            "name": name,
        }  # type: Dict[str, Any]

        if "op" in hvd_sig.parameters:
            horovod_kwargs["op"] = hvd.Average if average else hvd.Sum

            # average has not yet been removed but it's deprecated. It defaults
            # to true and horovod does not support specifying an op while having
            # average be not None.
            if "average" in hvd_sig.parameters:
                horovod_kwargs["average"] = None
        else:
            horovod_kwargs["average"] = average

        return hvd.allreduce(**horovod_kwargs)