def _compute_validation_metrics(self) -> workload.Response:
        self.context.reset_reducers()
        # Set the behavior of certain layers (e.g., dropout) that are
        # different between training and inference.
        for model in self.context.models:
            model.eval()

        step_start_time = time.time()

        for callback in self.callbacks.values():
            if util.is_overridden(callback.on_validation_step_start,
                                  pytorch.PyTorchCallback):
                logging.warning("on_validation_step_start is now deprecated, "
                                "please use on_validation_start instead")
                callback.on_validation_step_start()

        for callback in self.callbacks.values():
            callback.on_validation_start()

        num_inputs = 0
        metrics = {}  # type: Dict[str, Any]

        if self._evaluate_batch_defined():
            keys = None
            batch_metrics = []

            self.validation_loader = cast(torch.utils.data.DataLoader,
                                          self.validation_loader)
            check.gt(len(self.validation_loader), 0)
            for callback in self.callbacks.values():
                callback.on_validation_epoch_start()
            for idx, batch in enumerate(self.validation_loader):
                if self.context.experimental._auto_to_device:
                    batch = self.context.to_device(batch)
                num_inputs += self.trial.get_batch_length(batch)

                if has_param(self.trial.evaluate_batch, "batch_idx", 2):
                    vld_metrics = self.trial.evaluate_batch(batch=batch,
                                                            batch_idx=idx)
                else:
                    vld_metrics = self.trial.evaluate_batch(
                        batch=batch)  # type: ignore
                # Verify validation metric names are the same across batches.
                if keys is None:
                    keys = vld_metrics.keys()
                else:
                    check.eq(
                        keys,
                        vld_metrics.keys(),
                        "Validation metric names must match across all batches of data.",
                    )
                check.is_instance(
                    vld_metrics,
                    dict,
                    "validation_metrics() must return a "
                    "dictionary of string names to Tensor "
                    "metrics",
                )
                # TODO: For performance perform -> cpu() only at the end of validation.
                batch_metrics.append(
                    pytorch._convert_metrics_to_numpy(vld_metrics))
                if self.env.test_mode:
                    break

            for callback in self.callbacks.values():
                callback.on_validation_epoch_end(batch_metrics)

            metrics = pytorch._reduce_metrics(
                self.context.distributed,
                batch_metrics=batch_metrics,
                keys=keys,
                metrics_reducers=pytorch._prepare_metrics_reducers(
                    self.trial.evaluation_reducer(), keys=keys),
            )

            # Gather a list of per-worker (num_inputs, num_batches) tuples.
            input_counts = self.context.distributed.gather(
                (num_inputs, idx + 1))
            if self.context.distributed.rank == 0:
                assert input_counts is not None
                # Reshape and sum.
                num_inputs, num_batches = [sum(n) for n in zip(*input_counts)]

        else:
            check.true(self._evaluate_full_dataset_defined())
            self.validation_loader = cast(torch.utils.data.DataLoader,
                                          self.validation_loader)
            if self.is_chief:
                metrics = self.trial.evaluate_full_dataset(
                    data_loader=self.validation_loader)

                check.is_instance(
                    metrics, dict,
                    f"eval() must return a dictionary, got {type(metrics)}.")

                metrics = pytorch._convert_metrics_to_numpy(metrics)
                num_inputs = self.context.get_per_slot_batch_size() * len(
                    self.validation_loader)

        metrics.update(
            pytorch._convert_metrics_to_numpy(
                self.context.reduce_metrics(for_training=False)))

        if self.context.distributed.size > 1 and any(
                map(
                    lambda c: util.is_overridden(
                        c.on_validation_end, pytorch.
                        PyTorchCallback) or util.is_overridden(
                            c.on_validation_step_end, pytorch.PyTorchCallback),
                    self.callbacks.values(),
                )):
            logging.debug(
                "Broadcasting metrics to all worker processes to execute a "
                "validation step end callback")
            metrics = hvd.broadcast_object(metrics, root_rank=0)

        for callback in self.callbacks.values():
            if util.is_overridden(callback.on_validation_step_end,
                                  pytorch.PyTorchCallback):
                logging.warning(
                    "on_validation_step_end is now deprecated, please use on_validation_end instead"
                )
                callback.on_validation_step_end(metrics)

        for callback in self.callbacks.values():
            callback.on_validation_end(metrics)

        if not self.is_chief:
            return {}

        # Skip reporting timings if evaluate_full_dataset() was defined.  This is far less common
        # than evaluate_batch() and we can't know how the user processed their validation data.
        if self._evaluate_batch_defined():
            step_duration = time.time() - step_start_time
            logging.info(
                det.util.make_timing_log("validated", step_duration,
                                         num_inputs, num_batches))

        return {"num_inputs": num_inputs, "validation_metrics": metrics}
Exemplo n.º 2
0
    def _compute_validation_metrics(self) -> workload.Response:
        self.context.reset_reducers()
        # Set the behavior of certain layers (e.g., dropout) that are
        # different between training and inference.
        for model in self.context.models:
            model.eval()

        step_start_time = time.time()

        for callback in self.callbacks.values():
            if util.is_overridden(callback.on_validation_step_start,
                                  pytorch.PyTorchCallback):
                logging.warning("on_validation_step_start is now deprecated, "
                                "please use on_validation_start instead")
                callback.on_validation_step_start()

        for callback in self.callbacks.values():
            callback.on_validation_start()

        num_inputs = 0
        keys = None
        batch_metrics = []

        for callback in self.callbacks.values():
            callback.on_validation_epoch_start()

        validation_iterator = iter(
            self.validation_loader) if self.validation_loader else None
        for idx in range(cast(int, self.num_validation_batches)):
            num_inputs += cast(int, self.validation_batch_size)
            # Note that when using pipeline parallelism, each call to evaluate_batch will request
            # self.context.num_micro_batches_per_slot batches from the validation iterator.
            # This is why we set self.num_validation_batches differently for pipeline parallel
            # and no pipeline parallel when building the data laoders.
            vld_metrics = self.trial.evaluate_batch(validation_iterator, idx)
            if self.context._mpu.should_report_metrics:
                if not isinstance(vld_metrics, dict):
                    raise det.errors.InvalidExperimentException(
                        "evaluate_batch must return a dictionary of string names "
                        "to Tensor metrics", )
                # Verify validation metric names are the same across batches.
                if keys is None:
                    keys = vld_metrics.keys()
                else:
                    if keys != vld_metrics.keys():
                        raise det.errors.InvalidExperimentException(
                            "Validation metric names must match across all batches of data.",
                        )
                # TODO: For performance perform -> cpu() only at the end of validation.
                batch_metrics.append(
                    pytorch._convert_metrics_to_numpy(vld_metrics))
            if self.env.test_mode:
                break

        # keys and list(keys) does not satisfy all cases because it will return dict_keys type if
        # keys is an empty dict. this will then break when passed to zmq_broadcast since it does
        # not know how to serialize dict_keys type.
        all_keys = self.context.distributed.gather(
            keys if keys is None else list(keys))
        if self.is_chief:
            all_keys = [k for k in all_keys if k is not None]
            keys = all_keys[0]
        keys = self.context.distributed.broadcast(keys)

        for callback in self.callbacks.values():
            callback.on_validation_epoch_end(batch_metrics)

        metrics = pytorch._reduce_metrics(
            self.context.distributed,
            batch_metrics=batch_metrics,
            keys=keys,
            metrics_reducers=pytorch._prepare_metrics_reducers(
                pytorch.Reducer.AVG, keys=keys),
        )
        metrics.update(
            pytorch._convert_metrics_to_numpy(
                self.context.reduce_metrics(for_training=False)))

        if self.context.distributed.size > 1 and any(
                util.is_overridden(c.on_validation_end,
                                   pytorch.PyTorchCallback)
                or util.is_overridden(c.on_validation_step_end,
                                      pytorch.PyTorchCallback)
                for c in self.callbacks.values()):
            logging.debug(
                "Broadcasting metrics to all worker processes to execute a "
                "validation step end callback")
            metrics = self.context.distributed.broadcast(metrics)

        for callback in self.callbacks.values():
            if util.is_overridden(callback.on_validation_step_end,
                                  pytorch.PyTorchCallback):
                logging.warning(
                    "on_validation_step_end is now deprecated, please use on_validation_end instead"
                )
                callback.on_validation_step_end(metrics)

        for callback in self.callbacks.values():
            callback.on_validation_end(metrics)

        if not self.is_chief:
            return {}

        num_inputs *= self.context._mpu.data_parallel_world_size
        step_duration = time.time() - step_start_time
        logging.info(
            det.util.make_timing_log("validated", step_duration, num_inputs,
                                     cast(int, self.num_validation_batches)))

        self.metric_writer.on_validation_step_end(self.steps_completed,
                                                  metrics)
        return {"num_inputs": num_inputs, "validation_metrics": metrics}
    def _train_for_step(self, step_id: int, num_batches: int,
                        total_batches_processed: int) -> workload.Response:
        self.prof.set_training(True)
        check.gt(step_id, 0)
        step_start_time = time.time()
        self.context.reset_reducers()

        # Set the behavior of certain layers (e.g., dropout) that are different
        # between training and inference.
        for model in self.context.models:
            model.train()

        start = total_batches_processed
        end = start + num_batches

        per_batch_metrics = []  # type: List[Dict]
        num_inputs = 0

        for batch_idx in range(start, end):
            self.steps_completed += 1
            batch_start_time = time.time()
            self.prof.update_batch_idx(batch_idx)
            with self.prof.record_timing("dataloader_next",
                                         requires_sync=False):
                batch = next(self.training_iterator)
            batch_inputs = self.trial.get_batch_length(batch)
            num_inputs += batch_inputs

            if self.context.experimental._auto_to_device:
                with self.prof.record_timing("to_device", accumulate=True):
                    batch = self.context.to_device(batch)

            self.context._current_batch_idx = batch_idx
            epoch_idx = self.get_epoch_idx(batch_idx)
            if self.context.is_epoch_start():
                for callback in self.callbacks.values():
                    with self.prof.record_timing(
                            f"callbacks.{callback.__class__.__name__}.on_training_epoch_start"
                    ):
                        sig = signature(callback.on_training_epoch_start)
                        if sig.parameters:
                            callback.on_training_epoch_start(epoch_idx)
                        else:
                            logging.warning(
                                "on_training_epoch_start() without parameters is deprecated"
                                " since 0.17.8. Please add epoch_idx parameter."
                            )
                            callback.on_training_epoch_start(
                            )  # type: ignore[call-arg]

            self.context._loss_ids = {}

            with self.prof.record_timing("train_batch", requires_sync=False):
                if self.context.profiler:
                    with self.context.profiler as torch_profiler:
                        tr_metrics = self.trial.train_batch(
                            batch=batch,
                            epoch_idx=epoch_idx,
                            batch_idx=batch_idx,
                        )
                        torch_profiler.step()
                else:
                    tr_metrics = self.trial.train_batch(
                        batch=batch,
                        epoch_idx=epoch_idx,
                        batch_idx=batch_idx,
                    )
            if self._should_update_scaler():
                self.context._scaler.update()
            if isinstance(tr_metrics, torch.Tensor):
                tr_metrics = {"loss": tr_metrics}
            check.is_instance(
                tr_metrics,
                dict,
                "train_batch() must return a dictionary "
                f"mapping string names to Tensor metrics, got {type(tr_metrics)}",
            )

            # Step learning rate of a pytorch.LRScheduler.
            with self.prof.record_timing("step_lr_schedulers"):
                for lr_scheduler in self.context.lr_schedulers:
                    self._auto_step_lr_scheduler_per_batch(
                        batch_idx, lr_scheduler)

            with self.prof.record_timing("from_device"):
                for name, metric in tr_metrics.items():
                    # Convert PyTorch metric values to NumPy, so that
                    # `det.util.encode_json` handles them properly without
                    # needing a dependency on PyTorch.
                    if isinstance(metric, torch.Tensor):
                        metric = metric.cpu().detach().numpy()
                    tr_metrics[name] = metric

            batch_dur = time.time() - batch_start_time
            samples_per_second = batch_inputs / batch_dur
            samples_per_second *= self.context.distributed.size
            self.prof.record_metric("samples_per_second", samples_per_second)
            per_batch_metrics.append(tr_metrics)

            if self.context.is_epoch_end():
                for callback in self.callbacks.values():
                    with self.prof.record_timing(
                            f"callbacks.{callback.__class__.__name__}.on_training_epoch_end"
                    ):
                        callback.on_training_epoch_end(epoch_idx)

        # Aggregate and reduce training metrics from all the training processes.
        if self.context.distributed.size > 1 and self.context._average_training_metrics:
            with self.prof.record_timing("average_training_metrics"):
                per_batch_metrics = pytorch._combine_and_average_training_metrics(
                    self.context.distributed, per_batch_metrics)
        num_inputs *= self.context.distributed.size
        metrics = det.util.make_metrics(num_inputs, per_batch_metrics)

        # Ignore batch_metrics entirely for custom reducers; there's no guarantee that per-batch
        # metrics are even logical for a custom reducer.
        with self.prof.record_timing("reduce_metrics"):
            metrics["avg_metrics"].update(
                pytorch._convert_metrics_to_numpy(
                    self.context.reduce_metrics(for_training=True)))

        if not self.is_chief:
            # The training metrics are reported only in the chief process.
            return {}

        step_duration = time.time() - step_start_time
        logging.info(
            det.util.make_timing_log("trained", step_duration, num_inputs,
                                     num_batches))

        return metrics
Exemplo n.º 4
0
    def _train_for_step(self, step_id: int, num_batches: int,
                        total_batches_processed: int) -> workload.Response:
        """
        DeepSpeed allows specifying train_batch_size, train_micro_batch_size_per_gpu, and
        gradient_accumulation_steps. The three are related as follows:
        train_batch_size = train_micro_batch_size * gradient_accumulation_steps.
        Hence, if two are specified, the third can be inferred.

        For pipeline parallel training, DeepSpeed will automatically interleave
        gradient_accumulation_steps worth of micro batches in one train_batch/eval_batch call.

        With the default DeepSpeed model engine (no pipeline parallel training), the backward
        and optimizer step calls track micro batches and will automatically update model weights
        and lr scheduler if micro batches % gradient_accumulation_steps == 0.

        Comparing training with and without pipeline parallel is a common goal.  Since DeepSpeed's
        PipelineEngine trains on a number of micro batches equal to gradient accumulation steps,
        we automatically perform gradient accumulation by default when pipeline parallelism is not
        enabled.  This makes it fair to compare training with and without pipeline parallelism
        at a given batch idx. This can be turned off by setting
        context.disable_auto_grad_accumulation.
        """
        self.prof.set_training(True)
        assert step_id > 0, "step_id should be greater than 0"
        step_start_time = time.time()
        self.context.reset_reducers()

        # Set the behavior of certain layers (e.g., dropout) that are different
        # between training and inference.
        for model in self.context.models:
            model.train()

        start = total_batches_processed
        end = start + num_batches

        per_batch_metrics = []  # type: List[Dict]
        num_inputs = 0

        for batch_idx in range(start, end):
            self.steps_completed += 1
            self.prof.update_batch_idx(batch_idx)
            batch_start_time = time.time()
            self.context._current_batch_idx = batch_idx
            if self.context.is_epoch_start():
                for callback in self.callbacks.values():
                    with self.prof.record_timing(
                            f"callbacks.{callback.__class__.__name__}.on_training_epoch_start"
                    ):
                        callback.on_training_epoch_start(
                            self.get_epoch_idx(batch_idx))
            # This can be inaccurate if the user's data loader does not return batches with
            # the micro batch size.  It is also slightly inaccurate if the data loader can return
            # partial batches.  The same sort of assumptions are made in the DeepSpeed
            # model engine's accounting and profiling computations.
            batch_inputs = (self.context.train_micro_batch_size_per_gpu *
                            self.context.num_micro_batches_per_slot)
            num_inputs += batch_inputs
            num_train_batch_calls = self.context.num_micro_batches_per_slot
            if self.context.use_pipeline_parallel or self.context._manual_grad_accumulation:
                num_train_batch_calls = 1
            self.context._loss_ids = {}
            for _ in range(num_train_batch_calls):
                with self.prof.record_timing("train_batch",
                                             requires_sync=False,
                                             accumulate=True):
                    tr_metrics = self.trial.train_batch(
                        self.training_iterator,
                        self.get_epoch_idx(batch_idx),
                        batch_idx,
                    )
                if self.context._mpu.should_report_metrics:
                    if isinstance(tr_metrics, torch.Tensor):
                        tr_metrics = {"loss": tr_metrics}
                    if not isinstance(tr_metrics, dict):
                        raise det.errors.InvalidExperimentException(
                            "train_batch must return a dictionary "
                            f"mapping string names to Tensor metrics, got {type(tr_metrics)}",
                        )

                    for name, metric in tr_metrics.items():
                        # Convert PyTorch metric values to NumPy, so that
                        # `det.util.encode_json` handles them properly without
                        # needing a dependency on PyTorch.
                        if isinstance(metric, torch.Tensor):
                            metric = metric.cpu().detach().numpy()
                        tr_metrics[name] = metric
                    per_batch_metrics.append(tr_metrics)
            # We do a check here to make sure that we do indeed process `num_micro_batches_per_slot`
            # micro batches when training a batch for models that do not use pipeline parallelism.
            model0 = self.context.models[0]
            if not isinstance(model0, deepspeed.PipelineEngine):
                assert (model0.micro_steps %
                        self.context.num_micro_batches_per_slot == 0
                        ), "did not train for gradient accumulation steps"

            batch_dur = time.time() - batch_start_time
            samples_per_second = batch_inputs / batch_dur
            samples_per_second *= self.context._mpu.data_parallel_world_size
            self.prof.record_metric("samples_per_second", samples_per_second)

            if self.context.is_epoch_end():
                for callback in self.callbacks.values():
                    with self.prof.record_timing(
                            f"callbacks.{callback.__class__.__name__}.on_training_epoch_end"
                    ):
                        callback.on_training_epoch_end(
                            self.get_epoch_idx(batch_idx))

        # Aggregate and reduce training metrics from all the training processes.
        if self.context.distributed.size > 1 and self.context._average_training_metrics:
            with self.prof.record_timing("average_training_metrics"):
                per_batch_metrics = pytorch._combine_and_average_training_metrics(
                    self.context.distributed, per_batch_metrics)
        num_inputs *= self.context._mpu.data_parallel_world_size
        metrics = det.util.make_metrics(num_inputs, per_batch_metrics)

        # Ignore batch_metrics entirely for custom reducers; there's no guarantee that per-batch
        # metrics are even logical for a custom reducer.
        with self.prof.record_timing("reduce_metrics"):
            metrics["avg_metrics"].update(
                pytorch._convert_metrics_to_numpy(
                    self.context.reduce_metrics(for_training=True)))

        if not self.is_chief:
            # The training metrics are reported only in the chief process.
            return {}

        step_duration = time.time() - step_start_time
        logging.info(
            det.util.make_timing_log("trained", step_duration, num_inputs,
                                     num_batches))
        self.prof.set_training(False)

        self.metric_writer.on_train_step_end(
            self.steps_completed,
            metrics["avg_metrics"],
            metrics["batch_metrics"],
        )
        return metrics