def test_multi_gpu_model_dp(tmpdir):
    """
    Make sure DP works
    :return:
    """
    tutils.reset_seed()

    if not tutils.can_run_gpu_test():
        return

    model, hparams = tutils.get_model()
    trainer_options = dict(
        default_save_path=tmpdir,
        show_progress_bar=False,
        distributed_backend='dp',
        max_nb_epochs=1,
        train_percent_check=0.1,
        val_percent_check=0.1,
        gpus='-1'
    )

    tutils.run_model_test(trainer_options, model, hparams)

    # test memory helper functions
    memory.get_memory_profile('min_max')
Exemplo n.º 2
0
    def log_metrics(self, metrics, grad_norm_dic, step=None):
        """Logs the metric dict passed in.
        If `step` parameter is None and `step` key is presented is metrics,
        uses metrics["step"] as a step

        Args:
            metrics (dict): Metric values
            grad_norm_dic (dict): Gradient norms
            step (int): Step for which metrics should be logged. Default value corresponds to `self.global_step`
        """
        # add gpu memory
        if self.on_gpu and self.log_gpu_memory:
            mem_map = memory.get_memory_profile(self.log_gpu_memory)
            metrics.update(mem_map)

        # add norms
        metrics.update(grad_norm_dic)

        # turn all tensors to scalars
        scalar_metrics = self.metrics_to_scalars(metrics)

        if "step" in scalar_metrics and step is None:
            step = scalar_metrics.pop("step")
        else:
            # added metrics by Lightning for convenience
            metrics['epoch'] = self.current_epoch
            step = step if step is not None else self.global_step
        # log actual metrics
        if self.proc_rank == 0 and self.logger is not None:
            self.logger.log_metrics(scalar_metrics, step=step)
            self.logger.save()
Exemplo n.º 3
0
    def log_metrics(self, metrics, grad_norm_dic, step=None):
        """Logs the metric dict passed in.

        :param metrics:
        :param grad_norm_dic:
        """
        # added metrics by Lightning for convenience
        metrics['epoch'] = self.current_epoch

        # add gpu memory
        if self.on_gpu and self.log_gpu_memory:
            mem_map = memory.get_memory_profile(self.log_gpu_memory)
            metrics.update(mem_map)

        # add norms
        metrics.update(grad_norm_dic)

        # turn all tensors to scalars
        scalar_metrics = self.metrics_to_scalars(metrics)

        step = step if step is not None else self.global_step
        # log actual metrics
        if self.proc_rank == 0 and self.logger is not None:
            self.logger.log_metrics(scalar_metrics, step=step)
            self.logger.save()
    def log_metrics(self, metrics: Dict[str, _METRIC], step: Optional[int] = None) -> None:
        """Logs the metric dict passed in.
        If `step` parameter is None and `step` key is presented is metrics,
        uses metrics["step"] as a step

        Args:
            metrics: Metric values
            step: Step for which metrics should be logged. Default value is `self.global_step` during training or
                the total validation / test log step count during validation and testing.
        """
        if self.trainer.logger is None or not metrics:
            return

        # add gpu memory
        if self.trainer._device_type == DeviceType.GPU and self.log_gpu_memory:
            mem_map = memory.get_memory_profile(self.log_gpu_memory)
            metrics.update(mem_map)

        # turn all tensors to scalars
        scalar_metrics = metrics_to_scalars(metrics)

        if step is None:
            step = scalar_metrics.pop("step", None)
        if step is None:
            # added metrics for convenience
            scalar_metrics.setdefault("epoch", self.trainer.current_epoch)
            step = self.trainer.global_step

        # log actual metrics
        if self.trainer.is_global_zero:
            self.trainer.logger.agg_and_log_metrics(scalar_metrics, step=step)
            self.trainer.logger.save()

        self._logged_metrics.update(scalar_metrics)
Exemplo n.º 5
0
def test_multi_gpu_model_ddp_spawn(tmpdir):
    tutils.set_random_master_port()

    trainer_options = dict(default_root_dir=tmpdir,
                           max_epochs=1,
                           limit_train_batches=10,
                           limit_val_batches=10,
                           gpus=[0, 1],
                           distributed_backend='ddp_spawn',
                           progress_bar_refresh_rate=0)

    model = EvalModelTemplate()

    tpipes.run_model_test(trainer_options, model)

    # test memory helper functions
    memory.get_memory_profile('min_max')
Exemplo n.º 6
0
def test_multi_gpu_model_dp(tmpdir):
    """Make sure DP works."""
    tutils.reset_seed()

    model, hparams = tutils.get_default_model()
    trainer_options = dict(default_save_path=tmpdir,
                           progress_bar_refresh_rate=0,
                           distributed_backend='dp',
                           max_epochs=1,
                           train_percent_check=0.1,
                           val_percent_check=0.1,
                           gpus='-1')

    tutils.run_model_test(trainer_options, model)

    # test memory helper functions
    memory.get_memory_profile('min_max')
Exemplo n.º 7
0
def test_multi_gpu_model_dp(tmpdir):
    tutils.set_random_master_port()

    trainer_options = dict(
        default_root_dir=tmpdir,
        max_epochs=1,
        limit_train_batches=10,
        limit_val_batches=10,
        gpus=[0, 1],
        accelerator='dp',
        progress_bar_refresh_rate=0,
    )

    model = BoringModel()

    tpipes.run_model_test(trainer_options, model)

    # test memory helper functions
    memory.get_memory_profile('min_max')
Exemplo n.º 8
0
def test_multi_gpu_model(tmpdir, backend):
    """Make sure DDP works."""
    tutils.set_random_master_port()

    trainer_options = dict(
        default_root_dir=tmpdir,
        max_epochs=1,
        train_percent_check=0.4,
        val_percent_check=0.2,
        gpus=[0, 1],
        distributed_backend=backend,
    )

    model = EvalModelTemplate()
    # tutils.run_model_test(trainer_options, model)
    trainer = Trainer(**trainer_options)
    result = trainer.fit(model)
    assert result

    # test memory helper functions
    memory.get_memory_profile('min_max')
    def log_metrics(self,
                    metrics,
                    grad_norm_dic,
                    step=None,
                    log_train_step_metrics=False):
        """Logs the metric dict passed in.
        If `step` parameter is None and `step` key is presented is metrics,
        uses metrics["step"] as a step

        Args:
            metrics (dict): Metric values
            grad_norm_dic (dict): Gradient norms
            step (int): Step for which metrics should be logged. Default value corresponds to `self.global_step`
            log_train_step_metrics (bool): Used to track if log_metrics function is being called in during training steps.
                In training steps, we will log metrics on step: total_nb_idx (for accumulated gradients) and global_step for the rest.
        """
        # add gpu memory
        if self.trainer.on_gpu and self.trainer.log_gpu_memory:
            mem_map = memory.get_memory_profile(self.trainer.log_gpu_memory)
            metrics.update(mem_map)

        # add norms
        metrics.update(grad_norm_dic)

        # turn all tensors to scalars
        scalar_metrics = self.trainer.metrics_to_scalars(metrics)

        if "step" in scalar_metrics and step is None:
            step = scalar_metrics.pop("step")

        elif step is None:
            # added metrics by Lightning for convenience
            if log_train_step_metrics:
                step = self.trainer.total_batch_idx
            else:
                scalar_metrics['epoch'] = self.trainer.current_epoch
                step = self.trainer.global_step

        # log actual metrics
        if self.trainer.logger is not None:
            if self.trainer.is_global_zero:
                self.trainer.logger.agg_and_log_metrics(scalar_metrics,
                                                        step=step)
                self.trainer.logger.save()

            # track the logged metrics
            self.logged_metrics.update(scalar_metrics)
            self.trainer.dev_debugger.track_logged_metrics_history(
                scalar_metrics)
Exemplo n.º 10
0
    def log_metrics(self, metrics, grad_norm_dict, step=None):
        """Logs the metric dict passed in.
        If `step` parameter is None and `step` key is presented is metrics,
        uses metrics["step"] as a step

        Args:
            metrics (dict): Metric values
            grad_norm_dict (dict): Gradient norms
            step (int): Step for which metrics should be logged. Default value is `self.global_step` during training or
                the total validation / test log step count during validation and testing.
        """
        # add gpu memory
        if self.trainer._device_type == DeviceType.GPU and self.log_gpu_memory:
            mem_map = memory.get_memory_profile(self.log_gpu_memory)
            metrics.update(mem_map)

        # add norms
        metrics.update(grad_norm_dict)

        # turn all tensors to scalars
        scalar_metrics = metrics_to_scalars(metrics)

        if "step" in scalar_metrics and step is None:
            step = scalar_metrics.pop("step")

        elif step is None:
            # added metrics by Lightning for convenience
            scalar_metrics['epoch'] = self.trainer.current_epoch
            step = self.trainer.global_step

        # log actual metrics
        if self.trainer.logger is not None:
            if self.trainer.is_global_zero:
                self.trainer.logger.agg_and_log_metrics(scalar_metrics,
                                                        step=step)
                self.trainer.logger.save()

            # track the logged metrics
            self.logged_metrics.update(scalar_metrics)
            self.trainer.dev_debugger.track_logged_metrics_history(
                scalar_metrics)
Exemplo n.º 11
0
 def gpus_metrics(self) -> Dict[str, str]:
     if self.trainer._device_type == DeviceType.GPU and self.log_gpu_memory:
         mem_map = memory.get_memory_profile(self.log_gpu_memory)
         self._gpus_metrics.update(mem_map)
     return self._gpus_metrics