示例#1
0
def test_result_gather_different_shapes():
    """ Test that tensors of varying shape get gathered into a list. """
    outputs = [
        {"foo": torch.tensor(1)},
        {"foo": torch.zeros(2, 3)},
        {"foo": torch.zeros(1, 2, 3)},
    ]
    result = Result.gather(outputs)
    expected = [torch.tensor(1), torch.zeros(2, 3), torch.zeros(1, 2, 3)]
    assert isinstance(result["foo"], list)
    assert all(torch.eq(r, e).all() for r, e in zip(result["foo"], expected))
示例#2
0
    def cache_result(self) -> None:
        """
        This function is called after every hook
        and store the result object
        """
        model_ref = self.trainer.get_model()

        # extract hook results
        hook_result = model_ref._results

        # extract model information
        fx_name, dataloader_idx = self.current_model_info()

        # add only if anything as been logged
        # default len is 1 due to _internals
        if len(hook_result) > 1:

            if fx_name not in self._internals:
                self._internals[fx_name] = HookResultStore(fx_name)

            extra_info = {}
            if self.has_split_and_opt_idx:
                extra_info = self.extra_info

            # attach capture batch_size
            Result.attach_batch_size(self._batch_size, hook_result)

            hook_result.detach()
            if self.trainer.move_metrics_to_cpu:
                hook_result.cpu()

            self._internals[fx_name].append(
                hook_result,
                dataloader_idx=dataloader_idx,
                extra_info=extra_info)

            # update logged_metrics, progress_bar_metrics, callback_metrics
            self.update_logger_connector(fx_name)

        # reset _results, fx_name
        self.reset_model()
    def cache_result(self) -> None:
        """
        This function is called after every hook
        and store the result object
        """
        with self.trainer.profiler.profile("cache_result"):
            model_ref = self.trainer.lightning_module

            # extract hook results
            hook_result = model_ref._results

            if len(hook_result) == 1:
                model_ref._current_hook_fx_name = None
                model_ref._current_fx_name = ''
                return

            info = self.info
            fx_name = info["fx_name"]

            all_gather_fn = self.trainer.lightning_module.all_gather
            self._internals.setdefault(
                fx_name,
                HookResultStore(fx_name, all_gather_fn, self._should_warn))

            # attach capture batch_size
            Result.attach_batch_size(self._batch_size, hook_result)

            hook_result = hook_result.detach()
            if self.trainer.move_metrics_to_cpu:
                hook_result = hook_result.cpu()
            elif self.trainer._distrib_type == DistributedType.DP:
                hook_result = hook_result.to(
                    torch.device("cuda", self.trainer.root_gpu))

            self._internals[fx_name].append(hook_result, info)

            # update logged_metrics, progress_bar_metrics, callback_metrics
            if "epoch_end" in fx_name:
                self.update_logger_connector()

            self.reset_model()
    def cache_result(self) -> None:
        """
        This function is called after every hook
        and store the result object
        """
        with self.trainer.profiler.profile("cache_result"):
            model_ref = self.trainer.get_model()

            # extract hook results
            hook_result = model_ref._results

            if len(hook_result) == 1:
                model_ref._current_hook_fx_name = None
                model_ref._current_fx_name = ''
                return

            # extract model information
            fx_name, dataloader_idx = self.current_model_info()

            self._internals.setdefault(fx_name, HookResultStore(fx_name))

            extra_info = self.extra_info if self.has_split_and_opt_idx else {}

            # attach capture batch_size
            Result.attach_batch_size(self._batch_size, hook_result)

            hook_result.detach()
            if self.trainer.move_metrics_to_cpu:
                hook_result.cpu()
            elif self.trainer.use_dp:
                hook_result.to(torch.device("cuda", self.trainer.root_gpu))

            self._internals[fx_name].append(hook_result, dataloader_idx=dataloader_idx, extra_info=extra_info)

            # update logged_metrics, progress_bar_metrics, callback_metrics

            if "epoch_end" in fx_name:
                self.update_logger_connector()

            self.reset_model()
示例#5
0
def test_result_metric_integration():
    metric_a = DummyMetric()
    metric_b = DummyMetric()
    metric_c = DummyMetric()

    result = Result()

    for epoch in range(3):
        cumulative_sum = 0

        for i in range(5):
            metric_a(i)
            metric_b(i)
            metric_c(i)

            cumulative_sum += i

            result.log('a', metric_a, on_step=True, on_epoch=True)
            result.log('b', metric_b, on_step=False, on_epoch=True)
            result.log('c', metric_c, on_step=True, on_epoch=False)

            batch_log = result.get_batch_log_metrics()
            batch_expected = {"a_step": i, "a": i, "c": i}
            assert set(batch_log.keys()) == set(batch_expected.keys())
            for k in batch_expected.keys():
                assert batch_expected[k] == batch_log[k]

        epoch_log = result.get_epoch_log_metrics()
        result.reset()

        # assert metric state reset to default values
        assert metric_a.x == metric_a._defaults['x']
        assert metric_b.x == metric_b._defaults['x']
        assert metric_c.x == metric_c._defaults['x']

        epoch_expected = {"b": cumulative_sum, "a_epoch": cumulative_sum}

        assert set(epoch_log.keys()) == set(epoch_expected.keys())
        for k in epoch_expected.keys():
            assert epoch_expected[k] == epoch_log[k]
    def training_step(self, split_batch, batch_idx, opt_idx, hiddens):
        # give the PL module a result for logging
        model_ref = self.trainer.get_model()

        with self.trainer.profiler.profile("model_forward"):
            args = self.build_train_args(split_batch, batch_idx, opt_idx, hiddens)

            # manually capture logged metrics
            model_ref._current_fx_name = 'training_step'
            model_ref._results = Result()
            training_step_output = self.trainer.accelerator_backend.training_step(args)
            self.trainer.logger_connector.cache_logged_metrics()

            self._check_training_step_output(training_step_output)

            training_step_output = self.trainer.call_hook("training_step_end", training_step_output)

            training_step_output_for_epoch_end, training_step_output = self._process_training_step_output(
                training_step_output, split_batch
            )
            is_result_obj = isinstance(training_step_output, Result)

            if training_step_output_for_epoch_end is None:
                return None

        # enable empty loss when using manual opt
        closure_loss = None
        untouched_loss = None

        if self.trainer.train_loop.automatic_optimization:
            # accumulate loss
            # (if accumulate_grad_batches = 1 no effect)
            if is_result_obj:
                closure_loss = training_step_output.minimize
            else:
                closure_loss = training_step_output.batch_loss

            closure_loss = closure_loss / self.trainer.accumulate_grad_batches

            # the loss will get scaled for amp. avoid any modifications to it
            untouched_loss = closure_loss.detach().clone()

        # result
        result = AttributeDict(
            closure_loss=closure_loss,
            loss=untouched_loss,
            training_step_output=training_step_output,
            training_step_output_for_epoch_end=training_step_output_for_epoch_end,
            hiddens=training_step_output.hiddens,
        )
        return result
def _ddp_test_fn(rank, worldsize):
    _setup_ddp(rank, worldsize)
    tensor = torch.tensor([1.0])

    metric_a = DummyMetric()
    metric_b = DummyMetric()
    metric_c = DummyMetric()

    # ddp_sync_on_step is False by default
    result = Result()

    for epoch in range(3):
        cumulative_sum = 0

        for i in range(5):
            metric_a(i)
            metric_b(i)
            metric_c(i)

            cumulative_sum += i

            result.log('a', metric_a, on_step=True, on_epoch=True)
            result.log('b', metric_b, on_step=False, on_epoch=True)
            result.log('c', metric_c, on_step=True, on_epoch=False)

            batch_log = result.get_batch_log_metrics()
            batch_expected = {"a_step": i, "a": i, "c": i}
            assert set(batch_log.keys()) == set(batch_expected.keys())
            for k in batch_expected.keys():
                assert batch_expected[k] == batch_log[k]

        epoch_log = result.get_epoch_log_metrics()

        # assert metric state reset to default values
        assert metric_a.x == metric_a._defaults['x']
        assert metric_b.x == metric_b._defaults['x']
        assert metric_c.x == metric_c._defaults['x']

        epoch_expected = {
            "b": cumulative_sum * worldsize,
            "a": cumulative_sum * worldsize,
            "a_epoch": cumulative_sum * worldsize
        }

        assert set(epoch_log.keys()) == set(epoch_expected.keys())
        for k in epoch_expected.keys():
            assert epoch_expected[k] == epoch_log[k]
示例#8
0
    def __run_eval_epoch_end(self, num_dataloaders, using_eval_result):
        model = self.trainer.get_model()

        # reset results
        model._results = Result()

        # with a single dataloader don't pass an array
        outputs = self.outputs
        eval_results = outputs
        if num_dataloaders == 1:
            eval_results = outputs[0]

        user_reduced = False

        if self.testing:
            if is_overridden('test_epoch_end', model=model):
                model._current_fx_name = 'test_epoch_end'
                if using_eval_result:
                    eval_results = self.__gather_epoch_end_eval_results(
                        outputs)

                eval_results = model.test_epoch_end(eval_results)
                user_reduced = True

        else:
            if is_overridden('validation_epoch_end', model=model):
                model._current_fx_name = 'validation_epoch_end'
                if using_eval_result:
                    eval_results = self.__gather_epoch_end_eval_results(
                        outputs)

                eval_results = model.validation_epoch_end(eval_results)
                user_reduced = True

        # depre warning
        if eval_results is not None and user_reduced:
            step = 'testing_epoch_end' if self.testing else 'validation_epoch_end'
            self.warning_cache.warn(
                f'The {step} should not return anything as of 9.1.'
                ' To log, use self.log(...) or self.write(...) directly in the LightningModule'
            )

        if using_eval_result and not user_reduced:
            eval_results = self.__auto_reduce_result_objs(outputs)

        if not isinstance(eval_results, list):
            eval_results = [eval_results]

        return eval_results
示例#9
0
def test_result_gather_stack():
    """ Test that tensors get concatenated when they all have the same shape. """
    outputs = [
        {
            "foo": torch.zeros(4, 5)
        },
        {
            "foo": torch.zeros(4, 5)
        },
        {
            "foo": torch.zeros(4, 5)
        },
    ]
    result = Result.gather(outputs)
    assert isinstance(result["foo"], torch.Tensor)
    assert list(result["foo"].shape) == [12, 5]
示例#10
0
def test_result_gather_concatenate():
    """ Test that tensors get concatenated when they have varying size in first dimension. """
    outputs = [
        {
            "foo": torch.zeros(4, 5)
        },
        {
            "foo": torch.zeros(8, 5)
        },
        {
            "foo": torch.zeros(3, 5)
        },
    ]
    result = Result.gather(outputs)
    assert isinstance(result["foo"], torch.Tensor)
    assert list(result["foo"].shape) == [15, 5]
示例#11
0
def test_result_gather_scalar():
    """ Test that 0-dim tensors get gathered and stacked correctly. """
    outputs = [
        {
            "foo": torch.tensor(1)
        },
        {
            "foo": torch.tensor(2)
        },
        {
            "foo": torch.tensor(3)
        },
    ]
    result = Result.gather(outputs)
    assert isinstance(result["foo"], torch.Tensor)
    assert list(result["foo"].shape) == [3]
示例#12
0
    def on_evaluation_batch_start(self, batch, batch_idx, dataloader_idx):
        # reset the result of the PL module
        model = self.trainer.get_model()
        model._results = Result()
        model._current_fx_name = 'evaluation_step'

        # set dataloader_idx and track batch_size
        self.trainer.logger_connector.on_evaluation_batch_start(
            self.testing, batch, dataloader_idx, self.num_dataloaders)

        if self.testing:
            self.trainer.call_hook('on_test_batch_start', batch, batch_idx,
                                   dataloader_idx)
        else:
            self.trainer.call_hook('on_validation_batch_start', batch,
                                   batch_idx, dataloader_idx)
 def log_metrics(self, result: Result, step_results: Dict[str, float],
                 step_type: str) -> None:
     result.log(f"loss/{step_type}", step_results["loss"])
     for task in ["verb", "noun"]:
         result.log(f"{task}_loss/{step_type}",
                    step_results[f"{task}_loss"])
         for k in (1, 5):
             result.log(
                 f"{task}_accuracy@{k}/{step_type}",
                 step_results[f"{task}_accuracy@{k}"],
             )
示例#14
0
def test_result_gather_mixed_types():
    """ Test that a collection of mixed types gets gathered into a list. """
    outputs = [
        {
            "foo": 1.2
        },
        {
            "foo": ["bar", None]
        },
        {
            "foo": torch.tensor(1)
        },
    ]
    result = Result.gather(outputs)
    expected = [1.2, ["bar", None], torch.tensor(1)]
    assert isinstance(result["foo"], list)
    assert result["foo"] == expected
    def training_step(self, split_batch, batch_idx, opt_idx, hiddens):
        # give the PL module a result for logging
        model = self.trainer.get_model()
        model._results = Result()
        model._current_fx_name = 'training_step'

        with self.trainer.profiler.profile('model_forward'):
            args = self.build_train_args(split_batch, batch_idx, opt_idx,
                                         hiddens)
            training_step_output = self.trainer.accelerator_backend.training_step(
                args)
            training_step_output = self.trainer.call_hook(
                'training_step_end', training_step_output)

            training_step_output_for_epoch_end, training_step_output = self._process_training_step_output(
                training_step_output, split_batch)
            is_result_obj = isinstance(training_step_output, Result)

            if training_step_output_for_epoch_end is None:
                return None

        # accumulate loss
        # (if accumulate_grad_batches = 1 no effect)
        if is_result_obj:
            closure_loss = training_step_output.minimize
        else:
            closure_loss = training_step_output.batch_loss

        closure_loss = closure_loss / self.trainer.accumulate_grad_batches

        # the loss will get scaled for amp. avoid any modifications to it
        untouched_loss = closure_loss.detach().clone()

        # result
        result = AttributeDict(
            closure_loss=closure_loss,
            loss=untouched_loss,
            training_step_output=training_step_output,
            training_step_output_for_epoch_end=
            training_step_output_for_epoch_end,
            hiddens=training_step_output.hiddens,
        )
        return result
示例#16
0
def test_sample_metadata_field() -> None:
    """
    Test that the string constant we use to identify the metadata field is really matching the
    field name in SampleWithMetadata
    """
    batch_size = 5
    xyz = (6, 7, 8)
    shape = (batch_size, ) + xyz
    zero = torch.zeros(shape)
    s = Sample(metadata=DummyPatientMetadata,
               image=zero,
               mask=zero,
               labels=torch.zeros((batch_size, ) + (2, ) + xyz))
    fields = vars(s)
    assert len(fields) == 4
    assert SAMPLE_METADATA_FIELD in fields
    # Lightning attempts to determine the batch size by trying to find a tensor field in the sample.
    # This only works if any field other than Metadata is first.
    assert Result.unpack_batch_size(fields) == batch_size
示例#17
0
    def evaluation_step(self, batch: Any, batch_idx: int, dataloader_idx: int) -> Optional[STEP_OUTPUT]:
        # configure step_kwargs
        step_kwargs = self._build_kwargs(batch, batch_idx, dataloader_idx)

        model_ref = self.trainer.lightning_module
        model_ref._results = Result()

        if self.trainer.testing:
            model_ref._current_fx_name = "test_step"
            with self.trainer.profiler.profile("test_step"):
                output = self.trainer.accelerator.test_step(step_kwargs)
        else:
            model_ref._current_fx_name = "validation_step"
            with self.trainer.profiler.profile("validation_step"):
                output = self.trainer.accelerator.validation_step(step_kwargs)

        # capture any logged information
        self.trainer.logger_connector.cache_logged_metrics()
        # track batch size for weighted average
        if isinstance(output, Result):
            output.track_batch_size(batch)

        return output
示例#18
0
 def on_train_split_start(self, split_idx: int, opt_idx: int, split_batch) -> None:
     self.cached_results._split_idx = split_idx
     self.cached_results._opt_idx = opt_idx
     self.cached_results._batch_size = Result.extract_batch_size(split_batch)
示例#19
0
 def on_evaluation_batch_start(self, testing, batch, dataloader_idx, num_dataloaders):
     model = self.trainer.get_model()
     # set dataloader_idx only if multiple ones
     model._current_dataloader_idx = dataloader_idx if num_dataloaders > 1 else None
     # track batch_size
     self.cached_results._batch_size = Result.extract_batch_size(batch)
 def check_dataloader_idx(self, result: Result) -> bool:
     random_key = list(result.keys())[-1]
     return result["meta"][random_key]["dataloader_idx"] is not None
示例#21
0
def test_result_retrieve_last_logged_item():
    result = Result()
    result.log('a', 5., on_step=True, on_epoch=True)
    assert result['a_epoch'] == 5.
    assert result['a_step'] == 5.
    assert result['a'] == 5.
示例#22
0
 def _reset_result_and_set_hook_fx_name(self, hook_name):
     model_ref = self.get_model()
     if model_ref is not None:
         # used to track current hook name called
         model_ref._results = Result()
         model_ref._current_hook_fx_name = hook_name
示例#23
0
 def check_dataloader_idx(self, result: Result) -> bool:
     random_key = [*result.keys()][-1]
     add_dataloader_idx = result["meta"][random_key][
         "dataloader_idx"] is not None
     return add_dataloader_idx