Exemplo n.º 1
0
def test_training_step_result_log_step_and_epoch(tmpdir):
    """
    Tests that only training_step can be used with TrainResult
    Makes sure that things are routed to pbar, loggers and loss accordingly

    Makes sure pbar and logs happen on epoch only when requested
    """
    # enable internal debugging actions
    os.environ['PL_DEV_DEBUG'] = '1'

    model = DeterministicModel()
    model.training_step = model.training_step_result_log_epoch_and_step
    model.training_step_end = None
    model.training_epoch_end = None
    model.val_dataloader = None

    epochs = 3
    batches = 2
    trainer = Trainer(
        default_root_dir=tmpdir,
        limit_train_batches=batches,
        limit_val_batches=batches,
        row_log_interval=1,
        max_epochs=epochs,
        weights_summary=None,
    )
    trainer.fit(model)

    # make sure correct steps were called
    assert model.training_step_called
    assert not model.training_step_end_called
    assert not model.training_epoch_end_called

    # make sure correct metrics are logged (one per batch step as requested)
    assert len(
        trainer.dev_debugger.logged_metrics) == (epochs * batches) + epochs
    epoch_metrics = trainer.dev_debugger.logged_metrics
    epoch_idx = -1
    for i_start in range(0, len(epoch_metrics), batches + 1):
        epoch_idx += 1
        epoch_outputs = epoch_metrics[i_start:i_start + batches + 1]
        mean_vals = {
            'epoch_step_epoch_log_and_pbar_acc1': [],
            'epoch_step_epoch_log_acc2': []
        }

        # make sure each batch logged the expected value
        for batch_idx in range(len(epoch_outputs) - 1):
            logged_metrics = epoch_outputs[batch_idx]

            expected_val_1 = (5 + batch_idx) * (epoch_idx + 1)
            expected_val_2 = (6 + batch_idx) * (epoch_idx + 1)
            mean_vals['epoch_step_epoch_log_and_pbar_acc1'].append(
                torch.tensor(expected_val_1).float())
            mean_vals['epoch_step_epoch_log_acc2'].append(
                torch.tensor(expected_val_2).float())

            assert logged_metrics[
                'step_step_epoch_log_and_pbar_acc1'] == expected_val_1
            assert logged_metrics['step_step_epoch_log_acc2'] == expected_val_2
            assert 'step_epoch_pbar_acc3' not in logged_metrics
            assert len(logged_metrics) == 4

        # make sure the metrics for the epoch end are actual means (the default reduce fx) or all the batches
        epoch_end_metrics = epoch_outputs[-1]
        eval_1 = torch.stack(
            mean_vals['epoch_step_epoch_log_and_pbar_acc1']).mean()
        eval_2 = torch.stack(mean_vals['epoch_step_epoch_log_acc2']).mean()
        assert epoch_end_metrics[
            'epoch_step_epoch_log_and_pbar_acc1'] == eval_1
        assert epoch_end_metrics['epoch_step_epoch_log_acc2'] == eval_2
        assert 'step_epoch_pbar_acc3' not in epoch_end_metrics
        assert len(logged_metrics) == 4

    # make sure we are using the correct metrics for callbacks
    assert trainer.logger_connector.callback_metrics['checkpoint_on'] == 171

    # -------------------------------
    # VERIFY PBAR METRICS
    # -------------------------------
    # make sure pbar metrics are correct ang log metrics did not leak
    all_pbar_metrics = trainer.dev_debugger.pbar_added_metrics
    assert len(all_pbar_metrics) == (epochs * batches) + epochs

    epoch_idx = -1
    for i_start in range(0, len(all_pbar_metrics), batches + 1):
        epoch_idx += 1
        epoch_outputs = all_pbar_metrics[i_start:i_start + batches + 1]
        mean_vals = {
            'epoch_step_epoch_log_and_pbar_acc1': [],
            'epoch_step_epoch_pbar_acc3': []
        }

        # make sure each batch logged the expected value
        for batch_idx in range(len(epoch_outputs) - 1):
            logged_metrics = epoch_outputs[batch_idx]

            expected_val_1 = (5 + batch_idx) * (epoch_idx + 1)
            expected_val_2 = (7 + batch_idx) * (epoch_idx + 1)
            mean_vals['epoch_step_epoch_log_and_pbar_acc1'].append(
                torch.tensor(expected_val_1).float())
            mean_vals['epoch_step_epoch_pbar_acc3'].append(
                torch.tensor(expected_val_2).float())
            assert logged_metrics[
                'step_step_epoch_log_and_pbar_acc1'] == expected_val_1
            assert logged_metrics[
                'step_step_epoch_pbar_acc3'] == expected_val_2
            assert 'step_epoch_log_acc2' not in logged_metrics
            assert len(logged_metrics) == 3

        # make sure the metrics for the epoch end are actual means (the default reduce fx) or all the batches
        epoch_end_metrics = epoch_outputs[-1]
        eval_1 = torch.stack(
            mean_vals['epoch_step_epoch_log_and_pbar_acc1']).mean()
        eval_2 = torch.stack(mean_vals['epoch_step_epoch_pbar_acc3']).mean()
        assert epoch_end_metrics[
            'epoch_step_epoch_log_and_pbar_acc1'] == eval_1
        assert epoch_end_metrics['epoch_step_epoch_pbar_acc3'] == eval_2
        assert 'step_epoch_log_acc2' not in epoch_end_metrics
        assert len(logged_metrics) == 3

    # -----------------------------------------
    # make sure training outputs what is expected
    # -----------------------------------------
    for batch_idx, batch in enumerate(model.train_dataloader()):
        break

    out = trainer.train_loop.run_training_batch(batch, batch_idx, 0)
    assert out.signal == 0
    assert len(out.batch_log_metrics) == 2

    train_step_out = out.training_step_output_for_epoch_end
    assert len(train_step_out) == 1
    train_step_out = train_step_out[0][0]
    assert isinstance(train_step_out, TrainResult)

    assert 'minimize' in train_step_out
    assert 'step_step_epoch_log_and_pbar_acc1' in train_step_out
    assert 'step_step_epoch_log_acc2' in train_step_out
    assert 'epoch_step_epoch_log_and_pbar_acc1' in train_step_out
    assert 'epoch_step_epoch_log_acc2' in train_step_out

    # make sure the optimizer closure returns the correct things
    opt_closure_result = trainer.train_loop.training_step_and_backward(
        batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens)
    assert opt_closure_result['loss'] == (42.0 * 3) + (15.0 * 3)
Exemplo n.º 2
0
def test_training_step_result_log_epoch_only(tmpdir):
    """
    Tests that only training_step can be used with TrainResult
    Makes sure that things are routed to pbar, loggers and loss accordingly

    Makes sure pbar and logs happen on epoch only when requested
    """
    # enable internal debugging actions
    os.environ['PL_DEV_DEBUG'] = '1'

    model = DeterministicModel()
    model.training_step = model.training_step_result_log_epoch_only
    model.training_step_end = None
    model.training_epoch_end = None
    model.val_dataloader = None

    epochs = 3
    batches = 2
    trainer = Trainer(
        default_root_dir=tmpdir,
        limit_train_batches=batches,
        limit_val_batches=batches,
        row_log_interval=1,
        max_epochs=epochs,
        weights_summary=None,
    )
    trainer.fit(model)

    # make sure correct steps were called
    assert model.training_step_called
    assert not model.training_step_end_called
    assert not model.training_epoch_end_called

    assert len(trainer.logger_connector.callback_metrics) == 12

    # make sure correct metrics are logged (one per batch step as requested)
    assert len(trainer.dev_debugger.logged_metrics) == epochs
    epoch_metrics = trainer.dev_debugger.logged_metrics
    assert len(epoch_metrics) == epochs
    for batch_idx, logged_metrics in enumerate(epoch_metrics):
        assert logged_metrics[f'epoch_log_and_pbar_acc1_e{batch_idx}'] == 14.0
        assert logged_metrics[f'epoch_log_acc2_e{batch_idx}'] == 15.0
        assert f'epoch_pbar_acc3_e{batch_idx}' not in logged_metrics
        assert len(logged_metrics) == 4

    # make sure we are using the correct metrics for callbacks
    assert trainer.logger_connector.callback_metrics['checkpoint_on'] == 171

    # make sure pbar metrics are correct ang log metrics did not leak
    for epoch_idx in range(epochs):
        assert trainer.logger_connector.progress_bar_metrics[
            f'epoch_log_and_pbar_acc1_e{epoch_idx}'] == 14
        assert trainer.logger_connector.progress_bar_metrics[
            f'epoch_pbar_acc3_e{epoch_idx}'] == 16
        assert f'epoch_log_acc2_e{epoch_idx}' not in trainer.logger_connector.progress_bar_metrics

    # make sure training outputs what is expected
    for batch_idx, batch in enumerate(model.train_dataloader()):
        break

    out = trainer.train_loop.run_training_batch(batch, batch_idx, 0)
    assert out.signal == 0
    assert len(out.batch_log_metrics) == 0

    train_step_out = out.training_step_output_for_epoch_end
    assert len(train_step_out) == 1
    train_step_out = train_step_out[0][0]
    assert isinstance(train_step_out, TrainResult)

    assert 'minimize' in train_step_out
    assert f'epoch_log_and_pbar_acc1_e{trainer.current_epoch}' in train_step_out
    assert f'epoch_log_acc2_e{trainer.current_epoch}' in train_step_out

    # make sure the optimizer closure returns the correct things
    opt_closure_result = trainer.train_loop.training_step_and_backward(
        batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens)
    assert opt_closure_result['loss'] == (42.0 * 3) + (15.0 * 3)
def test_val_step_epoch_step_metrics(tmpdir):
    """
    Make sure the logged + pbar metrics are allocated accordingly at every step when requested
    """
    # enable internal debugging actions
    os.environ['PL_DEV_DEBUG'] = '1'

    model = DeterministicModel()
    model.training_step = model.training_step_result_log_epoch_and_step_for_callbacks
    model.training_step_end = None
    model.training_epoch_end = None
    model.validation_step = model.validation_step_result_epoch_step_metrics
    model.validation_step_end = None
    model.validation_epoch_end = None

    batches = 3
    epochs = 3
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=epochs,
        row_log_interval=1,
        limit_train_batches=batches,
        limit_val_batches=batches,
        weights_summary=None,
    )
    trainer.fit(model)

    # make sure correct steps were called
    assert model.validation_step_called
    assert not model.validation_step_end_called
    assert not model.validation_epoch_end_called

    # no early stopping
    assert len(trainer.dev_debugger.early_stopping_history) == 0

    # make sure we logged the exact number of metrics
    assert len(
        trainer.dev_debugger.logged_metrics) == epochs * batches + (epochs)
    assert len(
        trainer.dev_debugger.pbar_added_metrics) == epochs * batches + (epochs)

    # make sure we logged the correct epoch metrics
    for metric_idx in range(0, len(trainer.dev_debugger.logged_metrics),
                            batches + 1):
        batch_metrics = trainer.dev_debugger.logged_metrics[
            metric_idx:metric_idx + batches]
        epoch_metric = trainer.dev_debugger.logged_metrics[metric_idx +
                                                           batches]
        epoch = epoch_metric['epoch']

        # make sure the metric was split
        for batch_metric in batch_metrics:
            assert f'step_val_step_log_acc/epoch_{epoch}' in batch_metric
            assert f'step_val_step_log_pbar_acc/epoch_{epoch}' in batch_metric

        # make sure the epoch split was correct
        assert 'epoch_val_step_log_acc' in epoch_metric
        assert 'epoch_val_step_log_pbar_acc' in epoch_metric

    # make sure we logged the correct pbar metrics
    for metric_idx in range(0, len(trainer.dev_debugger.pbar_added_metrics),
                            batches + 1):
        batch_metrics = trainer.dev_debugger.pbar_added_metrics[
            metric_idx:metric_idx + batches]
        epoch_metric = trainer.dev_debugger.pbar_added_metrics[metric_idx +
                                                               batches]

        # make sure the metric was split
        for batch_metric in batch_metrics:
            assert 'step_val_step_pbar_acc' in batch_metric
            assert 'step_val_step_log_pbar_acc' in batch_metric

        # make sure the epoch split was correct
        assert 'epoch_val_step_pbar_acc' in epoch_metric
        assert 'epoch_val_step_log_pbar_acc' in epoch_metric

    # only 1 checkpoint expected since values didn't change after that
    assert len(trainer.dev_debugger.checkpoint_callback_history) == 1

    # make sure the last known metric is correct
    assert trainer.callback_metrics['val_checkpoint_on'] == 171
Exemplo n.º 4
0
def test_training_step_epoch_end_result(tmpdir):
    """
    Makes sure training_step and epoch_end can be used with Results (without batch_end)
    """
    os.environ['PL_DEV_DEBUG'] = '1'

    model = DeterministicModel()
    model.training_step = model.training_step_result_log_epoch_and_step
    model.training_epoch_end = model.training_epoch_end_return_for_log_epoch_and_step
    model.val_dataloader = None

    batches = 3
    epochs = 1
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=epochs,
        row_log_interval=1,
        limit_train_batches=batches,
        weights_summary=None,
    )
    trainer.fit(model)

    assert len(trainer.logger_connector.callback_metrics) == 11

    # make sure correct steps were called
    assert model.training_step_called
    assert not model.training_step_end_called
    assert model.training_epoch_end_called

    # make sure correct metrics were logged
    logged_metrics = trainer.dev_debugger.logged_metrics
    assert len(logged_metrics) == (epochs * batches) + epochs
    last_logged = logged_metrics[-1]

    assert last_logged['epoch_step_epoch_log_and_pbar_acc1'] == 210.0
    assert last_logged['epoch_step_epoch_log_acc2'] == 336.0
    assert last_logged['epoch_epoch_end_log_acc'] == 1212.0
    assert last_logged['epoch_epoch_end_log_pbar_acc'] == 1214.0
    assert 'epoch_end_pbar_acc' not in last_logged

    # make sure pbar metrics are correct
    logged_pbar = trainer.dev_debugger.pbar_added_metrics
    assert len(logged_pbar) == (epochs * batches) + epochs

    assert trainer.logger_connector.progress_bar_metrics[
        'epoch_step_epoch_log_and_pbar_acc1'] == 210.0
    assert trainer.logger_connector.progress_bar_metrics[
        'step_step_epoch_log_and_pbar_acc1'] == 7.0
    assert trainer.logger_connector.progress_bar_metrics[
        'epoch_step_epoch_pbar_acc3'] == 504.0
    assert trainer.logger_connector.progress_bar_metrics[
        'epoch_epoch_end_pbar_acc'] == 1213.0
    assert trainer.logger_connector.progress_bar_metrics[
        'epoch_epoch_end_log_pbar_acc'] == 1214.0
    assert 'epoch_end_log_acc' not in trainer.logger_connector.progress_bar_metrics
    assert 'log_acc2' not in trainer.logger_connector.progress_bar_metrics

    # make sure callback metrics didn't change
    assert trainer.logger_connector.callback_metrics['checkpoint_on'] == 171

    # -----------------------------------------
    # make sure training outputs what is expected
    # -----------------------------------------
    for batch_idx, batch in enumerate(model.train_dataloader()):
        break

    out = trainer.train_loop.run_training_batch(batch, batch_idx, 0)
    assert out.signal == 0
    assert len(out.batch_log_metrics) == 2

    train_step_out = out.training_step_output_for_epoch_end
    assert len(train_step_out) == 1
    train_step_out = train_step_out[0][0]
    assert isinstance(train_step_out, TrainResult)

    assert 'minimize' in train_step_out
    assert 'step_step_epoch_log_and_pbar_acc1' in train_step_out
    assert 'epoch_step_epoch_log_and_pbar_acc1' in train_step_out
    assert 'step_step_epoch_log_acc2' in train_step_out
    assert 'epoch_step_epoch_log_acc2' in train_step_out

    # make sure the optimizer closure returns the correct things
    opt_closure_result = trainer.train_loop.training_step_and_backward(
        batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens)
    assert opt_closure_result['loss'] == (42.0 * 3) + (15.0 * 3)
def test_val_step_only_step_metrics(tmpdir):
    """
    Make sure the logged + pbar metrics are allocated accordingly at every step when requested
    """
    # enable internal debugging actions
    os.environ['PL_DEV_DEBUG'] = '1'

    model = DeterministicModel()
    model.training_step = model.training_step_result_log_epoch_and_step_for_callbacks
    model.training_step_end = None
    model.training_epoch_end = None
    model.validation_step = model.validation_step_result_only_step_metrics
    model.validation_step_end = None
    model.validation_epoch_end = None

    batches = 3
    epochs = 3
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=epochs,
        row_log_interval=1,
        limit_train_batches=batches,
        limit_val_batches=batches,
        weights_summary=None,
    )
    trainer.fit(model)

    # make sure correct steps were called
    assert model.validation_step_called
    assert not model.validation_step_end_called
    assert not model.validation_epoch_end_called

    # no early stopping
    assert len(trainer.dev_debugger.early_stopping_history) == 0

    # make sure we logged the exact number of metrics
    assert len(
        trainer.dev_debugger.logged_metrics) == epochs * batches + (epochs)
    assert len(
        trainer.dev_debugger.pbar_added_metrics) == epochs * batches + (epochs)

    # make sure we logged the correct epoch metrics
    total_empty_epoch_metrics = 0
    epoch = 0
    for metric in trainer.dev_debugger.logged_metrics:
        if 'epoch' in metric:
            epoch += 1
        if len(metric) > 2:
            assert 'no_val_no_pbar' not in metric
            assert 'val_step_pbar_acc' not in metric
            assert metric[f'val_step_log_acc/epoch_{epoch}']
            assert metric[f'val_step_log_pbar_acc/epoch_{epoch}']
        else:
            total_empty_epoch_metrics += 1

    assert total_empty_epoch_metrics == 3

    # make sure we logged the correct epoch pbar metrics
    total_empty_epoch_metrics = 0
    for metric in trainer.dev_debugger.pbar_added_metrics:
        if 'epoch' in metric:
            epoch += 1
        if len(metric) > 2:
            assert 'no_val_no_pbar' not in metric
            assert 'val_step_log_acc' not in metric
            assert metric['val_step_log_pbar_acc']
            assert metric['val_step_pbar_acc']
        else:
            total_empty_epoch_metrics += 1

    assert total_empty_epoch_metrics == 3

    # only 1 checkpoint expected since values didn't change after that
    assert len(trainer.dev_debugger.checkpoint_callback_history) == 1

    # make sure the last known metric is correct
    assert trainer.callback_metrics['val_checkpoint_on'] == 171