def test_train_step_epoch_end(tmpdir): """ Checks train_step + training_epoch_end (NO training_step_end) """ model = DeterministicModel() model.training_step = model.training_step_dict_return model.training_step_end = None model.training_epoch_end = model.training_epoch_end_dict model.val_dataloader = None trainer = Trainer(max_epochs=1, weights_summary=None) trainer.fit(model) # make sure correct steps were called assert model.training_step_called assert not model.training_step_end_called assert model.training_epoch_end_called # assert epoch end metrics were added assert trainer.callback_metrics['epoch_end_log_1'] == 178 assert trainer.progress_bar_metrics['epoch_end_pbar_1'] == 234 # make sure training outputs what is expected for batch_idx, batch in enumerate(model.train_dataloader()): break out = trainer.run_training_batch(batch, batch_idx) assert out.signal == 0 assert out.batch_log_metrics['log_acc1'] == 12.0 assert out.batch_log_metrics['log_acc2'] == 7.0 train_step_end_out = out.training_step_output_for_epoch_end pbar_metrics = train_step_end_out['progress_bar'] assert pbar_metrics['pbar_acc1'] == 17.0 assert pbar_metrics['pbar_acc2'] == 19.0
def test_no_callbacks_with_train_loop_only(tmpdir): """ Make sure early stop + checkpoint work with only a train loop """ os.environ['PL_DEV_DEBUG'] = '1' model = DeterministicModel() model.training_step = model.training_step_no_callbacks_result_obj model.training_epoch_end = None model.val_dataloader = None batches = 3 epochs = 3 trainer = Trainer( default_root_dir=tmpdir, max_epochs=epochs, log_every_n_steps=1, limit_train_batches=batches, weights_summary=None, ) trainer.fit(model) all_losses = trainer.dev_debugger.saved_train_losses assert len(all_losses) == batches * epochs assert trainer.early_stop_callback is None assert len(trainer.dev_debugger.checkpoint_callback_history) == 3 assert len(trainer.dev_debugger.early_stopping_history) == 0
def test_full_training_loop_dict(tmpdir): """ Checks train_step + training_step_end + training_epoch_end """ model = DeterministicModel() model.training_step = model.training_step_for_step_end_dict model.training_step_end = model.training_step_end_dict model.training_epoch_end = model.training_epoch_end_dict model.val_dataloader = None trainer = Trainer(max_epochs=1, weights_summary=None) trainer.fit(model) # make sure correct steps were called assert model.training_step_called assert model.training_step_end_called assert model.training_epoch_end_called # assert epoch end metrics were added assert trainer.callback_metrics['epoch_end_log_1'] == 178 assert trainer.progress_bar_metrics['epoch_end_pbar_1'] == 234 # make sure training outputs what is expected for batch_idx, batch in enumerate(model.train_dataloader()): break out = trainer.run_training_batch(batch, batch_idx) signal, grad_norm_dic, all_log_metrics, training_step_output_for_epoch_end = out assert signal == 0 assert all_log_metrics['log_acc1'] == 12.0 assert all_log_metrics['log_acc2'] == 7.0 pbar_metrics = training_step_output_for_epoch_end['pbar_on_batch_end'] assert pbar_metrics['pbar_acc1'] == 17.0 assert pbar_metrics['pbar_acc2'] == 19.0
def test_val_step_only_epoch_metrics(tmpdir): """ Make sure the logged + pbar metrics are allocated accordingly when auto-reduced at epoch end """ # enable internal debugging actions os.environ['PL_DEV_DEBUG'] = '1' model = DeterministicModel() model.training_step = model.training_step_result_log_epoch_and_step_for_callbacks model.training_step_end = None model.training_epoch_end = None model.validation_step = model.validation_step_result_only_epoch_metrics model.validation_step_end = None model.validation_epoch_end = None batches = 3 epochs = 3 trainer = Trainer( default_root_dir=tmpdir, max_epochs=epochs, row_log_interval=1, limit_train_batches=batches, weights_summary=None, ) trainer.fit(model) # make sure correct steps were called assert model.validation_step_called assert not model.validation_step_end_called assert not model.validation_epoch_end_called # no early stopping assert len(trainer.dev_debugger.early_stopping_history) == 0 # make sure we logged the exact number of metrics assert len(trainer.dev_debugger.logged_metrics) == epochs assert len(trainer.dev_debugger.pbar_added_metrics) == epochs # make sure we logged the correct epoch metrics for metric in trainer.dev_debugger.logged_metrics: assert 'no_val_no_pbar' not in metric assert 'val_step_pbar_acc' not in metric assert metric['val_step_log_acc'] == (12 + 13) / 2 assert metric['val_step_log_pbar_acc'] == (13 + 14) / 2 # make sure we logged the correct epoch pbar metrics for metric in trainer.dev_debugger.pbar_added_metrics: assert 'no_val_no_pbar' not in metric assert 'val_step_log_acc' not in metric assert metric['val_step_log_pbar_acc'] == (13 + 14) / 2 assert metric['val_step_pbar_acc'] == (14 + 15) / 2 # only 1 checkpoint expected since values didn't change after that assert len(trainer.dev_debugger.checkpoint_callback_history) == 1 # make sure the last known metric is correct assert trainer.logger_connector.callback_metrics[ 'val_checkpoint_on'] == 171
def test_val_step_epoch_end_result(tmpdir): """ Make sure val step + val epoch end works with EvalResult """ os.environ['PL_DEV_DEBUG'] = '1' model = DeterministicModel() model.training_step = model.training_step_result_log_epoch_and_step_for_callbacks model.training_step_end = None model.training_epoch_end = None model.validation_step = model.validation_step_for_epoch_end_result model.validation_step_end = None model.validation_epoch_end = model.validation_epoch_end_result batches = 3 epochs = 3 trainer = Trainer( default_root_dir=tmpdir, max_epochs=epochs, log_every_n_steps=1, limit_train_batches=batches, limit_val_batches=batches, weights_summary=None, ) trainer.fit(model) assert len(trainer.logger_connector.callback_metrics) == 6 # make sure correct steps were called assert model.validation_step_called assert not model.validation_step_end_called assert model.validation_epoch_end_called # no early stopping assert len(trainer.dev_debugger.early_stopping_history) == 0 # make sure we logged the exact number of metrics assert len(trainer.dev_debugger.logged_metrics) == epochs assert len(trainer.dev_debugger.pbar_added_metrics) == epochs # make sure we logged the correct metrics for metric in trainer.dev_debugger.logged_metrics: assert metric['val_epoch_end_metric'] == 189 assert 'val_step_metric' in metric # make sure we pbar logged the correct metrics for metric in trainer.dev_debugger.pbar_added_metrics: assert metric['val_epoch_end_metric'] == 189 assert 'val_step_metric' in metric # only 1 checkpoint expected since values didn't change after that assert len(trainer.dev_debugger.checkpoint_callback_history) == 1 # make sure the last known metric is correct assert trainer.logger_connector.callback_metrics['checkpoint_on'] == 189
def test_val_step_result_callbacks(tmpdir): """ Tests that val step can be used: - val step - no other val_xxx - train loop - callbacks coming from val loop (not train loop) """ # enable internal debugging actions os.environ['PL_DEV_DEBUG'] = '1' model = DeterministicModel() model.training_step = model.training_step_result_log_epoch_and_step_for_callbacks model.training_step_end = None model.training_epoch_end = None model.validation_step = model.validation_step_result_callbacks model.validation_step_end = None model.validation_epoch_end = None batches = 3 epochs = 300 trainer = Trainer( default_root_dir=tmpdir, max_epochs=epochs, early_stop_callback=True, log_every_n_steps=1, limit_train_batches=batches, weights_summary=None, ) trainer.fit(model) # make sure correct steps were called assert model.validation_step_called assert not model.validation_step_end_called assert not model.validation_epoch_end_called # assert that early stopping happened after the requested num of steps # if it used the train step for ES then it wouldn't be 5 assert len(trainer.dev_debugger.early_stopping_history) == 5 # only 2 checkpoints expected assert len(trainer.dev_debugger.checkpoint_callback_history) == 2 # make sure the last known metric is correct assert trainer.logger_connector.callback_metrics[ 'checkpoint_on'] == 171 + 15 # did not request any metrics to log (except the metrics saying which epoch we are on) assert len(trainer.logger_connector.progress_bar_metrics) == 0 assert len(trainer.dev_debugger.logged_metrics) == 0
def test_full_training_loop_scalar(tmpdir): """ Checks train_step + training_step_end + training_epoch_end (all with scalar return from train_step) """ model = DeterministicModel() model.training_step = model.training_step_scalar_return model.training_step_end = model.training_step_end_scalar model.training_epoch_end = model.training_epoch_end_scalar model.val_dataloader = None trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, weights_summary=None, ) trainer.fit(model) # make sure correct steps were called assert model.training_step_called assert model.training_step_end_called assert model.training_epoch_end_called # assert epoch end metrics were added assert 'epoch' in trainer.callback_metrics and len( trainer.callback_metrics) == 1 assert len(trainer.progress_bar_metrics) == 0 # make sure training outputs what is expected for batch_idx, batch in enumerate(model.train_dataloader()): break out = trainer.run_training_batch(batch, batch_idx) assert out.signal == 0 assert len(out.batch_log_metrics) == 0 and isinstance( out.batch_log_metrics, dict) assert len(out.grad_norm_dic) == 0 and isinstance(out.grad_norm_dic, dict) train_step_out = out.training_step_output_for_epoch_end assert len(train_step_out) == 1 train_step_out = train_step_out[0][0] assert isinstance(train_step_out, torch.Tensor) assert train_step_out.item() == 171 # make sure the optimizer closure returns the correct things opt_closure_result = trainer.optimizer_closure(batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens) assert opt_closure_result['loss'].item() == 171
def test_val_step_using_train_callbacks(tmpdir): """ ES conditioned in train CKPT conditioned in val """ # enable internal debugging actions os.environ['PL_DEV_DEBUG'] = '1' model = DeterministicModel() model.training_step = model.training_step_result_log_epoch_and_step_for_callbacks model.training_step_end = None model.training_epoch_end = None model.validation_step = model.validation_step_result_no_callbacks model.validation_step_end = None model.validation_epoch_end = None batches = 3 epochs = 300 trainer = Trainer( default_root_dir=tmpdir, max_epochs=epochs, early_stop_callback=True, log_every_n_steps=1, limit_train_batches=batches, weights_summary=None, ) trainer.fit(model) expected_epochs = 10 # make sure correct steps were called assert model.validation_step_called assert not model.validation_step_end_called assert not model.validation_epoch_end_called # early stopping was not conditioned in val loop, but instead in train loop assert len(trainer.dev_debugger.early_stopping_history) == expected_epochs # only 2 checkpoints expected assert len(trainer.dev_debugger.checkpoint_callback_history) == 2 # make sure the last known metric is correct assert trainer.logger_connector.callback_metrics[ 'checkpoint_on'] == 171 + 20 # did not request any metrics to log (except the metrics saying which epoch we are on) assert len(trainer.logger_connector.progress_bar_metrics) == 0 assert len(trainer.dev_debugger.logged_metrics) == 0
def test_use_callbacks_with_train_loop_only(tmpdir): os.environ['PL_DEV_DEBUG'] = '1' model = DeterministicModel() model.training_step = model.training_step_result_log_epoch_and_step_for_callbacks model.training_epoch_end = None model.val_dataloader = None batches = 3 epochs = 300 trainer = Trainer( default_root_dir=tmpdir, max_epochs=epochs, early_stop_callback=True, log_every_n_steps=1, limit_train_batches=batches, weights_summary=None, ) trainer.fit(model) num_expected_epochs = 10 # ---------------------------------- # VERIFY EARLY STOPPING BEHAVIOR # ---------------------------------- # with train loop only it happens on every epoch early_stop_vals = trainer.dev_debugger.early_stopping_history assert len(early_stop_vals) == num_expected_epochs min_val = min([x['best'] for x in early_stop_vals]) assert min_val == 171 + 9 all_losses = trainer.dev_debugger.saved_train_losses from collections import Counter batch_idxs = Counter([x['batch_idx'] for x in all_losses]) for i, val in batch_idxs.items(): assert val == num_expected_epochs assert i in [0, 1, 2] # ---------------------------------- # VERIFY CHECKPOINTING BEHAVIOR # ---------------------------------- ckpt_vals = trainer.dev_debugger.checkpoint_callback_history assert len(ckpt_vals) == 5, '5 ckpts should have been saved' for ckpt_val, expected_epoch in zip(ckpt_vals, [0, 1, 2, 3, 6]): assert ckpt_val['epoch'] == expected_epoch assert ckpt_val['monitor'] == 'checkpoint_on'
def test_val_step_only_step_metrics(tmpdir): """ Make sure the logged + pbar metrics are allocated accordingly at every step when requested """ # enable internal debugging actions os.environ['PL_DEV_DEBUG'] = '1' model = DeterministicModel() model.training_step = model.training_step_result_log_epoch_and_step_for_callbacks model.training_step_end = None model.training_epoch_end = None model.validation_step = model.validation_step_result_only_step_metrics model.validation_step_end = None model.validation_epoch_end = None batches = 3 epochs = 3 trainer = Trainer( default_root_dir=tmpdir, max_epochs=epochs, log_every_n_steps=1, limit_train_batches=batches, limit_val_batches=batches, weights_summary=None, ) trainer.fit(model) # make sure correct steps were called assert model.validation_step_called assert not model.validation_step_end_called assert not model.validation_epoch_end_called # no early stopping assert len(trainer.dev_debugger.early_stopping_history) == 0 # make sure we logged the exact number of metrics assert len(trainer.dev_debugger.logged_metrics) == epochs * batches assert len( trainer.dev_debugger.pbar_added_metrics) == epochs * batches + (epochs) # only 1 checkpoint expected since values didn't change after that assert len(trainer.dev_debugger.checkpoint_callback_history) == 1 # make sure the last known metric is correct assert trainer.logger_connector.callback_metrics['checkpoint_on'] == 189
def test_result_obj_lr_scheduler_step(tmpdir): """ test that the LR scheduler was called at the correct time with the correct metrics """ model = DeterministicModel() model.training_step = model.training_step_for_step_end_dict model.training_step_end = model.training_step_end_dict model.training_epoch_end = model.training_epoch_end_dict model.val_dataloader = None model.configure_optimizers = model.configure_optimizers__lr_on_plateau_step trainer = Trainer( default_root_dir=tmpdir, max_epochs=2, weights_summary=None, ) trainer.fit(model) assert len(trainer.dev_debugger.saved_lr_scheduler_updates) == 8
def test_train_step_epoch_end(tmpdir): """ Checks train_step + training_epoch_end (NO training_step_end) """ model = DeterministicModel() model.training_step = model.training_step_dict_return model.training_step_end = None model.training_epoch_end = model.training_epoch_end_dict model.val_dataloader = None trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, weights_summary=None, ) trainer.fit(model) # make sure correct steps were called assert model.training_step_called assert not model.training_step_end_called assert model.training_epoch_end_called # assert epoch end metrics were added assert trainer.logger_connector.callback_metrics['epoch_end_log_1'] == 178 assert trainer.logger_connector.progress_bar_metrics[ 'epoch_end_pbar_1'] == 234 # make sure training outputs what is expected batch_idx, batch = 0, next(iter(model.train_dataloader())) out = trainer.train_loop.run_training_batch(batch, batch_idx, 0) assert out.signal == 0 assert trainer.logger_connector.logged_metrics['log_acc1'] == 12.0 assert trainer.logger_connector.logged_metrics['log_acc2'] == 7.0 # outputs are for 1 optimizer and no tbptt train_step_end_out = out.training_step_output_for_epoch_end assert len(train_step_end_out) == 1 train_step_end_out = train_step_end_out[0][0] pbar_metrics = train_step_end_out['progress_bar'] assert pbar_metrics['pbar_acc1'] == 17.0 assert pbar_metrics['pbar_acc2'] == 19.0
def test_train_step_epoch_end_scalar(tmpdir): """ Checks train_step + training_epoch_end (NO training_step_end) (with scalar return) """ os.environ['PL_DEV_DEBUG'] = '0' model = DeterministicModel() model.training_step = model.training_step_scalar_return model.training_step_end = None model.training_epoch_end = model.training_epoch_end_scalar model.val_dataloader = None trainer = Trainer(max_epochs=1, weights_summary=None) trainer.fit(model) # make sure correct steps were called assert model.training_step_called assert not model.training_step_end_called assert model.training_epoch_end_called # assert epoch end metrics were added assert len(trainer.logger_connector.callback_metrics) == 0 assert len(trainer.logger_connector.progress_bar_metrics) == 0 # make sure training outputs what is expected for batch_idx, batch in enumerate(model.train_dataloader()): break out = trainer.train_loop.run_training_batch(batch, batch_idx, 0) assert out.signal == 0 assert len(out.grad_norm_dic) == 0 and isinstance(out.grad_norm_dic, dict) train_step_out = out.training_step_output_for_epoch_end assert len(train_step_out) == 1 train_step_out = train_step_out[0][0] assert isinstance(train_step_out['minimize'], torch.Tensor) assert train_step_out['minimize'].item() == 171 # make sure the optimizer closure returns the correct things opt_closure_result = trainer.train_loop.training_step_and_backward( batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens) assert opt_closure_result['loss'].item() == 171
def test_no_auto_callbacks_with_train_loop_only(tmpdir): """ Make sure early stop + checkpoint work with only a train loop """ os.environ['PL_DEV_DEBUG'] = '1' model = DeterministicModel() model.training_step = model.training_step_no_default_callbacks_for_train_loop model.training_epoch_end = None model.val_dataloader = None batches = 3 epochs = 3 trainer = Trainer( default_root_dir=tmpdir, max_epochs=epochs, row_log_interval=1, limit_train_batches=batches, weights_summary=None, ) trainer.fit(model) assert len(trainer.logger_connector.callback_metrics) == 1 all_losses = trainer.dev_debugger.saved_train_losses assert len(all_losses) == batches * epochs assert trainer.checkpoint_callback.monitor == 'checkpoint_on' assert trainer.early_stop_callback is None trainer = Trainer( default_root_dir=tmpdir, early_stop_callback=True, max_epochs=epochs, row_log_interval=1, limit_train_batches=batches, weights_summary=None, ) trainer.fit(model) assert trainer.early_stop_callback.monitor == 'early_stop_on'
def test_full_training_loop_dict(tmpdir): """ Checks train_step + training_step_end + training_epoch_end """ model = DeterministicModel() model.training_step = model.training_step_for_step_end_dict model.training_step_end = model.training_step_end_dict model.training_epoch_end = model.training_epoch_end_dict model.val_dataloader = None trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, weights_summary=None, ) trainer.fit(model) # make sure correct steps were called assert model.training_step_called assert model.training_step_end_called assert model.training_epoch_end_called # assert epoch end metrics were added assert trainer.callback_metrics['epoch_end_log_1'] == 178 assert trainer.progress_bar_metrics['epoch_end_pbar_1'] == 234 # make sure training outputs what is expected batch_idx, batch = 0, next(iter(model.train_dataloader())) out = trainer.run_training_batch(batch, batch_idx) assert out.signal == 0 assert out.batch_log_metrics['log_acc1'] == 14.0 assert out.batch_log_metrics['log_acc2'] == 9.0 # get the output of the first optimizer train_step_end_out = out.training_step_output_for_epoch_end assert len(train_step_end_out) == 1 train_step_end_out = train_step_end_out[0][0] pbar_metrics = train_step_end_out['progress_bar'] assert pbar_metrics['pbar_acc1'] == 19.0 assert pbar_metrics['pbar_acc2'] == 21.0
def test_training_step_result_log_epoch_only(tmpdir): """ Tests that only training_step can be used with TrainResult Makes sure that things are routed to pbar, loggers and loss accordingly Makes sure pbar and logs happen on epoch only when requested """ # enable internal debugging actions os.environ['PL_DEV_DEBUG'] = '1' model = DeterministicModel() model.training_step = model.training_step_result_log_epoch_only model.training_step_end = None model.training_epoch_end = None model.val_dataloader = None epochs = 3 batches = 2 trainer = Trainer( default_root_dir=tmpdir, limit_train_batches=batches, limit_val_batches=batches, log_every_n_steps=1, max_epochs=epochs, weights_summary=None, ) trainer.fit(model) # make sure correct steps were called assert model.training_step_called assert not model.training_step_end_called assert not model.training_epoch_end_called assert len(trainer.logger_connector.callback_metrics) == 11 # make sure correct metrics are logged (one per batch step as requested) assert len(trainer.dev_debugger.logged_metrics) == epochs epoch_metrics = trainer.dev_debugger.logged_metrics assert len(epoch_metrics) == epochs for batch_idx, logged_metrics in enumerate(epoch_metrics): assert logged_metrics[f'epoch_log_and_pbar_acc1_e{batch_idx}'] == 14.0 assert logged_metrics[f'epoch_log_acc2_e{batch_idx}'] == 15.0 assert f'epoch_pbar_acc3_e{batch_idx}' not in logged_metrics assert len(logged_metrics) == 4 # make sure we are using the correct metrics for callbacks assert trainer.logger_connector.callback_metrics['checkpoint_on'] == 171 # make sure pbar metrics are correct ang log metrics did not leak for epoch_idx in range(epochs): assert trainer.logger_connector.progress_bar_metrics[ f'epoch_log_and_pbar_acc1_e{epoch_idx}'] == 14 assert trainer.logger_connector.progress_bar_metrics[ f'epoch_pbar_acc3_e{epoch_idx}'] == 16 assert f'epoch_log_acc2_e{epoch_idx}' not in trainer.logger_connector.progress_bar_metrics # make sure training outputs what is expected for batch_idx, batch in enumerate(model.train_dataloader()): break out = trainer.train_loop.run_training_batch(batch, batch_idx, 0) assert out.signal == 0 assert len(out.batch_log_metrics) == 0 train_step_out = out.training_step_output_for_epoch_end assert len(train_step_out) == 1 train_step_out = train_step_out[0][0] assert isinstance(train_step_out, TrainResult) assert 'minimize' in train_step_out assert f'epoch_log_and_pbar_acc1_e{trainer.current_epoch}' in train_step_out assert f'epoch_log_acc2_e{trainer.current_epoch}' in train_step_out # make sure the optimizer closure returns the correct things opt_closure_result = trainer.train_loop.training_step_and_backward( batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens) assert opt_closure_result['loss'] == (42.0 * 3) + (15.0 * 3)
def test_val_step_only_step_metrics(tmpdir): """ Make sure the logged + pbar metrics are allocated accordingly at every step when requested """ # enable internal debugging actions os.environ['PL_DEV_DEBUG'] = '1' model = DeterministicModel() model.training_step = model.training_step_result_log_epoch_and_step_for_callbacks model.training_step_end = None model.training_epoch_end = None model.validation_step = model.validation_step_result_only_step_metrics model.validation_step_end = None model.validation_epoch_end = None batches = 3 epochs = 3 trainer = Trainer( default_root_dir=tmpdir, max_epochs=epochs, row_log_interval=1, limit_train_batches=batches, limit_val_batches=batches, weights_summary=None, ) trainer.fit(model) # make sure correct steps were called assert model.validation_step_called assert not model.validation_step_end_called assert not model.validation_epoch_end_called # no early stopping assert len(trainer.dev_debugger.early_stopping_history) == 0 # make sure we logged the exact number of metrics assert len( trainer.dev_debugger.logged_metrics) == epochs * batches + (epochs) assert len( trainer.dev_debugger.pbar_added_metrics) == epochs * batches + (epochs) # make sure we logged the correct epoch metrics total_empty_epoch_metrics = 0 epoch = 0 for metric in trainer.dev_debugger.logged_metrics: if 'epoch' in metric: epoch += 1 if len(metric) > 2: assert 'no_val_no_pbar' not in metric assert 'val_step_pbar_acc' not in metric assert metric[f'val_step_log_acc/epoch_{epoch}'] assert metric[f'val_step_log_pbar_acc/epoch_{epoch}'] else: total_empty_epoch_metrics += 1 assert total_empty_epoch_metrics == 3 # make sure we logged the correct epoch pbar metrics total_empty_epoch_metrics = 0 for metric in trainer.dev_debugger.pbar_added_metrics: if 'epoch' in metric: epoch += 1 if len(metric) > 2: assert 'no_val_no_pbar' not in metric assert 'val_step_log_acc' not in metric assert metric['val_step_log_pbar_acc'] assert metric['val_step_pbar_acc'] else: total_empty_epoch_metrics += 1 assert total_empty_epoch_metrics == 3 # only 1 checkpoint expected since values didn't change after that assert len(trainer.dev_debugger.checkpoint_callback_history) == 1 # make sure the last known metric is correct assert trainer.logger_connector.callback_metrics[ 'val_checkpoint_on'] == 171
def test_training_step_epoch_end_result(tmpdir): """ Makes sure training_step and epoch_end can be used with Results (without batch_end) """ os.environ['PL_DEV_DEBUG'] = '1' model = DeterministicModel() model.training_step = model.training_step_result_log_epoch_and_step model.training_epoch_end = model.training_epoch_end_return_for_log_epoch_and_step model.val_dataloader = None batches = 3 epochs = 1 trainer = Trainer( default_root_dir=tmpdir, max_epochs=epochs, log_every_n_steps=1, limit_train_batches=batches, weights_summary=None, ) trainer.fit(model) assert len(trainer.logger_connector.callback_metrics) == 17 # make sure correct steps were called assert model.training_step_called assert not model.training_step_end_called assert model.training_epoch_end_called # make sure correct metrics were logged logged_metrics = trainer.dev_debugger.logged_metrics assert len(logged_metrics) == (epochs * batches) + epochs last_logged = logged_metrics[-1] assert last_logged['step_epoch_log_and_pbar_acc1_epoch'] == 210.0 assert last_logged['step_epoch_log_acc2_epoch'] == 336.0 assert last_logged['epoch_end_log_acc_epoch'] == 1212.0 assert last_logged['epoch_end_log_pbar_acc_epoch'] == 1214.0 assert 'epoch_end_pbar_acc' not in last_logged # make sure pbar metrics are correct logged_pbar = trainer.dev_debugger.pbar_added_metrics assert len(logged_pbar) == (epochs * batches) + epochs assert trainer.logger_connector.progress_bar_metrics[ 'step_epoch_log_and_pbar_acc1_epoch'] == 210.0 assert trainer.logger_connector.progress_bar_metrics[ 'step_epoch_log_and_pbar_acc1_step'] == 7.0 assert trainer.logger_connector.progress_bar_metrics[ 'step_epoch_pbar_acc3_epoch'] == 504.0 assert trainer.logger_connector.progress_bar_metrics[ 'epoch_end_pbar_acc_epoch'] == 1213.0 assert trainer.logger_connector.progress_bar_metrics[ 'epoch_end_log_pbar_acc_epoch'] == 1214.0 assert 'epoch_end_log_acc' not in trainer.logger_connector.progress_bar_metrics assert 'log_acc2' not in trainer.logger_connector.progress_bar_metrics # make sure callback metrics didn't change assert trainer.logger_connector.callback_metrics['checkpoint_on'] == 171 # ----------------------------------------- # make sure training outputs what is expected # ----------------------------------------- for batch_idx, batch in enumerate(model.train_dataloader()): break out = trainer.train_loop.run_training_batch(batch, batch_idx, 0) assert out.signal == 0 assert len(out.batch_log_metrics) == 4 train_step_out = out.training_step_output_for_epoch_end assert len(train_step_out) == 1 train_step_out = train_step_out[0][0] assert isinstance(train_step_out, TrainResult) assert 'minimize' in train_step_out assert 'step_epoch_log_and_pbar_acc1_step' in train_step_out assert 'step_epoch_log_and_pbar_acc1_epoch' in train_step_out assert 'step_epoch_log_acc2_step' in train_step_out assert 'step_epoch_log_acc2_epoch' in train_step_out # make sure the optimizer closure returns the correct things opt_closure_result = trainer.train_loop.training_step_and_backward( batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens) assert opt_closure_result['loss'] == (42.0 * 3) + (15.0 * 3)
def test_training_step_result_log_step_and_epoch(tmpdir): """ Tests that only training_step can be used with TrainResult Makes sure that things are routed to pbar, loggers and loss accordingly Makes sure pbar and logs happen on epoch only when requested """ # enable internal debugging actions os.environ['PL_DEV_DEBUG'] = '1' model = DeterministicModel() model.training_step = model.training_step_result_log_epoch_and_step model.training_step_end = None model.training_epoch_end = None model.val_dataloader = None epochs = 3 batches = 2 trainer = Trainer( default_root_dir=tmpdir, limit_train_batches=batches, limit_val_batches=batches, log_every_n_steps=1, max_epochs=epochs, weights_summary=None, ) trainer.fit(model) # make sure correct steps were called assert model.training_step_called assert not model.training_step_end_called assert not model.training_epoch_end_called assert len(trainer.logger_connector.callback_metrics) == 11 # make sure correct metrics are logged (one per batch step as requested) assert len( trainer.dev_debugger.logged_metrics) == (epochs * batches) + epochs epoch_metrics = trainer.dev_debugger.logged_metrics epoch_idx = -1 for i_start in range(0, len(epoch_metrics), batches + 1): epoch_idx += 1 epoch_outputs = epoch_metrics[i_start:i_start + batches + 1] mean_vals = { 'epoch_step_epoch_log_and_pbar_acc1': [], 'epoch_step_epoch_log_acc2': [] } # make sure each batch logged the expected value for batch_idx in range(len(epoch_outputs) - 1): logged_metrics = epoch_outputs[batch_idx] expected_val_1 = (5 + batch_idx) * (epoch_idx + 1) expected_val_2 = (6 + batch_idx) * (epoch_idx + 1) mean_vals['epoch_step_epoch_log_and_pbar_acc1'].append( torch.tensor(expected_val_1).float()) mean_vals['epoch_step_epoch_log_acc2'].append( torch.tensor(expected_val_2).float()) assert logged_metrics[ 'step_epoch_log_and_pbar_acc1_step'] == expected_val_1 assert logged_metrics['step_epoch_log_acc2_step'] == expected_val_2 assert 'step_epoch_pbar_acc3' not in logged_metrics assert len(logged_metrics) == 6 # make sure the metrics for the epoch end are actual means (the default reduce fx) or all the batches epoch_end_metrics = epoch_outputs[-1] eval_1 = torch.stack( mean_vals['epoch_step_epoch_log_and_pbar_acc1']).mean() eval_2 = torch.stack(mean_vals['epoch_step_epoch_log_acc2']).mean() assert epoch_end_metrics[ 'step_epoch_log_and_pbar_acc1_epoch'] == eval_1 assert epoch_end_metrics['step_epoch_log_acc2_epoch'] == eval_2 assert 'step_epoch_pbar_acc3' not in epoch_end_metrics assert len(logged_metrics) == 6 # make sure we are using the correct metrics for callbacks assert trainer.logger_connector.callback_metrics['checkpoint_on'] == 171 # ------------------------------- # VERIFY PBAR METRICS # ------------------------------- # make sure pbar metrics are correct ang log metrics did not leak all_pbar_metrics = trainer.dev_debugger.pbar_added_metrics assert len(all_pbar_metrics) == (epochs * batches) + epochs epoch_idx = -1 for i_start in range(0, len(all_pbar_metrics), batches + 1): epoch_idx += 1 epoch_outputs = all_pbar_metrics[i_start:i_start + batches + 1] mean_vals = { 'epoch_step_epoch_log_and_pbar_acc1': [], 'epoch_step_epoch_pbar_acc3': [] } # make sure each batch logged the expected value for batch_idx in range(len(epoch_outputs) - 1): logged_metrics = epoch_outputs[batch_idx] expected_val_1 = (5 + batch_idx) * (epoch_idx + 1) expected_val_2 = (7 + batch_idx) * (epoch_idx + 1) mean_vals['epoch_step_epoch_log_and_pbar_acc1'].append( torch.tensor(expected_val_1).float()) mean_vals['epoch_step_epoch_pbar_acc3'].append( torch.tensor(expected_val_2).float()) assert logged_metrics[ 'step_epoch_log_and_pbar_acc1_step'] == expected_val_1 assert logged_metrics[ 'step_epoch_pbar_acc3_step'] == expected_val_2 assert 'epoch_log_acc2_step' not in logged_metrics assert len(logged_metrics) == 5 # make sure the metrics for the epoch end are actual means (the default reduce fx) or all the batches epoch_end_metrics = epoch_outputs[-1] eval_1 = torch.stack( mean_vals['epoch_step_epoch_log_and_pbar_acc1']).mean() eval_2 = torch.stack(mean_vals['epoch_step_epoch_pbar_acc3']).mean() assert epoch_end_metrics[ 'step_epoch_log_and_pbar_acc1_epoch'] == eval_1 assert epoch_end_metrics['step_epoch_pbar_acc3_epoch'] == eval_2 assert 'epoch_log_acc2_step' not in epoch_end_metrics assert len(logged_metrics) == 5 # ----------------------------------------- # make sure training outputs what is expected # ----------------------------------------- for batch_idx, batch in enumerate(model.train_dataloader()): break out = trainer.train_loop.run_training_batch(batch, batch_idx, 0) assert out.signal == 0 assert len(out.batch_log_metrics) == 4 train_step_out = out.training_step_output_for_epoch_end assert len(train_step_out) == 1 train_step_out = train_step_out[0][0] assert isinstance(train_step_out, TrainResult) assert 'minimize' in train_step_out assert 'step_epoch_log_and_pbar_acc1_step' in train_step_out assert 'step_epoch_log_acc2_step' in train_step_out assert 'step_epoch_log_and_pbar_acc1_epoch' in train_step_out assert 'step_epoch_log_acc2_epoch' in train_step_out # make sure the optimizer closure returns the correct things opt_closure_result = trainer.train_loop.training_step_and_backward( batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens) assert opt_closure_result['loss'] == (42.0 * 3) + (15.0 * 3)
def test_val_step_epoch_step_metrics(tmpdir): """ Make sure the logged + pbar metrics are allocated accordingly at every step when requested """ # enable internal debugging actions os.environ['PL_DEV_DEBUG'] = '1' model = DeterministicModel() model.training_step = model.training_step_result_log_epoch_and_step_for_callbacks model.training_step_end = None model.training_epoch_end = None model.validation_step = model.validation_step_result_epoch_step_metrics model.validation_step_end = None model.validation_epoch_end = None batches = 3 epochs = 3 trainer = Trainer( default_root_dir=tmpdir, max_epochs=epochs, log_every_n_steps=1, limit_train_batches=batches, limit_val_batches=batches, weights_summary=None, ) trainer.fit(model) assert len(trainer.logger_connector.callback_metrics) == 11 expected_metrics = { 'early_stop_on', 'checkpoint_on', 'val_step_pbar_acc', 'val_step_pbar_acc_epoch', 'val_step_log_acc', 'val_step_log_acc_epoch', 'val_step_log_pbar_acc', 'val_step_log_pbar_acc_epoch', 'val_step_batch_idx', 'val_step_batch_idx_epoch' } expected_metrics.add('debug_epoch') seen_metrics = set(trainer.logger_connector.callback_metrics) assert expected_metrics == seen_metrics # make sure correct steps were called assert model.validation_step_called assert not model.validation_step_end_called assert not model.validation_epoch_end_called # no early stopping assert len(trainer.dev_debugger.early_stopping_history) == 0 # make sure we logged the exact number of metrics assert len( trainer.dev_debugger.logged_metrics) == epochs * batches + (epochs) assert len( trainer.dev_debugger.pbar_added_metrics) == epochs * batches + (epochs) # make sure we logged the correct epoch metrics for metric_idx in range(0, len(trainer.dev_debugger.logged_metrics), batches + 1): batch_metrics = trainer.dev_debugger.logged_metrics[ metric_idx:metric_idx + batches] epoch_metric = trainer.dev_debugger.logged_metrics[metric_idx + batches] epoch = epoch_metric['epoch'] # make sure the metric was split for batch_metric in batch_metrics: assert f'val_step_log_acc_step/epoch_{epoch}' in batch_metric assert f'val_step_log_pbar_acc_step/epoch_{epoch}' in batch_metric # make sure the epoch split was correct assert 'val_step_log_acc_epoch' in epoch_metric assert 'val_step_log_pbar_acc_epoch' in epoch_metric # make sure we logged the correct pbar metrics for metric_idx in range(0, len(trainer.dev_debugger.pbar_added_metrics), batches + 1): batch_metrics = trainer.dev_debugger.pbar_added_metrics[ metric_idx:metric_idx + batches] epoch_metric = trainer.dev_debugger.pbar_added_metrics[metric_idx + batches] # make sure the metric was split for batch_metric in batch_metrics: assert 'val_step_pbar_acc_step' in batch_metric assert 'val_step_log_pbar_acc_step' in batch_metric # make sure the epoch split was correct assert 'val_step_pbar_acc_epoch' in epoch_metric assert 'val_step_log_pbar_acc_epoch' in epoch_metric # only 1 checkpoint expected since values didn't change after that assert len(trainer.dev_debugger.checkpoint_callback_history) == 1 # make sure the last known metric is correct assert trainer.logger_connector.callback_metrics['checkpoint_on'] == 189