def test_cpu_restore_training(tmpdir): """Verify continue training session on CPU.""" tutils.reset_seed() hparams = tutils.get_hparams() model = LightningTestModel(hparams) # logger file to get meta test_logger_version = 10 logger = tutils.get_test_tube_logger(tmpdir, False, version=test_logger_version) trainer_options = dict( max_epochs=8, val_check_interval=0.50, val_percent_check=0.2, train_percent_check=0.2, logger=logger, checkpoint_callback=ModelCheckpoint(tmpdir, save_top_k=-1) ) # fit model trainer = Trainer(**trainer_options) result = trainer.fit(model) real_global_epoch = trainer.current_epoch # traning complete assert result == 1, 'amp + ddp model failed to complete' # wipe-out trainer and model # retrain with not much data... this simulates picking training back up after slurm # we want to see if the weights come back correctly new_logger = tutils.get_test_tube_logger(tmpdir, False, version=test_logger_version) trainer_options = dict( max_epochs=2, val_check_interval=0.50, val_percent_check=0.2, train_percent_check=0.2, logger=new_logger, checkpoint_callback=ModelCheckpoint(tmpdir), ) trainer = Trainer(**trainer_options) model = LightningTestModel(hparams) # set the epoch start hook so we can predict before the model does the full training def assert_good_acc(): assert trainer.current_epoch == real_global_epoch assert trainer.current_epoch >= 0 # if model and state loaded correctly, predictions will be good even though we # haven't trained with the new loaded model trainer.model.eval() for dataloader in trainer.get_val_dataloaders(): tutils.run_prediction(dataloader, trainer.model) model.on_train_start = assert_good_acc # by calling fit again, we trigger training, loading weights from the cluster # and our hook to predict using current model before any more weight updates trainer.fit(model)
def test_dp_resume(tmpdir): """Make sure DP continues training correctly.""" if not tutils.can_run_gpu_test(): return tutils.reset_seed() hparams = tutils.get_hparams() model = LightningTestModel(hparams) trainer_options = dict( show_progress_bar=True, max_epochs=2, gpus=2, distributed_backend='dp', ) # get logger logger = tutils.get_test_tube_logger(tmpdir, debug=False) # exp file to get weights # logger file to get weights checkpoint = tutils.init_checkpoint_callback(logger) # add these to the trainer options trainer_options['logger'] = logger trainer_options['checkpoint_callback'] = checkpoint # fit model trainer = Trainer(**trainer_options) trainer.is_slurm_managing_tasks = True result = trainer.fit(model) # track epoch before saving real_global_epoch = trainer.current_epoch # correct result and ok accuracy assert result == 1, 'amp + dp model failed to complete' # --------------------------- # HPC LOAD/SAVE # --------------------------- # save trainer.hpc_save(tmpdir, logger) # init new trainer new_logger = tutils.get_test_tube_logger(tmpdir, version=logger.version) trainer_options['logger'] = new_logger trainer_options['checkpoint_callback'] = ModelCheckpoint(tmpdir) trainer_options['train_percent_check'] = 0.2 trainer_options['val_percent_check'] = 0.2 trainer_options['max_epochs'] = 1 new_trainer = Trainer(**trainer_options) # set the epoch start hook so we can predict before the model does the full training def assert_good_acc(): assert new_trainer.current_epoch == real_global_epoch and new_trainer.current_epoch > 0 # if model and state loaded correctly, predictions will be good even though we # haven't trained with the new loaded model dp_model = new_trainer.model dp_model.eval() dataloader = trainer.get_train_dataloader() tutils.run_prediction(dataloader, dp_model, dp=True) # new model model = LightningTestModel(hparams) model.on_train_start = assert_good_acc # fit new model which should load hpc weights new_trainer.fit(model) # test freeze on gpu model.freeze() model.unfreeze()