def test_distributed_backend_set_when_using_tpu(tmpdir, tpu_cores): """Test if distributed_backend is set to `tpu` when tpu_cores is not None""" assert Trainer(tpu_cores=tpu_cores).distributed_backend == "tpu"
return logger def initialize_checkpoint_callback(self): checkpoint_callback = ModelCheckpoint( monitor="val_acc_epoch", dirpath=f"model_checkpoints/" \ f"{self.logger_subdir}/" \ f"{self.logger_run_name}", filename='{epoch:02d}-{val_acc_epoch:.4f}', save_top_k=self.configs["train_num_epochs"], mode='max', ) return checkpoint_callback if __name__ == '__main__': model = SupervisedModel() logger = model.initialize_logger() checkpoint_callback = model.initialize_checkpoint_callback() lr_monitor = LearningRateMonitor(logging_interval='epoch') trainer = Trainer(gpus=1, deterministic=True, max_epochs=model.configs['train_num_epochs'], callbacks=[checkpoint_callback, lr_monitor], logger=logger, fast_dev_run=False) trainer.fit(model)
def test_val_step_epoch_step_metrics(tmpdir): """ Make sure the logged + pbar metrics are allocated accordingly at every step when requested """ # enable internal debugging actions os.environ['PL_DEV_DEBUG'] = '1' model = DeterministicModel() model.training_step = model.training_step_result_log_epoch_and_step_for_callbacks model.training_step_end = None model.training_epoch_end = None model.validation_step = model.validation_step_result_epoch_step_metrics model.validation_step_end = None model.validation_epoch_end = None batches = 3 epochs = 3 trainer = Trainer( default_root_dir=tmpdir, max_epochs=epochs, row_log_interval=1, limit_train_batches=batches, limit_val_batches=batches, weights_summary=None, ) trainer.fit(model) assert len(trainer.logger_connector.callback_metrics) == 11 expected_metrics = { 'early_stop_on', 'checkpoint_on', 'val_step_pbar_acc', 'epoch_val_step_pbar_acc', 'val_step_log_acc', 'epoch_val_step_log_acc', 'val_step_log_pbar_acc', 'epoch_val_step_log_pbar_acc', 'val_step_batch_idx', 'epoch_val_step_batch_idx' } expected_metrics.add('debug_epoch') seen_metrics = set(trainer.logger_connector.callback_metrics) assert expected_metrics == seen_metrics # make sure correct steps were called assert model.validation_step_called assert not model.validation_step_end_called assert not model.validation_epoch_end_called # no early stopping assert len(trainer.dev_debugger.early_stopping_history) == 0 # make sure we logged the exact number of metrics assert len( trainer.dev_debugger.logged_metrics) == epochs * batches + (epochs) assert len( trainer.dev_debugger.pbar_added_metrics) == epochs * batches + (epochs) # make sure we logged the correct epoch metrics for metric_idx in range(0, len(trainer.dev_debugger.logged_metrics), batches + 1): batch_metrics = trainer.dev_debugger.logged_metrics[ metric_idx:metric_idx + batches] epoch_metric = trainer.dev_debugger.logged_metrics[metric_idx + batches] epoch = epoch_metric['epoch'] # make sure the metric was split for batch_metric in batch_metrics: assert f'step_val_step_log_acc/epoch_{epoch}' in batch_metric assert f'step_val_step_log_pbar_acc/epoch_{epoch}' in batch_metric # make sure the epoch split was correct assert 'epoch_val_step_log_acc' in epoch_metric assert 'epoch_val_step_log_pbar_acc' in epoch_metric # make sure we logged the correct pbar metrics for metric_idx in range(0, len(trainer.dev_debugger.pbar_added_metrics), batches + 1): batch_metrics = trainer.dev_debugger.pbar_added_metrics[ metric_idx:metric_idx + batches] epoch_metric = trainer.dev_debugger.pbar_added_metrics[metric_idx + batches] # make sure the metric was split for batch_metric in batch_metrics: assert 'step_val_step_pbar_acc' in batch_metric assert 'step_val_step_log_pbar_acc' in batch_metric # make sure the epoch split was correct assert 'epoch_val_step_pbar_acc' in epoch_metric assert 'epoch_val_step_log_pbar_acc' in epoch_metric # only 1 checkpoint expected since values didn't change after that assert len(trainer.dev_debugger.checkpoint_callback_history) == 1 # make sure the last known metric is correct assert trainer.logger_connector.callback_metrics['checkpoint_on'] == 189
def test_lr_finder_fails_fast_on_bad_config(tmpdir): """ Test that tune fails if the model does not have a lr BEFORE running lr find """ trainer = Trainer(default_root_dir=tmpdir, max_steps=2, auto_lr_find=True) with pytest.raises(MisconfigurationException, match='should have one of these fields'): trainer.tune(BoringModel())
data_path=os.path.join( PARENT_DIR, 'datasets', 'pendulum-gym-image-dataset-train.pkl')) checkpoint_callback = ModelCheckpoint(monitor='loss', prefix=args.name + f'-T_p={args.T_pred}-', save_top_k=1, save_last=True) trainer = Trainer.from_argparse_args( args, deterministic=True, default_root_dir=os.path.join(PARENT_DIR, 'logs', args.name), checkpoint_callback=checkpoint_callback) trainer.fit(model) if __name__ == '__main__': parser = ArgumentParser(add_help=False) parser.add_argument('--name', default='ablation-pend-lag-caAE', type=str) parser.add_argument('--T_pred', default=4, type=int) parser.add_argument('--solver', default='euler', type=str) parser.add_argument('--homo_u', dest='homo_u', action='store_true') # add args from trainer parser = Trainer.add_argparse_args(parser) # give the module a chance to add own params # good practice to define LightningModule speficic params in the module parser = Model.add_model_specific_args(parser) # parse params args = parser.parse_args() main(args)
def test_training_step_with_dataloader_access(tmpdir) -> None: """A baseline functional test for `training_step` with dataloader access.""" trainer = Trainer(max_epochs=1, default_root_dir=tmpdir) m = AsyncBoringModel() trainer.fit(m) assert m.num_batches_processed == DATASET_LEN, f"Expect all {DATASET_LEN} batches to be processed."
def test_logger_after_fit_predict_test_calls(tmpdir): """Make sure logger outputs are finalized after fit, prediction, and test calls.""" class BufferLogger(LightningLoggerBase): def __init__(self): super().__init__() self.buffer = {} self.logs = {} def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: self.buffer.update(metrics) def finalize(self, status: str) -> None: self.logs.update(self.buffer) self.buffer = {} @property def experiment(self) -> Any: return None @property def version(self) -> Union[int, str]: return 1 @property def name(self) -> str: return "BufferLogger" def log_hyperparams(self, *args, **kwargs) -> None: return None class LoggerCallsObserver(Callback): def on_fit_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: trainer.logger.log_metrics({"fit": 1}) def on_validation_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: trainer.logger.log_metrics({"validate": 1}) def on_predict_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: trainer.logger.log_metrics({"predict": 1}) def on_test_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: trainer.logger.log_metrics({"test": 1}) model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, limit_train_batches=1, limit_val_batches=1, max_epochs=1, logger=BufferLogger(), callbacks=[LoggerCallsObserver()], ) assert not trainer.logger.logs trainer.fit(model) assert trainer.logger.logs == {"fit": 1, "validate": 1} trainer.test(model) assert trainer.logger.logs == {"fit": 1, "validate": 1, "test": 1} trainer.predict(model) assert trainer.logger.logs == { "fit": 1, "validate": 1, "test": 1, "predict": 1 }
def test_gradient_accumulation_scheduling(tmpdir): """ Test grad accumulation by the freq of optimizer updates """ # test incorrect configs with pytest.raises(IndexError): assert Trainer(accumulate_grad_batches={0: 3, 1: 4, 4: 6}) assert Trainer(accumulate_grad_batches={-2: 3}) with pytest.raises(TypeError): assert Trainer(accumulate_grad_batches={}) assert Trainer(accumulate_grad_batches=[[2, 3], [4, 6]]) assert Trainer(accumulate_grad_batches={1: 2, 3.: 4}) assert Trainer(accumulate_grad_batches={1: 2.5, 3: 5}) # test optimizer call freq matches scheduler def _optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None): # only test the first 12 batches in epoch if batch_idx < 12: if epoch == 0: # reset counter when starting epoch if batch_idx == 0: self.prev_called_batch_idx = 0 # use this opportunity to test once assert self.trainer.accumulate_grad_batches == 1 assert batch_idx == self.prev_called_batch_idx self.prev_called_batch_idx += 1 elif 1 <= epoch <= 2: # reset counter when starting epoch if batch_idx == 1: self.prev_called_batch_idx = 1 # use this opportunity to test once assert self.trainer.accumulate_grad_batches == 2 assert batch_idx == self.prev_called_batch_idx self.prev_called_batch_idx += 2 else: if batch_idx == 3: self.prev_called_batch_idx = 3 # use this opportunity to test once assert self.trainer.accumulate_grad_batches == 4 assert batch_idx == self.prev_called_batch_idx self.prev_called_batch_idx += 3 optimizer.step() # clear gradients optimizer.zero_grad() model = EvalModelTemplate() schedule = {1: 2, 3: 4} trainer = Trainer(accumulate_grad_batches=schedule, train_percent_check=0.1, val_percent_check=0.1, max_epochs=2, default_root_dir=tmpdir) # for the test trainer.optimizer_step = _optimizer_step model.prev_called_batch_idx = 0 trainer.fit(model)
def test_trainer_flag(caplog): class TestModel(BoringModel): def on_fit_start(self): raise SystemExit() trainer = Trainer(max_time=dict(seconds=1337)) with pytest.raises(SystemExit): trainer.fit(TestModel()) timer = [c for c in trainer.callbacks if isinstance(c, Timer)][0] assert timer._duration == 1337 trainer = Trainer(max_time=dict(seconds=1337), callbacks=[Timer()]) with pytest.raises(SystemExit), caplog.at_level(level=logging.INFO): trainer.fit(TestModel()) assert "callbacks list already contains a Timer" in caplog.text # Make sure max_time still honored even if max_epochs == -1 trainer = Trainer(max_time=dict(seconds=1), max_epochs=-1) with pytest.raises(SystemExit): trainer.fit(TestModel()) timer = [c for c in trainer.callbacks if isinstance(c, Timer)][0] assert timer._duration == 1 assert trainer.max_epochs == -1 assert trainer.max_steps == -1
def test_dp_resume(tmpdir): """Make sure DP continues training correctly.""" model = CustomClassificationModelDP(lr=0.1) dm = ClassifDataModule() trainer_options = dict(max_epochs=1, gpus=2, accelerator='dp', default_root_dir=tmpdir) # get logger logger = tutils.get_default_logger(tmpdir) # exp file to get weights # logger file to get weights checkpoint = tutils.init_checkpoint_callback(logger) # add these to the trainer options trainer_options['logger'] = logger trainer_options['callbacks'] = [checkpoint] # fit model trainer = Trainer(**trainer_options) trainer.is_slurm_managing_tasks = True trainer.fit(model, datamodule=dm) # track epoch before saving. Increment since we finished the current epoch, don't want to rerun real_global_epoch = trainer.current_epoch + 1 # correct result and ok accuracy assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}" # --------------------------- # HPC LOAD/SAVE # --------------------------- # save trainer.checkpoint_connector.hpc_save(tmpdir, logger) # init new trainer new_logger = tutils.get_default_logger(tmpdir, version=logger.version) trainer_options['logger'] = new_logger trainer_options['callbacks'] = [ModelCheckpoint(dirpath=tmpdir)] trainer_options['limit_train_batches'] = 0.5 trainer_options['limit_val_batches'] = 0.2 trainer_options['max_epochs'] = 1 new_trainer = Trainer(**trainer_options) class CustomModel(CustomClassificationModelDP): def __init__(self): super().__init__() self.on_train_start_called = False # set the epoch start hook so we can predict before the model does the full training def on_train_start(self): assert self.trainer.current_epoch == real_global_epoch and self.trainer.current_epoch > 0 # if model and state loaded correctly, predictions will be good even though we # haven't trained with the new loaded model new_trainer._running_stage = RunningStage.VALIDATING dataloader = self.train_dataloader() tpipes.run_prediction_eval_model_template( self.trainer.lightning_module, dataloader=dataloader) self.on_train_start_called = True # new model model = CustomModel() # fit new model which should load hpc weights new_trainer.fit(model, datamodule=dm) assert model.on_train_start_called # test freeze on gpu model.freeze() model.unfreeze()
def test_resume_from_checkpoint_epoch_restored(tmpdir): """Verify resuming from checkpoint runs the right number of epochs""" hparams = EvalModelTemplate.get_default_hparams() def _new_model(): # Create a model that tracks epochs and batches seen model = EvalModelTemplate(**hparams) model.num_epochs_seen = 0 model.num_batches_seen = 0 model.num_on_load_checkpoint_called = 0 def increment_epoch(self): self.num_epochs_seen += 1 def increment_batch(self, _): self.num_batches_seen += 1 def increment_on_load_checkpoint(self, _): self.num_on_load_checkpoint_called += 1 # Bind methods to keep track of epoch numbers, batch numbers it has seen # as well as number of times it has called on_load_checkpoint() model.on_epoch_end = types.MethodType(increment_epoch, model) model.on_batch_start = types.MethodType(increment_batch, model) model.on_load_checkpoint = types.MethodType( increment_on_load_checkpoint, model) return model model = _new_model() trainer_options = dict( progress_bar_refresh_rate=0, max_epochs=2, train_percent_check=0.65, val_percent_check=1, checkpoint_callback=ModelCheckpoint(tmpdir, save_top_k=-1), default_root_dir=tmpdir, early_stop_callback=False, val_check_interval=1., ) trainer = Trainer(**trainer_options) # fit model trainer.fit(model) training_batches = trainer.num_training_batches assert model.num_epochs_seen == 2 assert model.num_batches_seen == training_batches * 2 assert model.num_on_load_checkpoint_called == 0 # Other checkpoints can be uncommented if/when resuming mid-epoch is supported checkpoints = sorted( glob.glob(os.path.join(trainer.checkpoint_callback.dirpath, '*.ckpt'))) for check in checkpoints: next_model = _new_model() state = torch.load(check) # Resume training trainer_options['max_epochs'] = 2 new_trainer = Trainer(**trainer_options, resume_from_checkpoint=check) new_trainer.fit(next_model) assert state[ 'global_step'] + next_model.num_batches_seen == training_batches * trainer_options[ 'max_epochs'] assert next_model.num_on_load_checkpoint_called == 1
def test_test_checkpoint_path(tmpdir, ckpt_path, save_top_k): hparams = EvalModelTemplate.get_default_hparams() loaded_checkpoint_path = '' class TestBestModel(EvalModelTemplate): @classmethod def load_from_checkpoint(cls, checkpoint_path, *args, **kwargs): nonlocal loaded_checkpoint_path loaded_checkpoint_path = checkpoint_path return super().load_from_checkpoint(checkpoint_path, *args, **kwargs) model = TestBestModel(**hparams) trainer = Trainer( max_epochs=2, progress_bar_refresh_rate=0, default_root_dir=tmpdir, checkpoint_callback=ModelCheckpoint(save_top_k=save_top_k), ) trainer.fit(model) if ckpt_path == 'best': # ckpt_path is 'best', meaning we load the best weights if save_top_k <= 0: with pytest.raises(MisconfigurationException, match='.*is not configured to save the best.*'): trainer.test(ckpt_path=ckpt_path) else: trainer.test(ckpt_path=ckpt_path) assert loaded_checkpoint_path == trainer.checkpoint_callback.best_model_path elif ckpt_path is None: # ckpt_path is None, meaning we don't load any checkpoints and # use the weights from the end of training trainer.test(ckpt_path=ckpt_path) assert loaded_checkpoint_path == '' else: # specific checkpoint, pick one from saved ones if save_top_k == 0: with pytest.raises(FileNotFoundError): trainer.test(ckpt_path='random.ckpt') else: ckpt_path = str( list((Path(tmpdir) / 'lightning_logs/version_0/checkpoints').iterdir())[0]) trainer.test(ckpt_path=ckpt_path) assert loaded_checkpoint_path == ckpt_path
def test_resume_from_checkpoint_epoch_restored(monkeypatch, tmpdir, tmpdir_server, url_ckpt): """Verify resuming from checkpoint runs the right number of epochs""" # set $TORCH_HOME, which determines torch hub's cache path, to tmpdir monkeypatch.setenv('TORCH_HOME', tmpdir) hparams = EvalModelTemplate.get_default_hparams() def _new_model(): # Create a model that tracks epochs and batches seen model = EvalModelTemplate(**hparams) model.num_epochs_seen = 0 model.num_batches_seen = 0 model.num_on_load_checkpoint_called = 0 def increment_epoch(self): self.num_epochs_seen += 1 def increment_batch(self, _): self.num_batches_seen += 1 def increment_on_load_checkpoint(self, _): self.num_on_load_checkpoint_called += 1 # Bind methods to keep track of epoch numbers, batch numbers it has seen # as well as number of times it has called on_load_checkpoint() model.on_epoch_end = types.MethodType(increment_epoch, model) model.on_batch_start = types.MethodType(increment_batch, model) model.on_load_checkpoint = types.MethodType( increment_on_load_checkpoint, model) return model model = _new_model() trainer_options = dict( progress_bar_refresh_rate=0, max_epochs=2, limit_train_batches=0.65, limit_val_batches=1, checkpoint_callback=ModelCheckpoint(tmpdir, save_top_k=-1), default_root_dir=tmpdir, early_stop_callback=False, val_check_interval=1., ) trainer = Trainer(**trainer_options) # fit model trainer.fit(model) training_batches = trainer.num_training_batches assert model.num_epochs_seen == 2 assert model.num_batches_seen == training_batches * 2 assert model.num_on_load_checkpoint_called == 0 # Other checkpoints can be uncommented if/when resuming mid-epoch is supported checkpoints = sorted( glob.glob(os.path.join(trainer.checkpoint_callback.dirpath, '*.ckpt'))) if url_ckpt: # transform local paths into url checkpoints ip, port = tmpdir_server checkpoints = [ f'http://{ip}:{port}/' + os.path.basename(check) for check in checkpoints ] for check in checkpoints: next_model = _new_model() state = pl_load(check) # Resume training trainer_options['max_epochs'] = 2 new_trainer = Trainer(**trainer_options, resume_from_checkpoint=check) new_trainer.fit(next_model) assert state[ 'global_step'] + next_model.num_batches_seen == training_batches * trainer_options[ 'max_epochs'] assert next_model.num_on_load_checkpoint_called == 1
def test_trainer_pickle(tmpdir): trainer = Trainer(max_epochs=1, default_root_dir=tmpdir) pickle.dumps(trainer) cloudpickle.dumps(trainer)
def test_step_with_optimizer_closure_with_different_frequencies( mock_sgd_step, mock_adam_step, tmpdir): """Tests that `step` works with optimizer_closure and different accumulated_gradient frequency.""" class TestModel(BoringModel): def __init__(self): super().__init__() self.automatic_optimization = False def training_step(self, batch, batch_idx): # emulate gans training opt_gen, opt_dis = self.optimizers() # Note: Be careful, don't log on the same key in self.log in both closure # as they will be aggregated together on epoch_end def compute_loss(): x = batch[0] x = F.dropout(x, 0.1) predictions = self(x) predictions = F.dropout(predictions, 0.1) loss = self.loss(None, predictions) return loss def gen_closure(): loss_gen = compute_loss() self.log("loss_gen", loss_gen, on_step=True, on_epoch=True) self.manual_backward(loss_gen) def dis_closure(): loss_dis = compute_loss() self.log("loss_dis", loss_dis, on_step=True, on_epoch=True) self.manual_backward(loss_dis) # this will accumulate gradients for 2 batches and then call opt_gen.step() gen_closure() if batch_idx % 2 == 0: opt_gen.step(closure=gen_closure, optim="sgd") opt_gen.zero_grad() # update discriminator every 4 baches # therefore, no gradient accumulation for discriminator if batch_idx % 4 == 0: opt_dis.step(closure=dis_closure) opt_dis.zero_grad() def configure_optimizers(self): optimizer_gen = torch.optim.SGD(self.layer.parameters(), lr=0.1) optimizer_dis = torch.optim.Adam(self.layer.parameters(), lr=0.001) return [optimizer_gen, optimizer_dis] model = TestModel() model.val_dataloader = None model.training_epoch_end = None limit_train_batches = 8 trainer = Trainer( default_root_dir=tmpdir, limit_train_batches=limit_train_batches, limit_val_batches=2, max_epochs=1, log_every_n_steps=1, ) trainer.fit(model) expected_calls = [call(closure=ANY, optim="sgd") for s in range(4)] mock_sgd_step.assert_has_calls(expected_calls) expected_calls = [call(closure=ANY) for s in range(2)] mock_adam_step.assert_has_calls(expected_calls)
def main(hparams): if hparams.logging_location == "s3": logging_dir = os.path.join(S3_LIGHTNING_LOGS_DIR, hparams.name) else: logging_dir = os.path.join(LIGHTNING_LOGS_DIR, hparams.name) # main LightningModule if hparams.checkpoint_path is not None: pretrain_system = PreTrainSystem.load_from_checkpoint( hparams.adversarial_system) else: pretrain_system = PreTrainSystem(**vars(hparams)) pretrain_checkpoints = ModelCheckpoint( dirpath=os.path.join(MODEL_CHECKPOINTS_DIR, hparams.version), monitor="Val/loss", verbose=True, mode="min", save_top_k=hparams.save_top_k, ) pretrain_early_stopping = EarlyStopping( monitor="Val/loss", min_delta=0.00, patience=hparams.patience, verbose=False, mode="min", ) gpu_stats = GPUStatsMonitor(temperature=True) log_recolored_to_tensorboard = LogPairRecoloringToTensorboard() log_hyperparams_to_tensorboard = LogHyperparamsToTensorboard( hp_metric="Test/loss") notify = Notify(test_metric_name="Test/loss") logger = TensorBoardLogger( logging_dir, name=hparams.name, version=hparams.version, log_graph=True, default_hp_metric=False, ) trainer = Trainer.from_argparse_args( hparams, resume_from_checkpoint=hparams.checkpoint_path, logger=logger, checkpoint_callback=pretrain_checkpoints, callbacks=[ pretrain_early_stopping, log_recolored_to_tensorboard, log_hyperparams_to_tensorboard, gpu_stats, notify, ], profiler="simple", benchmark=True, ) datamodule = PreTrainDataModule(**vars(hparams)) trainer.fit(pretrain_system, datamodule=datamodule) # lightning automatically uses the best model checkpoint for testing trainer.test(pretrain_system, datamodule=datamodule) if hparams.upload_model_to_s3: # upload best model to S3 best_model_path = pretrain_checkpoints.best_model_path S3_best_model_path = os.path.join( S3_MODEL_CHECKPOINTS_RELATIVE_DIR, hparams.name, ".".join([hparams.version, best_model_path.split(".")[-1]]), ) upload_to_s3(best_model_path, S3_best_model_path)
def test_multiple_optimizers_manual_no_return(tmpdir, kwargs): apex_optimizer_patches = [] apex_optimizer_steps = [] class TestModel(ManualOptModel): def training_step(self, batch, batch_idx): # avoid returning a value super().training_step(batch, batch_idx) def training_epoch_end(self, outputs): # outputs is empty as training_step does not return # and it is not automatic optimization assert not outputs def on_train_start(self): if kwargs.get("amp_backend") != "apex": return # extremely ugly. APEX patches all the native torch optimizers on `_initialize` which we call on # `ApexMixedPrecisionPlugin.dispatch`. Additionally, their replacement `new_step` functions are locally # defined so can't even patch those, thus we need to create the mock after APEX has been initialized nonlocal apex_optimizer_patches, apex_optimizer_steps for opt in self.trainer.optimizers: # `amp.scale_loss` will also patch the step to avoid it when gradient overflow happens. avoid it opt._amp_stash.already_patched = True patch = mock.patch.object(opt, "step") apex_optimizer_patches.append(patch) apex_optimizer_steps.append(patch.start()) def on_train_end(self): if kwargs.get("amp_backend") == "apex": for p in apex_optimizer_patches: p.stop() model = TestModel() model.val_dataloader = None limit_train_batches = 2 trainer = Trainer( default_root_dir=tmpdir, limit_train_batches=limit_train_batches, limit_val_batches=2, max_epochs=1, log_every_n_steps=1, weights_summary=None, **kwargs, ) if kwargs.get("amp_backend") == "native": # mock the scaler instead of the optimizer step because it can be skipped with NaNs scaler_step_patch = mock.patch.object( trainer.precision_plugin.scaler, "step", wraps=trainer.precision_plugin.scaler.step) scaler_step = scaler_step_patch.start() with mock.patch.object(Accelerator, "backward", wraps=trainer.accelerator.backward) as bwd_mock: trainer.fit(model) assert bwd_mock.call_count == limit_train_batches * 3 if kwargs.get("amp_backend") == "native": scaler_step_patch.stop() assert scaler_step.call_count == len( model.optimizers()) * limit_train_batches if kwargs.get("amp_backend") == "apex": assert [s.call_count for s in apex_optimizer_steps ] == [len(model.optimizers())] * limit_train_batches
def test_lightning_optimizer_automatic_optimization_optimizer_zero_grad_make_optimizer_step(tmpdir): """ Test lightning optimize works with optimizer_zero_grad overrides and make_optimizer_step in automatic_optimization """ try: with patch("torch.optim.Adam.zero_grad") as adam_zero_grad, \ patch("torch.optim.SGD.zero_grad") as sgd_zero_grad: class TestModel(BoringModel): def training_step(self, batch, batch_idx, optimizer_idx=None): output = self.layer(batch) loss = self.loss(batch, output) return {"loss": loss} def training_epoch_end(self, outputs): outputs = sum(outputs, []) torch.stack([x["loss"] for x in outputs]).mean() def optimizer_zero_grad(self, epoch: int, batch_idx: int, optimizer: Optimizer, optimizer_idx: int): if optimizer_idx == 0: if batch_idx % 2 == 0: optimizer.zero_grad() if optimizer_idx == 1: if batch_idx % 5 == 0: optimizer.zero_grad() def optimizer_step( self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_native_amp, using_lbfgs, ): assert optimizer_closure.__name__ == "train_step_and_backward_closure" if optimizer_idx == 0: optimizer.step(closure=optimizer_closure, make_optimizer_step=batch_idx % 3 == 0) return optimizer.step(closure=optimizer_closure) def configure_optimizers(self): optimizer_1 = torch.optim.SGD(self.layer.parameters(), lr=0.1) optimizer_2 = torch.optim.Adam(self.layer.parameters(), lr=0.1) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer_1, step_size=1) return [optimizer_1, optimizer_2], [lr_scheduler] model = TestModel() trainer = Trainer( default_root_dir=tmpdir, limit_train_batches=20, limit_val_batches=1, max_epochs=1, weights_summary=None, ) trainer.fit(model) assert adam_zero_grad.call_count == 4 assert sgd_zero_grad.call_count == 10 except MisconfigurationException as e: assert "When overriding LightningModule `optimizer_zero_grad`, make_optimizer_step is not allowed" in str(e)
def _test_fast_forward_sampler_with_distributed_sampler_and_iterative_dataset(rank, worldsize): if worldsize > 1: _setup_ddp(rank, worldsize) def all_gather(tensor, world_size): tensor_list = [torch.zeros_like(tensor, dtype=torch.int64) for _ in range(world_size)] torch.distributed.all_gather(tensor_list, tensor) return tensor_list initial_seed = seed_everything(42) generator = torch.Generator() generator.manual_seed(initial_seed) num_workers = 2 batch_size = 4 dataset_length = 60 num_classes = 10 labels = np.random.randint(0, num_classes, dataset_length) dataset = ClassificationDataset(range(dataset_length), labels) dataset = MetaLearningDataset( dataset, batch_size=batch_size, drop_last=True, num_workers=num_workers, global_rank=rank, world_size=worldsize, initial_seed=initial_seed, debugging=True, shuffle=True, ) dataset = CaptureIterableDataset(dataset, initial_seed=initial_seed) dataloader = DataLoader(dataset, num_workers=num_workers, batch_size=1, generator=generator) Trainer._add_sampler_metadata_collate(dataloader) epoch_results = [] for _ in range(2): iter_dataloader = iter(dataloader) batches = [] while True: try: batches.append(next(iter_dataloader)) except StopIteration: break epoch_results.append(batches) dataloader.dataset.dataset.current_task_iteration += 1 assert len(epoch_results) == 2 assert len(epoch_results[0]) == math.ceil((dataset_length / (num_workers * worldsize)) / batch_size) + 2 if worldsize == 1: assert epoch_results[0][0]["data"]["task_length"] == epoch_results[0][1]["data"]["task_length"] assert torch.equal( epoch_results[0][0]["data"]["selected_indexes"], epoch_results[0][1]["data"]["selected_indexes"] ) assert 0 in epoch_results[0][2][AutoRestartBatchKeys.PL_SAMPLERS]["iter_sampler"] # worker id 0 assert 1 in epoch_results[0][3][AutoRestartBatchKeys.PL_SAMPLERS]["iter_sampler"] # worker id 1 assert not torch.equal(epoch_results[0][2]["data"][0], epoch_results[0][3]["data"][0]) else: first_task_metadata = all_gather(epoch_results[0][0]["data"]["task_length"], worldsize) second_task_metadata = all_gather(epoch_results[0][1]["data"]["task_length"], worldsize) assert torch.equal(first_task_metadata[0], first_task_metadata[1]) assert torch.equal(second_task_metadata[0], second_task_metadata[1]) assert torch.equal(first_task_metadata[0], second_task_metadata[1]) first_batch_list = all_gather(epoch_results[0][2]["data"][0], worldsize) assert not torch.equal(first_batch_list[0], first_batch_list[1]) second_batch_list = all_gather(epoch_results[0][3]["data"][0], worldsize) assert not torch.equal(second_batch_list[0], second_batch_list[1]) # restarting on epoch 0 / real batch 2 state_dict = {"iter_sampler": {}} for batch in epoch_results[0][2:4]: batch, _state_dict = CaptureIterableDataset.extract_samplers_state_dict_from_batch(batch) for k, v in _state_dict[0].items(): state_dict[k].update(v) dataset = ClassificationDataset(range(dataset_length), labels) dataset = MetaLearningDataset( dataset, batch_size=batch_size, drop_last=True, num_workers=num_workers, global_rank=rank, world_size=worldsize, initial_seed=initial_seed, debugging=True, shuffle=True, ) dataset = CaptureIterableDataset(dataset, initial_seed=initial_seed) dataset.load_state_dict(state_dict) dataloader = DataLoader(dataset, num_workers=num_workers, batch_size=1, generator=generator) Trainer._add_sampler_metadata_collate(dataloader) epoch_results_restart = [] for _ in range(2): iter_dataloader = iter(dataloader) batches = [] while True: try: batches.append(next(iter_dataloader)) except StopIteration: break epoch_results_restart.append(batches) dataloader.dataset.dataset.increment_iteration() dataloader.dataset.reset_on_epoch() assert len(epoch_results_restart[0]) + 2 == len(epoch_results[0]) epoch_tensors = [e["data"][0] for e in epoch_results[0][4:]] epoch_tensors_restart = [e["data"][0] for e in epoch_results_restart[0][2:]] for t, tr in zip(epoch_tensors, epoch_tensors_restart): assert torch.equal(t, tr) epoch_tensors = [e["data"][0] for e in epoch_results[1][2:]] epoch_tensors_restart = [e["data"][0] for e in epoch_results_restart[1][2:]] for t, tr in zip(epoch_tensors, epoch_tensors_restart): assert torch.equal(t, tr)
def train(config: DictConfig, resume_from_checkpoint: str = None): filter_warnings() print_config(config) seed_everything(config.seed) known_models = { "token": get_token_based, "vuldeepecker": get_VDP, "vgdetector": get_VGD, "sysevr": get_SYS, "mulvuldeepecker": get_MULVDP, "code2seq": get_C2S, "code2vec": get_C2V } vocab = { "token": Vocabulary_token, "vuldeepecker": Vocabulary_token, "vgdetector": Vocabulary_token, "sysevr": Vocabulary_token, "mulvuldeepecker": Vocabulary_token, "code2seq": Vocabulary_c2s, "code2vec": Vocabulary_c2s } if config.name not in known_models: print(f"Unknown model: {config.name}, try on of {known_models.keys()}") return if os.path.exists( join(config.data_folder, config.name, config.dataset.name, "vocab.pkl")): vocabulary = vocab[config.name].load_vocabulary( join(config.data_folder, config.name, config.dataset.name, "vocab.pkl")) else: vocabulary = None model, data_module = known_models[config.name](config, vocabulary) # define logger # wandb logger # wandb_logger = WandbLogger(project=f"{config.name}-{config.dataset.name}", # log_model=True, # offline=config.log_offline) # wandb_logger.watch(model) # checkpoint_callback = ModelCheckpoint( # dirpath=wandb_logger.experiment.dir, # filename="{epoch:02d}-{val_loss:.4f}", # period=config.save_every_epoch, # save_top_k=-1, # ) # upload_checkpoint_callback = UploadCheckpointCallback( # wandb_logger.experiment.dir) # tensorboard logger tensorlogger = TensorBoardLogger(join("ts_logger", config.name), config.dataset.name) # define model checkpoint callback checkpoint_callback = ModelCheckpoint( dirpath=join(tensorlogger.log_dir, "checkpoints"), monitor="val_loss", filename="{epoch:02d}-{val_loss:.4f}", period=config.save_every_epoch, save_top_k=3, ) upload_checkpoint_callback = UploadCheckpointCallback( join(tensorlogger.log_dir, "checkpoints")) # define early stopping callback early_stopping_callback = EarlyStopping( patience=config.hyper_parameters.patience, monitor="val_loss", verbose=True, mode="min") # define callback for printing intermediate result print_epoch_result_callback = PrintEpochResultCallback("train", "val") collect_test_res_callback = CollectTestResCallback(config) # use gpu if it exists gpu = 1 if torch.cuda.is_available() else None # define learning rate logger lr_logger = LearningRateMonitor("step") trainer = Trainer( max_epochs=config.hyper_parameters.n_epochs, gradient_clip_val=config.hyper_parameters.clip_norm, deterministic=True, check_val_every_n_epoch=config.val_every_epoch, log_every_n_steps=config.log_every_epoch, logger=[tensorlogger], reload_dataloaders_every_epoch=config.hyper_parameters. reload_dataloader, gpus=gpu, progress_bar_refresh_rate=config.progress_bar_refresh_rate, callbacks=[ lr_logger, early_stopping_callback, checkpoint_callback, print_epoch_result_callback, upload_checkpoint_callback, collect_test_res_callback ], resume_from_checkpoint=resume_from_checkpoint, ) trainer.fit(model=model, datamodule=data_module) trainer.test()
def test_can_prepare_data(tmpdir): dm = TrialMNISTDataModule() trainer = Trainer() trainer.datamodule = dm # 1 no DM # prepare_data_per_node = True # local rank = 0 (True) trainer.prepare_data_per_node = True trainer.local_rank = 0 assert trainer.data_connector.can_prepare_data() # local rank = 1 (False) trainer.local_rank = 1 assert not trainer.data_connector.can_prepare_data() # prepare_data_per_node = False (prepare across all nodes) # global rank = 0 (True) trainer.prepare_data_per_node = False trainer.node_rank = 0 trainer.local_rank = 0 assert trainer.data_connector.can_prepare_data() # global rank = 1 (False) trainer.node_rank = 1 trainer.local_rank = 0 assert not trainer.data_connector.can_prepare_data() trainer.node_rank = 0 trainer.local_rank = 1 assert not trainer.data_connector.can_prepare_data() # 2 dm # prepar per node = True # local rank = 0 (True) trainer.prepare_data_per_node = True trainer.local_rank = 0 # is_overridden prepare data = True # has been called # False dm._has_prepared_data = True assert not trainer.data_connector.can_prepare_data() # has not been called # True dm._has_prepared_data = False assert trainer.data_connector.can_prepare_data() # is_overridden prepare data = False # True dm.prepare_data = None assert trainer.data_connector.can_prepare_data()
file.close() if _hyperparams.k_fold_validation: all_subjects = range(len(keys)) for leave_one_out_idx in all_subjects: _train_subjects.append(all_subjects[:leave_one_out_idx] + all_subjects[leave_one_out_idx + 1:]) _valid_subjects.append([leave_one_out_idx]) # Note that this is a hack and should not be used to get results for papers _test_subjects.append([leave_one_out_idx]) else: _train_subjects.append(keys[1:]) _valid_subjects.append([keys[0]]) _test_subjects.append([keys[0]]) for fold, (train_s, valid_s, test_s) in enumerate(zip(_train_subjects, _valid_subjects, _test_subjects)): complete_path = os.path.abspath(os.path.join(_hyperparams.save_dir, "fold_{}/".format(fold))) _model = TrainRTGENE(hparams=_hyperparams, train_subjects=train_s, validate_subjects=valid_s, test_subjects=test_s) # save all models checkpoint_callback = ModelCheckpoint(filepath=os.path.join(complete_path, "{epoch}-{val_loss:.3f}"), monitor='val_loss', mode='min', verbose=True, save_top_k=-1 if not _hyperparams.augment else 5) early_stop_callback = EarlyStopping(monitor='val_loss', min_delta=0.00, verbose=True, patience=20 if _hyperparams.augment else 2, mode='min') # start training trainer = Trainer(gpus=_hyperparams.gpu, checkpoint_callback=checkpoint_callback, early_stop_callback=early_stop_callback, progress_bar_refresh_rate=1, min_epochs=64 if _hyperparams.augment else 3, max_epochs=128 if _hyperparams.augment else 5, accumulate_grad_batches=_hyperparams.accumulate_grad_batches) trainer.fit(_model) trainer.test()
def test_trainer_callback_system(tmpdir): """Test the callback system.""" hparams = EvalModelTemplate.get_default_hparams() model = EvalModelTemplate(**hparams) def _check_args(trainer, pl_module): assert isinstance(trainer, Trainer) assert isinstance(pl_module, LightningModule) class TestCallback(Callback): def __init__(self): super().__init__() self.setup_called = False self.teardown_called = False self.on_init_start_called = False self.on_init_end_called = False self.on_fit_start_called = False self.on_fit_end_called = False self.on_sanity_check_start_called = False self.on_sanity_check_end_called = False self.on_epoch_start_called = False self.on_epoch_end_called = False self.on_batch_start_called = False self.on_batch_end_called = False self.on_validation_batch_start_called = False self.on_validation_batch_end_called = False self.on_test_batch_start_called = False self.on_test_batch_end_called = False self.on_train_start_called = False self.on_train_end_called = False self.on_validation_start_called = False self.on_validation_end_called = False self.on_test_start_called = False self.on_test_end_called = False def setup(self, trainer, step: str): assert isinstance(trainer, Trainer) self.setup_called = True def teardown(self, trainer, step: str): assert isinstance(trainer, Trainer) self.teardown_called = True def on_init_start(self, trainer): assert isinstance(trainer, Trainer) self.on_init_start_called = True def on_init_end(self, trainer): assert isinstance(trainer, Trainer) self.on_init_end_called = True def on_fit_start(self, trainer): assert isinstance(trainer, Trainer) self.on_fit_start_called = True def on_fit_end(self, trainer): assert isinstance(trainer, Trainer) self.on_fit_end_called = True def on_sanity_check_start(self, trainer, pl_module): _check_args(trainer, pl_module) self.on_sanity_check_start_called = True def on_sanity_check_end(self, trainer, pl_module): _check_args(trainer, pl_module) self.on_sanity_check_end_called = True def on_epoch_start(self, trainer, pl_module): _check_args(trainer, pl_module) self.on_epoch_start_called = True def on_epoch_end(self, trainer, pl_module): _check_args(trainer, pl_module) self.on_epoch_end_called = True def on_batch_start(self, trainer, pl_module): _check_args(trainer, pl_module) self.on_batch_start_called = True def on_batch_end(self, trainer, pl_module): _check_args(trainer, pl_module) self.on_batch_end_called = True def on_validation_batch_start(self, trainer, pl_module): _check_args(trainer, pl_module) self.on_validation_batch_start_called = True def on_validation_batch_end(self, trainer, pl_module): _check_args(trainer, pl_module) self.on_validation_batch_end_called = True def on_test_batch_start(self, trainer, pl_module): _check_args(trainer, pl_module) self.on_test_batch_start_called = True def on_test_batch_end(self, trainer, pl_module): _check_args(trainer, pl_module) self.on_test_batch_end_called = True def on_train_start(self, trainer, pl_module): _check_args(trainer, pl_module) self.on_train_start_called = True def on_train_end(self, trainer, pl_module): _check_args(trainer, pl_module) self.on_train_end_called = True def on_validation_start(self, trainer, pl_module): _check_args(trainer, pl_module) self.on_validation_start_called = True def on_validation_end(self, trainer, pl_module): _check_args(trainer, pl_module) self.on_validation_end_called = True def on_test_start(self, trainer, pl_module): _check_args(trainer, pl_module) self.on_test_start_called = True def on_test_end(self, trainer, pl_module): _check_args(trainer, pl_module) self.on_test_end_called = True test_callback = TestCallback() trainer_options = dict( callbacks=[test_callback], max_epochs=1, limit_val_batches=0.1, limit_train_batches=0.2, progress_bar_refresh_rate=0, ) assert not test_callback.setup_called assert not test_callback.teardown_called assert not test_callback.on_init_start_called assert not test_callback.on_init_end_called assert not test_callback.on_fit_start_called assert not test_callback.on_fit_end_called assert not test_callback.on_sanity_check_start_called assert not test_callback.on_sanity_check_end_called assert not test_callback.on_epoch_start_called assert not test_callback.on_epoch_start_called assert not test_callback.on_batch_start_called assert not test_callback.on_batch_end_called assert not test_callback.on_validation_batch_start_called assert not test_callback.on_validation_batch_end_called assert not test_callback.on_test_batch_start_called assert not test_callback.on_test_batch_end_called assert not test_callback.on_train_start_called assert not test_callback.on_train_end_called assert not test_callback.on_validation_start_called assert not test_callback.on_validation_end_called assert not test_callback.on_test_start_called assert not test_callback.on_test_end_called # fit model trainer = Trainer(**trainer_options) assert trainer.callbacks[0] == test_callback assert test_callback.on_init_start_called assert test_callback.on_init_end_called assert not test_callback.setup_called assert not test_callback.teardown_called assert not test_callback.on_fit_start_called assert not test_callback.on_fit_end_called assert not test_callback.on_sanity_check_start_called assert not test_callback.on_sanity_check_end_called assert not test_callback.on_epoch_start_called assert not test_callback.on_epoch_start_called assert not test_callback.on_batch_start_called assert not test_callback.on_batch_end_called assert not test_callback.on_validation_batch_start_called assert not test_callback.on_validation_batch_end_called assert not test_callback.on_test_batch_start_called assert not test_callback.on_test_batch_end_called assert not test_callback.on_train_start_called assert not test_callback.on_train_end_called assert not test_callback.on_validation_start_called assert not test_callback.on_validation_end_called assert not test_callback.on_test_start_called assert not test_callback.on_test_end_called trainer.fit(model) assert test_callback.setup_called assert test_callback.teardown_called assert test_callback.on_init_start_called assert test_callback.on_init_end_called assert test_callback.on_fit_start_called assert test_callback.on_fit_end_called assert test_callback.on_sanity_check_start_called assert test_callback.on_sanity_check_end_called assert test_callback.on_epoch_start_called assert test_callback.on_epoch_start_called assert test_callback.on_batch_start_called assert test_callback.on_batch_end_called assert test_callback.on_validation_batch_start_called assert test_callback.on_validation_batch_end_called assert test_callback.on_train_start_called assert test_callback.on_train_end_called assert test_callback.on_validation_start_called assert test_callback.on_validation_end_called assert not test_callback.on_test_batch_start_called assert not test_callback.on_test_batch_end_called assert not test_callback.on_test_start_called assert not test_callback.on_test_end_called # reset setup teardown callback test_callback.teardown_called = False test_callback.setup_called = False test_callback = TestCallback() trainer_options.update(callbacks=[test_callback]) trainer = Trainer(**trainer_options) trainer.test(model) assert test_callback.setup_called assert test_callback.teardown_called assert test_callback.on_test_batch_start_called assert test_callback.on_test_batch_end_called assert test_callback.on_test_start_called assert test_callback.on_test_end_called assert not test_callback.on_validation_start_called assert not test_callback.on_validation_end_called assert not test_callback.on_validation_batch_end_called assert not test_callback.on_validation_batch_start_called
def test_manual_optimization_and_accumulated_gradient(tmpdir): """This test verify that in `automatic_optimization=False`, step is being called only when we shouldn't accumulate.""" seed_everything(234) class ExtendedModel(BoringModel): count = 1 called = collections.defaultdict(int) detach = False def __init__(self): super().__init__() self.automatic_optimization = False @property def should_update(self): return self.count % 2 == 0 @property def should_have_updated(self): return self.count % 4 == 0 @property def has_gradient(self): return self.layer.weight.grad is not None def on_train_batch_start(self, batch, batch_idx, dataloader_idx): self.called["on_train_batch_start"] += 1 self.weight_before = self.layer.weight.clone() def training_step(self, batch, batch_idx): self.called["training_step"] += 1 opt = self.optimizers() output = self.layer(batch) loss = self.loss(batch, output) loss /= loss.clone().detach() loss *= 0.1 if self.should_update: self.manual_backward(loss) if self.should_have_updated: opt.step() opt.zero_grad() return loss.detach() if self.detach else loss def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): self.called["on_train_batch_end"] += 1 after_before = self.layer.weight.clone() if self.should_update and self.should_have_updated: assert not torch.equal(self.weight_before, after_before), self.count assert torch.all(self.layer.weight.grad == 0) else: assert torch.equal(self.weight_before, after_before) if self.count > 1: if self.count % 4 == 1: assert torch.all(self.layer.weight.grad == 0) else: assert torch.sum(self.layer.weight.grad) != 0 self.count += 1 def on_train_epoch_end(self, *_, **__): assert self.called["training_step"] == 20 assert self.called["on_train_batch_start"] == 20 assert self.called["on_train_batch_end"] == 20 model = ExtendedModel() model.training_step_end = None model.training_epoch_end = None trainer = Trainer( max_epochs=1, default_root_dir=tmpdir, limit_train_batches=20, limit_test_batches=0, limit_val_batches=0, precision=16, amp_backend="native", gpus=1, ) trainer.fit(model)
def create_lightning_trainer(container: LightningContainer, resume_from_checkpoint: Optional[Path] = None, num_nodes: int = 1, **kwargs: Dict[str, Any]) -> \ Tuple[Trainer, Optional[StoringLogger]]: """ Creates a Pytorch Lightning Trainer object for the given model configuration. It creates checkpoint handlers and loggers. That includes a diagnostic logger for use in unit tests, that is also returned as the second return value. :param container: The container with model and data. :param resume_from_checkpoint: If provided, training resumes from this checkpoint point. :param num_nodes: The number of nodes to use in distributed training. :param kwargs: Any additional keyowrd arguments will be passed to the constructor of Trainer. :return: A tuple [Trainer object, diagnostic logger] """ # For now, stick with the legacy behaviour of always saving only the last epoch checkpoint. For large segmentation # models, this still appears to be the best way of choosing them because validation loss on the relatively small # training patches is not stable enough. Going by the validation loss somehow works for the Prostate model, but # not for the HeadAndNeck model. best_checkpoint_callback = ModelCheckpoint( dirpath=str(container.checkpoint_folder), # filename=BEST_CHECKPOINT_FILE_NAME, # monitor=f"{VALIDATION_PREFIX}{MetricType.LOSS.value}", # save_top_k=1, save_last=True) # Recovery checkpoints: {epoch} will turn into a string like "epoch=1" # Store 1 recovery checkpoint every recovery_checkpoint_save_interval epochs, keep the last # recovery_checkpoints_save_last_k. recovery_checkpoint_callback = InnerEyeRecoveryCheckpointCallback( container) num_gpus = container.num_gpus_per_node effective_num_gpus = num_gpus * num_nodes # Accelerator should be "ddp" when running large models in AzureML (when using DDP_spawn, we get out of GPU memory). # For unit tests, only "ddp_spawn" works accelerator = "ddp" if effective_num_gpus > 1 else None if effective_num_gpus > 1: # Initialize the DDP plugin with find_unused_parameters=False by default. If True (default), it prints out # lengthy warnings about the performance impact of find_unused_parameters plugins = [ InnerEyeDDPPlugin( num_nodes=num_nodes, sync_batchnorm=True, find_unused_parameters=container.pl_find_unused_parameters) ] else: plugins = [] logging.info( f"Using {num_gpus} GPUs per node with accelerator '{accelerator}'") tensorboard_logger = TensorBoardLogger(save_dir=str(container.logs_folder), name="Lightning", version="") loggers = [tensorboard_logger, AzureMLLogger()] storing_logger: Optional[StoringLogger] if isinstance(container, InnerEyeContainer): storing_logger = StoringLogger() loggers.append(storing_logger) else: storing_logger = None # Use 32bit precision when running on CPU. Otherwise, make it depend on use_mixed_precision flag. precision = 32 if num_gpus == 0 else 16 if container.use_mixed_precision else 32 # The next two flags control the settings in torch.backends.cudnn.deterministic and torch.backends.cudnn.benchmark # https://pytorch.org/docs/stable/notes/randomness.html # For the classification models, we observed only a small performance deterioration (increase in 10sec on total # training time of 22min) when switching to deterministic. if container.pl_deterministic: deterministic = True benchmark = False else: deterministic = False benchmark = True # If the users provides additional callbacks via get_trainer_arguments (for custom # containers callbacks = [best_checkpoint_callback, recovery_checkpoint_callback] if "callbacks" in kwargs: callbacks.append(kwargs.pop("callbacks")) # type: ignore is_azureml_run = not is_offline_run_context(RUN_CONTEXT) progress_bar_refresh_rate = container.pl_progress_bar_refresh_rate if progress_bar_refresh_rate is None and is_azureml_run: # When running in AzureML, the default progress bar clutters the output files with thousands of lines. progress_bar_refresh_rate = 50 logging.info( f"The progress bar refresh rate is not set. Using a default of {progress_bar_refresh_rate}. " f"To change, modify the pl_progress_bar_refresh_rate field of the container." ) # Read out additional model-specific args here. # We probably want to keep essential ones like numgpu and logging. trainer = Trainer(default_root_dir=str(container.outputs_folder), deterministic=deterministic, benchmark=benchmark, accelerator=accelerator, max_epochs=container.num_epochs, num_sanity_val_steps=container.pl_num_sanity_val_steps, callbacks=callbacks, logger=loggers, progress_bar_refresh_rate=progress_bar_refresh_rate, num_nodes=num_nodes, gpus=num_gpus, precision=precision, sync_batchnorm=True, terminate_on_nan=container.detect_anomaly, resume_from_checkpoint=str(resume_from_checkpoint) if resume_from_checkpoint else None, plugins=plugins, **kwargs) return trainer, storing_logger
def test_multiple_optimizers_step(tmpdir): """Tests that `step` works with several optimizers.""" class TestModel(ManualOptModel): called = False def on_before_optimizer_step(self, *args): self.called = True norm = torch.nn.utils.clip_grad_norm_(self.parameters(), 2) if not (torch.isinf(norm) or torch.isnan(norm)): assert norm.item() < 100, norm.item() def training_step(self, batch, batch_idx): # manual opt_a, opt_b = self.optimizers() x = batch[0] loss_1 = self(x) loss_1 = self.loss(loss_1, loss_1) # make sure there are no grads if self.layer.weight.grad is not None: assert torch.all(self.layer.weight.grad == 0) self.manual_backward(loss_1) opt_a.step() # fake discriminator loss_2 = self(x) loss_2 = self.loss(loss_2, loss_2) # ensure we forward the correct params to the optimizer # without retain_graph we can't do multiple backward passes self.manual_backward(loss_2, retain_graph=True) self.manual_backward(loss_2, retain_graph=True) assert self.layer.weight.grad is not None opt_b.step() opt_b.zero_grad() return {"loss1": loss_1.detach(), "loss2": loss_2.detach()} def training_epoch_end(self, outputs) -> None: # outputs should be an array with an entry per optimizer assert len(outputs) == 2 model = TestModel() model.val_dataloader = None limit_train_batches = 2 trainer = Trainer( default_root_dir=tmpdir, limit_train_batches=limit_train_batches, limit_val_batches=2, max_epochs=1, log_every_n_steps=1, weights_summary=None, precision=16, amp_backend="native", gpus=1, ) with mock.patch.object(Accelerator, "backward", wraps=trainer.accelerator.backward) as bwd_mock: trainer.fit(model) assert bwd_mock.call_count == limit_train_batches * 3 assert model.called
def test_val_step_only_step_metrics(tmpdir): """ Make sure the logged + pbar metrics are allocated accordingly at every step when requested """ # enable internal debugging actions os.environ['PL_DEV_DEBUG'] = '1' model = DeterministicModel() model.training_step = model.training_step_result_log_epoch_and_step_for_callbacks model.training_step_end = None model.training_epoch_end = None model.validation_step = model.validation_step_result_only_step_metrics model.validation_step_end = None model.validation_epoch_end = None batches = 3 epochs = 3 trainer = Trainer( default_root_dir=tmpdir, max_epochs=epochs, row_log_interval=1, limit_train_batches=batches, limit_val_batches=batches, weights_summary=None, ) trainer.fit(model) # make sure correct steps were called assert model.validation_step_called assert not model.validation_step_end_called assert not model.validation_epoch_end_called # no early stopping assert len(trainer.dev_debugger.early_stopping_history) == 0 # make sure we logged the exact number of metrics assert len( trainer.dev_debugger.logged_metrics) == epochs * batches + (epochs) assert len( trainer.dev_debugger.pbar_added_metrics) == epochs * batches + (epochs) # make sure we logged the correct epoch metrics total_empty_epoch_metrics = 0 epoch = 0 for metric in trainer.dev_debugger.logged_metrics: if 'epoch' in metric: epoch += 1 if len(metric) > 2: assert 'no_val_no_pbar' not in metric assert 'val_step_pbar_acc' not in metric assert metric[f'val_step_log_acc/epoch_{epoch}'] assert metric[f'val_step_log_pbar_acc/epoch_{epoch}'] else: total_empty_epoch_metrics += 1 assert total_empty_epoch_metrics == 3 # make sure we logged the correct epoch pbar metrics total_empty_epoch_metrics = 0 for metric in trainer.dev_debugger.pbar_added_metrics: if 'epoch' in metric: epoch += 1 if len(metric) > 2: assert 'no_val_no_pbar' not in metric assert 'val_step_log_acc' not in metric assert metric['val_step_log_pbar_acc'] assert metric['val_step_pbar_acc'] else: total_empty_epoch_metrics += 1 assert total_empty_epoch_metrics == 3 # only 1 checkpoint expected since values didn't change after that assert len(trainer.dev_debugger.checkpoint_callback_history) == 1 # make sure the last known metric is correct assert trainer.logger_connector.callback_metrics['checkpoint_on'] == 189
def test_step_with_optimizer_closure(tmpdir): """Tests that `step` works with optimizer_closure.""" class TestModel(BoringModel): _losses = [] def __init__(self): super().__init__() self.automatic_optimization = False def training_step(self, batch, batch_idx): # make sure there are no grads if self.layer.weight.grad is not None: assert torch.all(self.layer.weight.grad == 0) opt = self.optimizers() def compute_loss(): x = batch[0] x = F.dropout(x, 0.1) predictions = self(x) predictions = F.dropout(predictions, 0.1) loss = self.loss(None, predictions) return loss def optimizer_closure(): # emulate bayesian optimization. num_backward = 2 losses = [] for backward_idx in range(num_backward): loss = compute_loss() losses.append(loss) retain_graph = (num_backward - 1) != backward_idx self.manual_backward(loss, retain_graph=retain_graph) # emulate MC dropout training loss = torch.stack(losses).mean() self._losses.append(loss) self.log("train_loss", loss, on_step=True, prog_bar=True, on_epoch=True) assert losses[0] != losses[1] weight_before = self.layer.weight.clone() opt.step(closure=optimizer_closure) opt.zero_grad() weight_after = self.layer.weight.clone() assert not torch.equal(weight_before, weight_after) def configure_optimizers(self): return torch.optim.SGD(self.layer.parameters(), lr=0.1) model = TestModel() model.val_dataloader = None model.training_epoch_end = None limit_train_batches = 2 trainer = Trainer( default_root_dir=tmpdir, limit_train_batches=limit_train_batches, limit_val_batches=2, max_epochs=1, log_every_n_steps=1, ) with mock.patch.object(Accelerator, "backward", wraps=trainer.accelerator.backward) as bwd_mock: trainer.fit(model) assert bwd_mock.call_count == limit_train_batches * 2 assert trainer.progress_bar_metrics["train_loss_step"] == model._losses[-1] assert trainer.progress_bar_metrics["train_loss_epoch"] == torch.stack( model._losses).mean()
def cli_main(): parser = ArgumentParser() parser.add_argument("--DATA_PATH", type=str, help="path to folders with images") parser.add_argument("--MODEL_PATH", default=None, type=str, help="path to model checkpoint") parser.add_argument("--batch_size", default=128, type=int, help="batch size for SSL") parser.add_argument("--image_size", default=256, type=int, help="image size for SSL") parser.add_argument("--num_workers", default=1, type=int, help="number of CPU cores to use for data processing") parser.add_argument("--image_embedding_size", default=128, type=int, help="size of image representation of SIMCLR") parser.add_argument("--epochs", default=200, type=int, help="number of epochs to train model") parser.add_argument("--lr", default=1e-3, type=float, help="learning rate for training model") parser.add_argument( "--patience", default=-1, type=int, help= "automatically cuts off training if validation does not drop for (patience) epochs. Leave blank to have no validation based early stopping." ) parser.add_argument("--val_split", default=0.2, type=float, help="percent in validation data") parser.add_argument( "--pretrain_encoder", default=False, type=bool, help= "initialize resnet encoder with pretrained imagenet weights. Cannot be true if passing previous SSL model checkpoint." ) parser.add_argument( "--withold_train_percent", default=0, type=float, help= "decimal from 0-1 representing how much of the training data to withold during SSL training" ) parser.add_argument("--version", default="0", type=str, help="version to name checkpoint for saving") parser.add_argument("--gpus", default=1, type=int, help="number of gpus to use for training") parser.add_argument("--num_workers", default=0, type=int, help="number of workers to use to fetch data") args = parser.parse_args() URL = args.DATA_PATH batch_size = args.batch_size image_size = args.image_size num_workers = args.num_workers embedding_size = args.image_embedding_size epochs = args.epochs lr = args.lr patience = args.patience val_split = args.val_split pretrain = args.pretrain_encoder withold_train_percent = args.withold_train_percent version = args.version model_checkpoint = args.MODEL_PATH gpus = args.gpus num_workers = args.num_workers train_transform = SimCLRTrainDataTransform(256) val_transform = SimCLREvalDataTransform(256) dm = ImageDataModule(URL, train_transform=train_transform, val_transform=val_transform, val_split=val_split, num_workers=num_workers) dm.setup() #init model with batch size, num_samples (len of data), epochs to train, and autofinds learning rate model = SimCLR(arch='resnet18', batch_size=batch_size, num_samples=dm.num_samples, gpus=gpus, dataset='None', max_epochs=epochs, learning_rate=lr) # model.encoder = resnet18(pretrained=pretrain, first_conv=model.first_conv, maxpool1=model.maxpool1, return_all_feature_maps=False) model.projection = Projection(input_dim=512, hidden_dim=256, output_dim=embedding_size) #overrides if patience > 0: cb = EarlyStopping('val_loss', patience=patience) trainer = Trainer(gpus=gpus, max_epochs=epochs, callbacks=[cb], progress_bar_refresh_rate=5) else: trainer = Trainer(gpus=gpus, max_epochs=epochs, progress_bar_refresh_rate=5) if model_checkpoint is not None: model.load_state_dict(torch.load(model_checkpoint)) print( 'Successfully loaded your checkpoint. Keep in mind that this does not preserve the previous trainer states, only the model weights' ) model.cuda() print('Model Initialized') trainer.fit(model, dm) Path(f"./models/SSL/SIMCLR_SSL_{version}").mkdir(parents=True, exist_ok=True) torch.save(model.state_dict(), f"./models/SSL/SIMCLR_SSL_{version}/SIMCLR_SSL_{version}.pt")
def test_exception_when_no_tpu_found(tmpdir): """Test if exception is thrown when xla devices are not available""" with pytest.raises(MisconfigurationException, match='No TPU devices were found.'): Trainer(tpu_cores=8)