def test_save_best_model_by_val_score(dirname, capsys): trainer = Engine(lambda e, b: None) evaluator = Engine(lambda e, b: None) model = DummyModel() acc_scores = [0.1, 0.2, 0.3, 0.4, 0.3, 0.5, 0.6, 0.61, 0.7, 0.5] @trainer.on(Events.EPOCH_COMPLETED) def validate(engine): evaluator.run([ 0, ]) @evaluator.on(Events.EPOCH_COMPLETED) def set_eval_metric(engine): engine.state.metrics = {"acc": acc_scores[trainer.state.epoch - 1]} save_best_model_by_val_score(dirname, evaluator, model, metric_name="acc", n_saved=2, trainer=trainer) data = [ 0, ] trainer.run(data, max_epochs=len(acc_scores)) assert set(os.listdir(dirname)) == set( ["best_model_8_val_acc=0.6100.pt", "best_model_9_val_acc=0.7000.pt"])
def training(config, local_rank=None, with_mlflow_logging=False, with_plx_logging=False): if not getattr(config, "use_fp16", True): raise RuntimeError("This training script uses by default fp16 AMP") set_seed(config.seed + local_rank) torch.cuda.set_device(local_rank) device = 'cuda' torch.backends.cudnn.benchmark = True train_loader = config.train_loader train_sampler = getattr(train_loader, "sampler", None) assert train_sampler is not None, "Train loader of type '{}' " \ "should have attribute 'sampler'".format(type(train_loader)) assert hasattr(train_sampler, 'set_epoch') and callable(train_sampler.set_epoch), \ "Train sampler should have a callable method `set_epoch`" train_eval_loader = config.train_eval_loader val_loader = config.val_loader model = config.model.to(device) optimizer = config.optimizer model, optimizer = amp.initialize(model, optimizer, opt_level=getattr( config, "fp16_opt_level", "O2"), num_losses=1) model = DDP(model, delay_allreduce=True) criterion = config.criterion.to(device) prepare_batch = getattr(config, "prepare_batch", _prepare_batch) non_blocking = getattr(config, "non_blocking", True) # Setup trainer accumulation_steps = getattr(config, "accumulation_steps", 1) model_output_transform = getattr(config, "model_output_transform", lambda x: x) def train_update_function(engine, batch): model.train() x, y = prepare_batch(batch, device=device, non_blocking=non_blocking) y_pred = model(x) y_pred = model_output_transform(y_pred) loss = criterion(y_pred, y) / accumulation_steps with amp.scale_loss(loss, optimizer, loss_id=0) as scaled_loss: scaled_loss.backward() if engine.state.iteration % accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return { 'supervised batch loss': loss.item(), } trainer = Engine(train_update_function) common.setup_common_distrib_training_handlers( trainer, train_sampler, to_save={ 'model': model, 'optimizer': optimizer }, save_every_iters=1000, output_path=config.output_path.as_posix(), lr_scheduler=config.lr_scheduler, with_gpu_stats=True, output_names=[ 'supervised batch loss', ], with_pbars=True, with_pbar_on_iters=with_mlflow_logging, log_every_iters=1) if getattr(config, "benchmark_dataflow", False): benchmark_dataflow_num_iters = getattr(config, "benchmark_dataflow_num_iters", 1000) DataflowBenchmark(benchmark_dataflow_num_iters, prepare_batch=prepare_batch, device=device).attach(trainer, train_loader) # Setup evaluators val_metrics = { "Accuracy": Accuracy(device=device), "Top-5 Accuracy": TopKCategoricalAccuracy(k=5, device=device), } if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict): val_metrics.update(config.val_metrics) model_output_transform = getattr(config, "model_output_transform", lambda x: x) evaluator_args = dict(model=model, metrics=val_metrics, device=device, non_blocking=non_blocking, prepare_batch=prepare_batch, output_transform=lambda x, y, y_pred: ( model_output_transform(y_pred), y, )) train_evaluator = create_supervised_evaluator(**evaluator_args) evaluator = create_supervised_evaluator(**evaluator_args) if dist.get_rank() == 0 and with_mlflow_logging: ProgressBar(persist=False, desc="Train Evaluation").attach(train_evaluator) ProgressBar(persist=False, desc="Val Evaluation").attach(evaluator) def run_validation(_): train_evaluator.run(train_eval_loader) evaluator.run(val_loader) if getattr(config, "start_by_validation", False): trainer.add_event_handler(Events.STARTED, run_validation) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=getattr(config, "val_interval", 1)), run_validation) trainer.add_event_handler(Events.COMPLETED, run_validation) score_metric_name = "Accuracy" if hasattr(config, "es_patience"): common.add_early_stopping_by_val_score(config.es_patience, evaluator, trainer, metric_name=score_metric_name) if dist.get_rank() == 0: tb_logger = common.setup_tb_logging(config.output_path.as_posix(), trainer, optimizer, evaluators={ "training": train_evaluator, "validation": evaluator }) if with_mlflow_logging: common.setup_mlflow_logging(trainer, optimizer, evaluators={ "training": train_evaluator, "validation": evaluator }) if with_plx_logging: common.setup_plx_logging(trainer, optimizer, evaluators={ "training": train_evaluator, "validation": evaluator }) common.save_best_model_by_val_score(config.output_path.as_posix(), evaluator, model, metric_name=score_metric_name, trainer=trainer) # Log train/val predictions: tb_logger.attach( evaluator, log_handler=predictions_gt_images_handler( img_denormalize_fn=config.img_denormalize, n_images=15, another_engine=trainer, prefix_tag="validation"), event_name=Events.ITERATION_COMPLETED(once=len(val_loader) // 2)) tb_logger.attach(train_evaluator, log_handler=predictions_gt_images_handler( img_denormalize_fn=config.img_denormalize, n_images=15, another_engine=trainer, prefix_tag="training"), event_name=Events.ITERATION_COMPLETED( once=len(train_eval_loader) // 2)) trainer.run(train_loader, max_epochs=config.num_epochs)
# TODO: make sure `valid_state.metrics` is ordered so that reported default metric to NNI is always the same nni.report_intermediate_result({'default': valid_state.metrics.values()[0], **train_metrics, **valid_metrics}) if backend_conf.rank == 0: event = Events.ITERATION_COMPLETED(every=hp['log_progress_every_iters'] if hp['log_progress_every_iters'] else None) ProgressBar(persist=False, desc='Train evaluation').attach(train_evaluator, event_name=event) ProgressBar(persist=False, desc='Test evaluation').attach(valid_evaluator) log_handler = OutputHandler(tag='train', metric_names=list(metrics.keys()), global_step_transform=global_step_from_engine(trainer)) tb_logger.attach(train_evaluator, log_handler=log_handler, event_name=Events.COMPLETED) log_handler = OutputHandler(tag='test', metric_names=list(metrics.keys()), global_step_transform=global_step_from_engine(trainer)) tb_logger.attach(valid_evaluator, log_handler=log_handler, event_name=Events.COMPLETED) # Store the best model by validation accuracy: common.save_best_model_by_val_score(str(output_path), valid_evaluator, model=model, metric_name='accuracy', n_saved=3, trainer=trainer, tag='val') if hp['log_grads_every_iters'] is not None and hp['log_grads_every_iters'] > 0: tb_logger.attach(trainer, log_handler=GradsHistHandler(model, tag=model.__class__.__name__), event_name=Events.ITERATION_COMPLETED(every=hp['log_grads_every_iters'])) if hp['crash_iteration'] is not None and hp['crash_iteration'] >= 0: @trainer.on(Events.ITERATION_STARTED(once=hp['crash_iteration'])) def _(engine): raise Exception('STOP at iteration: {}'.format(engine.state.iteration)) if nni_compression_pruner is not None: # Notify NNI compressor (pruning or quantization) for each epoch and eventually each steps/batch-iteration if need by provided Pruner/Quantizer (see NNI Compression Documentation for more details: https://nni.readthedocs.io/en/latest/Compressor/QuickStart.html#apis-for-updating-fine-tuning-status) @trainer.on(Events.EPOCH_STARTED) def _nni_compression_update_epoch(engine): nni_compression_pruner.update_epoch(engine.state.epoch)
def training(local_rank, config): rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() logger = setup_logger(name="CIFAR10-QAT-Training", distributed_rank=local_rank) log_basic_info(logger, config) output_path = config["output_path"] if rank == 0: now = datetime.now().strftime("%Y%m%d-%H%M%S") folder_name = "{}_backend-{}-{}_{}".format(config["model"], idist.backend(), idist.get_world_size(), now) output_path = Path(output_path) / folder_name if not output_path.exists(): output_path.mkdir(parents=True) config["output_path"] = output_path.as_posix() logger.info("Output path: {}".format(config["output_path"])) if "cuda" in device.type: config["cuda device name"] = torch.cuda.get_device_name(local_rank) # Setup dataflow, model, optimizer, criterion train_loader, test_loader = get_dataflow(config) config["num_iters_per_epoch"] = len(train_loader) model, optimizer, criterion, lr_scheduler = initialize(config) # Create trainer for current task trainer = create_trainer(model, optimizer, criterion, lr_scheduler, train_loader.sampler, config, logger) # Let's now setup evaluator engine to perform model's validation and compute metrics metrics = { "Accuracy": Accuracy(), "Loss": Loss(criterion), } # We define two evaluators as they wont have exactly similar roles: # - `evaluator` will save the best model based on validation score evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def run_validation(engine): epoch = trainer.state.epoch state = train_evaluator.run(train_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(test_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED, run_validation) if rank == 0: # Setup TensorBoard logging on trainer and evaluators. Logged values are: # - Training metrics, e.g. running average loss values # - Learning rate # - Evaluation train/test metrics evaluators = {"training": train_evaluator, "test": evaluator} tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators) # Store 3 best models by validation accuracy: common.save_best_model_by_val_score( output_path=config["output_path"], evaluator=evaluator, model=model, metric_name="Accuracy", n_saved=1, trainer=trainer, tag="test", ) trainer.run(train_loader, max_epochs=config["num_epochs"]) if rank == 0: tb_logger.close()
def training(local_rank, config, logger=None): if not getattr(config, "use_fp16", True): raise RuntimeError("This training script uses by default fp16 AMP") torch.backends.cudnn.benchmark = True set_seed(config.seed + local_rank) train_loader, val_loader, train_eval_loader = config.train_loader, config.val_loader, config.train_eval_loader # Setup model, optimizer, criterion model, optimizer, criterion = initialize(config) if not hasattr(config, "prepare_batch"): config.prepare_batch = _prepare_batch # Setup trainer for this specific task trainer = create_trainer(model, optimizer, criterion, train_loader.sampler, config, logger) if getattr(config, "benchmark_dataflow", False): benchmark_dataflow_num_iters = getattr(config, "benchmark_dataflow_num_iters", 1000) DataflowBenchmark(benchmark_dataflow_num_iters, prepare_batch=config.prepare_batch).attach( trainer, train_loader) # Setup evaluators val_metrics = { "Accuracy": Accuracy(), "Top-5 Accuracy": TopKCategoricalAccuracy(k=5), } if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict): val_metrics.update(config.val_metrics) evaluator, train_evaluator = create_evaluators(model, val_metrics, config) @trainer.on( Events.EPOCH_COMPLETED(every=getattr(config, "val_interval", 1)) | Events.COMPLETED) def run_validation(): epoch = trainer.state.epoch state = train_evaluator.run(train_eval_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(val_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) if getattr(config, "start_by_validation", False): trainer.add_event_handler(Events.STARTED, run_validation) score_metric_name = "Accuracy" if hasattr(config, "es_patience"): common.add_early_stopping_by_val_score(config.es_patience, evaluator, trainer, metric_name=score_metric_name) # Store 3 best models by validation accuracy: common.save_best_model_by_val_score( config.output_path.as_posix(), evaluator, model=model, metric_name=score_metric_name, n_saved=3, trainer=trainer, tag="val", ) if idist.get_rank() == 0: tb_logger = common.setup_tb_logging( config.output_path.as_posix(), trainer, optimizer, evaluators={ "training": train_evaluator, "validation": evaluator }, ) exp_tracking_logger = exp_tracking.setup_logging(trainer, optimizer, evaluators={ "training": train_evaluator, "validation": evaluator }) # Log train/val predictions: tb_logger.attach( evaluator, log_handler=predictions_gt_images_handler( img_denormalize_fn=config.img_denormalize, n_images=15, another_engine=trainer, prefix_tag="validation"), event_name=Events.ITERATION_COMPLETED(once=len(val_loader) // 2), ) tb_logger.attach( train_evaluator, log_handler=predictions_gt_images_handler( img_denormalize_fn=config.img_denormalize, n_images=15, another_engine=trainer, prefix_tag="training"), event_name=Events.ITERATION_COMPLETED( once=len(train_eval_loader) // 2), ) trainer.run(train_loader, max_epochs=config.num_epochs) if idist.get_rank() == 0: tb_logger.close() exp_tracking_logger.close()
def run(output_path, config): distributed = dist.is_available() and dist.is_initialized() rank = dist.get_rank() if distributed else 0 manual_seed(config["seed"] + rank) # Setup dataflow, model, optimizer, criterion train_loader, test_loader = utils.get_dataflow(config, distributed) model, optimizer = utils.get_model_optimizer(config, distributed) criterion = nn.CrossEntropyLoss().to(utils.device) le = len(train_loader) milestones_values = [ (0, 0.0), (le * config["num_warmup_epochs"], config["learning_rate"]), (le * config["num_epochs"], 0.0), ] lr_scheduler = PiecewiseLinear(optimizer, param_name="lr", milestones_values=milestones_values) # Setup Ignite trainer: # - let's define training step # - add other common handlers: # - TerminateOnNan, # - handler to setup learning rate scheduling, # - ModelCheckpoint # - RunningAverage` on `train_step` output # - Two progress bars on epochs and optionally on iterations def train_step(engine, batch): x = convert_tensor(batch[0], device=utils.device, non_blocking=True) y = convert_tensor(batch[1], device=utils.device, non_blocking=True) model.train() # Supervised part y_pred = model(x) loss = criterion(y_pred, y) optimizer.zero_grad() loss.backward() optimizer.step() return { "batch loss": loss.item(), } if config["deterministic"] and rank == 0: print("Setup deterministic trainer") trainer = Engine(train_step) if not config["deterministic"] else DeterministicEngine(train_step) train_sampler = train_loader.sampler if distributed else None to_save = {"trainer": trainer, "model": model, "optimizer": optimizer, "lr_scheduler": lr_scheduler} metric_names = [ "batch loss", ] common.setup_common_training_handlers( trainer, train_sampler=train_sampler, to_save=to_save, save_every_iters=config["checkpoint_every"], output_path=output_path, lr_scheduler=lr_scheduler, output_names=metric_names, with_pbar_on_iters=config["display_iters"], log_every_iters=10, ) if rank == 0: # Setup Tensorboard logger - wrapper on SummaryWriter tb_logger = TensorboardLogger(log_dir=output_path) # Attach logger to the trainer and log trainer's metrics (stored in trainer.state.metrics) every iteration tb_logger.attach( trainer, log_handler=OutputHandler(tag="train", metric_names=metric_names), event_name=Events.ITERATION_COMPLETED, ) # log optimizer's parameters: "lr" every iteration tb_logger.attach( trainer, log_handler=OptimizerParamsHandler(optimizer, param_name="lr"), event_name=Events.ITERATION_STARTED ) # Let's now setup evaluator engine to perform model's validation and compute metrics metrics = { "accuracy": Accuracy(device=utils.device if distributed else None), "loss": Loss(criterion, device=utils.device if distributed else None), } # We define two evaluators as they wont have exactly similar roles: # - `evaluator` will save the best model based on validation score evaluator = create_supervised_evaluator(model, metrics=metrics, device=utils.device, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=utils.device, non_blocking=True) def run_validation(engine): train_evaluator.run(train_loader) evaluator.run(test_loader) trainer.add_event_handler(Events.EPOCH_STARTED(every=config["validate_every"]), run_validation) trainer.add_event_handler(Events.COMPLETED, run_validation) if rank == 0: # Setup progress bar on evaluation engines if config["display_iters"]: ProgressBar(persist=False, desc="Train evaluation").attach(train_evaluator) ProgressBar(persist=False, desc="Test evaluation").attach(evaluator) # Let's log metrics of `train_evaluator` stored in `train_evaluator.state.metrics` when validation run is done tb_logger.attach( train_evaluator, log_handler=OutputHandler( tag="train", metric_names="all", global_step_transform=global_step_from_engine(trainer) ), event_name=Events.COMPLETED, ) # Let's log metrics of `evaluator` stored in `evaluator.state.metrics` when validation run is done tb_logger.attach( evaluator, log_handler=OutputHandler( tag="test", metric_names="all", global_step_transform=global_step_from_engine(trainer) ), event_name=Events.COMPLETED, ) # Store 3 best models by validation accuracy: common.save_best_model_by_val_score( output_path, evaluator, model=model, metric_name="accuracy", n_saved=3, trainer=trainer, tag="test" ) # Optionally log model gradients if config["log_model_grads_every"] is not None: tb_logger.attach( trainer, log_handler=GradsHistHandler(model, tag=model.__class__.__name__), event_name=Events.ITERATION_COMPLETED(every=config["log_model_grads_every"]), ) # In order to check training resuming we can emulate a crash if config["crash_iteration"] is not None: @trainer.on(Events.ITERATION_STARTED(once=config["crash_iteration"])) def _(engine): raise Exception("STOP at iteration: {}".format(engine.state.iteration)) resume_from = config["resume_from"] if resume_from is not None: checkpoint_fp = Path(resume_from) assert checkpoint_fp.exists(), "Checkpoint '{}' is not found".format(checkpoint_fp.as_posix()) print("Resume from a checkpoint: {}".format(checkpoint_fp.as_posix())) checkpoint = torch.load(checkpoint_fp.as_posix()) Checkpoint.load_objects(to_load=to_save, checkpoint=checkpoint) try: trainer.run(train_loader, max_epochs=config["num_epochs"]) except Exception as e: import traceback print(traceback.format_exc()) if rank == 0: tb_logger.close()
def training(config, local_rank, with_pbar_on_iters=True): if not getattr(config, "use_fp16", True): raise RuntimeError("This training script uses by default fp16 AMP") set_seed(config.seed + local_rank) torch.cuda.set_device(local_rank) device = 'cuda' torch.backends.cudnn.benchmark = True train_loader = config.train_loader train_sampler = getattr(train_loader, "sampler", None) assert train_sampler is not None, "Train loader of type '{}' " \ "should have attribute 'sampler'".format(type(train_loader)) assert hasattr(train_sampler, 'set_epoch') and callable(train_sampler.set_epoch), \ "Train sampler should have a callable method `set_epoch`" train_eval_loader = config.train_eval_loader val_loader = config.val_loader model = config.model.to(device) optimizer = config.optimizer model, optimizer = amp.initialize(model, optimizer, opt_level=getattr( config, "fp16_opt_level", "O2"), num_losses=1) model = DDP(model, delay_allreduce=True) criterion = config.criterion.to(device) # Setup trainer prepare_batch = getattr(config, "prepare_batch") non_blocking = getattr(config, "non_blocking", True) accumulation_steps = getattr(config, "accumulation_steps", 1) model_output_transform = getattr(config, "model_output_transform", lambda x: x) def train_update_function(engine, batch): model.train() x, y = prepare_batch(batch, device=device, non_blocking=non_blocking) y_pred = model(x) y_pred = model_output_transform(y_pred) loss = criterion(y_pred, y) if isinstance(loss, Mapping): assert 'supervised batch loss' in loss loss_dict = loss output = {k: v.item() for k, v in loss_dict.items()} loss = loss_dict['supervised batch loss'] / accumulation_steps else: output = {'supervised batch loss': loss.item()} with amp.scale_loss(loss, optimizer, loss_id=0) as scaled_loss: scaled_loss.backward() if engine.state.iteration % accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return output output_names = getattr(config, "output_names", [ 'supervised batch loss', ]) trainer = Engine(train_update_function) common.setup_common_distrib_training_handlers( trainer, train_sampler, to_save={ 'model': model, 'optimizer': optimizer }, save_every_iters=1000, output_path=config.output_path.as_posix(), lr_scheduler=config.lr_scheduler, output_names=output_names, with_pbars=True, with_pbar_on_iters=with_pbar_on_iters, log_every_iters=1) def output_transform(output): return output['y_pred'], output['y'] num_classes = config.num_classes cm_metric = ConfusionMatrix(num_classes=num_classes, output_transform=output_transform) pr = cmPrecision(cm_metric, average=False) re = cmRecall(cm_metric, average=False) val_metrics = { "IoU": IoU(cm_metric), "mIoU_bg": mIoU(cm_metric), "Accuracy": cmAccuracy(cm_metric), "Precision": pr, "Recall": re, "F1": Fbeta(beta=1.0, output_transform=output_transform) } if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict): val_metrics.update(config.val_metrics) evaluator_args = dict(model=model, metrics=val_metrics, device=device, non_blocking=non_blocking, prepare_batch=prepare_batch, output_transform=lambda x, y, y_pred: { 'y_pred': model_output_transform(y_pred), 'y': y }) train_evaluator = create_supervised_evaluator(**evaluator_args) evaluator = create_supervised_evaluator(**evaluator_args) if dist.get_rank() == 0 and with_pbar_on_iters: ProgressBar(persist=False, desc="Train Evaluation").attach(train_evaluator) ProgressBar(persist=False, desc="Val Evaluation").attach(evaluator) def run_validation(engine): train_evaluator.run(train_eval_loader) evaluator.run(val_loader) if getattr(config, "start_by_validation", False): trainer.add_event_handler(Events.STARTED, run_validation) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=getattr(config, "val_interval", 1)), run_validation) trainer.add_event_handler(Events.COMPLETED, run_validation) score_metric_name = "mIoU_bg" if hasattr(config, "es_patience"): common.add_early_stopping_by_val_score(config.es_patience, evaluator, trainer, metric_name=score_metric_name) if dist.get_rank() == 0: tb_logger = common.setup_tb_logging(config.output_path.as_posix(), trainer, optimizer, evaluators={ "training": train_evaluator, "validation": evaluator }) common.setup_mlflow_logging(trainer, optimizer, evaluators={ "training": train_evaluator, "validation": evaluator }) common.save_best_model_by_val_score(config.output_path.as_posix(), evaluator, model, metric_name=score_metric_name, trainer=trainer) # Log train/val predictions: tb_logger.attach( evaluator, log_handler=predictions_gt_images_handler( img_denormalize_fn=config.img_denormalize, n_images=15, another_engine=trainer, prefix_tag="validation"), event_name=Events.ITERATION_COMPLETED(once=len(val_loader) // 2)) log_train_predictions = getattr(config, "log_train_predictions", False) if log_train_predictions: tb_logger.attach(train_evaluator, log_handler=predictions_gt_images_handler( img_denormalize_fn=config.img_denormalize, n_images=15, another_engine=trainer, prefix_tag="validation"), event_name=Events.ITERATION_COMPLETED( once=len(train_eval_loader) // 2)) trainer.run(train_loader, max_epochs=config.num_epochs)
def training(local_rank, config): rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() logger = setup_logger(name="CIFAR10-Training", distributed_rank=local_rank) log_basic_info(logger, config) output_path = config["output_path"] if rank == 0: if config["stop_iteration"] is None: now = datetime.now().strftime("%Y%m%d-%H%M%S") else: now = "stop-on-{}".format(config["stop_iteration"]) folder_name = "{}_backend-{}-{}_{}".format(config["model"], idist.backend(), idist.get_world_size(), now) output_path = Path(output_path) / folder_name if not output_path.exists(): output_path.mkdir(parents=True) config["output_path"] = output_path.as_posix() logger.info("Output path: {}".format(config["output_path"])) # Setup dataflow, model, optimizer, criterion train_loader, test_loader = get_dataflow(config) config["num_iters_per_epoch"] = len(train_loader) model, optimizer, criterion, lr_scheduler = initialize(config) # Create trainer for current task trainer = create_trainer(model, optimizer, criterion, lr_scheduler, train_loader.sampler, config, logger) # Let's now setup evaluator engine to perform model's validation and compute metrics metrics = { "accuracy": Accuracy(), "loss": Loss(criterion), } # We define two evaluators as they wont have exactly similar roles: # - `evaluator` will save the best model based on validation score evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def run_validation(engine): epoch = trainer.state.epoch state = train_evaluator.run(train_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(test_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) trainer.add_event_handler(Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED, run_validation) if rank == 0: # Setup TensorBoard logging on trainer and evaluators. Logged values are: # - Training metrics, e.g. running average loss values # - Learning rate # - Evaluation train/test metrics evaluators = {"training": train_evaluator, "test": evaluator} tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators) if config["with_trains"]: trains_logger = common.setup_trains_logging( trainer, optimizer, evaluators=evaluators, project_name="cifar10-ignite", task_name=Path(output_path).stem, ) # Store 3 best models by validation accuracy: common.save_best_model_by_val_score( output_path, evaluator, model=model, metric_name="accuracy", n_saved=3, trainer=trainer, tag="test" ) # In order to check training resuming we can stop training on a given iteration if config["stop_iteration"] is not None: @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"])) def _(): logger.info("Stop training on {} iteration".format(trainer.state.iteration)) trainer.terminate() try: trainer.run(train_loader, max_epochs=config["num_epochs"]) except Exception as e: import traceback print(traceback.format_exc()) if rank == 0: tb_logger.close() if config["with_trains"]: trains_logger.close()
def training(config, local_rank, with_pbar_on_iters=True): if not getattr(config, "use_fp16", True): raise RuntimeError("This training script uses by default fp16 AMP") set_seed(config.seed + local_rank) torch.cuda.set_device(local_rank) device = 'cuda' torch.backends.cudnn.benchmark = True train_loader = config.train_loader train_sampler = getattr(train_loader, "sampler", None) assert train_sampler is not None, "Train loader of type '{}' " \ "should have attribute 'sampler'".format(type(train_loader)) assert hasattr(train_sampler, 'set_epoch') and callable(train_sampler.set_epoch), \ "Train sampler should have a callable method `set_epoch`" unsup_train_loader = config.unsup_train_loader unsup_train_sampler = getattr(unsup_train_loader, "sampler", None) assert unsup_train_sampler is not None, "Train loader of type '{}' " \ "should have attribute 'sampler'".format(type(unsup_train_loader)) assert hasattr(unsup_train_sampler, 'set_epoch') and callable(unsup_train_sampler.set_epoch), \ "Unsupervised train sampler should have a callable method `set_epoch`" train_eval_loader = config.train_eval_loader val_loader = config.val_loader model = config.model.to(device) optimizer = config.optimizer model, optimizer = amp.initialize(model, optimizer, opt_level=getattr(config, "fp16_opt_level", "O2"), num_losses=2) model = DDP(model, delay_allreduce=True) criterion = config.criterion.to(device) unsup_criterion = config.unsup_criterion.to(device) unsup_batch_num_repetitions = getattr(config, "unsup_batch_num_repetitions", 1) # Setup trainer prepare_batch = getattr(config, "prepare_batch") non_blocking = getattr(config, "non_blocking", True) accumulation_steps = getattr(config, "accumulation_steps", 1) model_output_transform = getattr(config, "model_output_transform", lambda x: x) def cycle(seq): while True: for i in seq: yield i unsup_train_loader_iter = cycle(unsup_train_loader) def supervised_loss(batch): x, y = prepare_batch(batch, device=device, non_blocking=non_blocking) y_pred = model(x) y_pred = model_output_transform(y_pred) loss = criterion(y_pred, y) return loss def unsupervised_loss(x): with torch.no_grad(): y_pred_orig = model(x) # Data augmentation: geom only k = random.randint(1, 3) x_aug = torch.rot90(x, k=k, dims=(2, 3)) y_pred_orig_aug = torch.rot90(y_pred_orig, k=k, dims=(2, 3)) if random.random() < 0.5: x_aug = torch.flip(x_aug, dims=(2, )) y_pred_orig_aug = torch.flip(y_pred_orig_aug, dims=(2, )) if random.random() < 0.5: x_aug = torch.flip(x_aug, dims=(3, )) y_pred_orig_aug = torch.flip(y_pred_orig_aug, dims=(3, )) y_pred_orig_aug = y_pred_orig_aug.argmax(dim=1).long() y_pred_aug = model(x_aug.detach()) loss = unsup_criterion(y_pred_aug, y_pred_orig_aug.detach()) return loss def train_update_function(engine, batch): model.train() loss = supervised_loss(batch) if isinstance(loss, Mapping): assert 'supervised batch loss' in loss loss_dict = loss output = {k: v.item() for k, v in loss_dict.items()} loss = loss_dict['supervised batch loss'] / accumulation_steps else: output = {'supervised batch loss': loss.item()} # Difference with original UDA # Apply separately grads from supervised/unsupervised parts with amp.scale_loss(loss, optimizer, loss_id=0) as scaled_loss: scaled_loss.backward() if engine.state.iteration % accumulation_steps == 0: optimizer.step() optimizer.zero_grad() unsup_batch = next(unsup_train_loader_iter) unsup_x = unsup_batch['image'] unsup_x = convert_tensor(unsup_x, device=device, non_blocking=non_blocking) for _ in range(unsup_batch_num_repetitions): unsup_loss = engine.state.unsup_lambda * unsupervised_loss(unsup_x) assert isinstance(unsup_loss, torch.Tensor) output['unsupervised batch loss'] = unsup_loss.item() with amp.scale_loss(unsup_loss, optimizer, loss_id=1) as scaled_loss: scaled_loss.backward() if engine.state.iteration % accumulation_steps == 0: optimizer.step() optimizer.zero_grad() unsup_batch = None unsup_x = None total_loss = loss + unsup_loss output['total batch loss'] = total_loss.item() return output output_names = getattr(config, "output_names", ['supervised batch loss', 'unsupervised batch loss', 'total batch loss']) trainer = Engine(train_update_function) @trainer.on(Events.STARTED) def init(engine): if hasattr(config, "unsup_lambda_min"): engine.state.unsup_lambda = config.unsup_lambda_min else: engine.state.unsup_lambda = getattr(config, "unsup_lambda", 0.001) @trainer.on(Events.ITERATION_COMPLETED) def update_unsup_params(engine): engine.state.unsup_lambda += getattr(config, "unsup_lambda_delta", 0.00001) if hasattr(config, "unsup_lambda_max"): m = config.unsup_lambda_max engine.state.unsup_lambda = engine.state.unsup_lambda if engine.state.unsup_lambda < m else m common.setup_common_distrib_training_handlers( trainer, train_sampler, to_save={'model': model, 'optimizer': optimizer}, save_every_iters=1000, output_path=config.output_path.as_posix(), lr_scheduler=config.lr_scheduler, output_names=output_names, with_pbars=True, with_pbar_on_iters=with_pbar_on_iters, log_every_iters=1 ) def output_transform(output): return output['y_pred'], output['y'] num_classes = config.num_classes cm_metric = ConfusionMatrix(num_classes=num_classes, output_transform=output_transform) pr = cmPrecision(cm_metric, average=False) re = cmRecall(cm_metric, average=False) val_metrics = { "IoU": IoU(cm_metric), "mIoU_bg": mIoU(cm_metric), "Accuracy": cmAccuracy(cm_metric), "Precision": pr, "Recall": re, "F1": Fbeta(beta=1.0, output_transform=output_transform) } if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict): val_metrics.update(config.val_metrics) evaluator_args = dict( model=model, metrics=val_metrics, device=device, non_blocking=non_blocking, prepare_batch=prepare_batch, output_transform=lambda x, y, y_pred: {'y_pred': model_output_transform(y_pred), 'y': y} ) train_evaluator = create_supervised_evaluator(**evaluator_args) evaluator = create_supervised_evaluator(**evaluator_args) if dist.get_rank() == 0 and with_pbar_on_iters: ProgressBar(persist=False, desc="Train Evaluation").attach(train_evaluator) ProgressBar(persist=False, desc="Val Evaluation").attach(evaluator) def run_validation(engine): train_evaluator.run(train_eval_loader) evaluator.run(val_loader) if getattr(config, "start_by_validation", False): trainer.add_event_handler(Events.STARTED, run_validation) trainer.add_event_handler(Events.EPOCH_COMPLETED(every=getattr(config, "val_interval", 1)), run_validation) trainer.add_event_handler(Events.COMPLETED, run_validation) score_metric_name = "mIoU_bg" if hasattr(config, "es_patience"): common.add_early_stopping_by_val_score(config.es_patience, evaluator, trainer, metric_name=score_metric_name) if dist.get_rank() == 0: tb_logger = common.setup_tb_logging(config.output_path.as_posix(), trainer, optimizer, evaluators={"training": train_evaluator, "validation": evaluator}) common.setup_mlflow_logging(trainer, optimizer, evaluators={"training": train_evaluator, "validation": evaluator}) common.save_best_model_by_val_score(config.output_path.as_posix(), evaluator, model, metric_name=score_metric_name, trainer=trainer) # Log unsup_lambda @trainer.on(Events.ITERATION_COMPLETED(every=100)) def tblog_unsupervised_lambda(engine): tb_logger.writer.add_scalar("training/unsupervised lambda", engine.state.unsup_lambda, engine.state.iteration) mlflow.log_metric("training unsupervised lambda", engine.state.unsup_lambda, step=engine.state.iteration) # Log train/val predictions: tb_logger.attach(evaluator, log_handler=predictions_gt_images_handler(img_denormalize_fn=config.img_denormalize, n_images=15, another_engine=trainer, prefix_tag="validation"), event_name=Events.ITERATION_COMPLETED(once=len(val_loader) // 2)) log_train_predictions = getattr(config, "log_train_predictions", False) if log_train_predictions: tb_logger.attach(train_evaluator, log_handler=predictions_gt_images_handler(img_denormalize_fn=config.img_denormalize, n_images=15, another_engine=trainer, prefix_tag="validation"), event_name=Events.ITERATION_COMPLETED(once=len(train_eval_loader) // 2)) trainer.run(train_loader, max_epochs=config.num_epochs)
def finetune_model(model_class, project_path, batch_size, num_workers=0, pin_memory=True, non_blocking=True, device=None, base_lr=1e-4, max_lr=1e-3, lr_gamma=0.9, lr_decay_iters=None, weight_decay=0.0, loss_func=None, n_epochs=1, patience=-1, data_augmentation=True, combination_module=simple_concatenation, combination_size=KinshipClassifier.FACENET_OUT_SIZE * 2, simple_fc_layers=None, custom_fc_layers=None, final_fc_layers=None, train_ds_name=None, dev_ds_name=None, logging_rate=-1, saving_rate=-1, experiment_name=None, checkpoint_name=None, hof_size=1, checkpoint_exp=None): if device is None: device = torch.device('cpu') if loss_func is None: loss_func = torch.nn.CrossEntropyLoss() if simple_fc_layers is None: simple_fc_layers = [1024] if custom_fc_layers is None: custom_fc_layers = [1024] if final_fc_layers is None: final_fc_layers = [] if train_ds_name is None: train_ds_name = 'train_dataset.pkl' if dev_ds_name is None: dev_ds_name = 'dev_dataset.pkl' if checkpoint_exp is None: checkpoint_exp = experiment_name model = model_class(combination_module, combination_size, simple_fc_layers, custom_fc_layers, final_fc_layers) data_path = os.path.join(project_path, 'data') processed_path = os.path.join(data_path, 'processed') dataset_names = {'train': train_ds_name, 'dev': dev_ds_name} dataset_paths = { partition: os.path.join(data_path, dataset_names[partition]) for partition in dataset_names } raw_paths = { partition: os.path.join(processed_path, partition) for partition in dataset_paths } relationships_path = os.path.join(data_path, 'raw', 'train_relationships.csv') datasets = { partition: KinshipDataset.get_dataset( dataset_paths[partition], raw_paths[partition], relationships_path, data_augmentation and (partition == 'train')) for partition in raw_paths } dataloaders = { partition: DataLoader(datasets[partition], batch_size=batch_size, shuffle=(partition == 'train'), num_workers=num_workers, pin_memory=pin_memory) for partition in datasets } params_to_train = list( filter(lambda x: x.requires_grad, model.parameters())) optimizer = optim.AdamW(params_to_train, lr=base_lr, weight_decay=weight_decay) lr_decay_iters = len( dataloaders['train']) if lr_decay_iters is None else lr_decay_iters lr_scheduler = optim.lr_scheduler.CyclicLR(optimizer, base_lr=base_lr, max_lr=max_lr, step_size_up=lr_decay_iters // 2, mode='exp_range', gamma=lr_gamma, cycle_momentum=False) train_engine = create_supervised_trainer(model, optimizer, loss_fn=loss_func, device=device, non_blocking=non_blocking) if checkpoint_exp is not None and checkpoint_name is not None: experiment_dir = os.path.join(project_path, 'experiments', checkpoint_exp) model, optimizer, loss_func, lr_scheduler, train_engine = load_checkpoint( model_class, experiment_dir, checkpoint_name, device) eval_engine = create_supervised_evaluator( model, metrics=dict(accuracy=Accuracy(), cross_entropy=Loss(loss_func)), device=device, non_blocking=non_blocking) metrics = {} if logging_rate > 0: metrics['ce_history'] = [] metrics['smoothed_loss_history'] = [] beta = 0.98 avg_loss = 0.0 @train_engine.on(Events.ITERATION_COMPLETED(every=logging_rate)) def log_iteration_training_metrics(engine): nonlocal metrics metrics['ce_history'].append(engine.state.output) @train_engine.on(Events.ITERATION_COMPLETED(every=logging_rate)) def log_smoothed_lr(engine: Engine): nonlocal avg_loss avg_loss = (avg_loss * beta) + (engine.state.output * (1 - beta)) metrics['smoothed_loss_history'].append( avg_loss / (1 - (beta**(len(metrics['smoothed_loss_history']) + 1)))) @train_engine.on(Events.EPOCH_COMPLETED) def plot_metrics(engine): plot_metric(metrics['smoothed_loss_history'], f"Smoothed loss epoch #{engine.state.epoch}", "Cross Entropy", index_scale=logging_rate) if patience >= 0: common.add_early_stopping_by_val_score(patience, eval_engine, train_engine, 'accuracy') # Replaced by setup_common_training_handlers # nan_terminate = TerminateOnNan() # train_engine.add_event_handler(Events.ITERATION_COMPLETED, nan_terminate) @train_engine.on(Events.EPOCH_COMPLETED) def print_training_metrics(engine): print(f"Finished epoch {engine.state.epoch}") if train_ds_name == dev_ds_name: print(f"Epoch {engine.state.epoch}: CE = {engine.state.output}") metrics['final_dev_loss'] = engine.state.output return eval_engine.run(dataloaders['dev']) metrics['final_dev_loss'] = eval_engine.state.metrics['cross_entropy'] print( f"Epoch {engine.state.epoch}: CE = {eval_engine.state.metrics['cross_entropy']}, " f"Acc = {eval_engine.state.metrics['accuracy']}") # Replaced by setup_common_training_handlers # @train_engine.on(Events.ITERATION_COMPLETED) # def change_lr(engine): # lr_scheduler.step() to_save = None output_path = None if saving_rate > 0: if experiment_name is None: print("Warning: saving rate specified but experiment name is None") exit() experiment_path = os.path.join(project_path, 'experiments', experiment_name) if not os.path.isdir(experiment_path): os.mkdir(experiment_path) with open(os.path.join(experiment_path, 'model.config'), 'wb+') as config_file: pickle.dump(model.get_configuration(), config_file) to_save = { 'model': model, 'optimizer': optimizer, 'loss_func': loss_func, 'lr_scheduler': lr_scheduler, 'train_engine': train_engine } output_path = experiment_path best_models_dir = os.path.join(output_path, 'best_models') if not os.path.isdir(best_models_dir): os.mkdir(best_models_dir) common.save_best_model_by_val_score(best_models_dir, eval_engine, model, 'accuracy', n_saved=hof_size, trainer=train_engine, tag='acc') # Replaced by setup_common_training_handlers # checkpointer = ModelCheckpoint(experiment_path, 'iter', n_saved=50, # global_step_transform=lambda engine, _: # f"{engine.state.epoch}-{engine.state.iteration}", require_empty=False) # train_engine.add_event_handler(Events.ITERATION_COMPLETED(every=saving_rate), checkpointer, to_save) common.setup_common_training_handlers(train_engine, to_save=to_save, save_every_iters=saving_rate, output_path=output_path, lr_scheduler=lr_scheduler, with_pbars=True, with_pbar_on_iters=True, log_every_iters=1, device=device) # Replaced by setup_common_training_handlers # train_pbar = ProgressBar() # train_pbar.attach(train_engine) # eval_pbar = ProgressBar(persist=False, desc="Evaluation") eval_pbar.attach(eval_engine) print(model) print("Running on:", device) train_engine.run(dataloaders['train'], max_epochs=n_epochs) return model, metrics
def run(output_path, config): device = "cuda" local_rank = config["local_rank"] distributed = backend is not None if distributed: torch.cuda.set_device(local_rank) device = "cuda" rank = dist.get_rank() if distributed else 0 torch.manual_seed(config["seed"] + rank) # Rescale batch_size and num_workers ngpus_per_node = torch.cuda.device_count() ngpus = dist.get_world_size() if distributed else 1 batch_size = config["batch_size"] // ngpus num_workers = int( (config["num_workers"] + ngpus_per_node - 1) / ngpus_per_node) train_loader, test_loader = get_train_test_loaders( path=config["data_path"], batch_size=batch_size, distributed=distributed, num_workers=num_workers, ) model = get_model(config["model"]) model = model.to(device) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[ local_rank, ], output_device=local_rank, ) optimizer = optim.SGD( model.parameters(), lr=config["learning_rate"], momentum=config["momentum"], weight_decay=config["weight_decay"], nesterov=True, ) criterion = nn.CrossEntropyLoss().to(device) le = len(train_loader) milestones_values = [ (0, 0.0), (le * config["num_warmup_epochs"], config["learning_rate"]), (le * config["num_epochs"], 0.0), ] lr_scheduler = PiecewiseLinear(optimizer, param_name="lr", milestones_values=milestones_values) def _prepare_batch(batch, device, non_blocking): x, y = batch return ( convert_tensor(x, device=device, non_blocking=non_blocking), convert_tensor(y, device=device, non_blocking=non_blocking), ) def process_function(engine, batch): x, y = _prepare_batch(batch, device=device, non_blocking=True) model.train() # Supervised part y_pred = model(x) loss = criterion(y_pred, y) optimizer.zero_grad() loss.backward() optimizer.step() return { "batch loss": loss.item(), } trainer = Engine(process_function) train_sampler = train_loader.sampler if distributed else None to_save = { "trainer": trainer, "model": model, "optimizer": optimizer, "lr_scheduler": lr_scheduler, } metric_names = [ "batch loss", ] common.setup_common_training_handlers( trainer, train_sampler=train_sampler, to_save=to_save, save_every_iters=config["checkpoint_every"], output_path=output_path, lr_scheduler=lr_scheduler, output_names=metric_names, with_pbar_on_iters=config["display_iters"], log_every_iters=10, ) if rank == 0: tb_logger = TensorboardLogger(log_dir=output_path) tb_logger.attach( trainer, log_handler=OutputHandler(tag="train", metric_names=metric_names), event_name=Events.ITERATION_COMPLETED, ) tb_logger.attach( trainer, log_handler=OptimizerParamsHandler(optimizer, param_name="lr"), event_name=Events.ITERATION_STARTED, ) metrics = { "accuracy": Accuracy(device=device if distributed else None), "loss": Loss(criterion, device=device if distributed else None), } evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def run_validation(engine): torch.cuda.synchronize() train_evaluator.run(train_loader) evaluator.run(test_loader) trainer.add_event_handler( Events.EPOCH_STARTED(every=config["validate_every"]), run_validation) trainer.add_event_handler(Events.COMPLETED, run_validation) if rank == 0: if config["display_iters"]: ProgressBar(persist=False, desc="Train evaluation").attach(train_evaluator) ProgressBar(persist=False, desc="Test evaluation").attach(evaluator) tb_logger.attach( train_evaluator, log_handler=OutputHandler( tag="train", metric_names=list(metrics.keys()), global_step_transform=global_step_from_engine(trainer), ), event_name=Events.COMPLETED, ) tb_logger.attach( evaluator, log_handler=OutputHandler( tag="test", metric_names=list(metrics.keys()), global_step_transform=global_step_from_engine(trainer), ), event_name=Events.COMPLETED, ) # Store the best model by validation accuracy: common.save_best_model_by_val_score( output_path, evaluator, model=model, metric_name="accuracy", n_saved=3, trainer=trainer, tag="test", ) if config["log_model_grads_every"] is not None: tb_logger.attach( trainer, log_handler=GradsHistHandler(model, tag=model.__class__.__name__), event_name=Events.ITERATION_COMPLETED( every=config["log_model_grads_every"]), ) if config["crash_iteration"] is not None: @trainer.on(Events.ITERATION_STARTED(once=config["crash_iteration"])) def _(engine): raise Exception("STOP at iteration: {}".format( engine.state.iteration)) resume_from = config["resume_from"] if resume_from is not None: checkpoint_fp = Path(resume_from) assert checkpoint_fp.exists(), "Checkpoint '{}' is not found".format( checkpoint_fp.as_posix()) print("Resume from a checkpoint: {}".format(checkpoint_fp.as_posix())) checkpoint = torch.load(checkpoint_fp.as_posix()) Checkpoint.load_objects(to_load=to_save, checkpoint=checkpoint) try: trainer.run(train_loader, max_epochs=config["num_epochs"]) except Exception as e: import traceback print(traceback.format_exc()) if rank == 0: tb_logger.close()
def get_handlers( config: Any, model: Module, trainer: Engine, evaluator: Engine, metric_name: str, es_metric_name: str, train_sampler: Optional[DistributedSampler] = None, to_save: Optional[Mapping] = None, lr_scheduler: Optional[LRScheduler] = None, output_names: Optional[Iterable[str]] = None, **kwargs: Any, ) -> Union[Tuple[Checkpoint, EarlyStopping, Timer], Tuple[None, None, None]]: """Get best model, earlystopping, timer handlers. Parameters ---------- config Config object for setting up handlers `config` has to contain - `output_dir`: output path to indicate where to_save objects are stored - `save_every_iters`: saving iteration interval - `n_saved`: number of best models to store - `log_every_iters`: logging interval for iteration progress bar and `GpuInfo` if true - `with_pbars`: show two progress bars - `with_pbar_on_iters`: show iteration-wise progress bar - `stop_on_nan`: Stop the training if engine output contains NaN/inf values - `clear_cuda_cache`: clear cuda cache every end of epoch - `with_gpu_stats`: show GPU information: used memory percentage, gpu utilization percentage values - `patience`: number of events to wait if no improvement and then stop the training - `limit_sec`: maximum time before training terminates in seconds model best model to save trainer the engine used for training evaluator the engine used for evaluation metric_name evaluation metric to save the best model es_metric_name evaluation metric to early stop the model train_sampler distributed training sampler to call `set_epoch` to_save objects to save during training lr_scheduler learning rate scheduler as native torch LRScheduler or ignite’s parameter scheduler output_names list of names associated with `trainer`'s process_function output dictionary kwargs keyword arguments passed to Checkpoint handler Returns ------- best_model_handler, es_handler, timer_handler """ best_model_handler, es_handler, timer_handler = None, None, None # https://pytorch.org/ignite/contrib/engines.html#ignite.contrib.engines.common.setup_common_training_handlers # kwargs can be passed to save the model based on training stats # like score_name, score_function common.setup_common_training_handlers( trainer=trainer, train_sampler=train_sampler, to_save=to_save, lr_scheduler=lr_scheduler, output_names=output_names, output_path=config.output_dir / 'checkpoints', save_every_iters=config.save_every_iters, n_saved=config.n_saved, log_every_iters=config.log_every_iters, with_pbars=config.with_pbars, with_pbar_on_iters=config.with_pbar_on_iters, stop_on_nan=config.stop_on_nan, clear_cuda_cache=config.clear_cuda_cache, with_gpu_stats=config.with_gpu_stats, **kwargs, ) {% if save_best_model_by_val_score %} # https://pytorch.org/ignite/contrib/engines.html#ignite.contrib.engines.common.save_best_model_by_val_score best_model_handler = common.save_best_model_by_val_score( output_path=config.output_dir / 'checkpoints', evaluator=evaluator, model=model, metric_name=metric_name, n_saved=config.n_saved, trainer=trainer, tag='eval', ) {% endif %} {% if add_early_stopping_by_val_score %} # https://pytorch.org/ignite/contrib/engines.html#ignite.contrib.engines.common.add_early_stopping_by_val_score es_handler = common.add_early_stopping_by_val_score( patience=config.patience, evaluator=evaluator, trainer=trainer, metric_name=es_metric_name, ) {% endif %} {% if setup_timer %} # https://pytorch.org/ignite/handlers.html#ignite.handlers.Timer # measure the average time to process a single batch of samples # Events for that are - ITERATION_STARTED and ITERATION_COMPLETED # you can replace with the events you want to measure timer_handler = Timer(average=True) timer_handler.attach( engine=trainer, start=Events.EPOCH_STARTED, resume=Events.ITERATION_STARTED, pause=Events.ITERATION_COMPLETED, step=Events.ITERATION_COMPLETED, ) {% endif %} {% if setup_timelimit %} # training will terminate if training time exceed `limit_sec`. trainer.add_event_handler( Events.ITERATION_COMPLETED, TimeLimit(limit_sec=config.limit_sec) ) {% endif %} return best_model_handler, es_handler, timer_handler