def create_segmentation_evaluator(model, device, num_classes=19, loss_fn=None, non_blocking=True): cm = partial(ConfusionMatrix, num_classes) metrics = { 'iou': IoU(cm()), 'miou': mIoU(cm()), 'accuracy': cmAccuracy(cm()), 'dice': DiceCoefficient(cm()), } if loss_fn is not None: metrics['loss'] = Loss(loss_fn) evaluator = create_supervised_evaluator( model, metrics, device, non_blocking=non_blocking, ) return evaluator
def test_cm_accuracy(): y_true, y_pred = get_y_true_y_pred() th_y_true, th_y_logits = compute_th_y_true_y_logits(y_true, y_pred) true_acc = accuracy_score(y_true.reshape(-1), y_pred.reshape(-1)) cm = ConfusionMatrix(num_classes=3) acc_metric = cmAccuracy(cm) # Update metric output = (th_y_logits, th_y_true) cm.update(output) res = acc_metric.compute().numpy() assert pytest.approx(res) == true_acc
def inference(config, local_rank, with_pbar_on_iters=True): set_seed(config.seed + local_rank) torch.cuda.set_device(local_rank) device = 'cuda' torch.backends.cudnn.benchmark = True # Load model and weights model_weights_filepath = Path( get_artifact_path(config.run_uuid, config.weights_filename)) assert model_weights_filepath.exists(), \ "Model weights file '{}' is not found".format(model_weights_filepath.as_posix()) model = config.model.to(device) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank) if hasattr(config, "custom_weights_loading"): config.custom_weights_loading(model, model_weights_filepath) else: state_dict = torch.load(model_weights_filepath) if not all([k.startswith("module.") for k in state_dict]): state_dict = {f"module.{k}": v for k, v in state_dict.items()} model.load_state_dict(state_dict) model.eval() prepare_batch = config.prepare_batch non_blocking = getattr(config, "non_blocking", True) model_output_transform = getattr(config, "model_output_transform", lambda x: x) tta_transforms = getattr(config, "tta_transforms", None) def eval_update_function(engine, batch): with torch.no_grad(): x, y, meta = prepare_batch(batch, device=device, non_blocking=non_blocking) if tta_transforms is not None: y_preds = [] for t in tta_transforms: t_x = t.augment_image(x) t_y_pred = model(t_x) t_y_pred = model_output_transform(t_y_pred) y_pred = t.deaugment_mask(t_y_pred) y_preds.append(y_pred) y_preds = torch.stack(y_preds, dim=0) y_pred = torch.mean(y_preds, dim=0) else: y_pred = model(x) y_pred = model_output_transform(y_pred) return {"y_pred": y_pred, "y": y, "meta": meta} evaluator = Engine(eval_update_function) has_targets = getattr(config, "has_targets", False) if has_targets: def output_transform(output): return output['y_pred'], output['y'] num_classes = config.num_classes cm_metric = ConfusionMatrix(num_classes=num_classes, output_transform=output_transform) pr = cmPrecision(cm_metric, average=False) re = cmRecall(cm_metric, average=False) val_metrics = { "IoU": IoU(cm_metric), "mIoU_bg": mIoU(cm_metric), "Accuracy": cmAccuracy(cm_metric), "Precision": pr, "Recall": re, "F1": Fbeta(beta=1.0, output_transform=output_transform) } if hasattr(config, "metrics") and isinstance(config.metrics, dict): val_metrics.update(config.metrics) for name, metric in val_metrics.items(): metric.attach(evaluator, name) if dist.get_rank() == 0: # Log val metrics: mlflow_logger = MLflowLogger() mlflow_logger.attach(evaluator, log_handler=OutputHandler( tag="validation", metric_names=list(val_metrics.keys())), event_name=Events.EPOCH_COMPLETED) if dist.get_rank() == 0 and with_pbar_on_iters: ProgressBar(persist=True, desc="Inference").attach(evaluator) if dist.get_rank() == 0: do_save_raw_predictions = getattr(config, "do_save_raw_predictions", True) do_save_overlayed_predictions = getattr( config, "do_save_overlayed_predictions", True) if not has_targets: assert do_save_raw_predictions or do_save_overlayed_predictions, \ "If no targets, either do_save_overlayed_predictions or do_save_raw_predictions should be " \ "defined in the config and has value equal True" # Save predictions if do_save_raw_predictions: raw_preds_path = config.output_path / "raw" raw_preds_path.mkdir(parents=True) evaluator.add_event_handler(Events.ITERATION_COMPLETED, save_raw_predictions_with_geoinfo, raw_preds_path) if do_save_overlayed_predictions: overlayed_preds_path = config.output_path / "overlay" overlayed_preds_path.mkdir(parents=True) evaluator.add_event_handler( Events.ITERATION_COMPLETED, save_overlayed_predictions, overlayed_preds_path, img_denormalize_fn=config.img_denormalize, palette=default_palette) evaluator.add_event_handler(Events.EXCEPTION_RAISED, report_exception) # Run evaluation evaluator.run(config.data_loader)
def run(train_config, logger, **kwargs): logger = logging.getLogger('UDA') if getattr(train_config, 'debug', False): setup_logger(logger, logging.DEBUG) # Set Polyaxon environment if needed plx_logger = None save_dir = None output_experiment_path = None try: plx_logger = PolyaxonLogger() experiment = plx_logger.experiment save_dir = get_outputs_path() output_experiment_path = get_outputs_refs_paths() output_experiment_path = output_experiment_path['experiments'][ 0] if output_experiment_path else None logger.debug("Experiment info: {}".format( experiment.get_experiment_info())) except PolyaxonClientException as e: logger.warning('Logger Polyaxon : ' + str(e)) # Path configuration saves_dict = getattr(train_config, 'saves', {}) save_dir = saves_dict.get('save_dir', '') if save_dir is None else save_dir log_dir = os.path.join(save_dir, saves_dict.get('log_dir', '')) save_model_dir = os.path.join(save_dir, saves_dict.get('model_dir', '')) save_prediction_dir = os.path.join(save_dir, saves_dict.get('prediction_dir', '')) save_config_dir = os.path.join(save_dir, saves_dict.get('config_dir', '')) load_model_file = saves_dict.get('load_model_file', '') load_optimizer_file = saves_dict.get('load_optimizer_file', '') # Create folders create_save_folders(save_dir, saves_dict) if output_experiment_path is not None: model_dir = saves_dict.get('model_dir', '') load_model_file = os.path.join( output_experiment_path, model_dir, load_model_file) if load_model_file else None load_optimizer_file = os.path.join( output_experiment_path, model_dir, load_optimizer_file) if load_optimizer_file else None num_epochs = getattr(train_config, 'num_epochs') num_classes = getattr(train_config, 'num_classes') device = getattr(train_config, 'device', 'cpu') # Set magical acceleration if torch.cuda.is_available(): torch.backends.cudnn.benchmark = True else: assert device == 'cpu', 'CUDA device selected but none is available' # Set half precision if required use_fp_16 = getattr(train_config, 'use_fp_16', False) train1_sup_loader = getattr(train_config, 'train1_sup_loader') train1_unsup_loader = getattr(train_config, 'train1_unsup_loader') train2_unsup_loader = getattr(train_config, 'train2_unsup_loader') test_loader = getattr(train_config, 'test_loader') save_interval = saves_dict.get('save_interval', 0) n_saved = saves_dict.get('n_saved', 0) val_interval = getattr(train_config, 'val_interval', 1) pred_interval = getattr(train_config, 'pred_interval', 0) model = getattr(train_config, 'model').to(device) optimizer = getattr(train_config, 'optimizer') criterion = getattr(train_config, 'criterion').to(device) consistency_criterion = getattr(train_config, 'consistency_criterion').to(device) cm_metric = getattr( train_config, 'cm_metric', ConfusionMatrix(num_classes=num_classes, output_transform=lambda x: (x['y_pred'], x['y']))) # AMP initialization for half precision if use_fp_16: assert 'cuda' in device assert torch.backends.cudnn.enabled, "NVIDIA/Apex:Amp requires cudnn backend to be enabled." try: from apex import amp except: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to run this example." ) # Initialize amp model, optimizer = amp.initialize(model, optimizer, opt_level="O2") # Load checkpoint load_params(model, optimizer=optimizer, model_file=load_model_file, optimizer_file=load_optimizer_file, device_name=device) # Add batch norm is_bn = getattr(train_config, 'is_bn', False) if is_bn: batch_norm = nn.BatchNorm2d(3).to(device) if use_fp_16: batch_norm = amp.initialize(batch_norm) batch_norm.reset_parameters() model = nn.Sequential(batch_norm, model) # Copy the config file shutil.copy2(os.path.abspath(train_config.__file__), os.path.join(save_config_dir, 'checkpoint_module.py')) le = len(train1_sup_loader) num_train_steps = le * num_epochs mlflow.log_param("num train steps", num_train_steps) lr = getattr(train_config, 'learning_rate') num_warmup_steps = getattr(train_config, 'num_warmup_steps', 0) lr_scheduler = getattr(train_config, 'lr_scheduler', None) if lr_scheduler is not None: lr_scheduler = lr_scheduler(optimizer) if num_warmup_steps > 0: lr_scheduler = create_lr_scheduler_with_warmup( lr_scheduler, warmup_start_value=0.0, warmup_end_value=lr * (1.0 + 1.0 / num_warmup_steps), warmup_duration=num_warmup_steps) train1_sup_loader_iter = cycle(train1_sup_loader) train1_unsup_loader_iter = cycle(train1_unsup_loader) train2_unsup_loader_iter = cycle(train2_unsup_loader) # Reduce on plateau reduce_on_plateau = getattr(train_config, 'reduce_on_plateau', None) # Output transform model output_transform_model = getattr(train_config, 'output_transform_model', lambda x: x) inference_fn = getattr(train_config, 'inference_fn', inference_standard) lam = getattr(train_config, 'consistency_lambda') beta = getattr(train_config, 'consistency_beta', lam) tsa = TrainingSignalAnnealing( num_steps=num_train_steps, min_threshold=getattr(train_config, 'TSA_proba_min'), max_threshold=getattr(train_config, 'TSA_proba_max')) with_tsa = getattr(train_config, 'with_TSA', False) cfg = { 'tsa': tsa, 'lambda': lam, 'beta': beta, 'with_tsa': with_tsa, 'device': device, 'consistency_criterion': consistency_criterion, 'criterion': criterion } trainer = Engine( partial(train_update_function, model=model, optimizer=optimizer, cfg=cfg, train1_sup_loader_iter=train1_sup_loader_iter, train1_unsup_loader_iter=train1_unsup_loader_iter, train2_unsup_loader_iter=train2_unsup_loader_iter, output_transform_model=output_transform_model, use_fp_16=use_fp_16)) # Register events for e in CustomEvents: State.event_to_attr[e] = 'iteration' trainer.register_events(*CustomEvents) if with_tsa: trainer.add_event_handler(Events.ITERATION_COMPLETED, log_tsa, tsa) if lr_scheduler is not None: if not hasattr(lr_scheduler, "step"): trainer.add_event_handler(Events.ITERATION_STARTED, lr_scheduler) else: trainer.add_event_handler(Events.ITERATION_STARTED, lambda engine: lr_scheduler.step()) trainer.add_event_handler(Events.ITERATION_COMPLETED, log_learning_rate, optimizer) metric_names = [ 'supervised batch loss', 'consistency batch loss', 'final batch loss' ] def output_transform(x, name): return x[name] for n in metric_names: RunningAverage( output_transform=partial(output_transform, name=n)).attach( trainer, n) ProgressBar(persist=True, bar_format="").attach(trainer, event_name=Events.EPOCH_STARTED, closing_event_name=Events.COMPLETED) # Handlers for Tensorboard logging tb_logger = TensorboardLogger(log_dir=log_dir) tb_logger.attach(trainer, log_handler=tbOutputHandler(tag="train", metric_names=metric_names), event_name=CustomEvents.ITERATION_K_COMPLETED) tb_logger.attach(trainer, log_handler=tbOptimizerParamsHandler(optimizer, param_name="lr"), event_name=CustomEvents.ITERATION_K_STARTED) # Handlers for Polyaxon logging if plx_logger is not None: plx_logger.attach(trainer, log_handler=plxOutputHandler( tag="train", metric_names=metric_names), event_name=CustomEvents.ITERATION_K_COMPLETED) metrics = { 'loss': Loss(criterion, output_transform=lambda x: (x['y_pred'], x['y'])), 'mAcc': cmAccuracy(cm_metric).mean(), 'mPr': cmPrecision(cm_metric).mean(), 'mRe': cmRecall(cm_metric).mean(), 'mIoU': mIoU(cm_metric), 'mF1': cmFbeta(cm_metric, 1).mean() } iou = IoU(cm_metric) for i in range(num_classes): key_name = 'IoU_{}'.format(str(i)) metrics[key_name] = iou[i] inference_update_fn = partial( inference_update_function, model=model, cfg=cfg, output_transform_model=output_transform_model, inference_fn=inference_fn) evaluator = Engine(inference_update_fn) train_evaluator = Engine(inference_update_fn) for name, metric in metrics.items(): metric.attach(train_evaluator, name) metric.attach(evaluator, name) # Add checkpoint if save_model_dir: checkpoint = ModelCheckpoint(dirname=save_model_dir, filename_prefix='checkpoint', save_interval=save_interval, n_saved=n_saved, create_dir=True) trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint, { 'mymodel': model, 'optimizer': optimizer }) def trigger_k_iteration_started(engine, k): if engine.state.iteration % k == 0: engine.fire_event(CustomEvents.ITERATION_K_STARTED) def trigger_k_iteration_completed(engine, k): if engine.state.iteration % k == 0: engine.fire_event(CustomEvents.ITERATION_K_COMPLETED) def run_validation(engine, validation_interval): if (trainer.state.epoch - 1) % validation_interval == 0: train_evaluator.run(train1_sup_loader) evaluator.run(test_loader) if save_prediction_dir: train_output = train_evaluator.state.output test_output = evaluator.state.output iteration = str(trainer.state.iteration) epoch = str(trainer.state.epoch) save_prediction('train_{}_{}'.format(iteration, epoch), save_prediction_dir, train_output['x'], torch.argmax( train_output['y_pred'][0, :, :, :], dim=0), y=train_output['y'][0, :, :]) save_prediction('test_{}_{}'.format(iteration, epoch), save_prediction_dir, test_output['x'], torch.argmax(test_output['y_pred'][0, :, :, :], dim=0), y=test_output['y'][0, :, :]) train_evaluator.state.output = None evaluator.state.output = None if reduce_on_plateau is not None: reduce_on_plateau.step(evaluator.state.metrics['mIoU']) trainer.add_event_handler(Events.ITERATION_STARTED, trigger_k_iteration_started, k=10) trainer.add_event_handler(Events.ITERATION_COMPLETED, trigger_k_iteration_completed, k=10) trainer.add_event_handler(Events.EPOCH_STARTED, run_validation, validation_interval=val_interval) trainer.add_event_handler(Events.COMPLETED, run_validation, validation_interval=1) def trainer_prediction_save(engine, prediction_interval): if (engine.state.iteration - 1) % prediction_interval == 0: if save_prediction_dir: trainer_output = trainer.state.output['unsup pred'] iteration = str(trainer.state.iteration) epoch = str(trainer.state.epoch) save_prediction('trainer_{}_{}'.format(iteration, epoch), save_prediction_dir, trainer_output['x'], trainer_output['y_pred']) logger.debug( 'Saved trainer prediction for iteration {}'.format( str(engine.state.iteration))) trainer.state.output = None trainer.add_event_handler(Events.ITERATION_COMPLETED, trainer_prediction_save, prediction_interval=pred_interval) tb_logger.attach(train_evaluator, log_handler=tbOutputHandler(tag="train", metric_names=list( metrics.keys())), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(evaluator, log_handler=tbOutputHandler(tag="test", metric_names=list( metrics.keys())), event_name=Events.EPOCH_COMPLETED) # Handlers for Polyaxon logging if plx_logger is not None: plx_logger.attach(train_evaluator, log_handler=plxOutputHandler(tag="train", metric_names=list( metrics.keys())), event_name=Events.EPOCH_COMPLETED) plx_logger.attach(evaluator, log_handler=plxOutputHandler(tag="test", metric_names=list( metrics.keys())), event_name=Events.EPOCH_COMPLETED) trainer.add_event_handler(Events.ITERATION_COMPLETED, mlflow_batch_metrics_logging, "train", trainer) train_evaluator.add_event_handler(Events.COMPLETED, mlflow_val_metrics_logging, "train", trainer) evaluator.add_event_handler(Events.COMPLETED, mlflow_val_metrics_logging, "test", trainer) data_steps = list(range(len(train1_sup_loader))) logger.debug('Start training') trainer.run(data_steps, max_epochs=num_epochs) logger.debug('Finished training')
def training(config, local_rank, with_pbar_on_iters=True): if not getattr(config, "use_fp16", True): raise RuntimeError("This training script uses by default fp16 AMP") set_seed(config.seed + local_rank) torch.cuda.set_device(local_rank) device = 'cuda' torch.backends.cudnn.benchmark = True train_loader = config.train_loader train_sampler = getattr(train_loader, "sampler", None) assert train_sampler is not None, "Train loader of type '{}' " \ "should have attribute 'sampler'".format(type(train_loader)) assert hasattr(train_sampler, 'set_epoch') and callable(train_sampler.set_epoch), \ "Train sampler should have a callable method `set_epoch`" train_eval_loader = config.train_eval_loader val_loader = config.val_loader model = config.model.to(device) optimizer = config.optimizer model, optimizer = amp.initialize(model, optimizer, opt_level=getattr( config, "fp16_opt_level", "O2"), num_losses=1) model = DDP(model, delay_allreduce=True) criterion = config.criterion.to(device) # Setup trainer prepare_batch = getattr(config, "prepare_batch") non_blocking = getattr(config, "non_blocking", True) accumulation_steps = getattr(config, "accumulation_steps", 1) model_output_transform = getattr(config, "model_output_transform", lambda x: x) def train_update_function(engine, batch): model.train() x, y = prepare_batch(batch, device=device, non_blocking=non_blocking) y_pred = model(x) y_pred = model_output_transform(y_pred) loss = criterion(y_pred, y) if isinstance(loss, Mapping): assert 'supervised batch loss' in loss loss_dict = loss output = {k: v.item() for k, v in loss_dict.items()} loss = loss_dict['supervised batch loss'] / accumulation_steps else: output = {'supervised batch loss': loss.item()} with amp.scale_loss(loss, optimizer, loss_id=0) as scaled_loss: scaled_loss.backward() if engine.state.iteration % accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return output output_names = getattr(config, "output_names", [ 'supervised batch loss', ]) trainer = Engine(train_update_function) common.setup_common_distrib_training_handlers( trainer, train_sampler, to_save={ 'model': model, 'optimizer': optimizer }, save_every_iters=1000, output_path=config.output_path.as_posix(), lr_scheduler=config.lr_scheduler, output_names=output_names, with_pbars=True, with_pbar_on_iters=with_pbar_on_iters, log_every_iters=1) def output_transform(output): return output['y_pred'], output['y'] num_classes = config.num_classes cm_metric = ConfusionMatrix(num_classes=num_classes, output_transform=output_transform) pr = cmPrecision(cm_metric, average=False) re = cmRecall(cm_metric, average=False) val_metrics = { "IoU": IoU(cm_metric), "mIoU_bg": mIoU(cm_metric), "Accuracy": cmAccuracy(cm_metric), "Precision": pr, "Recall": re, "F1": Fbeta(beta=1.0, output_transform=output_transform) } if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict): val_metrics.update(config.val_metrics) evaluator_args = dict(model=model, metrics=val_metrics, device=device, non_blocking=non_blocking, prepare_batch=prepare_batch, output_transform=lambda x, y, y_pred: { 'y_pred': model_output_transform(y_pred), 'y': y }) train_evaluator = create_supervised_evaluator(**evaluator_args) evaluator = create_supervised_evaluator(**evaluator_args) if dist.get_rank() == 0 and with_pbar_on_iters: ProgressBar(persist=False, desc="Train Evaluation").attach(train_evaluator) ProgressBar(persist=False, desc="Val Evaluation").attach(evaluator) def run_validation(engine): train_evaluator.run(train_eval_loader) evaluator.run(val_loader) if getattr(config, "start_by_validation", False): trainer.add_event_handler(Events.STARTED, run_validation) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=getattr(config, "val_interval", 1)), run_validation) trainer.add_event_handler(Events.COMPLETED, run_validation) score_metric_name = "mIoU_bg" if hasattr(config, "es_patience"): common.add_early_stopping_by_val_score(config.es_patience, evaluator, trainer, metric_name=score_metric_name) if dist.get_rank() == 0: tb_logger = common.setup_tb_logging(config.output_path.as_posix(), trainer, optimizer, evaluators={ "training": train_evaluator, "validation": evaluator }) common.setup_mlflow_logging(trainer, optimizer, evaluators={ "training": train_evaluator, "validation": evaluator }) common.save_best_model_by_val_score(config.output_path.as_posix(), evaluator, model, metric_name=score_metric_name, trainer=trainer) # Log train/val predictions: tb_logger.attach( evaluator, log_handler=predictions_gt_images_handler( img_denormalize_fn=config.img_denormalize, n_images=15, another_engine=trainer, prefix_tag="validation"), event_name=Events.ITERATION_COMPLETED(once=len(val_loader) // 2)) log_train_predictions = getattr(config, "log_train_predictions", False) if log_train_predictions: tb_logger.attach(train_evaluator, log_handler=predictions_gt_images_handler( img_denormalize_fn=config.img_denormalize, n_images=15, another_engine=trainer, prefix_tag="validation"), event_name=Events.ITERATION_COMPLETED( once=len(train_eval_loader) // 2)) trainer.run(train_loader, max_epochs=config.num_epochs)
def training(config, local_rank, with_pbar_on_iters=True): if not getattr(config, "use_fp16", True): raise RuntimeError("This training script uses by default fp16 AMP") set_seed(config.seed + local_rank) torch.cuda.set_device(local_rank) device = 'cuda' torch.backends.cudnn.benchmark = True train_loader = config.train_loader train_sampler = getattr(train_loader, "sampler", None) assert train_sampler is not None, "Train loader of type '{}' " \ "should have attribute 'sampler'".format(type(train_loader)) assert hasattr(train_sampler, 'set_epoch') and callable(train_sampler.set_epoch), \ "Train sampler should have a callable method `set_epoch`" unsup_train_loader = config.unsup_train_loader unsup_train_sampler = getattr(unsup_train_loader, "sampler", None) assert unsup_train_sampler is not None, "Train loader of type '{}' " \ "should have attribute 'sampler'".format(type(unsup_train_loader)) assert hasattr(unsup_train_sampler, 'set_epoch') and callable(unsup_train_sampler.set_epoch), \ "Unsupervised train sampler should have a callable method `set_epoch`" train_eval_loader = config.train_eval_loader val_loader = config.val_loader model = config.model.to(device) optimizer = config.optimizer model, optimizer = amp.initialize(model, optimizer, opt_level=getattr(config, "fp16_opt_level", "O2"), num_losses=2) model = DDP(model, delay_allreduce=True) criterion = config.criterion.to(device) unsup_criterion = config.unsup_criterion.to(device) unsup_batch_num_repetitions = getattr(config, "unsup_batch_num_repetitions", 1) # Setup trainer prepare_batch = getattr(config, "prepare_batch") non_blocking = getattr(config, "non_blocking", True) accumulation_steps = getattr(config, "accumulation_steps", 1) model_output_transform = getattr(config, "model_output_transform", lambda x: x) def cycle(seq): while True: for i in seq: yield i unsup_train_loader_iter = cycle(unsup_train_loader) def supervised_loss(batch): x, y = prepare_batch(batch, device=device, non_blocking=non_blocking) y_pred = model(x) y_pred = model_output_transform(y_pred) loss = criterion(y_pred, y) return loss def unsupervised_loss(x): with torch.no_grad(): y_pred_orig = model(x) # Data augmentation: geom only k = random.randint(1, 3) x_aug = torch.rot90(x, k=k, dims=(2, 3)) y_pred_orig_aug = torch.rot90(y_pred_orig, k=k, dims=(2, 3)) if random.random() < 0.5: x_aug = torch.flip(x_aug, dims=(2, )) y_pred_orig_aug = torch.flip(y_pred_orig_aug, dims=(2, )) if random.random() < 0.5: x_aug = torch.flip(x_aug, dims=(3, )) y_pred_orig_aug = torch.flip(y_pred_orig_aug, dims=(3, )) y_pred_orig_aug = y_pred_orig_aug.argmax(dim=1).long() y_pred_aug = model(x_aug.detach()) loss = unsup_criterion(y_pred_aug, y_pred_orig_aug.detach()) return loss def train_update_function(engine, batch): model.train() loss = supervised_loss(batch) if isinstance(loss, Mapping): assert 'supervised batch loss' in loss loss_dict = loss output = {k: v.item() for k, v in loss_dict.items()} loss = loss_dict['supervised batch loss'] / accumulation_steps else: output = {'supervised batch loss': loss.item()} # Difference with original UDA # Apply separately grads from supervised/unsupervised parts with amp.scale_loss(loss, optimizer, loss_id=0) as scaled_loss: scaled_loss.backward() if engine.state.iteration % accumulation_steps == 0: optimizer.step() optimizer.zero_grad() unsup_batch = next(unsup_train_loader_iter) unsup_x = unsup_batch['image'] unsup_x = convert_tensor(unsup_x, device=device, non_blocking=non_blocking) for _ in range(unsup_batch_num_repetitions): unsup_loss = engine.state.unsup_lambda * unsupervised_loss(unsup_x) assert isinstance(unsup_loss, torch.Tensor) output['unsupervised batch loss'] = unsup_loss.item() with amp.scale_loss(unsup_loss, optimizer, loss_id=1) as scaled_loss: scaled_loss.backward() if engine.state.iteration % accumulation_steps == 0: optimizer.step() optimizer.zero_grad() unsup_batch = None unsup_x = None total_loss = loss + unsup_loss output['total batch loss'] = total_loss.item() return output output_names = getattr(config, "output_names", ['supervised batch loss', 'unsupervised batch loss', 'total batch loss']) trainer = Engine(train_update_function) @trainer.on(Events.STARTED) def init(engine): if hasattr(config, "unsup_lambda_min"): engine.state.unsup_lambda = config.unsup_lambda_min else: engine.state.unsup_lambda = getattr(config, "unsup_lambda", 0.001) @trainer.on(Events.ITERATION_COMPLETED) def update_unsup_params(engine): engine.state.unsup_lambda += getattr(config, "unsup_lambda_delta", 0.00001) if hasattr(config, "unsup_lambda_max"): m = config.unsup_lambda_max engine.state.unsup_lambda = engine.state.unsup_lambda if engine.state.unsup_lambda < m else m common.setup_common_distrib_training_handlers( trainer, train_sampler, to_save={'model': model, 'optimizer': optimizer}, save_every_iters=1000, output_path=config.output_path.as_posix(), lr_scheduler=config.lr_scheduler, output_names=output_names, with_pbars=True, with_pbar_on_iters=with_pbar_on_iters, log_every_iters=1 ) def output_transform(output): return output['y_pred'], output['y'] num_classes = config.num_classes cm_metric = ConfusionMatrix(num_classes=num_classes, output_transform=output_transform) pr = cmPrecision(cm_metric, average=False) re = cmRecall(cm_metric, average=False) val_metrics = { "IoU": IoU(cm_metric), "mIoU_bg": mIoU(cm_metric), "Accuracy": cmAccuracy(cm_metric), "Precision": pr, "Recall": re, "F1": Fbeta(beta=1.0, output_transform=output_transform) } if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict): val_metrics.update(config.val_metrics) evaluator_args = dict( model=model, metrics=val_metrics, device=device, non_blocking=non_blocking, prepare_batch=prepare_batch, output_transform=lambda x, y, y_pred: {'y_pred': model_output_transform(y_pred), 'y': y} ) train_evaluator = create_supervised_evaluator(**evaluator_args) evaluator = create_supervised_evaluator(**evaluator_args) if dist.get_rank() == 0 and with_pbar_on_iters: ProgressBar(persist=False, desc="Train Evaluation").attach(train_evaluator) ProgressBar(persist=False, desc="Val Evaluation").attach(evaluator) def run_validation(engine): train_evaluator.run(train_eval_loader) evaluator.run(val_loader) if getattr(config, "start_by_validation", False): trainer.add_event_handler(Events.STARTED, run_validation) trainer.add_event_handler(Events.EPOCH_COMPLETED(every=getattr(config, "val_interval", 1)), run_validation) trainer.add_event_handler(Events.COMPLETED, run_validation) score_metric_name = "mIoU_bg" if hasattr(config, "es_patience"): common.add_early_stopping_by_val_score(config.es_patience, evaluator, trainer, metric_name=score_metric_name) if dist.get_rank() == 0: tb_logger = common.setup_tb_logging(config.output_path.as_posix(), trainer, optimizer, evaluators={"training": train_evaluator, "validation": evaluator}) common.setup_mlflow_logging(trainer, optimizer, evaluators={"training": train_evaluator, "validation": evaluator}) common.save_best_model_by_val_score(config.output_path.as_posix(), evaluator, model, metric_name=score_metric_name, trainer=trainer) # Log unsup_lambda @trainer.on(Events.ITERATION_COMPLETED(every=100)) def tblog_unsupervised_lambda(engine): tb_logger.writer.add_scalar("training/unsupervised lambda", engine.state.unsup_lambda, engine.state.iteration) mlflow.log_metric("training unsupervised lambda", engine.state.unsup_lambda, step=engine.state.iteration) # Log train/val predictions: tb_logger.attach(evaluator, log_handler=predictions_gt_images_handler(img_denormalize_fn=config.img_denormalize, n_images=15, another_engine=trainer, prefix_tag="validation"), event_name=Events.ITERATION_COMPLETED(once=len(val_loader) // 2)) log_train_predictions = getattr(config, "log_train_predictions", False) if log_train_predictions: tb_logger.attach(train_evaluator, log_handler=predictions_gt_images_handler(img_denormalize_fn=config.img_denormalize, n_images=15, another_engine=trainer, prefix_tag="validation"), event_name=Events.ITERATION_COMPLETED(once=len(train_eval_loader) // 2)) trainer.run(train_loader, max_epochs=config.num_epochs)