def run_training(model, optimizer, scheduler, output_path, train_loader, val_loader, epochs, patience, epochs_pretrain, mixed_precision, classes_weights): # trainer device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if classes_weights is not None: classes_weights = classes_weights.to(device) crit = nn.CrossEntropyLoss(weight=classes_weights) metrics = {"accuracy": Accuracy(), "loss": Loss(crit)} trainer = create_supervised_trainer_with_pretraining( model, optimizer, crit, device=device, epochs_pretrain=epochs_pretrain, mixed_precision=mixed_precision) train_evaluator = create_supervised_evaluator( model, metrics=metrics, device=device) val_evaluator = create_supervised_evaluator( model, metrics=metrics, device=device) # Out paths path_ckpt = os.path.join(output_path, "model_ckpt") log_dir = os.path.join(output_path, "log_dir") os.makedirs(log_dir, exist_ok=True) # tensorboard tb_logger = TensorboardLogger(log_dir=log_dir) tb_logger.attach(train_evaluator, log_handler=OutputHandler(tag="training", metric_names=[ "accuracy", "loss"], another_engine=trainer), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(val_evaluator, log_handler=OutputHandler(tag="validation", metric_names=[ "accuracy", "loss"], another_engine=trainer), event_name=Events.EPOCH_COMPLETED) # training progress pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names="all") # @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): train_evaluator.run(train_loader) val_evaluator.run(val_loader) train_loss = train_evaluator.state.metrics["loss"] val_loss = val_evaluator.state.metrics["loss"] train_acc = train_evaluator.state.metrics["accuracy"] val_acc = val_evaluator.state.metrics["accuracy"] pbar.log_message( "Training Results - Epoch: {} Loss: {:.6f} Accuracy: {:.6f}".format(engine.state.epoch, train_loss, train_acc)) pbar.log_message( "Validation Results - Epoch: {} Loss: {:.6f} Accuracy: {:.6f}".format(engine.state.epoch, val_loss, val_acc)) pbar.n = pbar.last_print_n = 0 trainer.add_event_handler(Events.EPOCH_COMPLETED, log_training_results) # def get_val_loss(engine): # return -engine.state.metrics['loss'] def get_val_acc(engine): return engine.state.metrics['accuracy'] # checkpoint and early stopping checkpointer = ModelCheckpoint( path_ckpt, "model", score_function=get_val_acc, score_name="accuracy", require_empty=False) early_stopper = EarlyStopping(patience, get_val_acc, trainer) to_save = {'optimizer': optimizer, 'model': model} if scheduler is not None: to_save["scheduler"] = scheduler val_evaluator.add_event_handler(Events.COMPLETED, checkpointer, to_save) val_evaluator.add_event_handler(Events.COMPLETED, early_stopper) if scheduler is not None: trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # free resources trainer.add_event_handler( Events.ITERATION_COMPLETED, lambda _: _empty_cache()) train_evaluator.add_event_handler( Events.ITERATION_COMPLETED, lambda _: _empty_cache()) val_evaluator.add_event_handler( Events.ITERATION_COMPLETED, lambda _: _empty_cache()) trainer.run(train_loader, max_epochs=epochs) tb_logger.close() # Evaluation with best model model.load_state_dict(torch.load( glob.glob(os.path.join(path_ckpt, "*.pth"))[0])["model"]) train_evaluator = create_supervised_evaluator( model, metrics=metrics, device=device) val_evaluator = create_supervised_evaluator( model, metrics=metrics, device=device) train_evaluator.run(train_loader) val_evaluator.run(val_loader) _pretty_print("Evaluating best model") pbar.log_message( "Best model on training set - Loss: {:.6f} Accuracy: {:.6f}" .format(train_evaluator.state.metrics["loss"], train_evaluator.state.metrics["accuracy"])) pbar.log_message( "Best model on validation set - Loss: {:.6f} Accuracy: {:.6f}" .format(val_evaluator.state.metrics["loss"], val_evaluator.state.metrics["accuracy"])) return model, train_evaluator.state.metrics, val_evaluator.state.metrics
def run(local_rank: int, config: Any, *args: Any, **kwargs: Any): """function to be run by idist.Parallel context manager.""" # ---------------------- # make a certain seed # ---------------------- rank = idist.get_rank() manual_seed(config.seed + rank) # ----------------------- # create output folder # ----------------------- if rank == 0: now = datetime.now().strftime("%Y%m%d-%H%M%S") name = f"{config.model}-backend-{idist.backend()}-{now}" path = Path(config.output_dir, name) path.mkdir(parents=True, exist_ok=True) config.output_dir = path.as_posix() config.output_dir = Path(idist.broadcast(config.output_dir, src=0)) # ----------------------------- # datasets and dataloaders # ----------------------------- # TODO : PLEASE provide your custom datasets and dataloaders configurations # we can use `idist.auto_dataloader` to handle distributed configurations # TODO : PLEASE replace `kwargs` with your desirable DataLoader arguments # See : https://pytorch.org/ignite/distributed.html#ignite.distributed.auto.auto_dataloader train_dataset, eval_dataset = get_datasets(path=config.data_path) train_dataloader = idist.auto_dataloader( train_dataset, batch_size=config.train_batch_size, num_workers=config.num_workers, shuffle=True, {% if use_distributed_training and not use_distributed_launcher %} persistent_workers=True, {% endif %} ) eval_dataloader = idist.auto_dataloader( eval_dataset, batch_size=config.eval_batch_size, num_workers=config.num_workers, shuffle=False, {% if use_distributed_training and not use_distributed_launcher %} persistent_workers=True, {% endif %} ) # ------------------------------------------ # model, optimizer, loss function, device # ------------------------------------------ device = idist.device() config.num_iters_per_epoch = len(train_dataloader) model, optimizer, loss_fn, lr_scheduler = initialize(config=config) # ----------------------------- # trainer and evaluator # ----------------------------- trainer, evaluator = create_trainers( config=config, model=model, optimizer=optimizer, loss_fn=loss_fn, device=device, ) # --------------------------------- # attach metrics to evaluator # --------------------------------- accuracy = Accuracy(device=device) metrics = { "eval_accuracy": accuracy, "eval_loss": Loss(loss_fn, device=device), "eval_error": (1.0 - accuracy) * 100, } for name, metric in metrics.items(): metric.attach(evaluator, name) # ------------------------------------------- # setup engines logger with python logging # print training configurations # ------------------------------------------- logger = setup_logging(config) log_basic_info(logger, config) trainer.logger = logger evaluator.logger = logger # ------------------------------------- # ignite handlers and ignite loggers # ------------------------------------- to_save = {"model": model, "optimizer": optimizer, "trainer": trainer, "lr_scheduler": lr_scheduler} best_model_handler, es_handler, timer_handler = get_handlers( config=config, model=model, trainer=trainer, evaluator=evaluator, metric_name="eval_accuracy", es_metric_name="eval_accuracy", to_save=to_save, lr_scheduler=lr_scheduler, output_names=None, ) # setup ignite logger only on rank 0 if rank == 0: logger_handler = get_logger( config=config, trainer=trainer, evaluator=evaluator, optimizers=optimizer ) # ----------------------------------- # resume from the saved checkpoints # ----------------------------------- if config.resume_from: resume_from(to_load=to_save, checkpoint_fp=config.resume_from) # -------------------------------- # print metrics to the stderr # with `add_event_handler` API # for training stats # -------------------------------- trainer.add_event_handler(Events.ITERATION_COMPLETED(every=config.log_every_iters), log_metrics, tag="train") # --------------------------------------------- # run evaluation at every training epoch end # with shortcut `on` decorator API and # print metrics to the stderr # again with `add_event_handler` API # for evaluation stats # --------------------------------------------- @trainer.on(Events.EPOCH_COMPLETED(every=1)) def _(): evaluator.run(eval_dataloader, epoch_length=config.eval_epoch_length) log_metrics(evaluator, "eval") # -------------------------------------------------- # let's try run evaluation first as a sanity check # -------------------------------------------------- @trainer.on(Events.STARTED) def _(): evaluator.run(eval_dataloader, epoch_length=config.eval_epoch_length) # ------------------------------------------ # setup if done. let's run the training # ------------------------------------------ trainer.run(train_dataloader, max_epochs=config.max_epochs, epoch_length=config.train_epoch_length) # ------------------------------------------------------------ # close the logger after the training completed / terminated # ------------------------------------------------------------ if rank == 0: from ignite.contrib.handlers.wandb_logger import WandBLogger if isinstance(logger_handler, WandBLogger): # why handle differently for wandb ? # See : https://github.com/pytorch/ignite/issues/1894 logger_handler.finish() elif logger_handler: logger_handler.close() # ----------------------------------------- # where is my best and last checkpoint ? # ----------------------------------------- if best_model_handler is not None: logger.info("Last and best checkpoint: %s", best_model_handler.last_checkpoint)
trainer = Engine(process_function) train_evaluator = Engine(eval_function) validator_evaluator = Engine(eval_function) RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss') def thresholded_output_transform(output): y_pred, y = output y_pred = torch.round(y_pred) return y_pred, y Accuracy(output_transform=thresholded_output_transform).attach(train_evaluator, 'accuracy') Loss(loss_function).attach(train_evaluator, 'loss_train') # binary cross entropy Accuracy(output_transform=thresholded_output_transform).attach(validator_evaluator, 'accuracy') Loss(loss_function).attach(validator_evaluator, 'loss_val') pbar = ProgressBar(persist=True, bar_format="") pbar.attach(trainer, ['loss']) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): train_evaluator.run(train_iter) metrics = train_evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_loss = metrics['loss_train'] pbar.log_message(
def run(*options, cfg=None, debug=False): """Run training and validation of model Notes: Options can be passed in via the options argument and loaded from the cfg file Options loaded from default.py will be overridden by those loaded from cfg file Options passed in via options argument will override those loaded from cfg file Args: *options (str, int, optional): Options used to overide what is loaded from the config. To see what options are available consult default.py cfg (str, optional): Location of config file to load. Defaults to None. debug (bool): Places scripts in debug/test mode and only executes a few iterations """ update_config(config, options=options, config_file=cfg) # we will write the model under outputs / config_file_name / model_dir config_file_name = "default_config" if not cfg else cfg.split("/")[-1].split(".")[0] # Start logging load_log_configuration(config.LOG_CONFIG) logger = logging.getLogger(__name__) logger.debug(config.WORKERS) epochs_per_cycle = config.TRAIN.END_EPOCH // config.TRAIN.SNAPSHOTS torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK torch.manual_seed(config.SEED) if torch.cuda.is_available(): torch.cuda.manual_seed_all(config.SEED) np.random.seed(seed=config.SEED) device = "cpu" if torch.cuda.is_available(): device = "cuda" # Setup Augmentations basic_aug = Compose( [ Normalize(mean=(config.TRAIN.MEAN,), std=(config.TRAIN.STD,), max_pixel_value=config.TRAIN.MAX,), PadIfNeeded( min_height=config.TRAIN.PATCH_SIZE, min_width=config.TRAIN.PATCH_SIZE, border_mode=config.OPENCV_BORDER_CONSTANT, always_apply=True, mask_value=mask_value, value=0, ), Resize( config.TRAIN.AUGMENTATIONS.RESIZE.HEIGHT, config.TRAIN.AUGMENTATIONS.RESIZE.WIDTH, always_apply=True, ), PadIfNeeded( min_height=config.TRAIN.AUGMENTATIONS.PAD.HEIGHT, min_width=config.TRAIN.AUGMENTATIONS.PAD.WIDTH, border_mode=config.OPENCV_BORDER_CONSTANT, always_apply=True, mask_value=mask_value, value=0, ), ] ) if config.TRAIN.AUGMENTATION: train_aug = Compose([basic_aug, HorizontalFlip(p=0.5)]) val_aug = basic_aug else: train_aug = val_aug = basic_aug PenobscotDataset = get_patch_dataset(config) train_set = PenobscotDataset( config.DATASET.ROOT, config.TRAIN.PATCH_SIZE, config.TRAIN.STRIDE, split="train", transforms=train_aug, n_channels=config.MODEL.IN_CHANNELS, complete_patches_only=config.TRAIN.COMPLETE_PATCHES_ONLY, ) val_set = PenobscotDataset( config.DATASET.ROOT, config.TRAIN.PATCH_SIZE, config.TRAIN.STRIDE, split="val", transforms=val_aug, n_channels=config.MODEL.IN_CHANNELS, complete_patches_only=config.VALIDATION.COMPLETE_PATCHES_ONLY, ) logger.info(train_set) logger.info(val_set) n_classes = train_set.n_classes train_loader = data.DataLoader( train_set, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS, shuffle=True, ) if debug: val_set = data.Subset(val_set, range(3)) val_loader = data.DataLoader(val_set, batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS) model = getattr(models, config.MODEL.NAME).get_seg_model(config) model = model.to(device) # Send to GPU optimizer = torch.optim.SGD( model.parameters(), lr=config.TRAIN.MAX_LR, momentum=config.TRAIN.MOMENTUM, weight_decay=config.TRAIN.WEIGHT_DECAY, ) try: output_dir = generate_path( config.OUTPUT_DIR, git_branch(), git_hash(), config_file_name, config.TRAIN.MODEL_DIR, current_datetime(), ) except TypeError: output_dir = generate_path(config.OUTPUT_DIR, config_file_name, config.TRAIN.MODEL_DIR, current_datetime(),) summary_writer = create_summary_writer(log_dir=path.join(output_dir, config.LOG_DIR)) snapshot_duration = epochs_per_cycle * len(train_loader) if not debug else 2 * len(train_loader) scheduler = CosineAnnealingScheduler( optimizer, "lr", config.TRAIN.MAX_LR, config.TRAIN.MIN_LR, cycle_size=snapshot_duration ) # weights are inversely proportional to the frequency of the classes in # the training set class_weights = torch.tensor(config.DATASET.CLASS_WEIGHTS, device=device, requires_grad=False) criterion = torch.nn.CrossEntropyLoss(weight=class_weights, ignore_index=mask_value, reduction="mean") trainer = create_supervised_trainer(model, optimizer, criterion, _prepare_batch, device=device) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) trainer.add_event_handler( Events.ITERATION_COMPLETED, logging_handlers.log_training_output(log_interval=config.TRAIN.BATCH_SIZE_PER_GPU), ) trainer.add_event_handler(Events.EPOCH_STARTED, logging_handlers.log_lr(optimizer)) trainer.add_event_handler( Events.EPOCH_STARTED, tensorboard_handlers.log_lr(summary_writer, optimizer, "epoch"), ) trainer.add_event_handler( Events.ITERATION_COMPLETED, tensorboard_handlers.log_training_output(summary_writer), ) def _select_pred_and_mask(model_out_dict): return (model_out_dict["y_pred"].squeeze(), model_out_dict["mask"].squeeze()) evaluator = create_supervised_evaluator( model, _prepare_batch, metrics={ "pixacc": pixelwise_accuracy(n_classes, output_transform=_select_pred_and_mask), "nll": Loss(criterion, output_transform=_select_pred_and_mask), "cacc": class_accuracy(n_classes, output_transform=_select_pred_and_mask), "mca": mean_class_accuracy(n_classes, output_transform=_select_pred_and_mask), "ciou": class_iou(n_classes, output_transform=_select_pred_and_mask), "mIoU": mean_iou(n_classes, output_transform=_select_pred_and_mask), }, device=device, ) # Set the validation run to start on the epoch completion of the training run trainer.add_event_handler(Events.EPOCH_COMPLETED, Evaluator(evaluator, val_loader)) evaluator.add_event_handler( Events.EPOCH_COMPLETED, logging_handlers.log_metrics( "Validation results", metrics_dict={ "nll": "Avg loss :", "pixacc": "Pixelwise Accuracy :", "mca": "Avg Class Accuracy :", "mIoU": "Avg Class IoU :", }, ), ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, tensorboard_handlers.log_metrics( summary_writer, trainer, "epoch", metrics_dict={ "mIoU": "Validation/mIoU", "nll": "Validation/Loss", "mca": "Validation/MCA", "pixacc": "Validation/Pixel_Acc", }, ), ) def _select_max(pred_tensor): return pred_tensor.max(1)[1] def _tensor_to_numpy(pred_tensor): return pred_tensor.squeeze().cpu().numpy() transform_func = compose(np_to_tb, decode_segmap, _tensor_to_numpy,) transform_pred = compose(transform_func, _select_max) evaluator.add_event_handler( Events.EPOCH_COMPLETED, create_image_writer(summary_writer, "Validation/Image", "image"), ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, create_image_writer(summary_writer, "Validation/Mask", "mask", transform_func=transform_func), ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, create_image_writer(summary_writer, "Validation/Pred", "y_pred", transform_func=transform_pred), ) def snapshot_function(): return (trainer.state.iteration % snapshot_duration) == 0 checkpoint_handler = SnapshotHandler(output_dir, config.MODEL.NAME, extract_metric_from("mIoU"), snapshot_function,) evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {"model": model}) logger.info("Starting training") if debug: trainer.run( train_loader, max_epochs=config.TRAIN.END_EPOCH, epoch_length=config.TRAIN.BATCH_SIZE_PER_GPU, seed=config.SEED, ) else: trainer.run(train_loader, max_epochs=config.TRAIN.END_EPOCH, epoch_length=len(train_loader), seed=config.SEED)
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_interval): train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() device = 'cpu' if torch.cuda.is_available(): device = 'cuda' optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) # define a trainer trainer = create_supervised_trainer(model, optimizer, F.nll_loss, device=device) # define a evaluator evaluator = create_supervised_evaluator(model, metrics={ 'accuracy': Accuracy(), 'nll': Loss(F.nll_loss) }, device=device) # Print desc = "ITERATION - loss: {:.2f}" # the loss of each iteration while training pbar = tqdm( initial=0, leave=False, total=len(train_loader), desc=desc.format( 0)) # Progress of the current iteration in the entire epoch @trainer.on(Events.ITERATION_COMPLETED ) # call this function when iteration is completed def log_training_loss(engine): iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: pbar.desc = desc.format( engine.state.output) # update the training loss pbar.update(log_interval) # update the progress bar @trainer.on(Events.EPOCH_COMPLETED ) # call this function when epoch is completed def log_training_results(engine): pbar.refresh() evaluator.run(train_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] tqdm.write( "Training Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) @trainer.on(Events.EPOCH_COMPLETED ) # call this function when epoch is completed def log_validation_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] tqdm.write( "Validation Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) pbar.n = pbar.last_print_n = 0 trainer.run(train_loader, max_epochs=epochs) pbar.close()
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_interval): train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = FushionNet() #model=torch.load(SAVE_PATH+"350-0.908.pth") if torch.cuda.is_available(): device = 'cuda' else: device = 'cpu' optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=2e-6, nesterov=False) #optimizer = optim.Adamax(model.parameters(),lr,(0.9,0.999),1e-8,1e-6) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, [50, 100, 150, 200, 250, 300], 0.1) trainer = create_supervised_trainer(model, optimizer, F.nll_loss, device=device) evaluator = create_supervised_evaluator(model, metrics={ 'accuracy': Accuracy(), 'nll': Loss(F.nll_loss) }, device=device) desc = "ITERATION - loss: {:.2f}" pbar = tqdm(initial=0, leave=False, total=len(train_loader), desc=desc.format(0)) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: pbar.desc = desc.format(engine.state.output) pbar.update(log_interval) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): pbar.refresh() evaluator.run(train_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] current_lr = optimizer.param_groups[0]['lr'] tqdm.write( "Training Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f} Current lr: {:.6f}" .format(engine.state.epoch, avg_accuracy, avg_nll, current_lr)) scheduler.step() @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] tqdm.write( "Validation Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) pbar.n = pbar.last_print_n = 0 if (engine.state.epoch % 10 == 0): torch.save( model, SAVE_PATH + str(engine.state.epoch) + "-" + str(avg_accuracy) + ".pth") trainer.run(train_loader, max_epochs=epochs) pbar.close()
def run(train_batch_size, val_batch_size, epochs, learning_rate, weight_decay, log_interval, log_dir): train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) test_loader = get_test_loader(val_batch_size) use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") print("Pytorch Version:", torch.__version__) print('device={}'.format(device)) model = CP_MixedNet() writer = create_summary_writer(model, train_loader, log_dir) optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) trainer = create_supervised_trainer(model, optimizer, F.nll_loss, device=device) evaluator = create_supervised_evaluator(model, metrics={ 'accuracy': Accuracy(), 'nll': Loss(F.nll_loss) }, device=device) evaluator_val = create_supervised_evaluator(model, metrics={ 'accuracy': Accuracy(), 'nll': Loss(F.nll_loss) }, device=device) desc = "ITERATION - loss: {:.2f}" pbar = tqdm(initial=0, leave=False, total=len(train_loader), desc=desc.format(0)) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: pbar.desc = desc.format(engine.state.output) pbar.update(log_interval) writer.add_scalar("training/loss", engine.state.output, engine.state.iteration) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): pbar.refresh() evaluator.run(train_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] tqdm.write( "Training Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) writer.add_scalar("training/avg_loss", avg_nll, engine.state.epoch) writer.add_scalar("training/avg_accuracy", avg_accuracy, engine.state.epoch) @trainer.on(Events.EPOCH_COMPLETED) def log_test_results(engine): evaluator.run(test_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] tqdm.write( "Test Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}". format(engine.state.epoch, avg_accuracy, avg_nll)) pbar.n = pbar.last_print_n = 0 writer.add_scalar("test/avg_loss", avg_nll, engine.state.epoch) writer.add_scalar("test/avg_accuracy", avg_accuracy, engine.state.epoch) handler = EarlyStopping(patience=400, score_function=score_function, trainer=trainer) evaluator_val.add_event_handler(Events.COMPLETED, handler) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator_val.run(val_loader) metrics = evaluator_val.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] tqdm.write( "Validation Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) pbar.n = pbar.last_print_n = 0 writer.add_scalar("val/avg_loss", avg_nll, engine.state.epoch) writer.add_scalar("val/avg_accuracy", avg_accuracy, engine.state.epoch) trainer.run(train_loader, max_epochs=epochs) pbar.close() writer.close() save_model = True if (save_model): torch.save(model.state_dict(), "weights_BCI.pt")
def main(dataset_path, batch_size=256, max_epochs=10): assert torch.cuda.is_available() assert torch.backends.cudnn.enabled, "NVIDIA/Apex:Amp requires cudnn backend to be enabled." torch.backends.cudnn.benchmark = True device = "cuda" train_loader, test_loader, eval_train_loader = get_train_eval_loaders( dataset_path, batch_size=batch_size) model = wide_resnet50_2(num_classes=100).to(device) optimizer = SGD(model.parameters(), lr=0.01) criterion = CrossEntropyLoss().to(device) scaler = GradScaler() def train_step(engine, batch): x = convert_tensor(batch[0], device, non_blocking=True) y = convert_tensor(batch[1], device, non_blocking=True) optimizer.zero_grad() # Runs the forward pass with autocasting. with autocast(): y_pred = model(x) loss = criterion(y_pred, y) # Scales loss. Calls backward() on scaled loss to create scaled gradients. # Backward passes under autocast are not recommended. # Backward ops run in the same precision that autocast used for corresponding forward ops. scaler.scale(loss).backward() # scaler.step() first unscales the gradients of the optimizer's assigned params. # If these gradients do not contain infs or NaNs, optimizer.step() is then called, # otherwise, optimizer.step() is skipped. scaler.step(optimizer) # Updates the scale for next iteration. scaler.update() return loss.item() trainer = Engine(train_step) timer = Timer(average=True) timer.attach(trainer, step=Events.EPOCH_COMPLETED) ProgressBar(persist=True).attach( trainer, output_transform=lambda out: {"batch loss": out}) metrics = {"Accuracy": Accuracy(), "Loss": Loss(criterion)} evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def log_metrics(engine, title): for name in metrics: print(f"\t{title} {name}: {engine.state.metrics[name]:.2f}") @trainer.on(Events.COMPLETED) def run_validation(_): print(f"- Mean elapsed time for 1 epoch: {timer.value()}") print("- Metrics:") with evaluator.add_event_handler(Events.COMPLETED, log_metrics, "Train"): evaluator.run(eval_train_loader) with evaluator.add_event_handler(Events.COMPLETED, log_metrics, "Test"): evaluator.run(test_loader) trainer.run(train_loader, max_epochs=max_epochs)
def run(tb, vb, lr, epochs, writer): device = os.environ['main-device'] logging.info('Training program start!') logging.info('Configuration:') logging.info('\n' + json.dumps(INFO, indent=2)) # ------------------------------------ # 1. Define dataloader train_loader, train4val_loader, val_loader, num_of_images, mapping, _ = get_dataloaders( tb, vb) # train_loader, train4val_loader, val_loader, num_of_images = get_dataloaders(tb, vb) # Adjust weights of unknown num_of_images[6] += int(sum(num_of_images) / len(num_of_images)) weights = (1 / num_of_images) / ((1 / num_of_images).sum().item()) # weights = (1/num_of_images)/(1/num_of_images + 1/(num_of_images.sum().item()-num_of_images)) weights = weights.to(device=device) # ------------------------------------ # 2. Define model model = EfficientNet.from_pretrained( 'efficientnet-b0', num_classes=INFO['dataset-info']['num-of-classes']) model = carrier(model) # ------------------------------------ # 3. Define optimizer optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200) ignite_scheduler = LRScheduler(scheduler) # ------------------------------------ # 4. Define metrics class SoftCrossEntropyLoss(nn.Module): def __init__(self, weight=None): super(SoftCrossEntropyLoss, self).__init__() self.class_weights = weight def forward(self, input, target): softmax = torch.exp(input) / torch.exp(input).sum(1)[:, None] onehot_labels = to_onehot(target, input.shape[1]) soft_labels = torch.zeros_like(onehot_labels) soft_labels = torch.where( onehot_labels.cpu() == 1, torch.tensor([0.9]), torch.tensor([0.1 / (input.shape[1] - 1)])).to(device=device) if self.class_weights is not None: # print(soft_labels.shape, softmax.shape) loss = -torch.sum( torch.log(softmax) * soft_labels * self.class_weights * input.shape[1]) else: loss = -torch.sum(torch.log(softmax) * soft_labels) return loss class EntropyPrediction(metric.Metric): def __init__(self, threshold=1.0): super(EntropyPrediction, self).__init__() self.threshold = threshold self.prediction = torch.tensor([], dtype=torch.int) self.y = torch.tensor([], dtype=torch.int) def reset(self): # self.threshold = 0.3 self.prediction = torch.tensor([]) self.y = torch.tensor([]) super(EntropyPrediction, self).reset() def update(self, output): y_pred, y = output softmax = torch.exp(y_pred) / torch.exp(y_pred).sum(1)[:, None] entropy_base = math.log(y_pred.shape[1]) entropy = (-softmax * torch.log(softmax)).sum(1) / entropy_base values, inds = softmax.max(1) prediction = torch.where(entropy > self.threshold, inds, torch.tensor([-1]).to(device=device)) self.prediction = torch.cat( (self.prediction.type(torch.LongTensor).to(device=device), torch.tensor([mapping[x.item()] for x in prediction]).to(device=device))) self.y = torch.cat( (self.y.type(torch.LongTensor).to(device=device), y.to(device=device))) # return self.prediction, self.y def compute(self): return self.prediction, self.y train_metrics = { 'accuracy': Accuracy(), 'loss': Loss(nn.CrossEntropyLoss(weight=weights)), 'precision_recall': MetricsLambda(PrecisionRecallTable, Precision(), Recall(), train_loader.dataset.classes), 'cmatrix': MetricsLambda(CMatrixTable, ConfusionMatrix(INFO['dataset-info']['num-of-classes']), train_loader.dataset.classes) } val_metrics = { 'accuracy': MetricsLambda(Labels2Acc, EntropyPrediction()), 'precision_recall': MetricsLambda(Labels2PrecisionRecall, EntropyPrediction(), val_loader.dataset.classes), 'cmatrix': MetricsLambda(Labels2CMatrix, EntropyPrediction(), val_loader.dataset.classes) } # ------------------------------------ # 5. Create trainer trainer = create_supervised_trainer(model, optimizer, nn.CrossEntropyLoss(weight=weights), device=device) # ------------------------------------ # 6. Create evaluator train_evaluator = create_supervised_evaluator(model, metrics=train_metrics, device=device) val_evaluator = create_supervised_evaluator(model, metrics=val_metrics, device=device) desc = 'ITERATION - loss: {:.4f}' pbar = tqdm(initial=0, leave=False, total=len(train_loader), desc=desc.format(0)) # ------------------------------------ # 7. Create event hooks # Update process bar on each iteration completed. @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): log_interval = 1 iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: pbar.desc = desc.format(engine.state.output) pbar.update(log_interval) @trainer.on(Events.EPOCH_STARTED) def refresh_pbar(engine): pbar.refresh() pbar.n = pbar.last_print_n = 0 # Compute metrics on train data on each epoch completed. @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): print('Checking on training set.') train_evaluator.run(train4val_loader) metrics = train_evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_loss = metrics['loss'] precision_recall = metrics['precision_recall'] cmatrix = metrics['cmatrix'] prompt = """ Training Results - Epoch: {} Avg accuracy: {:.4f} Avg loss: {:.4f} precision_recall: \n{} confusion matrix: \n{} """.format(engine.state.epoch, avg_accuracy, avg_loss, precision_recall['pretty'], cmatrix['pretty']) tqdm.write(prompt) logging.info('\n' + prompt) writer.add_text(os.environ['run-id'], prompt, engine.state.epoch) writer.add_scalars('Aggregate/Acc', {'Train Acc': avg_accuracy}, engine.state.epoch) writer.add_scalars('Aggregate/Loss', {'Train Loss': avg_loss}, engine.state.epoch) # Compute metrics on val data on each epoch completed. cpe = CustomPeriodicEvent(n_epochs=50) cpe.attach(trainer) @trainer.on(cpe.Events.EPOCHS_50_COMPLETED) def log_validation_results(engine): pbar.clear() print('* - * - * - * - * - * - * - * - * - * - * - * - *') print('Checking on validation set.') val_evaluator.run(val_loader) metrics = val_evaluator.state.metrics avg_accuracy = metrics['accuracy'] precision_recall = metrics['precision_recall'] cmatrix = metrics['cmatrix'] prompt = """ Validating Results - Epoch: {} Avg accuracy: {:.4f} precision_recall: \n{} confusion matrix: \n{} """.format(engine.state.epoch, avg_accuracy, precision_recall['pretty'], cmatrix['pretty']) tqdm.write(prompt) logging.info('\n' + prompt) writer.add_text(os.environ['run-id'], prompt, engine.state.epoch) writer.add_scalars('Aggregate/Acc', {'Val Acc': avg_accuracy}, engine.state.epoch) writer.add_scalars( 'Aggregate/Score', { 'Val avg precision': precision_recall['data'][0, -1], 'Val avg recall': precision_recall['data'][1, -1] }, engine.state.epoch) # Save model ever N epoch. save_model_handler = ModelCheckpoint(os.environ['savedir'], '', save_interval=10, n_saved=2) trainer.add_event_handler(Events.EPOCH_COMPLETED, save_model_handler, {'model': model}) # Update learning-rate due to scheduler. trainer.add_event_handler(Events.EPOCH_STARTED, ignite_scheduler) # ------------------------------------ # Run trainer.run(train_loader, max_epochs=epochs) pbar.close()
def main(batch_size, epochs): # 1. GPUの設定(PyTorchでは明示的に指定する必要がある) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(device) train_loader, test_loader = get_data_loaders(batch_size) # 2. モデル作成 # model = net.CNN(num_classes=num_classes).to(device) model = net.Net(1000, 10).to(device) print(model) # ネットワークの詳細を確認用に表示 # 3. 損失関数を定義 criterion = nn.CrossEntropyLoss() # 4. 最適化手法を定義(ここでは例としてAdamを選択) # optimizer = optim.Adam(model.parameters(), lr=0.001) optimizer = optim.Adam(model.parameters()) trainer = create_supervised_trainer(model, optimizer, criterion, device=device) train_evaluator = create_supervised_evaluator(model, metrics={ 'accuracy': Accuracy(), 'loss': Loss(criterion) }, device=device) test_evaluator = create_supervised_evaluator(model, metrics={ 'accuracy': Accuracy(), 'loss': Loss(criterion) }, device=device) desc = "ITERATION - loss: {:.2f}" pbar = tqdm(initial=0, leave=False, total=len(train_loader), desc=desc.format(0)) log_interval = 10 # 5. ログ出力 @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): i = (engine.state.iteration - 1) % len(train_loader) + 1 if i % log_interval == 0: pbar.desc = desc.format(engine.state.output) pbar.update(log_interval) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): pbar.refresh() train_evaluator.run(train_loader) metrics = train_evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_loss = metrics['loss'] tqdm.write( "Training Results - Epoch: {} Avg accuracy: {:.3f} Avg loss: {:.4f}" .format(engine.state.epoch, avg_accuracy, avg_loss)) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): test_evaluator.run(test_loader) metrics = test_evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_loss = metrics['loss'] tqdm.write( "Validation Results - Epoch: {} Avg accuracy: {:.3f} Avg loss: {:.4f}" .format(engine.state.epoch, avg_accuracy, avg_loss)) pbar.n = pbar.last_print_n = 0 def score_function(engine): val_loss = engine.state.metrics['loss'] return -val_loss # 5. checkpoint setting best_handler = ModelCheckpoint(dirname='./checkpoints', filename_prefix='best', n_saved=3, score_name='loss', score_function=score_function, create_dir=True, require_empty=False) test_evaluator.add_event_handler(Events.EPOCH_COMPLETED, best_handler, {'mymodel': model}) early_handler = EarlyStopping(patience=5, score_function=score_function, trainer=trainer) # Note: the handler is attached to an *Evaluator* (runs one epoch on validation dataset) test_evaluator.add_event_handler(Events.COMPLETED, early_handler) # 6. 実行 trainer.run(train_loader, max_epochs=epochs) pbar.close()
def train_with_ignite(networks, dataset, data_dir, batch_size, img_size, epochs, lr, momentum, num_workers, optimizer, logger): from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator from ignite.metrics import Loss from utils.metrics import MultiThresholdMeasures, Accuracy, IoU, F1score # device device = 'cuda' if torch.cuda.is_available() else 'cpu' # build model model = get_network(networks) # log model summary input_size = (3, img_size, img_size) summarize_model(model.to(device), input_size, logger, batch_size, device) # build loss loss = torch.nn.BCEWithLogitsLoss() # build optimizer and scheduler model_optimizer = get_optimizer(optimizer, model, lr, momentum) lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(model_optimizer) # transforms on both image and mask train_joint_transforms = jnt_trnsf.Compose([ jnt_trnsf.RandomCrop(img_size), jnt_trnsf.RandomRotate(5), jnt_trnsf.RandomHorizontallyFlip() ]) # transforms only on images train_image_transforms = std_trnsf.Compose([ std_trnsf.ColorJitter(0.05, 0.05, 0.05, 0.05), std_trnsf.ToTensor(), std_trnsf.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) test_joint_transforms = jnt_trnsf.Compose([jnt_trnsf.Safe32Padding()]) test_image_transforms = std_trnsf.Compose([ std_trnsf.ToTensor(), std_trnsf.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) # transforms only on mask mask_transforms = std_trnsf.Compose([std_trnsf.ToTensor()]) # build train / test loader train_loader = get_loader(dataset=dataset, data_dir=data_dir, train=True, joint_transforms=train_joint_transforms, image_transforms=train_image_transforms, mask_transforms=mask_transforms, batch_size=batch_size, shuffle=False, num_workers=num_workers) test_loader = get_loader(dataset=dataset, data_dir=data_dir, train=False, joint_transforms=test_joint_transforms, image_transforms=test_image_transforms, mask_transforms=mask_transforms, batch_size=1, shuffle=False, num_workers=num_workers) # build trainer / evaluator with ignite trainer = create_supervised_trainer(model, model_optimizer, loss, device=device) measure = MultiThresholdMeasures() evaluator = create_supervised_evaluator(model, metrics={ '': measure, 'pix-acc': Accuracy(measure), 'iou': IoU(measure), 'loss': Loss(loss), 'f1': F1score(measure), }, device=device) # initialize state variable for checkpoint state = update_state(model.state_dict(), 0, 0, 0, 0, 0) # make ckpt path ckpt_root = './ckpt/' filename = '{network}_{optimizer}_lr_{lr}_epoch_{epoch}.pth' ckpt_path = os.path.join(ckpt_root, filename) # execution after every training iteration @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(trainer): num_iter = (trainer.state.iteration - 1) % len(train_loader) + 1 if num_iter % 20 == 0: logger.info("Epoch[{}] Iter[{:03d}] Loss: {:.2f}".format( trainer.state.epoch, num_iter, trainer.state.output)) # execution after every training epoch @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(trainer): # evaluate on training set evaluator.run(train_loader) metrics = evaluator.state.metrics logger.info( "Training Results - Epoch: {} Avg-loss: {:.3f}\n Pix-acc: {}\n IOU: {}\n F1: {}\n" .format(trainer.state.epoch, metrics['loss'], str(metrics['pix-acc']), str(metrics['iou']), str(metrics['f1']))) # update state update_state(weight=model.state_dict(), train_loss=metrics['loss'], val_loss=state['val_loss'], val_pix_acc=state['val_pix_acc'], val_iou=state['val_iou'], val_f1=state['val_f1']) # execution after every epoch @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(trainer): # evaluate test(validation) set evaluator.run(test_loader) metrics = evaluator.state.metrics logger.info( "Validation Results - Epoch: {} Avg-loss: {:.3f}\n Pix-acc: {}\n IOU: {}\n F1: {}\n" .format(trainer.state.epoch, metrics['loss'], str(metrics['pix-acc']), str(metrics['iou']), str(metrics['f1']))) # update scheduler lr_scheduler.step(metrics['loss']) # update and save state update_state(weight=model.state_dict(), train_loss=state['train_loss'], val_loss=metrics['loss'], val_pix_acc=metrics['pix-acc'], val_iou=metrics['iou'], val_f1=metrics['f1']) path = ckpt_path.format(network=networks, optimizer=optimizer, lr=lr, epoch=trainer.state.epoch) save_ckpt_file(path, state) trainer.run(train_loader, max_epochs=epochs)
def log_training_acc(engine): metrics = engine.state.metrics sim_acc = metrics['sim_acc'] clsf_acc = metrics['clsf_acc'] print("Epoch[{}] sim_acc: {:.2f}; clsf_acc {:.2f}".format( engine.state.epoch, sim_acc, clsf_acc)) from ignite.engine import create_supervised_evaluator from ignite.metrics import Loss from utils import extract_embeddings from trainer.metrics import SiameseNetSimilarityAccuracy as SimilarityAccuracy siamese_evaluator = create_supervised_evaluator( siamese_net, device=device, non_blocking=pin_memory, metrics={ # no a good approach 'accuracy': SimilarityAccuracy(margin, l2_normalize=True), 'loss': Loss(con_loss_fn) }) pbar = ProgressBar() pbar.attach(siamese_evaluator) clsf_evaluator = create_supervised_evaluator( clsf_net, device=device, non_blocking=pin_memory, metrics={ 'accuracy': Accuracy(), 'loss': Loss(CrossEntropyLoss()) }) @engine.on(Events.EPOCH_COMPLETED) def run_validation(engine): # loader_kwargs = { # 'pin_memory': True, # 'num_workers': 4, # 'batch_size': 100,
def run( train_batch_size, val_batch_size, epochs, lr, momentum, log_interval, log_dir, checkpoint_every, resume_from, crash_iteration=1000, ): train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() writer = SummaryWriter(logdir=log_dir) device = "cpu" if torch.cuda.is_available(): device = "cuda" criterion = nn.NLLLoss() optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) lr_scheduler = StepLR(optimizer, step_size=1, gamma=0.5) trainer = create_supervised_trainer(model, optimizer, criterion, device=device) evaluator = create_supervised_evaluator(model, metrics={ "accuracy": Accuracy(), "nll": Loss(criterion) }, device=device) @trainer.on(Events.EPOCH_COMPLETED) def lr_step(engine): lr_scheduler.step() desc = "ITERATION - loss: {:.4f} - lr: {:.4f}" pbar = tqdm(initial=0, leave=False, total=len(train_loader), desc=desc.format(0, lr)) if log_interval is None: e = Events.ITERATION_COMPLETED log_interval = 1 else: e = Events.ITERATION_COMPLETED(every=log_interval) @trainer.on(e) def log_training_loss(engine): lr = optimizer.param_groups[0]["lr"] pbar.desc = desc.format(engine.state.output, lr) pbar.update(log_interval) writer.add_scalar("training/loss", engine.state.output, engine.state.iteration) writer.add_scalar("lr", lr, engine.state.iteration) if resume_from is None: @trainer.on(Events.ITERATION_COMPLETED(once=crash_iteration)) def _(engine): raise Exception("STOP at {}".format(engine.state.iteration)) else: @trainer.on(Events.STARTED) def _(engine): pbar.n = engine.state.iteration @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): pbar.refresh() evaluator.run(train_loader) metrics = evaluator.state.metrics avg_accuracy = metrics["accuracy"] avg_nll = metrics["nll"] tqdm.write( "Training Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) writer.add_scalar("training/avg_loss", avg_nll, engine.state.epoch) writer.add_scalar("training/avg_accuracy", avg_accuracy, engine.state.epoch) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics avg_accuracy = metrics["accuracy"] avg_nll = metrics["nll"] tqdm.write( "Validation Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) pbar.n = pbar.last_print_n = 0 writer.add_scalar("valdation/avg_loss", avg_nll, engine.state.epoch) writer.add_scalar("valdation/avg_accuracy", avg_accuracy, engine.state.epoch) objects_to_checkpoint = { "trainer": trainer, "model": model, "optimizer": optimizer, "lr_scheduler": lr_scheduler } training_checkpoint = Checkpoint(to_save=objects_to_checkpoint, save_handler=DiskSaver( log_dir, require_empty=False)) trainer.add_event_handler( Events.ITERATION_COMPLETED(every=checkpoint_every), training_checkpoint) if resume_from is not None: tqdm.write("Resume from a checkpoint: {}".format(resume_from)) checkpoint = torch.load(resume_from) Checkpoint.load_objects(to_load=objects_to_checkpoint, checkpoint=checkpoint) try: trainer.run(train_loader, max_epochs=epochs) except Exception as e: import traceback print(traceback.format_exc()) pbar.close() writer.close()
def _train( self, train_data, val_data, test_data, writer, experiment, dry_run: bool = False, ) -> None: use_cuda = torch.cuda.is_available() # Preprocess all datasets. logger.info("Preprocessing datasets...") train_loader = self._preprocess_for_training("train", train_data, use_cuda) val_loader = self._preprocess_for_training("val", val_data, use_cuda) test_loader = self._preprocess_for_training("test", test_data, use_cuda) logger.info("") # Set up model and move it to device. logger.info("Creating model...") self._create_model(self.num_classes) device = torch.device("cuda" if use_cuda else "cpu") logger.info(f" device: {device}") self.model = self.model.to(device) # Set up optimizer and loss. optimizer = self._create_optimizer() loss_func = nn.CrossEntropyLoss() logger.info(f" loss function: cross-entropy") logger.info("") # Dedicate a few images that will be plotted as samples to tensorboard. num_samples_to_plot = self.config.get("num_samples_to_plot", 5) def get_samples(loader): if loader is None: return None, None else: return next( iter(DataLoader(loader.dataset, batch_size=num_samples_to_plot)) ) train_sample_images, train_sample_labels = get_samples(train_loader) val_sample_images, val_sample_labels = get_samples(val_loader) test_sample_images, test_sample_labels = get_samples(test_loader) # Configure trainer and metrics. accumulate_train_metrics = self.config.get("accumulate_train_metrics", True) # We need to transform the output of the trainer and metrics here to accumulate # metrics during training (otherwise, we have to re-evaluate on the complete # train set which takes a long time). By default, the trainer outputs # `loss.item()` and the metrics expect `y_pred, y` (which is what the evaluator # outputs). We are now outputting `y_pred, y, loss` from the trainer and then # slicing off the `loss` before it goes into the metric. # See also the footnote here but note that it's a bit wrong: # https://pytorch.org/ignite/quickstart.html# def trainer_output_transform(x, y, y_pred, loss): return y_pred, y, loss.item() def metrics_output_transform(output): return output[:2] # use only y_pred, y trainer = create_supervised_trainer( self.model, optimizer, loss_func, device=device, output_transform=trainer_output_transform, ) if accumulate_train_metrics: # TODO: Maybe put train_metrics and val_metrics into one dict. train_metrics = { "accuracy": Accuracy(output_transform=metrics_output_transform), "loss": Loss(loss_func, output_transform=metrics_output_transform), # "confusion_matrix": ConfusionMatrix(num_classes), } for name, metric in train_metrics.items(): # Attach metrics to trainer to accumulate them during training. metric.attach(trainer, name) val_metrics = { "accuracy": Accuracy(), "loss": Loss(loss_func), # "confusion_matrix": ConfusionMatrix(num_classes), } evaluator = create_supervised_evaluator( self.model, metrics=val_metrics, device=device ) @trainer.on( Events.ITERATION_COMPLETED(every=self.config.get("print_every", 100)) ) def log_batch(trainer): batch = (trainer.state.iteration - 1) % trainer.state.epoch_length + 1 logger.info( f"Epoch {trainer.state.epoch} / {num_epochs}, " f"batch {batch} / {trainer.state.epoch_length}: " f"Loss: {trainer.state.output[2]:.3f}" # f"Loss: {trainer.state.output:.3f}" ) def log_results(name, metrics, epoch): """Log results of an epoch to stdout, tensorboard and comet.""" logger.info( f"{name}: Average loss: {metrics['loss']:.3f}, " f"Average accuracy: {metrics['accuracy']:.3f}" ) experiment.log_metric(f"{name}_loss", metrics["loss"]) experiment.log_metric(f"{name}_accuracy", metrics["accuracy"]) writer.add_scalar(f"{name}_loss", metrics["loss"], epoch) writer.add_scalar(f"{name}_accuracy", metrics["accuracy"], epoch) # TODO: This iterates over complete train set again, maybe accumulate as in the # example in the footnote here: https://pytorch.org/ignite/quickstart.html# @trainer.on(Events.EPOCH_COMPLETED) def log_epoch(trainer): logger.info("") logger.info(f"Epoch {trainer.state.epoch} / {num_epochs} results: ") # Train data. if accumulate_train_metrics: log_results("train", trainer.state.metrics, trainer.state.epoch) logger.info("(train metrics are accumulated during training; " "to re-evaluate on the complete train set after training, " "use config parameter 'accumulate_train_metrics': False)") else: evaluator.run(train_loader) log_results("train", evaluator.state.metrics, trainer.state.epoch) # Val data. if val_loader: evaluator.run(val_loader) log_results("val", evaluator.state.metrics, trainer.state.epoch) # Test data. if test_loader: evaluator.run(test_loader) log_results("test", evaluator.state.metrics, trainer.state.epoch) logger.info("") @trainer.on(Events.EPOCH_COMPLETED) def checkpoint_model(trainer): # TODO: Do not checkpoint at every step. checkpoint_dir = ( self.out_dir / "checkpoints" / f"epoch{trainer.state.epoch}" ) checkpoint_dir.mkdir(parents=True, exist_ok=True) torch.save(self.model, checkpoint_dir / "model.pt") @trainer.on(Events.EPOCH_COMPLETED) def plot_samples(trainer): """Plot a few sample images and probabilites to tensorboard.""" def write_samples_plot(name, sample_images, sample_labels): # TODO: This can be improved by just using the outputs already # calculated in evaluator.state.output in the functions above. # Problem: At least in the train evaluator, the batches are not equal, # so the plotted images will differ from run to run. if sample_images is None: return with torch.no_grad(): sample_output = self.model(sample_images.to(device)) sample_pred = torch.softmax(sample_output, dim=1) visualization.plot_samples( writer, f"{name}-samples", trainer.state.epoch, sample_images.to("cpu").numpy(), sample_labels.to("cpu").numpy(), sample_pred.to("cpu").numpy(), ) write_samples_plot("train", train_sample_images, train_sample_labels) write_samples_plot("val", val_sample_images, val_sample_labels) write_samples_plot("test", test_sample_images, test_sample_labels) # Start training. num_epochs = 1 if dry_run else self.config.get("num_epochs", 5) if dry_run: num_batches = 1 logger.info(f"Training model on device {device}... (DRY RUN, only 1 batch)") elif "num_samples" in self.config: # TODO: Make sure batch_size doesn't differ from the value extracted during # preprocessing. batch_size = self.config.get("batch_size", 128) # TODO: This always uses a few more samples than num_samples. Maybe get it # to the correct value. num_batches = int(self.config["num_samples"] / batch_size) + 1 logger.info( f"Training model on device {device}... (using " f"{self.config['num_samples']} of {len(train_loader.dataset)} samples)" ) else: num_batches = None # all batches logger.info(f"Training model on device {device}...") logger.info( "(if this takes too long, train on less data with the config " "parameter 'num_samples')" ) logger.info("(show more steps by setting the config parameter 'print_every')") logger.info("") trainer.run(train_loader, max_epochs=num_epochs, epoch_length=num_batches) logger.info("Training finished!") # Save the trained model. torch.save(self.model, self.out_dir / "model.pt")
def train(): config_file = "configs/train_daily_dialog_emotion_action_topic_config.json" config = Config.from_json_file(config_file) # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig(level=logging.INFO if config.local_rank in [-1, 0] else logging.WARN) logger.warning("Running process %d", config.local_rank) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(config)) # Initialize distributed training if needed config.distributed = (config.local_rank != -1) if config.distributed: torch.cuda.set_device(config.local_rank) config.device = torch.device("cuda", config.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info("Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning") tokenizer_class = GPT2Tokenizer if "gpt2" in config.model_checkpoint else OpenAIGPTTokenizer tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint) model_class = GPT2DoubleHeadsModel if "gpt2" in config.model_checkpoint else OpenAIGPTDoubleHeadsModel model = model_class.from_pretrained(config.model_checkpoint) tokenizer.set_special_tokens(SPECIAL_TOKENS) model.set_num_special_tokens(len(SPECIAL_TOKENS)) model.to(config.device) optimizer = OpenAIAdam(model.parameters(), lr=config.lr) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if config.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=config.fp16) if config.distributed: model = DistributedDataParallel(model, device_ids=[config.local_rank], output_device=config.local_rank) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(config, tokenizer) # Training function and trainer def update(engine, batch): model.train() input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids, token_action_ids = tuple(input_tensor.to(config.device) for input_tensor in batch) lm_loss, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids, token_action_ids) loss = (lm_loss * config.lm_coef + mc_loss * config.mc_coef) / config.gradient_accumulation_steps if config.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), config.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_norm) if engine.state.iteration % config.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple(input_tensor.to(config.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids, token_action_ids = batch #logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) model_outputs = model(input_ids, mc_token_ids, token_type_ids=token_type_ids, token_emotion_ids=token_emotion_ids, token_action_ids=token_action_ids) lm_logits, mc_logits = model_outputs[0], model_outputs[1] # So we can also use GPT2 outputs lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if config.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if config.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if config.distributed: trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, config.lr), (config.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])), "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))} metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], config), "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], config)}) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if config.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics))) tb_logger = TensorboardLogger(log_dir=config.log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3) trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)}) # "getattr" take care of distributed encapsulation torch.save(config, tb_logger.writer.log_dir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file(os.path.join(tb_logger.writer.log_dir, CONFIG_NAME)) tokenizer.save_vocabulary(tb_logger.writer.log_dir) # Run the training trainer.run(train_loader, max_epochs=config.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if config.local_rank in [-1, 0] and config.n_epochs > 0: os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME)) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
def run(config, plx_experiment): set_seed(config['seed']) device = "cuda" batch_size = config['batch_size'] train_transforms = [DynamicCrop(32, 32), FlipLR()] cutout_size = config['cutout_size'] if cutout_size is not None: train_transforms.append(DynamicCutout(cutout_size, cutout_size)) train_loader, test_loader = get_fast_train_test_loaders( path=config["data_path"], batch_size=batch_size, num_workers=config['num_workers'], device=device, train_transforms=train_transforms) bn_kwargs = config['bn_kwargs'] conv_kwargs = config['conv_kwargs'] model = config["model"](conv_kwargs=conv_kwargs, bn_kwargs=bn_kwargs, final_weight=config['final_weight']) model = model.to(device) model = model.half() model_name = model.__class__.__name__ criterion = nn.CrossEntropyLoss(reduction='sum').to(device) criterion = criterion.half() eval_criterion = criterion if config["enable_mixup"]: criterion = MixupCriterion(criterion) weight_decay = config['weight_decay'] if not config['use_adamw']: opt_kwargs = [("lr", 0.0), ("momentum", config['momentum']), ("weight_decay", weight_decay), ("nesterov", True)] optimizer_cls = optim.SGD else: opt_kwargs = [ ("lr", 0.0), ("betas", (0.9, 0.999)), ("eps", 1e-08), ] optimizer_cls = optim.Adam optimizer = optimizer_cls([ # conv + bn dict([("params", model.prep.parameters())] + opt_kwargs), # conv + bn dict([("params", model.layer1[0].parameters())] + opt_kwargs), # identity residual block dict([("params", model.layer1[-1].conv1.parameters())] + opt_kwargs), dict([("params", model.layer1[-1].conv2.parameters())] + opt_kwargs), # conv + bn dict([("params", model.layer2.parameters())] + opt_kwargs), # conv + bn dict([("params", model.layer3[0].parameters())] + opt_kwargs), # identity residual block dict([("params", model.layer3[-1].conv1.parameters())] + opt_kwargs), dict([("params", model.layer3[-1].conv2.parameters())] + opt_kwargs), # linear dict([("params", model.classifier.parameters())] + opt_kwargs), ]) num_iterations_per_epoch = len(train_loader) num_iterations = num_iterations_per_epoch * config['num_epochs'] layerwise_milestones_lr_values = [] for i in range(len(optimizer.param_groups)): key = "lr_param_group_{}".format(i) assert key in config, "{} not in config".format(key) milestones_values = config[key] layerwise_milestones_lr_values.append([ (int(m * num_iterations_per_epoch), v / batch_size) for m, v in milestones_values ]) lr_scheduler = get_layerwise_lr_scheduler(optimizer, layerwise_milestones_lr_values) momentum_scheduler = None if config["momentum_scheduling"] is not None: milestones_values = config["momentum_scheduling"] layerwise_milestones_mtm_values = [] for i in range(len(optimizer.param_groups)): layerwise_milestones_mtm_values.append([ (int(m * num_iterations_per_epoch), v) for m, v in milestones_values ]) momentum_scheduler = get_layerwise_scheduler( optimizer, param_name="momentum", milestones_values=layerwise_milestones_mtm_values) def _prepare_batch_fp16(batch, device, non_blocking): x, y = batch return (convert_tensor(x, device=device, non_blocking=non_blocking).half(), convert_tensor(y, device=device, non_blocking=non_blocking).long()) def process_function(engine, batch): x, y = _prepare_batch_fp16(batch, device=device, non_blocking=True) if config['enable_mixup']: x, y = mixup_data(x, y, config['mixup_alpha'], config['mixup_proba']) optimizer.zero_grad() y_pred = model(x) loss = criterion(y_pred, y) loss.backward() if config["clip_gradients"] is not None: clip_grad_norm_(model.parameters(), config["clip_gradients"]) if config['use_adamw']: for group in optimizer.param_groups: for param in group['params']: param.data.add_(-weight_decay / batch_size * group['lr']) optimizer.step() loss = loss.item() return loss trainer = Engine(process_function) metrics = { "accuracy": Accuracy(), "loss": Loss(eval_criterion) / len(test_loader) } evaluator = create_supervised_evaluator(model, metrics, prepare_batch=_prepare_batch_fp16, device=device, non_blocking=True) train_evaluator = create_supervised_evaluator( model, metrics, prepare_batch=_prepare_batch_fp16, device=device, non_blocking=True) total_timer = Timer(average=False) train_timer = Timer(average=False) test_timer = Timer(average=False) table_logger = TableLogger() if config["use_tb_logger"]: path = "experiments/tb_logs" if "TB_LOGGER_PATH" not in os.environ else os.environ[ "TB_LOGGER_PATH"] tb_logger = SummaryWriter(log_dir=path) test_timer.attach(evaluator, start=Events.EPOCH_STARTED) @trainer.on(Events.STARTED) def on_training_started(engine): print("Warming up cudnn on random inputs") for _ in range(5): for size in [batch_size, len(test_loader.dataset) % batch_size]: warmup_cudnn(model, criterion, size, config) total_timer.reset() @trainer.on(Events.EPOCH_STARTED) def on_epoch_started(engine): model.train() train_timer.reset() # Warm-up on small images if config['warmup_on_small_images']: if engine.state.epoch < config['warmup_duration']: train_loader.dataset.transforms[0].h = 20 train_loader.dataset.transforms[0].w = 20 elif engine.state.epoch == config['warmup_duration']: train_loader.dataset.transforms[0].h = 32 train_loader.dataset.transforms[0].w = 32 train_loader.dataset.set_random_choices() if config['reduce_cutout']: # after 15 epoch remove cutout augmentation if 14 <= engine.state.epoch < 16: train_loader.dataset.transforms[-1].h -= 1 train_loader.dataset.transforms[-1].w -= 1 elif engine.state.epoch == 16: train_loader.dataset.transforms.pop() if config['enable_mixup'] and config[ 'mixup_max_epochs'] == engine.state.epoch - 1: config['mixup_proba'] = 0.0 if config["use_tb_logger"]: @trainer.on(Events.ITERATION_COMPLETED) def on_iteration_completed(engine): # log learning rate param_name = "lr" if len(optimizer.param_groups) == 1: param = float(optimizer.param_groups[0][param_name]) tb_logger.add_scalar(param_name, param * batch_size, engine.state.iteration) else: for i, param_group in enumerate(optimizer.param_groups): param = float(param_group[param_name]) tb_logger.add_scalar( "{}/{}/group_{}".format(param_name, model_name, i), param * batch_size, engine.state.iteration) # log training loss tb_logger.add_scalar("training/loss_vs_iterations", engine.state.output / batch_size, engine.state.iteration) @trainer.on(Events.EPOCH_COMPLETED) def on_epoch_completed(engine): trainer.state.train_time = train_timer.value() if config["use_tb_logger"]: # Log |w|^2 and gradients for i, p in enumerate(model.parameters()): tb_logger.add_scalar( "w2/{}/{}_{}".format(model_name, i, list(p.data.shape)), torch.norm(p.data), engine.state.epoch) tb_logger.add_scalar( "mean_grad/{}/{}_{}".format(model_name, i, list(p.grad.shape)), torch.mean(p.grad), engine.state.epoch) for i, p in enumerate(model.parameters()): plx_experiment.log_metrics( step=engine.state.epoch, **{ "w2/{}/{}_{}".format(model_name, i, list(p.data.shape)): torch.norm(p.data).item() }) evaluator.run(test_loader) trainer.add_event_handler(Events.ITERATION_STARTED, lr_scheduler) if momentum_scheduler is not None: trainer.add_event_handler(Events.ITERATION_STARTED, momentum_scheduler) @evaluator.on(Events.COMPLETED) def log_results(engine): evaluator.state.test_time = test_timer.value() metrics = evaluator.state.metrics output = [("epoch", trainer.state.epoch)] output += [(key, trainer.state.param_history[key][-1][0] * batch_size) for key in trainer.state.param_history if "lr" in key] output += [(key, trainer.state.param_history[key][-1][0]) for key in trainer.state.param_history if "lr" not in key] output += [("train time", trainer.state.train_time), ("train loss", trainer.state.output / batch_size), ("test time", evaluator.state.test_time), ("test loss", metrics['loss'] / batch_size), ("test acc", metrics['accuracy']), ("total time", total_timer.value())] output = OrderedDict(output) table_logger.append(output) plx_experiment.log_metrics(step=trainer.state.epoch, **output) if config["use_tb_logger"]: tb_logger.add_scalar("training/total_time", total_timer.value(), trainer.state.epoch) tb_logger.add_scalar("test/loss", metrics['loss'] / batch_size, trainer.state.epoch) tb_logger.add_scalar("test/accuracy", metrics['accuracy'], trainer.state.epoch) @trainer.on(Events.COMPLETED) def on_training_completed(engine): train_evaluator.run(train_loader) metrics = train_evaluator.state.metrics if config["use_tb_logger"]: tb_logger.add_scalar("training/loss", metrics['loss'] / batch_size, 0) tb_logger.add_scalar("training/loss", metrics['loss'] / batch_size, engine.state.epoch) tb_logger.add_scalar("training/accuracy", metrics['accuracy'], 0) tb_logger.add_scalar("training/accuracy", metrics['accuracy'], engine.state.epoch) output = { "train acc": metrics['accuracy'], "train loss": metrics['loss'] / batch_size } plx_experiment.log_metrics(step=engine.state.epoch, **output) trainer.run(train_loader, max_epochs=config['num_epochs']) if config["use_tb_logger"]: tb_logger.close()
out = self.fc(out) return out model = EfficientNetTwoInputs() # In[7]: criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=10e-4) # In[8]: metrics = { 'loss': Loss(criterion), 'accuracy': Accuracy(), } trainer = create_supervised_trainer(model, optimizer, criterion, device=device) val_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) # #### EarlyStopping # In[9]: # handler = EarlyStopping(patience=30, score_function=lambda engine: engine.state.metrics['accuracy'], trainer=trainer) # val_evaluator.add_event_handler(Events.COMPLETED, handler)
def run(train_batch_size, test_batch_size, epochs, lr, log_interval, log_dir, no_cuda, sub_spectrogram_size, sub_spectrogram_mel_hop, n_mel_bins, seed, root_dir, train_dir, eval_dir): """ Model runner Parameters ---------- train_batch_size : int Size of the training batch. Default: 16 test_batch_size : int size of the testing batch. Default: 16 epochs : int Number of training epochs. Default: 200 lr : float Learning rate for the ADAM optimizer. Default: 0.001 log_interval : int Interval for logging data: Default: 10 log_dir : str Directory to save the logs no_cuda : Bool Should you NOT use cuda? Default: False sub_spectrogram_size : int Size of the SubSpectrogram. Default 20 sub_spectrogram_mel_hop : int Mel-bin hop size of the SubSpectrogram. Default 10 n_mel_bins : int Number of mel-bins of the Spectrogram extracted. Default: 40. seed : int Torch random seed value, for reproducable results. Default: 1 root_dir : str Directory of the folder which contains the dataset (has 'audio' and 'evaluation_setup' folders inside) train_dir : str Set as default: 'evaluation_setup/train_fold1.txt' eval_dir : str Set as default: 'evaluation_setup/evaluate_fold1.txt' """ # check if possible to use CUDA use_cuda = not no_cuda and torch.cuda.is_available() # set seed torch.manual_seed(seed) # Map to GPU device = torch.device("cuda" if use_cuda else "cpu") # Load the data loaders train_loader, val_loader = get_data_loaders(train_batch_size, test_batch_size, sub_spectrogram_size, sub_spectrogram_mel_hop, n_mel_bins, use_cuda, root_dir, train_dir, eval_dir) # Get the model model = SubSpectralNet(sub_spectrogram_size, sub_spectrogram_mel_hop, n_mel_bins, use_cuda).to(device) # Init the TensorBoard summary writer writer = create_summary_writer(model, train_loader, log_dir) # Init the optimizer optimizer = optim.Adam(model.parameters(), lr=lr) # Use GPU if possible if device: model.to(device) def update_model(engine, batch): """Prepare batch for training: pass to a device with options. """ model.train() optimizer.zero_grad() inputs, label = prepare_batch(batch, device=device) output = model(inputs) losses = [] for ite in range(output.shape[1]): losses.append(F.nll_loss(output[:, ite, :], label)) loss = sum(losses) loss.backward() optimizer.step() return losses, output # get the trainer module trainer = Engine(update_model) def evaluate(engine, batch): """Prepare batch for training: pass to a device with options. """ model.eval() with torch.no_grad(): inputs, label = prepare_batch(batch, device=device) output = model(inputs) losses = [] correct = [] for ite in range(output.shape[1]): losses.append( F.nll_loss(output[:, ite, :], label, reduction='sum').item()) return losses, output, label # get the evaluator module evaluator = Engine(evaluate) # define output transforms for multiple outputs. def output_transform1(output): # `output` variable is returned by above `process_function` losses, correct, label = output return correct[:, 0, :], label metric = Accuracy(output_transform=output_transform1) metric.attach(evaluator, "acc_highband") metric = Loss(F.nll_loss, output_transform=output_transform1) metric.attach(evaluator, "loss_highband") def output_transform2(output): # `output` variable is returned by above `process_function` losses, correct, label = output return correct[:, 1, :], label metric = Accuracy(output_transform=output_transform2) metric.attach(evaluator, "acc_midband") metric = Loss(F.nll_loss, output_transform=output_transform2) metric.attach(evaluator, "loss_midband") def output_transform3(output): # `output` variable is returned by above `process_function` losses, correct, label = output return correct[:, 2, :], label metric = Accuracy(output_transform=output_transform3) metric.attach(evaluator, "acc_lowband") metric = Loss(F.nll_loss, output_transform=output_transform3) metric.attach(evaluator, "loss_lowband") def output_transform(output): # `output` variable is returned by above `process_function` losses, correct, label = output return correct[:, 3, :], label metric = Accuracy(output_transform=output_transform) metric.attach(evaluator, "acc_globalclassifier") metric = Loss(F.nll_loss, output_transform=output_transform) metric.attach(evaluator, "loss_globalclassifier") # Log the events in Ignite: EVERY ITERATION @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: losses, output = engine.state.output epoch = engine.state.epoch print( 'Train Epoch: {} [{}/{}]\tLosses: {:.6f} (Top Band), {:.6f} (Mid Band), {:.6f} (Low Band), {:.6f} (Global Classifier)' .format(epoch, iter, len(train_loader), losses[0].item(), losses[1].item(), losses[2].item(), losses[3].item())) # TensorBoard Logs writer.add_scalar("training/loss_topband_itr", losses[0].item(), engine.state.iteration) writer.add_scalar("training/loss_midband_itr", losses[1].item(), engine.state.iteration) writer.add_scalar("training/loss_lowband_itr", losses[2].item(), engine.state.iteration) writer.add_scalar("training/loss_global_itr", losses[3].item(), engine.state.iteration) # Log the events in Ignite: Test the training data on EVERY EPOCH @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): evaluator.run(train_loader) print( "Training Results - Epoch: {} Global accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, evaluator.state.metrics['acc_globalclassifier'], evaluator.state.metrics['loss_globalclassifier'])) # TensorBoard Logs writer.add_scalar("training/global_loss", evaluator.state.metrics['loss_globalclassifier'], engine.state.epoch) writer.add_scalar("training/lowband_loss", evaluator.state.metrics['loss_lowband'], engine.state.epoch) writer.add_scalar("training/midband_loss", evaluator.state.metrics['loss_midband'], engine.state.epoch) writer.add_scalar("training/highband_loss", evaluator.state.metrics['loss_highband'], engine.state.epoch) writer.add_scalar("training/global_acc", evaluator.state.metrics['acc_globalclassifier'], engine.state.epoch) writer.add_scalar("training/lowband_acc", evaluator.state.metrics['acc_lowband'], engine.state.epoch) writer.add_scalar("training/midband_acc", evaluator.state.metrics['acc_midband'], engine.state.epoch) writer.add_scalar("training/highband_acc", evaluator.state.metrics['acc_highband'], engine.state.epoch) # Log the events in Ignite: Test the validation data on EVERY EPOCH @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) print( "Validation Results - Epoch: {} Global accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, evaluator.state.metrics['acc_globalclassifier'], evaluator.state.metrics['loss_globalclassifier'])) # TensorBoard Logs writer.add_scalar("validation/global_loss", evaluator.state.metrics['loss_globalclassifier'], engine.state.epoch) writer.add_scalar("validation/lowband_loss", evaluator.state.metrics['loss_lowband'], engine.state.epoch) writer.add_scalar("validation/midband_loss", evaluator.state.metrics['loss_midband'], engine.state.epoch) writer.add_scalar("validation/highband_loss", evaluator.state.metrics['loss_highband'], engine.state.epoch) writer.add_scalar("validation/global_acc", evaluator.state.metrics['acc_globalclassifier'], engine.state.epoch) writer.add_scalar("validation/lowband_acc", evaluator.state.metrics['acc_lowband'], engine.state.epoch) writer.add_scalar("validation/midband_acc", evaluator.state.metrics['acc_midband'], engine.state.epoch) writer.add_scalar("validation/highband_acc", evaluator.state.metrics['acc_highband'], engine.state.epoch) # kick everything off trainer.run(train_loader, max_epochs=epochs) # close the writer writer.close() # return the model return model
def train(): parser = ArgumentParser() parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model_checkpoint", type=str, default="gpt2", help="Path, url or short name of the model") parser.add_argument("--num_candidates", type=int, default=2, help="Number of candidates for training") parser.add_argument("--max_history", type=int, default=2, help="Number of previous exchanges to keep in history") parser.add_argument("--train_batch_size", type=int, default=16, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=4, help="Batch size for validation") # parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="Accumulate gradients on several steps") parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="Accumulate gradients on several steps") # parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate") parser.add_argument("--lr", type=float, default=1e-4, help="Learning rate") parser.add_argument("--lm_coef", type=float, default=1.0, help="LM loss coefficient") parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient") parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--n_epochs", type=int, default=3, help="Number of training epochs") parser.add_argument("--eval_before_start", action='store_true', help="If true start with a first evaluation before training") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--fp16", type=str, default="", help="Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)") parser.add_argument("--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") parser.add_argument( "--init_model", default="model/pytorch_kogpt2_676e9bcfa7.params", type=str, help="The model checkpoint for weights initialization. Leave None if you want to train a model from scratch.", ) args = parser.parse_args() # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning("Running process %d", args.local_rank) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(args)) # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info("Prepare tokenizer, pretrained model and optimizer.") config = GPT2Config(vocab_size=50000) model = GPT2DoubleHeadsModel(config) if args.init_model: print("Load model from ", args.init_model) model.load_state_dict(torch.load(args.init_model), strict=False) model.to(args.device) add_special_tokens_(model, tokenizer) optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if args.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(args, tokenizer) # Training function and trainer def update(engine, batch): model.train() batch = tuple(input_tensor.to(args.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch (lm_loss), (mc_loss), *_ = model( input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, mc_labels=mc_labels, lm_labels=lm_labels ) loss = (lm_loss * args.lm_coef + mc_loss * args.mc_coef) / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple(input_tensor.to(args.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch # logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) # if we dont send labels to model, it doesnt return losses lm_logits, mc_logits, *_ = model( input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, ) lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-100), output_transform=lambda x: (x[0][0], x[1][0])), "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))} metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args), "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args)}) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics))) log_dir = make_logdir(args.init_model) tb_logger = TensorboardLogger(log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', save_interval=1, n_saved=3) trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)}) # "getattr" takes care of distributed encapsulation torch.save(args, log_dir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME)) # tokenizer.save_pretrained(log_dir) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename(os.path.join(log_dir, checkpoint_handler._saved[-1][1]), os.path.join(log_dir, WEIGHTS_NAME)) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
def train(epochs: int, model: nn.Module, train_loader: DataLoader, valid_loader: DataLoader, criterion: Callable, device: str, lr: float, patience: int, lr_decay: float, lr_scheduler: str, lr_scheduler_kwargs: Dict[str, Any]): model.to(torch.device(device)) optimizer = optim.Adam( [param for param in model.parameters() if param.requires_grad], lr=lr) trainer = create_supervised_trainer(model, optimizer, criterion, device=device) scheduler = LRScheduler( getattr(optim.lr_scheduler, lr_scheduler)(optimizer, **lr_scheduler_kwargs)) trainer.add_event_handler(Events.ITERATION_COMPLETED, scheduler) pbar = ProgressBar(False) pbar.attach(trainer) train_evaluator = create_supervised_evaluator( model, metrics={ 'ACC': Accuracy(discreted_output_transform), 'BCE': Loss(criterion), 'AP': AveragePrecision(probability_output_transform) }, device=device) valid_evaluator = create_supervised_evaluator( model, metrics={ 'ACC': Accuracy(discreted_output_transform), 'BCE': Loss(criterion), 'AP': AveragePrecision(probability_output_transform) }, device=device) history = { col: list() for col in [ 'epoch', 'elapsed time', 'iterations', 'lr', 'train BCE', 'valid BCE', 'train ACC', 'valid ACC', 'train AP', 'valid AP' ] } @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): train_evaluator.run(train_loader) history['train BCE'] += [train_evaluator.state.metrics['BCE']] history['train ACC'] += [train_evaluator.state.metrics['ACC']] history['train AP'] += [train_evaluator.state.metrics['AP']] @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): valid_evaluator.run(valid_loader) history['epoch'] += [valid_evaluator.state.epoch] history['iterations'] += [valid_evaluator.state.epoch_length] history['elapsed time'] += [ 0 if len(history['elapsed time']) == 0 else history['elapsed time'][-1] + valid_evaluator.state.times['COMPLETED'] ] history['lr'] += [scheduler.get_param()] history['valid BCE'] += [valid_evaluator.state.metrics['BCE']] history['valid ACC'] += [valid_evaluator.state.metrics['ACC']] history['valid AP'] += [valid_evaluator.state.metrics['AP']] @trainer.on(Events.EPOCH_COMPLETED) def log_progress_bar(engine): pbar.log_message( f"train BCE: {history['train BCE'][-1]:.2f} " \ + f"train ACC: {history['train ACC'][-1]:.2f} " \ + f"train AP: {history['train AP'][-1]:.2f} " \ + f"valid BCE: {history['valid BCE'][-1]:.2f} " \ + f"valid ACC: {history['valid ACC'][-1]:.2f} " \ + f"valid AP: {history['valid AP'][-1]:.2f}" ) # Early stopping handler = EarlyStopping(patience=patience, score_function=score_function, trainer=trainer) valid_evaluator.add_event_handler(Events.EPOCH_COMPLETED, handler) trainer.run(train_loader, max_epochs=epochs) return pd.DataFrame(history)
def run(args): train_loader, val_loader = get_data_loaders(args.dataset_dir, args.batch_size, args.val_batch_size, args.num_workers) if args.seed is not None: torch.manual_seed(args.seed) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') num_classes = KITTI.num_classes() model = LiLaNet(num_classes) device_count = torch.cuda.device_count() if device_count > 1: print("Using %d GPU(s)" % device_count) model = nn.DataParallel(model) args.batch_size = device_count * args.batch_size args.val_batch_size = device_count * args.val_batch_size model = model.to(device) criterion = nn.CrossEntropyLoss(weight=KITTI.class_weights()).to(device) optimizer = optim.Adam(model.parameters(), lr=args.lr) if args.resume: if os.path.isfile(args.resume): print("Loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) print("Loaded checkpoint '{}' (Epoch {})".format( args.resume, checkpoint['epoch'])) else: print("No checkpoint found at '{}'".format(args.resume)) def _prepare_batch(batch, non_blocking=True): distance, reflectivity, target = batch return (convert_tensor(distance, device=device, non_blocking=non_blocking), convert_tensor(reflectivity, device=device, non_blocking=non_blocking), convert_tensor(target, device=device, non_blocking=non_blocking)) def _update(engine, batch): model.train() optimizer.zero_grad() distance, reflectivity, target = _prepare_batch(batch) pred = model(distance, reflectivity) loss = criterion(pred, target) loss.backward() optimizer.step() return loss.item() trainer = Engine(_update) # attach running average metrics RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss') # attach progress bar pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=['loss']) def _inference(engine, batch): model.eval() with torch.no_grad(): distance, reflectivity, target = _prepare_batch(batch) pred = model(distance, reflectivity) return pred, target evaluator = Engine(_inference) cm = ConfusionMatrix(num_classes) IoU(cm, ignore_index=0).attach(evaluator, 'IoU') Loss(criterion).attach(evaluator, 'loss') pbar2 = ProgressBar(persist=True, desc='Eval Epoch') pbar2.attach(evaluator) def _global_step_transform(engine, event_name): if trainer.state is not None: return trainer.state.iteration else: return 1 tb_logger = TensorboardLogger(args.log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag='training', metric_names=['loss']), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(evaluator, log_handler=OutputHandler( tag='validation', metric_names=['loss', 'IoU'], global_step_transform=_global_step_transform), event_name=Events.EPOCH_COMPLETED) @trainer.on(Events.STARTED) def initialize(engine): engine.state.exception_raised = False if args.resume: engine.state.epoch = args.start_epoch @evaluator.on(Events.EPOCH_COMPLETED) def save_checkpoint(engine): epoch = trainer.state.epoch if trainer.state is not None else 1 iou = engine.state.metrics['IoU'] * 100.0 mean_iou = iou.mean() name = 'epoch{}_mIoU={:.1f}.pth'.format(epoch, mean_iou) file = { 'model': model.state_dict(), 'epoch': epoch, 'optimizer': optimizer.state_dict(), 'args': args } save(file, args.output_dir, 'checkpoint_{}'.format(name)) save(model.state_dict(), args.output_dir, 'model_{}'.format(name)) @trainer.on(Events.EPOCH_COMPLETED) def run_validation(engine): pbar.log_message("Start Validation - Epoch: [{}/{}]".format( engine.state.epoch, engine.state.max_epochs)) evaluator.run(val_loader) metrics = evaluator.state.metrics loss = metrics['loss'] iou = metrics['IoU'] * 100.0 mean_iou = iou.mean() iou_text = ', '.join([ '{}: {:.1f}'.format(KITTI.classes[i + 1].name, v) for i, v in enumerate(iou.tolist()) ]) pbar.log_message( "Validation results - Epoch: [{}/{}]: Loss: {:.2e}\n IoU: {}\n mIoU: {:.1f}" .format(engine.state.epoch, engine.state.max_epochs, loss, iou_text, mean_iou)) @trainer.on(Events.EXCEPTION_RAISED) def handle_exception(engine, e): engine.state.exception_raised = True if isinstance(e, KeyboardInterrupt) and (engine.state.iteration > 1): engine.terminate() warnings.warn("KeyboardInterrupt caught. Exiting gracefully.") name = 'epoch{}_exception.pth'.format(trainer.state.epoch) file = { 'model': model.state_dict(), 'epoch': trainer.state.epoch, 'optimizer': optimizer.state_dict() } save(file, args.output_dir, 'checkpoint_{}'.format(name)) save(model.state_dict(), args.output_dir, 'model_{}'.format(name)) else: raise e if args.eval_on_start: print("Start validation") evaluator.run(val_loader, max_epochs=1) print("Start training") trainer.run(train_loader, max_epochs=args.epochs) tb_logger.close()
def run(train_batch_size, val_batch_size, epochs, lr, momentum): train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() device = "cpu" if torch.cuda.is_available(): device = "cuda" model.to(device) # Move model before creating optimizer optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) criterion = nn.CrossEntropyLoss() trainer = create_supervised_trainer(model, optimizer, criterion, device=device) trainer.logger = setup_logger("Trainer") metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)} train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) train_evaluator.logger = setup_logger("Train Evaluator") validation_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) validation_evaluator.logger = setup_logger("Val Evaluator") @trainer.on(Events.EPOCH_COMPLETED) def compute_metrics(engine): train_evaluator.run(train_loader) validation_evaluator.run(val_loader) clearml_logger = ClearMLLogger(project_name="examples", task_name="ignite") clearml_logger.attach_output_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), tag="training", output_transform=lambda loss: {"batchloss": loss}, ) for tag, evaluator in [("training metrics", train_evaluator), ("validation metrics", validation_evaluator)]: clearml_logger.attach_output_handler( evaluator, event_name=Events.EPOCH_COMPLETED, tag=tag, metric_names=["loss", "accuracy"], global_step_transform=global_step_from_engine(trainer), ) clearml_logger.attach_opt_params_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), optimizer=optimizer) clearml_logger.attach( trainer, log_handler=WeightsScalarHandler(model, whitelist=["fc1"]), event_name=Events.ITERATION_COMPLETED(every=100), ) def is_conv(n, _): return "conv" in n clearml_logger.attach( trainer, log_handler=WeightsHistHandler(model, whitelist=is_conv), event_name=Events.ITERATION_COMPLETED(every=100), ) clearml_logger.attach(trainer, log_handler=GradsScalarHandler(model), event_name=Events.ITERATION_COMPLETED(every=100)) clearml_logger.attach( trainer, log_handler=GradsHistHandler(model, whitelist=["fc2.weight"]), event_name=Events.ITERATION_COMPLETED(every=100), ) handler = Checkpoint( {"model": model}, ClearMLSaver(), n_saved=1, score_function=lambda e: e.state.metrics["accuracy"], score_name="val_acc", filename_prefix="best", global_step_transform=global_step_from_engine(trainer), ) validation_evaluator.add_event_handler(Events.EPOCH_COMPLETED, handler) # kick everything off trainer.run(train_loader, max_epochs=epochs) clearml_logger.close()
#test_loader = DataLoader(dataset=test_set, batch_size=batch_size, shuffle=True) # Inisialisasi objek GPU gpu = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Pakai model RESNET untuk transfer learning model = torchvision.models.resnet50(pretrained=True) model.to(gpu) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=learning_rate) trainer = create_supervised_trainer(model, optimizer, criterion, device=gpu) metrics = { "accuracy": Accuracy(), "loss": Loss(criterion) } train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=gpu) val_evaluator = create_supervised_evaluator(model, metrics=metrics, device=gpu) training_history = {"accuracy":[], "loss":[]} validation_history = {"accuracy":[], "loss":[]} last_epoch = [] # RunningAverage metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") # EarlyStopping Callbacks handler = EarlyStopping(patience=10, score_function=score_function, trainer=trainer) val_evaluator.add_event_handler(Events.COMPLETED, handler) # Buat Custom Function
def main(dataset_path, batch_size=256, max_epochs=10, opt="O1"): assert torch.cuda.is_available() assert torch.backends.cudnn.enabled, "NVIDIA/Apex:Amp requires cudnn backend to be enabled." torch.backends.cudnn.benchmark = True device = "cuda" train_loader, test_loader, eval_train_loader = get_train_eval_loaders( dataset_path, batch_size=batch_size) model = wide_resnet50_2(num_classes=100).to(device) optimizer = SGD(model.parameters(), lr=0.01) criterion = CrossEntropyLoss().to(device) model, optimizer = amp.initialize(model, optimizer, opt_level=opt) def train_step(engine, batch): x = convert_tensor(batch[0], device, non_blocking=True) y = convert_tensor(batch[1], device, non_blocking=True) optimizer.zero_grad() y_pred = model(x) loss = criterion(y_pred, y) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() optimizer.step() return loss.item() trainer = Engine(train_step) timer = Timer(average=True) timer.attach(trainer, step=Events.EPOCH_COMPLETED) ProgressBar(persist=True).attach( trainer, output_transform=lambda out: {"batch loss": out}) metrics = {"Accuracy": Accuracy(), "Loss": Loss(criterion)} evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def log_metrics(engine, title): for name in metrics: print(f"\t{title} {name}: {engine.state.metrics[name]:.2f}") @trainer.on(Events.COMPLETED) def run_validation(_): print(f"- Mean elapsed time for 1 epoch: {timer.value()}") print("- Metrics:") with evaluator.add_event_handler(Events.COMPLETED, log_metrics, "Train"): evaluator.run(eval_train_loader) with evaluator.add_event_handler(Events.COMPLETED, log_metrics, "Test"): evaluator.run(test_loader) trainer.run(train_loader, max_epochs=max_epochs)
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_dir): train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() device = "cpu" if torch.cuda.is_available(): device = "cuda" model.to(device) # Move model before creating optimizer optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) criterion = nn.CrossEntropyLoss() trainer = create_supervised_trainer(model, optimizer, criterion, device=device) trainer.logger = setup_logger("Trainer") metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)} train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) train_evaluator.logger = setup_logger("Train Evaluator") validation_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) validation_evaluator.logger = setup_logger("Val Evaluator") @trainer.on(Events.EPOCH_COMPLETED) def compute_metrics(engine): train_evaluator.run(train_loader) validation_evaluator.run(val_loader) vd_logger = VisdomLogger(env="mnist_training") vd_logger.attach_output_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), tag="training", output_transform=lambda loss: {"batchloss": loss}, ) for tag, evaluator in [("training", train_evaluator), ("validation", validation_evaluator)]: vd_logger.attach_output_handler( evaluator, event_name=Events.EPOCH_COMPLETED, tag=tag, metric_names=["loss", "accuracy"], global_step_transform=global_step_from_engine(trainer), ) vd_logger.attach_opt_params_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), optimizer=optimizer) vd_logger.attach(trainer, log_handler=WeightsScalarHandler(model), event_name=Events.ITERATION_COMPLETED(every=100)) vd_logger.attach(trainer, log_handler=GradsScalarHandler(model), event_name=Events.ITERATION_COMPLETED(every=100)) def score_function(engine): return engine.state.metrics["accuracy"] model_checkpoint = ModelCheckpoint( log_dir, n_saved=2, filename_prefix="best", score_function=score_function, score_name="validation_accuracy", global_step_transform=global_step_from_engine(trainer), ) validation_evaluator.add_event_handler(Events.COMPLETED, model_checkpoint, {"model": model}) # kick everything off trainer.run(train_loader, max_epochs=epochs) vd_logger.close()
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_interval, log_dir): train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() writer = SummaryWriter(log_dir=log_dir) # Use TPU device device = xm.xla_device() model.to(device) # Move model before creating optimizer optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) criterion = nn.NLLLoss() # Create trainer and evaluator trainer = create_supervised_trainer( model, optimizer, criterion, device=device, output_transform=lambda x, y, y_pred, loss: [loss.item(),] ) val_metrics = {"accuracy": Accuracy(), "nll": Loss(criterion)} evaluator = create_supervised_evaluator(model, metrics=val_metrics, device=device) tracker = xm.RateTracker() # Add RateTracker as an output of the training step @trainer.on(Events.ITERATION_COMPLETED) def add_rate_tracker(engine): tracker.add(len(engine.state.batch)) engine.state.output.append(tracker.global_rate()) # Setup output values of the training step as EMA metrics RunningAverage(output_transform=lambda x: x[0]).attach(trainer, "batch_loss") RunningAverage(output_transform=lambda x: x[1]).attach(trainer, "global_rate") # Let's log the EMA metrics every `log_interval` iterations @trainer.on(Events.ITERATION_COMPLETED(every=log_interval)) def log_training_loss(engine): writer.add_scalar("training/batch_loss", engine.state.metrics["batch_loss"], engine.state.iteration) writer.add_scalar("training/global_rate", engine.state.metrics["global_rate"], engine.state.iteration) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): evaluator.run(train_loader) metrics = evaluator.state.metrics avg_accuracy = metrics["accuracy"] avg_nll = metrics["nll"] print( f"Training Results - Epoch: {engine.state.epoch} Avg accuracy: {avg_accuracy:.2f} Avg loss: {avg_nll:.2f}" ) writer.add_scalar("training/avg_loss", avg_nll, engine.state.epoch) writer.add_scalar("training/avg_accuracy", avg_accuracy, engine.state.epoch) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics avg_accuracy = metrics["accuracy"] avg_nll = metrics["nll"] print( f"Validation Results - Epoch: {engine.state.epoch} Avg accuracy: {avg_accuracy:.2f} Avg loss: {avg_nll:.2f}" ) writer.add_scalar("valdation/avg_loss", avg_nll, engine.state.epoch) writer.add_scalar("valdation/avg_accuracy", avg_accuracy, engine.state.epoch) # kick everything off trainer.run(train_loader, max_epochs=epochs) writer.close()
def main( architecture, batch_size, length_scale, centroid_size, learning_rate, l_gradient_penalty, gamma, weight_decay, final_model, output_dir, ): writer = SummaryWriter(log_dir=f"runs/{output_dir}") ds = all_datasets["CIFAR10"]() input_size, num_classes, dataset, test_dataset = ds # Split up training set idx = list(range(len(dataset))) random.shuffle(idx) if final_model: train_dataset = dataset val_dataset = test_dataset else: val_size = int(len(dataset) * 0.8) train_dataset = torch.utils.data.Subset(dataset, idx[:val_size]) val_dataset = torch.utils.data.Subset(dataset, idx[val_size:]) val_dataset.transform = (test_dataset.transform ) # Test time preprocessing for validation if architecture == "WRN": model_output_size = 640 epochs = 200 milestones = [60, 120, 160] feature_extractor = WideResNet() elif architecture == "ResNet18": model_output_size = 512 epochs = 200 milestones = [60, 120, 160] feature_extractor = resnet18() elif architecture == "ResNet50": model_output_size = 2048 epochs = 200 milestones = [60, 120, 160] feature_extractor = resnet50() elif architecture == "ResNet110": model_output_size = 2048 epochs = 200 milestones = [60, 120, 160] feature_extractor = resnet110() elif architecture == "DenseNet121": model_output_size = 1024 epochs = 200 milestones = [60, 120, 160] feature_extractor = densenet121() # Adapted resnet from: # https://github.com/kuangliu/pytorch-cifar/blob/master/models/resnet.py feature_extractor.conv1 = torch.nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) feature_extractor.maxpool = torch.nn.Identity() feature_extractor.fc = torch.nn.Identity() if centroid_size is None: centroid_size = model_output_size model = ResNet_DUQ( feature_extractor, num_classes, centroid_size, model_output_size, length_scale, gamma, ) model = model.cuda() optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=weight_decay) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=0.2) def calc_gradients_input(x, y_pred): gradients = torch.autograd.grad( outputs=y_pred, inputs=x, grad_outputs=torch.ones_like(y_pred), create_graph=True, )[0] gradients = gradients.flatten(start_dim=1) return gradients def calc_gradient_penalty(x, y_pred): gradients = calc_gradients_input(x, y_pred) # L2 norm grad_norm = gradients.norm(2, dim=1) # Two sided penalty gradient_penalty = ((grad_norm - 1)**2).mean() return gradient_penalty def step(engine, batch): model.train() optimizer.zero_grad() x, y = batch x, y = x.cuda(), y.cuda() x.requires_grad_(True) y_pred = model(x) y = F.one_hot(y, num_classes).float() loss = F.binary_cross_entropy(y_pred, y, reduction="mean") if l_gradient_penalty > 0: gp = calc_gradient_penalty(x, y_pred) loss += l_gradient_penalty * gp loss.backward() optimizer.step() x.requires_grad_(False) with torch.no_grad(): model.eval() model.update_embeddings(x, y) return loss.item() def eval_step(engine, batch): model.eval() x, y = batch x, y = x.cuda(), y.cuda() x.requires_grad_(True) y_pred = model(x) return {"x": x, "y": y, "y_pred": y_pred} trainer = Engine(step) evaluator = Engine(eval_step) metric = Average() metric.attach(trainer, "loss") metric = Accuracy(output_transform=lambda out: (out["y_pred"], out["y"])) metric.attach(evaluator, "accuracy") def bce_output_transform(out): return (out["y_pred"], F.one_hot(out["y"], num_classes).float()) metric = Loss(F.binary_cross_entropy, output_transform=bce_output_transform) metric.attach(evaluator, "bce") metric = Loss(calc_gradient_penalty, output_transform=lambda out: (out["x"], out["y_pred"])) metric.attach(evaluator, "gradient_penalty") pbar = ProgressBar(dynamic_ncols=True) pbar.attach(trainer) kwargs = {"num_workers": 4, "pin_memory": True} train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True, **kwargs) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, **kwargs) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, **kwargs) @trainer.on(Events.EPOCH_COMPLETED) def log_results(trainer): metrics = trainer.state.metrics loss = metrics["loss"] print(f"Train - Epoch: {trainer.state.epoch} Loss: {loss:.2f}") writer.add_scalar("Loss/train", loss, trainer.state.epoch) if trainer.state.epoch > (epochs - 5): accuracy, auroc = get_cifar_svhn_ood(model) print(f"Test Accuracy: {accuracy}, AUROC: {auroc}") writer.add_scalar("OoD/test_accuracy", accuracy, trainer.state.epoch) writer.add_scalar("OoD/roc_auc", auroc, trainer.state.epoch) accuracy, auroc = get_auroc_classification(val_dataset, model) print(f"AUROC - uncertainty: {auroc}") writer.add_scalar("OoD/val_accuracy", accuracy, trainer.state.epoch) writer.add_scalar("OoD/roc_auc_classification", auroc, trainer.state.epoch) evaluator.run(val_loader) metrics = evaluator.state.metrics acc = metrics["accuracy"] bce = metrics["bce"] GP = metrics["gradient_penalty"] loss = bce + l_gradient_penalty * GP print((f"Valid - Epoch: {trainer.state.epoch} " f"Acc: {acc:.4f} " f"Loss: {loss:.2f} " f"BCE: {bce:.2f} " f"GP: {GP:.2f} ")) writer.add_scalar("Loss/valid", loss, trainer.state.epoch) writer.add_scalar("BCE/valid", bce, trainer.state.epoch) writer.add_scalar("GP/valid", GP, trainer.state.epoch) writer.add_scalar("Accuracy/valid", acc, trainer.state.epoch) scheduler.step() trainer.run(train_loader, max_epochs=epochs) evaluator.run(test_loader) acc = evaluator.state.metrics["accuracy"] print(f"Test - Accuracy {acc:.4f}") torch.save(model.state_dict(), f"runs/{output_dir}/model.pt") writer.close()
def training(local_rank, config): rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() logger = setup_logger(name="CIFAR10-Training", distributed_rank=local_rank) log_basic_info(logger, config) output_path = config["output_path"] if rank == 0: if config["stop_iteration"] is None: now = datetime.now().strftime("%Y%m%d-%H%M%S") else: now = f"stop-on-{config['stop_iteration']}" folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}" output_path = Path(output_path) / folder_name if not output_path.exists(): output_path.mkdir(parents=True) config["output_path"] = output_path.as_posix() logger.info(f"Output path: {config['output_path']}") if "cuda" in device.type: config["cuda device name"] = torch.cuda.get_device_name(local_rank) if config["with_clearml"]: try: from clearml import Task except ImportError: # Backwards-compatibility for legacy Trains SDK from trains import Task task = Task.init("CIFAR10-Training", task_name=output_path.stem) task.connect_configuration(config) # Log hyper parameters hyper_params = [ "model", "batch_size", "momentum", "weight_decay", "num_epochs", "learning_rate", "num_warmup_epochs", ] task.connect({k: config[k] for k in hyper_params}) # Setup dataflow, model, optimizer, criterion train_loader, test_loader = get_dataflow(config) config["num_iters_per_epoch"] = len(train_loader) model, optimizer, criterion, lr_scheduler = initialize(config) # Create trainer for current task trainer = create_trainer(model, optimizer, criterion, lr_scheduler, train_loader.sampler, config, logger) # Let's now setup evaluator engine to perform model's validation and compute metrics metrics = { "Accuracy": Accuracy(), "Loss": Loss(criterion), } # We define two evaluators as they wont have exactly similar roles: # - `evaluator` will save the best model based on validation score evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def run_validation(engine): epoch = trainer.state.epoch state = train_evaluator.run(train_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(test_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED, run_validation) if rank == 0: # Setup TensorBoard logging on trainer and evaluators. Logged values are: # - Training metrics, e.g. running average loss values # - Learning rate # - Evaluation train/test metrics evaluators = {"training": train_evaluator, "test": evaluator} tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators) # Store 2 best models by validation accuracy starting from num_epochs / 2: best_model_handler = Checkpoint( {"model": model}, get_save_handler(config), filename_prefix="best", n_saved=2, global_step_transform=global_step_from_engine(trainer), score_name="test_accuracy", score_function=Checkpoint.get_default_score_fn("Accuracy"), ) evaluator.add_event_handler( Events.COMPLETED( lambda *_: trainer.state.epoch > config["num_epochs"] // 2), best_model_handler) # In order to check training resuming we can stop training on a given iteration if config["stop_iteration"] is not None: @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"])) def _(): logger.info( f"Stop training on {trainer.state.iteration} iteration") trainer.terminate() try: trainer.run(train_loader, max_epochs=config["num_epochs"]) except Exception as e: logger.exception("") raise e if rank == 0: tb_logger.close()
def setup_training(self, base_model, classifier, setops_model): # # Create the train and test dataset. # train_loader, train_subset_loader, val_loader = self.setup_datasets() logging.info("Setup logging and controls.") # # Setup metrics plotters. # mlflow_logger = MlflowLogger() # # Setup the optimizer. # logging.info("Setup optimizers and losses.") parameters = list(base_model.parameters()) parameters += list(setops_model.parameters()) if self.train_classifier: parameters += list(classifier.parameters()) if self.optimizer_cls == "SGD": optimizer = torch.optim.SGD(parameters, lr=self.lr1, momentum=0.9, weight_decay=self.weight_decay) else: optimizer = torch.optim.Adam(parameters, lr=self.lr1, weight_decay=self.weight_decay) if self.focal_loss: attr_loss = FocalLoss().cuda() else: attr_loss = torch.nn.MultiLabelSoftMarginLoss().cuda() recon_loss = torch.nn.MSELoss( ) if self.recon_loss == "mse" else torch.nn.L1Loss() # # Setup the trainer object and its logging. # logging.info("Setup trainer") trainer = create_setops_trainer(base_model, classifier, setops_model, optimizer, criterion1=attr_loss, criterion2=recon_loss.cuda(), params_object=self, device=self.device) ProgressBar(bar_format=None).attach(trainer) mlflow_logger.attach(engine=trainer, prefix="Train ", plot_event=Events.ITERATION_COMPLETED, update_period=LOG_INTERVAL, output_transform=lambda x: x) # # Define the evaluation metrics. # logging.info("Setup evaluator") evaluation_losses = { 'real class loss': Loss(torch.nn.MultiLabelSoftMarginLoss().cuda(), lambda o: (o["outputs"]["real class a"], o["targets"]["class a"])) + \ Loss(torch.nn.MultiLabelSoftMarginLoss().cuda(), lambda o: (o["outputs"]["real class b"], o["targets"]["class b"])), 'fake class loss': Loss(torch.nn.MultiLabelSoftMarginLoss().cuda(), lambda o: (o["outputs"]["fake class a"], o["targets"]["class a"])) + \ Loss(torch.nn.MultiLabelSoftMarginLoss().cuda(), lambda o: (o["outputs"]["fake class b"], o["targets"]["class b"])), '{} fake loss'.format(self.recon_loss): (Loss(recon_loss.cuda(), lambda o: (o["outputs"]["fake embed a"], o["targets"]["embed a"])) + Loss(recon_loss.cuda(), lambda o: (o["outputs"]["fake embed b"], o["targets"]["embed b"]))) / 2, } labels_list = train_loader.dataset.labels_list mask = labels_list_to_1hot(labels_list, labels_list).astype(np.bool) evaluation_accuracies = { 'real class acc': (MultiLabelSoftMarginIOUaccuracy(lambda o: (o["outputs"][ "real class a"], o["targets"]["class a"])) + MultiLabelSoftMarginIOUaccuracy(lambda o: (o["outputs"][ "real class b"], o["targets"]["class b"]))) / 2, 'fake class acc': (MultiLabelSoftMarginIOUaccuracy(lambda o: (o["outputs"][ "fake class a"], o["targets"]["class a"])) + MultiLabelSoftMarginIOUaccuracy(lambda o: (o["outputs"][ "fake class b"], o["targets"]["class b"]))) / 2, 'S class acc': (MultiLabelSoftMarginIOUaccuracy(lambda o: (o["outputs"][ "a_S_b class"], o["targets"]["a_S_b class"])) + MultiLabelSoftMarginIOUaccuracy(lambda o: (o["outputs"][ "b_S_a class"], o["targets"]["b_S_a class"]))) / 2, 'I class acc': (MultiLabelSoftMarginIOUaccuracy(lambda o: (o["outputs"][ "a_I_b class"], o["targets"]["a_I_b class"])) + MultiLabelSoftMarginIOUaccuracy(lambda o: (o["outputs"][ "b_I_a class"], o["targets"]["a_I_b class"]))) / 2, 'U class acc': (MultiLabelSoftMarginIOUaccuracy(lambda o: (o["outputs"][ "a_U_b class"], o["targets"]["a_U_b class"])) + MultiLabelSoftMarginIOUaccuracy(lambda o: (o["outputs"][ "b_U_a class"], o["targets"]["a_U_b class"]))) / 2, 'MSE fake acc': (EWMeanSquaredError(lambda o: (o["outputs"]["fake embed a"], o[ "targets"]["embed a"])) + EWMeanSquaredError(lambda o: (o[ "outputs"]["fake embed b"], o["targets"]["embed b"]))) / 2, 'real mAP': mAP(mask=mask, output_transform=lambda o: (o["outputs"]["real class a"], o["targets"]["class a"])), 'fake mAP': mAP(mask=mask, output_transform=lambda o: (o["outputs"]["fake class a"], o["targets"]["class a"])), 'S mAP': mAP(mask=mask, output_transform=lambda o: (o["outputs"]["a_S_b class"], o["targets"]["a_S_b class"])), 'I mAP': mAP(mask=mask, output_transform=lambda o: (o["outputs"]["a_I_b class"], o["targets"]["a_I_b class"])), 'U mAP': mAP(mask=mask, output_transform=lambda o: (o["outputs"]["a_U_b class"], o["targets"]["a_U_b class"])), } # # Setup the training evaluator object and its logging. # train_evaluator = create_setops_evaluator( base_model, classifier, setops_model, metrics=evaluation_accuracies.copy(), device=self.device) mlflow_logger.attach(engine=train_evaluator, prefix="Train Eval ", plot_event=Events.EPOCH_COMPLETED, metric_names=list(evaluation_accuracies.keys())) ProgressBar(bar_format=None).attach(train_evaluator) # # Setup the evaluator object and its logging. # evaluator = create_setops_evaluator(base_model, classifier, setops_model, metrics={ **evaluation_losses, **evaluation_accuracies }, device=self.device) mlflow_logger.attach(engine=evaluator, prefix="Eval ", plot_event=Events.EPOCH_COMPLETED, metric_names=list({ **evaluation_losses, **evaluation_accuracies }.keys())) ProgressBar(bar_format=None).attach(evaluator) # # Checkpoint of the model # self.setup_checkpoint(base_model, classifier, setops_model, evaluator) logging.info("Setup schedulers.") # # Update learning rate manually using the Visdom interface. # one_cycle_size = len(train_loader) * self.warmup_epochs * 2 scheduler_1 = LinearCyclicalScheduler(optimizer, "lr", start_value=self.lr1, end_value=self.lr2, cycle_size=one_cycle_size) scheduler_2 = ReduceLROnPlateau(optimizer, factor=0.5, patience=4 * len(train_loader), cooldown=len(train_loader), output_transform=lambda x: x["main"]) lr_scheduler = ConcatScheduler(schedulers=[scheduler_1, scheduler_2], durations=[one_cycle_size // 2], save_history=True) trainer.add_event_handler(Events.ITERATION_COMPLETED, lr_scheduler) # # Evaluation # @trainer.on(Events.EPOCH_COMPLETED) def epoch_completed(engine): # # Re-randomize the indices of the training dataset. # train_loader.dataset.calc_indices() # # Run the evaluator on a subset of the training dataset. # logging.info("Evaluation on a subset of the training data.") train_evaluator.run(train_subset_loader) # # Run the evaluator on the validation set. # logging.info("Evaluation on the eval data.") evaluator.run(val_loader) return trainer, train_loader
def test_zero_div(): loss = Loss(nll_loss) with pytest.raises(NotComputableError): loss.compute()