def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_interval): train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = FushionNet() #model=torch.load(SAVE_PATH+"350-0.908.pth") if torch.cuda.is_available(): device = 'cuda' else: device = 'cpu' optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=2e-6, nesterov=False) #optimizer = optim.Adamax(model.parameters(),lr,(0.9,0.999),1e-8,1e-6) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, [50, 100, 150, 200, 250, 300], 0.1) trainer = create_supervised_trainer(model, optimizer, F.nll_loss, device=device) evaluator = create_supervised_evaluator(model, metrics={ 'accuracy': Accuracy(), 'nll': Loss(F.nll_loss) }, device=device) desc = "ITERATION - loss: {:.2f}" pbar = tqdm(initial=0, leave=False, total=len(train_loader), desc=desc.format(0)) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: pbar.desc = desc.format(engine.state.output) pbar.update(log_interval) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): pbar.refresh() evaluator.run(train_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] current_lr = optimizer.param_groups[0]['lr'] tqdm.write( "Training Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f} Current lr: {:.6f}" .format(engine.state.epoch, avg_accuracy, avg_nll, current_lr)) scheduler.step() @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] tqdm.write( "Validation Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) pbar.n = pbar.last_print_n = 0 if (engine.state.epoch % 10 == 0): torch.save( model, SAVE_PATH + str(engine.state.epoch) + "-" + str(avg_accuracy) + ".pth") trainer.run(train_loader, max_epochs=epochs) pbar.close()
def run(train_batch_size, val_batch_size, epochs, lr, momentum): train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() device = "cpu" if torch.cuda.is_available(): device = "cuda" model.to(device) # Move model before creating optimizer optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) criterion = nn.CrossEntropyLoss() trainer = create_supervised_trainer(model, optimizer, criterion, device=device) trainer.logger = setup_logger("Trainer") metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)} train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) train_evaluator.logger = setup_logger("Train Evaluator") validation_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) validation_evaluator.logger = setup_logger("Val Evaluator") @trainer.on(Events.EPOCH_COMPLETED) def compute_metrics(engine): train_evaluator.run(train_loader) validation_evaluator.run(val_loader) trains_logger = TrainsLogger(project_name="examples", task_name="ignite") trains_logger.attach_output_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), tag="training", output_transform=lambda loss: {"batchloss": loss}, ) for tag, evaluator in [("training metrics", train_evaluator), ("validation metrics", validation_evaluator)]: trains_logger.attach_output_handler( evaluator, event_name=Events.EPOCH_COMPLETED, tag=tag, metric_names=["loss", "accuracy"], global_step_transform=global_step_from_engine(trainer), ) trains_logger.attach_opt_params_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), optimizer=optimizer ) trains_logger.attach( trainer, log_handler=WeightsScalarHandler(model), event_name=Events.ITERATION_COMPLETED(every=100) ) trains_logger.attach(trainer, log_handler=WeightsHistHandler(model), event_name=Events.EPOCH_COMPLETED(every=100)) trains_logger.attach( trainer, log_handler=GradsScalarHandler(model), event_name=Events.ITERATION_COMPLETED(every=100) ) trains_logger.attach(trainer, log_handler=GradsHistHandler(model), event_name=Events.EPOCH_COMPLETED(every=100)) handler = Checkpoint( {"model": model}, TrainsSaver(), n_saved=1, score_function=lambda e: e.state.metrics["accuracy"], score_name="val_acc", filename_prefix="best", global_step_transform=global_step_from_engine(trainer), ) validation_evaluator.add_event_handler(Events.EPOCH_COMPLETED, handler) # kick everything off trainer.run(train_loader, max_epochs=epochs) trains_logger.close()
def training(local_rank, config): rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() logger = setup_logger(name="CIFAR10-QAT-Training", distributed_rank=local_rank) log_basic_info(logger, config) output_path = config["output_path"] if rank == 0: now = datetime.now().strftime("%Y%m%d-%H%M%S") folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}" output_path = Path(output_path) / folder_name if not output_path.exists(): output_path.mkdir(parents=True) config["output_path"] = output_path.as_posix() logger.info(f"Output path: {config['output_path']}") if "cuda" in device.type: config["cuda device name"] = torch.cuda.get_device_name(local_rank) if config["with_clearml"]: try: from clearml import Task except ImportError: # Backwards-compatibility for legacy Trains SDK from trains import Task task = Task.init("CIFAR10-Training", task_name=output_path.stem) task.connect_configuration(config) # Log hyper parameters hyper_params = [ "model", "batch_size", "momentum", "weight_decay", "num_epochs", "learning_rate", "num_warmup_epochs", ] task.connect({k: config[k] for k in hyper_params}) # Setup dataflow, model, optimizer, criterion train_loader, test_loader = get_dataflow(config) config["num_iters_per_epoch"] = len(train_loader) model, optimizer, criterion, lr_scheduler = initialize(config) # Create trainer for current task trainer = create_trainer(model, optimizer, criterion, lr_scheduler, train_loader.sampler, config, logger) # Let's now setup evaluator engine to perform model's validation and compute metrics metrics = { "Accuracy": Accuracy(), "Loss": Loss(criterion), } # We define two evaluators as they wont have exactly similar roles: # - `evaluator` will save the best model based on validation score evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def run_validation(engine): epoch = trainer.state.epoch state = train_evaluator.run(train_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(test_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED, run_validation) if rank == 0: # Setup TensorBoard logging on trainer and evaluators. Logged values are: # - Training metrics, e.g. running average loss values # - Learning rate # - Evaluation train/test metrics evaluators = {"training": train_evaluator, "test": evaluator} tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators) # Store 2 best models by validation accuracy starting from num_epochs / 2: best_model_handler = Checkpoint( {"model": model}, get_save_handler(config), filename_prefix="best", n_saved=2, global_step_transform=global_step_from_engine(trainer), score_name="test_accuracy", score_function=Checkpoint.get_default_score_fn("Accuracy"), ) evaluator.add_event_handler( Events.COMPLETED( lambda *_: trainer.state.epoch > config["num_epochs"] // 2), best_model_handler) try: trainer.run(train_loader, max_epochs=config["num_epochs"]) except Exception as e: logger.exception("") raise e if rank == 0: tb_logger.close()
def fit_model(model, train_loader, test_loader, lr, max_epochs=5): optimizer = torch.optim.SGD(model.parameters(), lr=lr) criterion = torch.nn.BCEWithLogitsLoss() def threshold_output_transform(output): y_pred, y = output y_pred = torch.heaviside(y_pred, values=torch.zeros(1)) # print(f'y_pred size : {y_pred.size()}') # print(f'y size : {y.size()}') return y_pred, y def prepare_batch(batch, device, non_blocking): x, y = batch x = x.float() y = y.float() y = torch.unsqueeze(y, 1) return (x, y) def squeeze_y_dims(output): prediction, target = output # print(f'prediction size: {prediction.size()}') # print(f'target size: {target.size()}') return prediction, target trainer = create_supervised_trainer(model, optimizer, criterion, prepare_batch=prepare_batch) val_metrics = { "accuracy": Accuracy(threshold_output_transform), "bce": Loss(criterion, output_transform=squeeze_y_dims) # "precision" : Precision(threshold_output_transform, average=False), # "recall": Recall(threshold_output_transform, average=False) } evaluator = create_supervised_evaluator(model, metrics=val_metrics, prepare_batch=prepare_batch) @trainer.on(Events.ITERATION_COMPLETED(every=10)) def log_training_loss(trainer): evaluator.run(train_loader) metrics = evaluator.state.metrics print( f"Training Results - Epoch: {trainer.state.epoch} Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['bce']:.2f}" ) # @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(trainer): evaluator.run(train_loader) metrics = evaluator.state.metrics # print(f"Training Results - Epoch: {trainer.state.epoch} Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['bce']:.2f} Avg precision : {metrics['precision']:.2f} Avg recall: {metrics['recall']:.2f}") print( f"Training Results - Epoch: {trainer.state.epoch} Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['bce']:.2f}" ) @trainer.on(Events.EPOCH_COMPLETED(every=10)) def log_validation_results(trainer): evaluator.run(test_loader) metrics = evaluator.state.metrics # print(f"Validation Results - Epoch: {trainer.state.epoch} Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['bce']:.2f} Avg precision : {metrics['precision']:.2f} Avg recall: {metrics['recall']:.2f}") print( f"Validation Results - Epoch: {trainer.state.epoch} Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['bce']:.2f}" ) trainer.run(train_loader, max_epochs=max_epochs) return model
def __call__(self, model, train_dataset, val_dataset=None, **_): """Train a PyTorch model. Args: model (torch.nn.Module): PyTorch model to train. train_dataset (torch.utils.data.Dataset): Dataset used to train. val_dataset (torch.utils.data.Dataset, optional): Dataset used to validate. Returns: trained_model (torch.nn.Module): Trained PyTorch model. """ assert train_dataset is not None train_params = self.train_params mlflow_logging = self.mlflow_logging if mlflow_logging: try: import mlflow # NOQA except ImportError: log.warning( "Failed to import mlflow. MLflow logging is disabled.") mlflow_logging = False loss_fn = train_params.get("loss_fn") assert loss_fn epochs = train_params.get("epochs") seed = train_params.get("seed") optimizer = train_params.get("optimizer") assert optimizer optimizer_params = train_params.get("optimizer_params", dict()) train_dataset_size_limit = train_params.get("train_dataset_size_limit") if train_dataset_size_limit: train_dataset = PartialDataset(train_dataset, train_dataset_size_limit) log.info("train dataset size is set to {}".format( len(train_dataset))) val_dataset_size_limit = train_params.get("val_dataset_size_limit") if val_dataset_size_limit and (val_dataset is not None): val_dataset = PartialDataset(val_dataset, val_dataset_size_limit) log.info("val dataset size is set to {}".format(len(val_dataset))) train_data_loader_params = train_params.get("train_data_loader_params", dict()) val_data_loader_params = train_params.get("val_data_loader_params", dict()) evaluation_metrics = train_params.get("evaluation_metrics") evaluate_train_data = train_params.get("evaluate_train_data") evaluate_val_data = train_params.get("evaluate_val_data") progress_update = train_params.get("progress_update") scheduler = train_params.get("scheduler") scheduler_params = train_params.get("scheduler_params", dict()) model_checkpoint = train_params.get("model_checkpoint") model_checkpoint_params = train_params.get("model_checkpoint_params") early_stopping_params = train_params.get("early_stopping_params") time_limit = train_params.get("time_limit") cudnn_deterministic = train_params.get("cudnn_deterministic") cudnn_benchmark = train_params.get("cudnn_benchmark") if seed: torch.manual_seed(seed) np.random.seed(seed) if cudnn_deterministic: torch.backends.cudnn.deterministic = cudnn_deterministic if cudnn_benchmark: torch.backends.cudnn.benchmark = cudnn_benchmark device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) optimizer_ = optimizer(model.parameters(), **optimizer_params) trainer = create_supervised_trainer(model, optimizer_, loss_fn=loss_fn, device=device) train_data_loader_params.setdefault("shuffle", True) train_data_loader_params.setdefault("drop_last", True) train_data_loader_params["batch_size"] = _clip_batch_size( train_data_loader_params.get("batch_size", 1), train_dataset, "train") train_loader = DataLoader(train_dataset, **train_data_loader_params) RunningAverage(output_transform=lambda x: x, alpha=0.98).attach(trainer, "ema_loss") RunningAverage(output_transform=lambda x: x, alpha=2**(-1022)).attach(trainer, "batch_loss") if scheduler: class ParamSchedulerSavingAsMetric( ParamSchedulerSavingAsMetricMixIn, scheduler): pass cycle_epochs = scheduler_params.pop("cycle_epochs", 1) scheduler_params.setdefault("cycle_size", int(cycle_epochs * len(train_loader))) scheduler_params.setdefault("param_name", "lr") scheduler_ = ParamSchedulerSavingAsMetric(optimizer_, **scheduler_params) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler_) if evaluate_train_data: evaluator_train = create_supervised_evaluator( model, metrics=evaluation_metrics, device=device) if evaluate_val_data: val_data_loader_params["batch_size"] = _clip_batch_size( val_data_loader_params.get("batch_size", 1), val_dataset, "val") val_loader = DataLoader(val_dataset, **val_data_loader_params) evaluator_val = create_supervised_evaluator( model, metrics=evaluation_metrics, device=device) if model_checkpoint_params: assert isinstance(model_checkpoint_params, dict) minimize = model_checkpoint_params.pop("minimize", True) save_interval = model_checkpoint_params.get("save_interval", None) if not save_interval: model_checkpoint_params.setdefault( "score_function", get_score_function("ema_loss", minimize=minimize)) model_checkpoint_params.setdefault("score_name", "ema_loss") mc = model_checkpoint(**model_checkpoint_params) trainer.add_event_handler(Events.EPOCH_COMPLETED, mc, {"model": model}) if early_stopping_params: assert isinstance(early_stopping_params, dict) metric = early_stopping_params.pop("metric", None) assert (metric is None) or (metric in evaluation_metrics) minimize = early_stopping_params.pop("minimize", False) if metric: assert ( "score_function" not in early_stopping_params ), "Remove either 'metric' or 'score_function' from early_stopping_params: {}".format( early_stopping_params) early_stopping_params["score_function"] = get_score_function( metric, minimize=minimize) es = EarlyStopping(trainer=trainer, **early_stopping_params) if evaluate_val_data: evaluator_val.add_event_handler(Events.COMPLETED, es) elif evaluate_train_data: evaluator_train.add_event_handler(Events.COMPLETED, es) elif early_stopping_params: log.warning( "Early Stopping is disabled because neither " "evaluate_val_data nor evaluate_train_data is set True.") if time_limit: assert isinstance(time_limit, (int, float)) tl = TimeLimit(limit_sec=time_limit) trainer.add_event_handler(Events.ITERATION_COMPLETED, tl) pbar = None if progress_update: if not isinstance(progress_update, dict): progress_update = dict() progress_update.setdefault("persist", True) progress_update.setdefault("desc", "") pbar = ProgressBar(**progress_update) pbar.attach(trainer, ["ema_loss"]) else: def log_train_metrics(engine): log.info("[Epoch: {} | {}]".format(engine.state.epoch, engine.state.metrics)) trainer.add_event_handler(Events.EPOCH_COMPLETED, log_train_metrics) if evaluate_train_data: def log_evaluation_train_data(engine): evaluator_train.run(train_loader) train_report = _get_report_str(engine, evaluator_train, "Train Data") if pbar: pbar.log_message(train_report) else: log.info(train_report) eval_train_event = (Events[evaluate_train_data] if isinstance( evaluate_train_data, str) else Events.EPOCH_COMPLETED) trainer.add_event_handler(eval_train_event, log_evaluation_train_data) if evaluate_val_data: def log_evaluation_val_data(engine): evaluator_val.run(val_loader) val_report = _get_report_str(engine, evaluator_val, "Val Data") if pbar: pbar.log_message(val_report) else: log.info(val_report) eval_val_event = (Events[evaluate_val_data] if isinstance( evaluate_val_data, str) else Events.EPOCH_COMPLETED) trainer.add_event_handler(eval_val_event, log_evaluation_val_data) if mlflow_logging: mlflow_logger = MLflowLogger() logging_params = { "train_n_samples": len(train_dataset), "train_n_batches": len(train_loader), "optimizer": _name(optimizer), "loss_fn": _name(loss_fn), "pytorch_version": torch.__version__, "ignite_version": ignite.__version__, } logging_params.update(_loggable_dict(optimizer_params, "optimizer")) logging_params.update( _loggable_dict(train_data_loader_params, "train")) if scheduler: logging_params.update({"scheduler": _name(scheduler)}) logging_params.update( _loggable_dict(scheduler_params, "scheduler")) if evaluate_val_data: logging_params.update({ "val_n_samples": len(val_dataset), "val_n_batches": len(val_loader), }) logging_params.update( _loggable_dict(val_data_loader_params, "val")) mlflow_logger.log_params(logging_params) batch_metric_names = ["batch_loss", "ema_loss"] if scheduler: batch_metric_names.append(scheduler_params.get("param_name")) mlflow_logger.attach( trainer, log_handler=OutputHandler( tag="step", metric_names=batch_metric_names, global_step_transform=global_step_from_engine(trainer), ), event_name=Events.ITERATION_COMPLETED, ) if evaluate_train_data: mlflow_logger.attach( evaluator_train, log_handler=OutputHandler( tag="train", metric_names=list(evaluation_metrics.keys()), global_step_transform=global_step_from_engine(trainer), ), event_name=Events.COMPLETED, ) if evaluate_val_data: mlflow_logger.attach( evaluator_val, log_handler=OutputHandler( tag="val", metric_names=list(evaluation_metrics.keys()), global_step_transform=global_step_from_engine(trainer), ), event_name=Events.COMPLETED, ) trainer.run(train_loader, max_epochs=epochs) try: if pbar and pbar.pbar: pbar.pbar.close() except Exception as e: log.error(e, exc_info=True) model = load_latest_model(model_checkpoint_params)(model) return model
def main(): monai.config.print_config() logging.basicConfig(stream=sys.stdout, level=logging.INFO) # create a temporary directory and 40 random image, mask paris tempdir = tempfile.mkdtemp() print(f"generating synthetic data to {tempdir} (this may take a while)") for i in range(40): im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1, channel_dim=-1) n = nib.Nifti1Image(im, np.eye(4)) nib.save(n, os.path.join(tempdir, f"img{i:d}.nii.gz")) n = nib.Nifti1Image(seg, np.eye(4)) nib.save(n, os.path.join(tempdir, f"seg{i:d}.nii.gz")) images = sorted(glob(os.path.join(tempdir, "img*.nii.gz"))) segs = sorted(glob(os.path.join(tempdir, "seg*.nii.gz"))) train_files = [{ "img": img, "seg": seg } for img, seg in zip(images[:20], segs[:20])] val_files = [{ "img": img, "seg": seg } for img, seg in zip(images[-20:], segs[-20:])] # define transforms for image and segmentation train_transforms = Compose([ LoadNiftid(keys=["img", "seg"]), AsChannelFirstd(keys=["img", "seg"], channel_dim=-1), ScaleIntensityd(keys=["img", "seg"]), RandCropByPosNegLabeld(keys=["img", "seg"], label_key="seg", size=[96, 96, 96], pos=1, neg=1, num_samples=4), RandRotate90d(keys=["img", "seg"], prob=0.5, spatial_axes=[0, 2]), ToTensord(keys=["img", "seg"]), ]) val_transforms = Compose([ LoadNiftid(keys=["img", "seg"]), AsChannelFirstd(keys=["img", "seg"], channel_dim=-1), ScaleIntensityd(keys=["img", "seg"]), ToTensord(keys=["img", "seg"]), ]) # define dataset, data loader check_ds = monai.data.Dataset(data=train_files, transform=train_transforms) # use batch_size=2 to load images and use RandCropByPosNegLabeld to generate 2 x 4 images for network training check_loader = DataLoader(check_ds, batch_size=2, num_workers=4, collate_fn=list_data_collate, pin_memory=torch.cuda.is_available()) check_data = monai.utils.misc.first(check_loader) print(check_data["img"].shape, check_data["seg"].shape) # create a training data loader train_ds = monai.data.Dataset(data=train_files, transform=train_transforms) # use batch_size=2 to load images and use RandCropByPosNegLabeld to generate 2 x 4 images for network training train_loader = DataLoader( train_ds, batch_size=2, shuffle=True, num_workers=4, collate_fn=list_data_collate, pin_memory=torch.cuda.is_available(), ) # create a validation data loader val_ds = monai.data.Dataset(data=val_files, transform=val_transforms) val_loader = DataLoader(val_ds, batch_size=5, num_workers=8, collate_fn=list_data_collate, pin_memory=torch.cuda.is_available()) # create UNet, DiceLoss and Adam optimizer net = monai.networks.nets.UNet( dimensions=3, in_channels=1, out_channels=1, channels=(16, 32, 64, 128, 256), strides=(2, 2, 2, 2), num_res_units=2, ) loss = monai.losses.DiceLoss(do_sigmoid=True) lr = 1e-3 opt = torch.optim.Adam(net.parameters(), lr) device = torch.device("cuda:0") # ignite trainer expects batch=(img, seg) and returns output=loss at every iteration, # user can add output_transform to return other values, like: y_pred, y, etc. def prepare_batch(batch, device=None, non_blocking=False): return _prepare_batch((batch["img"], batch["seg"]), device, non_blocking) trainer = create_supervised_trainer(net, opt, loss, device, False, prepare_batch=prepare_batch) # adding checkpoint handler to save models (network params and optimizer stats) during training checkpoint_handler = ModelCheckpoint("./runs/", "net", n_saved=10, require_empty=False) trainer.add_event_handler(event_name=Events.EPOCH_COMPLETED, handler=checkpoint_handler, to_save={ "net": net, "opt": opt }) # StatsHandler prints loss at every iteration and print metrics at every epoch, # we don't set metrics for trainer here, so just print loss, user can also customize print functions # and can use output_transform to convert engine.state.output if it's not loss value train_stats_handler = StatsHandler(name="trainer") train_stats_handler.attach(trainer) # TensorBoardStatsHandler plots loss at every iteration and plots metrics at every epoch, same as StatsHandler train_tensorboard_stats_handler = TensorBoardStatsHandler() train_tensorboard_stats_handler.attach(trainer) validation_every_n_iters = 5 # set parameters for validation metric_name = "Mean_Dice" # add evaluation metric to the evaluator engine val_metrics = {metric_name: MeanDice(add_sigmoid=True, to_onehot_y=False)} # ignite evaluator expects batch=(img, seg) and returns output=(y_pred, y) at every iteration, # user can add output_transform to return other values evaluator = create_supervised_evaluator(net, val_metrics, device, True, prepare_batch=prepare_batch) @trainer.on(Events.ITERATION_COMPLETED(every=validation_every_n_iters)) def run_validation(engine): evaluator.run(val_loader) # add early stopping handler to evaluator early_stopper = EarlyStopping( patience=4, score_function=stopping_fn_from_metric(metric_name), trainer=trainer) evaluator.add_event_handler(event_name=Events.EPOCH_COMPLETED, handler=early_stopper) # add stats event handler to print validation stats via evaluator val_stats_handler = StatsHandler( name="evaluator", output_transform=lambda x: None, # no need to print loss value, so disable per iteration output global_epoch_transform=lambda x: trainer.state.epoch, ) # fetch global epoch number from trainer val_stats_handler.attach(evaluator) # add handler to record metrics to TensorBoard at every validation epoch val_tensorboard_stats_handler = TensorBoardStatsHandler( output_transform=lambda x: None, # no need to plot loss value, so disable per iteration output global_epoch_transform=lambda x: trainer.state.iteration, ) # fetch global iteration number from trainer val_tensorboard_stats_handler.attach(evaluator) # add handler to draw the first image and the corresponding label and model output in the last batch # here we draw the 3D output as GIF format along the depth axis, every 2 validation iterations. val_tensorboard_image_handler = TensorBoardImageHandler( batch_transform=lambda batch: (batch["img"], batch["seg"]), output_transform=lambda output: predict_segmentation(output[0]), global_iter_transform=lambda x: trainer.state.epoch, ) evaluator.add_event_handler(event_name=Events.ITERATION_COMPLETED(every=2), handler=val_tensorboard_image_handler) train_epochs = 5 state = trainer.run(train_loader, train_epochs) shutil.rmtree(tempdir)
def _create_amplitude_evaluator(model): return create_supervised_evaluator(model, metrics={"overlap": OverlapMetric()})
trainer = engine.create_supervised_trainer( model=model, optimizer=optimizer, loss_fn=loss_fn, device='cuda', non_blocking=True, ) evaluator = engine.create_supervised_evaluator( model, metrics={ 'loss': metrics.Loss(nn.BCELoss()), 'precision': metrics.Precision(thresholded_transform(threshold=0.5)), 'recall': metrics.Recall(thresholded_transform(threshold=0.5)), '[email protected]': IoUMetric(thresholded_transform(threshold=0.3)), '[email protected]': IoUMetric(thresholded_transform(threshold=0.5)), }, device='cuda', non_blocking=True, output_transform=lambda x, y, y_pred: (torch.sigmoid(y_pred['out']), y), ) logging.info(f'creating summary writer with tag {args.model_tag}') writer = tensorboard.SummaryWriter(log_dir=f'logs/{args.model_tag}') logging.info('attaching lr scheduler') lr_scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9) attach_lr_scheduler(trainer, lr_scheduler, writer)
def task_diagnostics(tasks, train_data, val_data, vocabulary, model, args): devicea = -1 if torch.cuda.is_available(): devicea = 0 train_activations = [] test_activations = [] train_labels = [] test_labels = [] for tid,task in enumerate(tasks): train_act, train_lab = evaluate_get_dataset(model, task, vocabulary[task], train_data[task], 1000, tid) test_act, test_lab = evaluate_get_dataset(model, task, vocabulary[task], val_data[task], 500, tid) train_activations.append(train_act) test_activations.append(test_act) train_labels.append(train_lab) test_labels.append(test_lab) train_activations = torch.cat(train_activations, dim=0) test_activations = torch.cat(test_activations, dim=0) train_labels = torch.cat(train_labels, dim=0) test_labels = torch.cat(test_labels, dim=0) print("Activations ", train_activations.shape, test_activations.shape, train_labels.shape, test_labels.shape) # Datasets train_ds = torch.utils.data.TensorDataset(train_activations, train_labels) test_ds = torch.utils.data.TensorDataset(test_activations, test_labels) train_dl = torch.utils.data.DataLoader(train_ds, batch_size=32) test_dl = torch.utils.data.DataLoader(test_ds, batch_size=2100) # Models and Optimizer diag_model = DiagnositicClassifier(train_activations.size(1), 128, len(tasks)) if devicea != -1: diag_model.cuda(devicea) optimizer = utils.get_optimizer(args.opt_alg, diag_model.parameters(), args.lr, args.wdecay) criterion = nn.CrossEntropyLoss() # ignite training loops if devicea == -1: trainer = create_supervised_trainer(diag_model, optimizer, criterion) evaluator = create_supervised_evaluator(diag_model, {"accuracy": Accuracy(), "loss": Loss(criterion)}) val_evaluator = create_supervised_evaluator(diag_model, {"accuracy": Accuracy(), "loss": Loss(criterion)}) else: trainer = create_supervised_trainer(diag_model, optimizer, diag_model.loss_function, device=devicea) evaluator = create_supervised_evaluator(diag_model, metrics={'accuracy': Accuracy()}, device=devicea) val_evaluator = create_supervised_evaluator(diag_model, metrics={'accuracy': Accuracy()}, device=devicea) @trainer.on(Events.EPOCH_COMPLETED) def compute_metrics(engine): evaluator.run(train_dl) val_evaluator.run(test_dl) def score_function(engine): return engine.state.metrics['accuracy'] early_stop_metric = EarlyStopping(patience=20, score_function=score_function, trainer=trainer) val_evaluator.add_event_handler(Events.COMPLETED, early_stop_metric) trainer.run(train_dl, max_epochs=1000) logits, test_labels = val_evaluator.state.output _, predicted = torch.max(logits, 1) correct_ones = (predicted == test_labels).sum() metrics = {} for i,task in enumerate(tasks): start = i*500 end = (i+1)*500 correct_this = (predicted[start:end] == test_labels[start:end]).sum() metrics[task] = correct_this.item()/500 #print("Task based accuracy", start, end , task, correct_this) metrics["overall"] = val_evaluator.state.metrics["accuracy"] print("Diagnostics metric", metrics) return metrics
def training(local_rank, config): rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() logger = setup_logger(name="CIFAR10-Training", distributed_rank=local_rank) log_basic_info(logger, config) output_path = config["output_path"] if rank == 0: if config["stop_iteration"] is None: now = datetime.now().strftime("%Y%m%d-%H%M%S") else: now = f"stop-on-{config['stop_iteration']}" folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}" output_path = Path(output_path) / folder_name if not output_path.exists(): output_path.mkdir(parents=True) config["output_path"] = output_path.as_posix() logger.info(f"Output path: {config['output_path']}") if "cuda" in device.type: config["cuda device name"] = torch.cuda.get_device_name(local_rank) if config["with_trains"]: from trains import Task task = Task.init("CIFAR10-Training", task_name=output_path.stem) task.connect_configuration(config) # Log hyper parameters hyper_params = [ "model", "batch_size", "momentum", "weight_decay", "num_epochs", "learning_rate", "num_warmup_epochs", ] task.connect({k: config[k] for k in hyper_params}) # Setup dataflow, model, optimizer, criterion train_loader, test_loader = get_dataflow(config) config["num_iters_per_epoch"] = len(train_loader) model, optimizer, criterion, lr_scheduler = initialize(config) # Create trainer for current task trainer = create_trainer(model, optimizer, criterion, lr_scheduler, train_loader.sampler, config, logger) # Let's now setup evaluator engine to perform model's validation and compute metrics metrics = { "accuracy": Accuracy(), "loss": Loss(criterion), } # We define two evaluators as they wont have exactly similar roles: # - `evaluator` will save the best model based on validation score evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def run_validation(engine): epoch = trainer.state.epoch state = train_evaluator.run(train_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(test_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED, run_validation) if rank == 0: # Setup TensorBoard logging on trainer and evaluators. Logged values are: # - Training metrics, e.g. running average loss values # - Learning rate # - Evaluation train/test metrics evaluators = {"training": train_evaluator, "test": evaluator} tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators) # Store 3 best models by validation accuracy: common.gen_save_best_models_by_val_score( save_handler=get_save_handler(config), evaluator=evaluator, models={"model": model}, metric_name="accuracy", n_saved=3, trainer=trainer, tag="test", ) # In order to check training resuming we can stop training on a given iteration if config["stop_iteration"] is not None: @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"])) def _(): logger.info( f"Stop training on {trainer.state.iteration} iteration") trainer.terminate() try: trainer.run(train_loader, max_epochs=config["num_epochs"]) except Exception as e: import traceback print(traceback.format_exc()) if rank == 0: tb_logger.close()
def run(args): """ Run the program """ train_loader, val_loader, test_loader, scale = get_data_loaders(args) device = torch.device("cuda" if not args.disable_gpu and torch.cuda.is_available() else "cpu") lr_ratio = 1 # feature lr / regression lr if args.model == 'WaDIQaM-FR' or args.model == 'DIQaM-FR': model = FRnet(weighted_average=args.weighted_average) if args.resume is not None: model.load_state_dict(torch.load(args.resume)) elif args.model == 'WaDIQaM-NR' or args.model == 'DIQaM-NR': model = NRnet(weighted_average=args.weighted_average) if args.resume is not None: model_dict = model.state_dict() if 'FR' in args.resume: lr_ratio = 0.1 # set feature lr / regression lr to 1/10 # Initialize the feature extractor by pretrained FRNet pretrained_model = FRnet( weighted_average=args.weighted_average) pretrained_model.load_state_dict(torch.load(args.resume)) pretrained_dict = pretrained_model.state_dict() # 1. filter out unnecessary keys pretrained_dict = { k: v for k, v in pretrained_dict.items() if k in model_dict } # 2. overwrite entries in the existing state dict model_dict.update(pretrained_dict) # 3. load the new state dict model.load_state_dict(model_dict) else: print('Wrong model name!') writer = SummaryWriter(log_dir=args.log_dir) model = model.to(device) print(model) if args.multi_gpu and torch.cuda.device_count() > 1: print("Using multiple GPU") model = nn.DataParallel(model) # batch_size becomes batch_size * torch.cuda.device_count() all_params = model.module.parameters() regression_params = [] for pname, p in model.module.named_parameters(): if pname.find('fc') >= 0: regression_params.append(p) regression_params_id = list(map(id, regression_params)) features_params = list( filter(lambda p: id(p) not in regression_params_id, all_params)) optimizer = Adam([{ 'params': regression_params }, { 'params': features_params, 'lr': args.lr * lr_ratio }], lr=args.lr, weight_decay=args.weight_decay) else: all_params = model.parameters() regression_params = [] for pname, p in model.named_parameters(): if pname.find('fc') >= 0: regression_params.append(p) regression_params_id = list(map(id, regression_params)) features_params = list( filter(lambda p: id(p) not in regression_params_id, all_params)) optimizer = Adam([{ 'params': regression_params }, { 'params': features_params, 'lr': args.lr * lr_ratio }], lr=args.lr, weight_decay=args.weight_decay) scheduler = lr_scheduler.StepLR(optimizer, step_size=args.decay_interval, gamma=args.decay_ratio) global best_criterion best_criterion = -1 # SROCC >= -1 trainer = create_supervised_trainer(model, optimizer, IQALoss(), device=device) evaluator = create_supervised_evaluator( model, metrics={'IQA_performance': IQAPerformance()}, device=device) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): writer.add_scalar("training/loss", scale * engine.state.output, engine.state.iteration) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics SROCC, KROCC, PLCC, RMSE, MAE, OR = metrics['IQA_performance'] print( "Validation Results - Epoch: {} SROCC: {:.4f} KROCC: {:.4f} PLCC: {:.4f} RMSE: {:.4f} MAE: {:.4f} OR: {:.2f}%" .format(engine.state.epoch, SROCC, KROCC, PLCC, scale * RMSE, scale * MAE, 100 * OR)) writer.add_scalar("SROCC/validation", SROCC, engine.state.epoch) writer.add_scalar("KROCC/validation", KROCC, engine.state.epoch) writer.add_scalar("PLCC/validation", PLCC, engine.state.epoch) writer.add_scalar("RMSE/validation", scale * RMSE, engine.state.epoch) writer.add_scalar("MAE/validation", scale * MAE, engine.state.epoch) writer.add_scalar("OR/validation", OR, engine.state.epoch) scheduler.step(engine.state.epoch) global best_criterion global best_epoch if SROCC > best_criterion and engine.state.epoch / args.epochs > 1 / 6: # # if engine.state.epoch/args.epochs > 1/6 and engine.state.epoch % int(args.epochs/10) == 0: best_criterion = SROCC best_epoch = engine.state.epoch try: torch.save(model.module.state_dict(), args.trained_model_file) except: torch.save(model.state_dict(), args.trained_model_file) # torch.save(model.state_dict(), args.trained_model_file + str(engine.state.epoch)) @trainer.on(Events.EPOCH_COMPLETED) def log_testing_results(engine): if args.test_during_training: evaluator.run(test_loader) metrics = evaluator.state.metrics SROCC, KROCC, PLCC, RMSE, MAE, OR = metrics['IQA_performance'] print( "Testing Results - Epoch: {} SROCC: {:.4f} KROCC: {:.4f} PLCC: {:.4f} RMSE: {:.4f} MAE: {:.4f} OR: {:.2f}%" .format(engine.state.epoch, SROCC, KROCC, PLCC, scale * RMSE, scale * MAE, 100 * OR)) writer.add_scalar("SROCC/testing", SROCC, engine.state.epoch) writer.add_scalar("KROCC/testing", KROCC, engine.state.epoch) writer.add_scalar("PLCC/testing", PLCC, engine.state.epoch) writer.add_scalar("RMSE/testing", scale * RMSE, engine.state.epoch) writer.add_scalar("MAE/testing", scale * MAE, engine.state.epoch) writer.add_scalar("OR/testing", OR, engine.state.epoch) @trainer.on(Events.COMPLETED) def final_testing_results(engine): global best_epoch model.load_state_dict(torch.load(args.trained_model_file)) evaluator.run(test_loader) metrics = evaluator.state.metrics SROCC, KROCC, PLCC, RMSE, MAE, OR = metrics['IQA_performance'] print( "Final Test Results - Epoch: {} SROCC: {:.4f} KROCC: {:.4f} PLCC: {:.4f} RMSE: {:.4f} MAE: {:.4f} OR: {:.2f}%" .format(best_epoch, SROCC, KROCC, PLCC, scale * RMSE, scale * MAE, 100 * OR)) np.save(args.save_result_file, (SROCC, KROCC, PLCC, scale * RMSE, scale * MAE, OR)) # kick everything off trainer.run(train_loader, max_epochs=args.epochs) writer.close()
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_interval): train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() device = 'cpu' if torch.cuda.is_available(): device = 'cuda' optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) # define a trainer trainer = create_supervised_trainer(model, optimizer, F.nll_loss, device=device) # define a evaluator evaluator = create_supervised_evaluator(model, metrics={ 'accuracy': Accuracy(), 'nll': Loss(F.nll_loss) }, device=device) # Print desc = "ITERATION - loss: {:.2f}" # the loss of each iteration while training pbar = tqdm( initial=0, leave=False, total=len(train_loader), desc=desc.format( 0)) # Progress of the current iteration in the entire epoch @trainer.on(Events.ITERATION_COMPLETED ) # call this function when iteration is completed def log_training_loss(engine): iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: pbar.desc = desc.format( engine.state.output) # update the training loss pbar.update(log_interval) # update the progress bar @trainer.on(Events.EPOCH_COMPLETED ) # call this function when epoch is completed def log_training_results(engine): pbar.refresh() evaluator.run(train_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] tqdm.write( "Training Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) @trainer.on(Events.EPOCH_COMPLETED ) # call this function when epoch is completed def log_validation_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] tqdm.write( "Validation Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) pbar.n = pbar.last_print_n = 0 trainer.run(train_loader, max_epochs=epochs) pbar.close()
# Inisialisasi objek GPU gpu = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Pakai model RESNET untuk transfer learning model = torchvision.models.resnet50(pretrained=True) model.to(gpu) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=learning_rate) trainer = create_supervised_trainer(model, optimizer, criterion, device=gpu) metrics = { "accuracy": Accuracy(), "loss": Loss(criterion) } train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=gpu) val_evaluator = create_supervised_evaluator(model, metrics=metrics, device=gpu) training_history = {"accuracy":[], "loss":[]} validation_history = {"accuracy":[], "loss":[]} last_epoch = [] # RunningAverage metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") # EarlyStopping Callbacks handler = EarlyStopping(patience=10, score_function=score_function, trainer=trainer) val_evaluator.add_event_handler(Events.COMPLETED, handler) # Buat Custom Function # Custom function dibuat untuk menghubungkan dengan dua event yaitu, event saat training dan event saat evaluation. @trainer.on(Events.EPOCH_COMPLETED)
def run(train_batch_size, val_batch_size, epochs, learning_rate, weight_decay, log_interval, log_dir): train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) test_loader = get_test_loader(val_batch_size) use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") print("Pytorch Version:", torch.__version__) print('device={}'.format(device)) model = CP_MixedNet() writer = create_summary_writer(model, train_loader, log_dir) optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) trainer = create_supervised_trainer(model, optimizer, F.nll_loss, device=device) evaluator = create_supervised_evaluator(model, metrics={ 'accuracy': Accuracy(), 'nll': Loss(F.nll_loss) }, device=device) evaluator_val = create_supervised_evaluator(model, metrics={ 'accuracy': Accuracy(), 'nll': Loss(F.nll_loss) }, device=device) desc = "ITERATION - loss: {:.2f}" pbar = tqdm(initial=0, leave=False, total=len(train_loader), desc=desc.format(0)) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: pbar.desc = desc.format(engine.state.output) pbar.update(log_interval) writer.add_scalar("training/loss", engine.state.output, engine.state.iteration) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): pbar.refresh() evaluator.run(train_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] tqdm.write( "Training Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) writer.add_scalar("training/avg_loss", avg_nll, engine.state.epoch) writer.add_scalar("training/avg_accuracy", avg_accuracy, engine.state.epoch) @trainer.on(Events.EPOCH_COMPLETED) def log_test_results(engine): evaluator.run(test_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] tqdm.write( "Test Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}". format(engine.state.epoch, avg_accuracy, avg_nll)) pbar.n = pbar.last_print_n = 0 writer.add_scalar("test/avg_loss", avg_nll, engine.state.epoch) writer.add_scalar("test/avg_accuracy", avg_accuracy, engine.state.epoch) handler = EarlyStopping(patience=400, score_function=score_function, trainer=trainer) evaluator_val.add_event_handler(Events.COMPLETED, handler) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator_val.run(val_loader) metrics = evaluator_val.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] tqdm.write( "Validation Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) pbar.n = pbar.last_print_n = 0 writer.add_scalar("val/avg_loss", avg_nll, engine.state.epoch) writer.add_scalar("val/avg_accuracy", avg_accuracy, engine.state.epoch) trainer.run(train_loader, max_epochs=epochs) pbar.close() writer.close() save_model = True if (save_model): torch.save(model.state_dict(), "weights_BCI.pt")
) device = torch.device("cuda:0") def prepare_batch(batch, device=None, non_blocking=False): return _prepare_batch((batch['img'], batch['label']), device, non_blocking) metric_name = 'Accuracy' # add evaluation metric to the evaluator engine val_metrics = {metric_name: Accuracy()} # ignite evaluator expects batch=(img, label) and returns output=(y_pred, y) at every iteration, # user can add output_transform to return other values evaluator = create_supervised_evaluator(net, val_metrics, device, True, prepare_batch=prepare_batch) # Add stats event handler to print validation stats via evaluator val_stats_handler = StatsHandler( name='evaluator', output_transform=lambda x: None # no need to print loss value, so disable per iteration output ) val_stats_handler.attach(evaluator) # for the arrary data format, assume the 3rd item of batch data is the meta_data prediction_saver = ClassificationSaver( output_dir='tempdir', name='evaluator',
def main(cfg, resume_state=None): if 'AUGMENTATION' in cfg: if 'INSERT' in cfg['AUGMENTATION']: if not os.path.exists(f"{FILEPATH}/inserts_data/"): raise Exception( 'Insert data not found (data augmentation). Please run python3 prepare_inserts_data.py --cfg=CONFIG_FILE.' ) # Prepare output data directory time_str = datetime.datetime.today().strftime("%Y_%m_%d_%H_%M_%S") results_directory = f"{FILEPATH}/results/{time_str}" os.makedirs(f"{results_directory}/states/") cfg['STATES_DIRECTORY'] = f"{results_directory}/states" with open(f"{results_directory}/config.yml", "w") as f: yaml.dump(cfg, f) # Random seed if 'SEED' in cfg['TRAINING']: torch.manual_seed(cfg['TRAINING']['SEED']) # Dataloaders dataset = LeddartechDataset(cfg) train_indices, valid_indices = train_valid_indices(len(dataset), cfg['TRAINING']) train_subset = Subset(dataset, train_indices) valid_subset = Subset(dataset, valid_indices) train_loader = DataLoader(train_subset, batch_size=cfg['TRAINING']['BATCH_SIZE'], num_workers=cfg['TRAINING']['NUM_WORKERS'], shuffle=True, drop_last=True) valid_loader = DataLoader(valid_subset, batch_size=cfg['TRAINING']['BATCH_SIZE'], num_workers=cfg['TRAINING']['NUM_WORKERS'], drop_last=True) print( f"Dataset size: {len(dataset)} | training set: {len(train_subset)} | validation set: {len(valid_subset)}" ) # Model in_channels = dataset.check_number_channels() model = getattr(models, cfg['NEURAL_NET']['NAME'])(cfg, in_channels) print(f"Model size: {model.size_of_net}") if cfg['TRAINING']['DEVICE'] == 'cuda' and torch.cuda.device_count( ) > 1: #Multi GPUs model = torch.nn.DataParallel(model) model.to(cfg['TRAINING']['DEVICE']) print(f"Device set to: {cfg['TRAINING']['DEVICE']}") # Loss loss_function = list(cfg['TRAINING']['LOSS'].keys())[0] loss = getattr(losses, loss_function)(cfg, **cfg['TRAINING']['LOSS'][loss_function]) # Optimizer optimizer_function = list(cfg['TRAINING']['OPTIMIZER'].keys())[0] optimizer = getattr(torch.optim, optimizer_function)( model.parameters(), **cfg['TRAINING']['OPTIMIZER'][optimizer_function]) # Trainer engine trainer = create_supervised_trainer(model, optimizer, loss, device=cfg['TRAINING']['DEVICE']) pbar = tqdm_logger.ProgressBar(persist=True) pbar.attach(trainer, output_transform=lambda x: {'loss': x}) # Evaluator engine eval_metrics = { 'loss': ignite_loss(loss, device=cfg['TRAINING']['DEVICE']) } if 'METRICS' in cfg['TRAINING']: for metric in cfg['TRAINING']['METRICS']: eval_metrics[metric] = getattr(metrics, metric)( cfg, **cfg['TRAINING']['METRICS'][metric]) evaluator = create_supervised_evaluator(model, metrics=eval_metrics, device=cfg['TRAINING']['DEVICE']) pbar2 = tqdm_logger.ProgressBar(persist=True, desc='Validation') pbar2.attach(evaluator) # Check for gradient explosion def check_grad(_): if not np.isfinite(trainer.state.output): print(loss.log) raise ValueError("Loss is not finite.") trainer.add_event_handler(Events.ITERATION_COMPLETED, check_grad) # Learning rate decay optimizer.lr_decay_factor = 1 def lr_decay(_): for param_group in optimizer.param_groups: ep = trainer.state.epoch N = cfg['TRAINING']['SCHEDULER']['DECAY']['n_epochs'] f = cfg['TRAINING']['SCHEDULER']['DECAY']['factor'] optimizer.lr_decay_factor = np.exp( -ep / N) + f * (1 - np.exp(-ep / N)) param_group['lr'] = optimizer.lr_decay_factor * cfg['TRAINING'][ 'OPTIMIZER'][optimizer_function]['lr'] print(f"learning rate set to: {param_group['lr']}") if 'SCHEDULER' in cfg['TRAINING']: if 'DECAY' in cfg['TRAINING']['SCHEDULER']: trainer.add_event_handler(Events.EPOCH_STARTED, lr_decay) def handle_epoch_completed(_): torch.save( model.state_dict(), f"{cfg['STATES_DIRECTORY']}/{cfg['NEURAL_NET']['STATE_ID']}_{trainer.state.epoch:03d}.pt" ) dataset.data_augmentation = False evaluator.run(valid_loader) dataset.data_augmentation = True print('Validation results: ', evaluator.state.metrics) with open(f"{results_directory}/{cfg['NEURAL_NET']['STATE_ID']}.yml", "a") as f: yaml.dump( {f'Epoch {trainer.state.epoch:03d}': evaluator.state.metrics}, f) trainer.add_event_handler(Events.EPOCH_COMPLETED, handle_epoch_completed) # Resume training def resume_training(trainer): if resume_state is not None: resume_epoch = int(resume_state.split('_')[-1].split('.')[0]) model.load_state_dict(torch.load(resume_state)) trainer.state.iteration = resume_epoch * len( trainer.state.dataloader) trainer.state.epoch = resume_epoch else: with open( f"{results_directory}/{cfg['NEURAL_NET']['STATE_ID']}.yml", "w") as f: pass trainer.add_event_handler(Events.STARTED, resume_training) # Start training dataset.data_augmentation = True trainer.run(train_loader, max_epochs=cfg['TRAINING']['EPOCHS']) return results_directory
def training(local_rank, config): rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() logger = setup_logger(name="ImageNet-Training", distributed_rank=local_rank) log_basic_info(logger, config) output_path = config["output_path"] if rank == 0: if config["stop_iteration"] is None: now = datetime.now().strftime("%Y%m%d-%H%M%S") else: now = "stop-on-{}".format(config["stop_iteration"]) folder_name = "{}_backend-{}-{}_{}".format(config["model"], idist.backend(), idist.get_world_size(), now) output_path = Path(output_path) / folder_name if not output_path.exists(): output_path.mkdir(parents=True) config["output_path"] = output_path.as_posix() logger.info("Output path: {}".format(config["output_path"])) if "cuda" in device.type: config["cuda device name"] = torch.cuda.get_device_name(local_rank) # Setup dataflow, model, optimizer, criterion train_loader, test_loader = get_imagenet_dataloader(config) config["num_iters_per_epoch"] = len(train_loader) model, optimizer, criterion, lr_scheduler = initialize(config) # Create trainer for current task trainer = create_supervised_trainer(model, optimizer, criterion, lr_scheduler, train_loader.sampler, config, logger) # Let's now setup evaluator engine to perform model's validation and compute metrics metrics = { "accuracy": Accuracy(), "loss": Loss(criterion), } # We define two evaluators as they wont have exactly similar roles: # - `evaluator` will save the best model based on validation score evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def run_validation(engine): epoch = trainer.state.epoch state = train_evaluator.run(train_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(test_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED, run_validation) if rank == 0: # Setup TensorBoard logging on trainer and evaluators. Logged values are: # - Training metrics, e.g. running average loss values # - Learning rate # - Evaluation train/test metrics evaluators = {"training": train_evaluator, "test": evaluator} tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators) # Store 3 best models by validation accuracy: common.gen_save_best_models_by_val_score( save_handler=get_save_handler(config), evaluator=evaluator, models={"model": model}, metric_name="accuracy", n_saved=3, trainer=trainer, tag="test", ) # In order to check training resuming we can stop training on a given iteration if config["stop_iteration"] is not None: @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"])) def _(): logger.info("Stop training on {} iteration".format( trainer.state.iteration)) trainer.terminate() @trainer.on(Events.ITERATION_COMPLETED(every=20)) def print_acc(engine): if rank == 0: print("Epoch[{}] Iteration[{}/{}] Loss: {:.3f}"\ .format(engine.state.epoch, engine.state.iteration, len(train_loader), engine.state.saved_batch_loss )) try: trainer.run(train_loader, max_epochs=config["num_epochs"]) except Exception as e: import traceback print(traceback.format_exc()) if rank == 0: tb_logger.close()
def run(*options, cfg=None): """Run training and validation of model Notes: Options can be passed in via the options argument and loaded from the cfg file Options from default.py will be overridden by options loaded from cfg file Options passed in via options argument will override option loaded from cfg file Args: *options (str,int ,optional): Options used to overide what is loaded from the config. To see what options are available consult default.py cfg (str, optional): Location of config file to load. Defaults to None. """ update_config(config, options=options, config_file=cfg) # Start logging load_log_configuration(config.LOG_CONFIG) logger = logging.getLogger(__name__) logger.debug(config.WORKERS) torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK torch.manual_seed(config.SEED) if torch.cuda.is_available(): torch.cuda.manual_seed_all(config.SEED) np.random.seed(seed=config.SEED) # load the data TrainVoxelLoader = get_voxel_loader(config) train_set = TrainVoxelLoader( config.DATASET.ROOT, config.DATASET.FILENAME, split="train", window_size=config.WINDOW_SIZE, len=config.TRAIN.BATCH_SIZE_PER_GPU * config.TRAIN.BATCH_PER_EPOCH, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, ) val_set = TrainVoxelLoader( config.DATASET.ROOT, config.DATASET.FILENAME, split="val", window_size=config.WINDOW_SIZE, len=config.TRAIN.BATCH_SIZE_PER_GPU * config.TRAIN.BATCH_PER_EPOCH, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, ) n_classes = train_set.n_classes # set dataset length to batch size to be consistent with 5000 iterations # each of size 32 in the original Waldeland implementation train_loader = data.DataLoader( train_set, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS, shuffle=False, ) val_loader = data.DataLoader( val_set, batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS, shuffle=False, ) # this is how we import model for CV - here we're importing a seismic # segmentation model model = TextureNet(n_classes=config.DATASET.NUM_CLASSES) optimizer = torch.optim.Adam( model.parameters(), lr=config.TRAIN.LR, # momentum=config.TRAIN.MOMENTUM, weight_decay=config.TRAIN.WEIGHT_DECAY, ) device = "cpu" if torch.cuda.is_available(): device = "cuda" model = model.cuda() loss = torch.nn.CrossEntropyLoss() trainer = create_supervised_trainer(model, optimizer, loss, prepare_batch=_prepare_batch, device=device) desc = "ITERATION - loss: {:.2f}" pbar = tqdm(initial=0, leave=False, total=len(train_loader), desc=desc.format(0)) # add model checkpointing output_dir = path.join(config.OUTPUT_DIR, config.TRAIN.MODEL_DIR) checkpoint_handler = ModelCheckpoint( output_dir, "model", save_interval=1, n_saved=3, create_dir=True, require_empty=False, ) criterion = torch.nn.CrossEntropyLoss(reduction="mean") # save model at each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {config.MODEL.NAME: model}) def _select_pred_and_mask(model_out): # receive a tuple of (x, y_pred), y # so actually in line 51 of # cv_lib/cv_lib/segmentation/dutch_f3/metrics/__init__.py # we do the following line, so here we just select the model # _, y_pred = torch.max(model_out[0].squeeze(), 1, keepdim=True) y_pred = model_out[0].squeeze() y = model_out[1].squeeze() return (y_pred.squeeze(), y) evaluator = create_supervised_evaluator( model, metrics={ "nll": Loss(criterion, device=device), "pixa": pixelwise_accuracy(n_classes, output_transform=_select_pred_and_mask, device=device), "cacc": class_accuracy(n_classes, output_transform=_select_pred_and_mask, device=device), "mca": mean_class_accuracy(n_classes, output_transform=_select_pred_and_mask, device=device), "ciou": class_iou(n_classes, output_transform=_select_pred_and_mask, device=device), "mIoU": mean_iou(n_classes, output_transform=_select_pred_and_mask, device=device), }, device=device, prepare_batch=_prepare_batch, ) # Set the validation run to start on the epoch completion of the training run trainer.add_event_handler(Events.EPOCH_COMPLETED, Evaluator(evaluator, val_loader)) summary_writer = create_summary_writer(log_dir=path.join(output_dir, config.LOG_DIR)) evaluator.add_event_handler( Events.EPOCH_COMPLETED, logging_handlers.log_metrics( "Validation results", metrics_dict={ "mIoU": "Avg IoU :", "nll": "Avg loss :", "pixa": "Pixelwise Accuracy :", "mca": "Mean Class Accuracy :", }, ), ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, tensorboard_handlers.log_metrics( summary_writer, trainer, "epoch", metrics_dict={"mIoU": "Validation/IoU", "nll": "Validation/Loss", "mca": "Validation/MCA",}, ), ) summary_writer = create_summary_writer(log_dir=path.join(output_dir, config.LOG_DIR)) snapshot_duration = 2 def snapshot_function(): return (trainer.state.iteration % snapshot_duration) == 0 checkpoint_handler = SnapshotHandler( path.join(output_dir, config.TRAIN.MODEL_DIR), config.MODEL.NAME, extract_metric_from("mIoU"), snapshot_function, ) evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {"model": model}) logger.info("Starting training") trainer.run(train_loader, max_epochs=config.TRAIN.END_EPOCH // config.TRAIN.BATCH_PER_EPOCH) pbar.close()
def main(): parser = argparse.ArgumentParser() arg = parser.add_argument arg('clf_gt', help='segmentation predictions') # Dataset params arg('--test-height', type=int, default=2528) arg('--crop-height', type=int, default=768) arg('--crop-width', type=int, default=512) arg('--scale-aug', type=float, default=0.3) arg('--color-hue-aug', type=int, default=7) arg('--color-sat-aug', type=int, default=30) arg('--color-val-aug', type=int, default=30) arg('--n-tta', type=int, default=1) arg('--pseudolabels', nargs='+', help='path to pseudolabels to be added to train') arg('--pseudolabels-oversample', type=int, default=1) arg('--test-book', help='use only this book for testing and pseudolabels') arg('--fold', type=int, default=0) arg('--n-folds', type=int, default=5) arg('--train-limit', type=int) arg('--test-limit', type=int) # Model params arg('--base', default='resnet50') arg('--use-sequences', type=int, default=0) arg('--head-dropout', type=float, default=0.5) arg('--frozen-start', type=int) arg('--head', type=str, default='Head') # Training params arg('--device', default='cuda', help='device') arg('--opt-level', help='pass 01 to use fp16 training with apex') arg('--benchmark', type=int) arg('--batch-size', default=10, type=int) arg('--max-targets', type=int) arg('--workers', default=8, type=int, help='number of data loading workers') arg('--lr', default=14e-3, type=float, help='initial learning rate') arg('--wd', default=1e-4, type=float, help='weight decay') arg('--optimizer', default='sgd') arg('--accumulation-steps', type=int, default=1) arg('--epochs', default=50, type=int, help='number of total epochs to run') arg('--repeat-train', type=int, default=6) arg('--drop-lr-epoch', default=0, type=int, help='epoch at which to drop lr') arg('--cosine', type=int, default=1, help='cosine lr schedule') # Misc. params arg('--output-dir', help='path where to save') arg('--resume', help='resume from checkpoint') arg('--test-only', help='Only test the model', action='store_true') arg('--submission', help='Create submission', action='store_true') arg('--detailed-postfix', default='', help='postfix of detailed file name') arg('--print-model', default=1, type=int) arg('--dump-features', default=0, type=int) # for knn, unused args = parser.parse_args() if args.test_only and args.submission: parser.error('pass one of --test-only and --submission') print(args) output_dir = Path(args.output_dir) if args.output_dir else None if output_dir: output_dir.mkdir(parents=True, exist_ok=True) if not args.resume: (output_dir / 'params.json').write_text( json.dumps(vars(args), indent=4)) print('Loading data') df_train_gt, df_valid_gt = load_train_valid_df(args.fold, args.n_folds) df_clf_gt = load_train_df(args.clf_gt)[['labels', 'image_id']] if args.submission: df_valid = df_train = df_clf_gt empty_index = df_valid['labels'] == '' empty_pages = df_valid[empty_index]['image_id'].values df_valid = df_valid[~empty_index] else: df_train, df_valid = [ df_clf_gt[df_clf_gt['image_id'].isin(set(df['image_id']))] for df in [df_train_gt, df_valid_gt] ] df_valid = df_valid[df_valid['labels'] != ''] if args.pseudolabels: df_ps = pd.concat( [pd.read_csv(p)[df_train.columns] for p in args.pseudolabels]) if args.test_book: df_ps = df_ps[df_ps['image_id'].apply( lambda x: get_book_id(x) == args.test_book)] df_train = ( pd.concat([df_train] + [df_ps] * args.pseudolabels_oversample).reset_index( drop=True)) if args.test_book: df_valid = df_valid[df_valid['image_id'].apply( lambda x: get_book_id(x) == args.test_book)] if args.train_limit: df_train = df_train.sample(n=args.train_limit, random_state=42) if args.test_limit: df_valid = df_valid.sample(n=args.test_limit, random_state=42) gt_by_image_id = {item.image_id: item for item in df_valid_gt.itertuples()} print(f'{len(df_train):,} in train, {len(df_valid):,} in valid') classes = get_encoded_classes() def _get_transforms(*, train: bool): if not train and args.n_tta > 1: test_heights = [ args.test_height * (1 + s) for s in np.linspace(0, args.scale_aug, args.n_tta) ] print('TTA test heights:', list(map(int, test_heights))) else: test_heights = [args.test_height] return [ get_transform( train=train, test_height=test_height, crop_width=args.crop_width, crop_height=args.crop_height, scale_aug=args.scale_aug, color_hue_aug=args.color_hue_aug, color_sat_aug=args.color_sat_aug, color_val_aug=args.color_val_aug, ) for test_height in test_heights ] def make_test_data_loader(df): return DataLoader( Dataset( df=df, transforms=_get_transforms(train=False), resample_empty=False, classes=classes, ), batch_size=1, collate_fn=collate_fn, num_workers=args.workers, ) data_loader_test = make_test_data_loader(df_valid) if args.dump_features: # unused df_train = df_train[df_train['labels'] != ''] data_loader_train = make_test_data_loader(df_train) else: data_loader_train = DataLoader( Dataset( df=pd.concat([df_train] * args.repeat_train), transforms=_get_transforms(train=True), resample_empty=True, classes=classes, ), num_workers=args.workers, shuffle=True, collate_fn=partial(collate_fn, max_targets=args.max_targets), batch_size=args.batch_size, ) print('Creating model') fp16 = bool(args.opt_level) model: nn.Module = build_model( base=args.base, head=args.head, frozen_start=args.frozen_start, fp16=fp16, n_classes=len(classes), head_dropout=args.head_dropout, use_sequences=bool(args.use_sequences), ) if args.print_model: print(model) device = torch.device(args.device) model.to(device) if args.benchmark: torch.backends.cudnn.benchmark = True parameters = model.parameters() if args.optimizer == 'adam': optimizer = optim.Adam(parameters, lr=args.lr, weight_decay=args.wd) elif args.optimizer == 'sgd': optimizer = optim.SGD(parameters, lr=args.lr, weight_decay=args.wd, momentum=0.9) else: parser.error(f'Unexpected optimzier {args.optimizer}') if fp16: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level) loss = nn.CrossEntropyLoss() step = epoch = 0 best_f1 = 0 if args.resume: state = torch.load(args.resume, map_location='cpu') if 'optimizer' in state: optimizer.load_state_dict(state['optimizer']) model.load_state_dict(state['model']) step = state['step'] epoch = state['epoch'] best_f1 = state['best_f1'] else: model.load_state_dict(state) del state @contextmanager def no_benchmark(): torch.backends.cudnn.benchmark = False yield if args.benchmark: torch.backends.cudnn.benchmark = True if args.dump_features and not args.submission: # unused if not output_dir: parser.error('set --output-dir with --dump-features') # We also dump test features below feature_evaluator = create_supervised_evaluator( model, device=device, prepare_batch=_prepare_batch, metrics={'features': GetFeatures(n_tta=args.n_tta)}, ) with no_benchmark(): run_with_pbar(feature_evaluator, data_loader_train, desc='train features') torch.save(feature_evaluator.state.metrics['features'], output_dir / 'train_features.pth') def get_y_pred_y(output): y_pred, y = output return get_output(y_pred), get_labels(y) metrics = { 'accuracy': Accuracy(output_transform=get_y_pred_y), 'loss': Loss(loss, output_transform=get_y_pred_y), 'predictions': GetPredictions(n_tta=args.n_tta, classes=classes), 'detailed': GetDetailedPrediction(n_tta=args.n_tta, classes=classes), } if args.dump_features: metrics['features'] = GetFeatures(n_tta=args.n_tta) evaluator = create_supervised_evaluator(model, device=device, prepare_batch=_prepare_batch, metrics=metrics) def evaluate(): with no_benchmark(): run_with_pbar(evaluator, data_loader_test, desc='evaluate') metrics = { 'valid_loss': evaluator.state.metrics['loss'], 'accuracy': evaluator.state.metrics['accuracy'], } scores = [] for prediction, meta in evaluator.state.metrics['predictions']: item = gt_by_image_id[meta['image_id']] target_boxes, target_labels = get_target_boxes_labels(item) target_boxes = torch.from_numpy(target_boxes) pred_centers = np.array([p['center'] for p in prediction]) pred_labels = [p['cls'] for p in prediction] scores.append( dict(score_boxes( truth_boxes=from_coco(target_boxes).numpy(), truth_label=target_labels, preds_center=pred_centers, preds_label=np.array(pred_labels), ), image_id=item.image_id)) metrics.update(get_metrics(scores)) if output_dir: pd.DataFrame(evaluator.state.metrics['detailed']).to_csv( output_dir / f'detailed{args.detailed_postfix}.csv.gz', index=None) if args.dump_features: f_name = 'test' if args.submission else 'valid' torch.save(evaluator.state.metrics['features'], output_dir / f'{f_name}_features.pth') return metrics def make_submission(): with no_benchmark(): run_with_pbar(evaluator, data_loader_test, desc='evaluate') submission = [] for prediction, meta in tqdm.tqdm( evaluator.state.metrics['predictions']): submission.append(submission_item(meta['image_id'], prediction)) submission.extend( submission_item(image_id, []) for image_id in empty_pages) pd.DataFrame(submission).to_csv(output_dir / f'submission_{output_dir.name}.csv.gz', index=None) pd.DataFrame(evaluator.state.metrics['detailed']).to_csv( output_dir / f'test_detailed{args.detailed_postfix}.csv.gz', index=None) if args.dump_features: torch.save(evaluator.state.metrics['features'], output_dir / 'test_features.pth') if args.test_only or args.submission: if not args.resume: parser.error('please pass --resume when running with --test-only ' 'or --submission') if args.test_only: print_metrics(evaluate()) elif args.submission: if not output_dir: parser.error('--output-dir required with --submission') make_submission() return trainer = create_supervised_trainer( model, optimizer, loss_fn=lambda y_pred, y: loss(get_output(y_pred), get_labels(y)), device=device, prepare_batch=_prepare_batch, accumulation_steps=args.accumulation_steps, fp16=fp16, ) epochs_left = args.epochs - epoch epochs_pbar = tqdm.trange(epochs_left) epoch_pbar = tqdm.trange(len(data_loader_train)) train_losses = deque(maxlen=20) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(_): nonlocal step train_losses.append(trainer.state.output) smoothed_loss = np.mean(train_losses) epoch_pbar.set_postfix(loss=f'{smoothed_loss:.4f}') epoch_pbar.update(1) step += 1 if step % 20 == 0 and output_dir: json_log_plots.write_event(output_dir, step=step * args.batch_size, loss=smoothed_loss) @trainer.on(Events.EPOCH_COMPLETED) def checkpoint(_): if output_dir: torch.save( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'step': step, 'epoch': epoch, 'best_f1': best_f1, }, output_dir / 'checkpoint.pth') @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(_): nonlocal best_f1 metrics = evaluate() if output_dir: json_log_plots.write_event(output_dir, step=step * args.batch_size, **metrics) if metrics['f1'] > best_f1: best_f1 = metrics['f1'] if output_dir: torch.save(model.state_dict(), output_dir / 'model_best.pth') epochs_pbar.set_postfix( {k: format_value(v) for k, v in metrics.items()}) @trainer.on(Events.EPOCH_COMPLETED) def update_pbars_on_epoch_completion(_): nonlocal epoch epochs_pbar.update(1) epoch_pbar.reset() epoch += 1 scheduler = None if args.drop_lr_epoch and args.cosine: parser.error('Choose only one schedule') if args.drop_lr_epoch: scheduler = StepLR(optimizer, step_size=args.drop_lr_epoch, gamma=0.1) if args.cosine: scheduler = CosineAnnealingLR(optimizer, epochs_left) if scheduler is not None: trainer.on(Events.EPOCH_COMPLETED)(lambda _: scheduler.step()) trainer.run(data_loader_train, max_epochs=epochs_left)
def train(model, model_name, train_dataloader, eval_dataloader, labels_name, trainer_name='ocr', backbone_url=None): device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) scaler = torch.cuda.amp.GradScaler() def _prepare_batch(batch, device=None, non_blocking=False): """Prepare batch for training: pass to a device with options. """ images, labels = batch images = images.to(device) labels = [label.to(device) for label in labels] return (images, labels) writer = SummaryWriter(log_dir=f'logs/{trainer_name}/{model_name}') lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=250, cooldown=100, min_lr=1e-6) def _update(engine, batch): model.train() optimizer.zero_grad() x, y = _prepare_batch(batch, device=device) # loss = model(x, y) # loss.backward() # optimizer.step() with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()): loss = model(x, y) scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() return loss.item() trainer = Engine(_update) evaluator = create_supervised_evaluator( model, prepare_batch=_prepare_batch, metrics={'edit_distance': EditDistanceMetric()}, device=device) if path.exists(f'{trainer_name}_{model_name}_checkpoint.pt'): checkpoint = torch.load(f'{trainer_name}_{model_name}_checkpoint.pt') model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) scaler.load_state_dict(checkpoint['scaler']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) logging.info( f'load checkpoint {trainer_name}_{model_name}_checkpoint.pt') elif path.exists(f'{model_name}_backbone.pt'): pretrained_dict = torch.load(f'{model_name}_backbone.pt')['model'] model_dict = model.state_dict() pretrained_dict = { k: v for k, v in pretrained_dict.items() if k in model_dict and 'neck.' not in k and 'fc.' not in k } model_dict.update(pretrained_dict) model.load_state_dict(model_dict) logging.info(f'load transfer parameters from {model_name}_backbone.pt') elif backbone_url is not None: pretrained_dict = torch.hub.load_state_dict_from_url(backbone_url, progress=False) model_dict = model.backbone.state_dict() pretrained_dict = { k: v for k, v in pretrained_dict.items() if k in model_dict } model_dict.update(pretrained_dict) model.backbone.load_state_dict(model_dict) logging.info(f'load backbone from {backbone_url}') early_stop_arr = [0.0] def early_stop_score_function(engine): val_acc = engine.state.metrics['edit_distance'] if val_acc < 0.8: # do not early stop when acc is less than 0.9 early_stop_arr[0] += 0.000001 return early_stop_arr[0] return val_acc early_stop_handler = EarlyStopping( patience=20, score_function=early_stop_score_function, trainer=trainer) evaluator.add_event_handler(Events.COMPLETED, early_stop_handler) checkpoint_handler = ModelCheckpoint(f'models/{trainer_name}/{model_name}', model_name, n_saved=10, create_dir=True) # trainer.add_event_handler(Events.ITERATION_COMPLETED(every=1000), checkpoint_handler, # {'model': model, 'optimizer': optimizer, 'lr_scheduler': lr_scheduler}) trainer.add_event_handler( Events.ITERATION_COMPLETED(every=1000), checkpoint_handler, { 'model': model, 'optimizer': optimizer, 'lr_scheduler': lr_scheduler, 'scaler': scaler }) @trainer.on(Events.ITERATION_COMPLETED(every=10)) def log_training_loss(trainer): lr = optimizer.param_groups[0]['lr'] logging.info("Epoch[{}]: {} - Loss: {:.4f}, Lr: {}".format( trainer.state.epoch, trainer.state.iteration, trainer.state.output, lr)) writer.add_scalar("training/loss", trainer.state.output, trainer.state.iteration) writer.add_scalar("training/learning_rate", lr, trainer.state.iteration) @trainer.on(Events.ITERATION_COMPLETED(every=10)) def step_lr(trainer): lr_scheduler.step(trainer.state.output) @trainer.on(Events.ITERATION_COMPLETED(every=1000)) def log_training_results(trainer): evaluator.run(eval_dataloader) metrics = evaluator.state.metrics logging.info( "Eval Results - Epoch[{}]: {} - Avg edit distance: {:.4f}".format( trainer.state.epoch, trainer.state.iteration, metrics['edit_distance'])) writer.add_scalar("evaluation/avg_edit_distance", metrics['edit_distance'], trainer.state.iteration) @trainer.on(Events.ITERATION_COMPLETED(every=100)) def read_lr_from_file(trainer): if path.exists('lr.txt'): with open('lr.txt', 'r', encoding='utf-8') as f: lr = float(f.read()) for group in optimizer.param_groups: group['lr'] = lr trainer.run(train_dataloader, max_epochs=1)
def main(dataset_path, batch_size=256, max_epochs=10): assert torch.cuda.is_available() assert torch.backends.cudnn.enabled, "NVIDIA/Apex:Amp requires cudnn backend to be enabled." torch.backends.cudnn.benchmark = True device = "cuda" train_loader, test_loader, eval_train_loader = get_train_eval_loaders(dataset_path, batch_size=batch_size) model = wide_resnet50_2(num_classes=100).to(device) optimizer = SGD(model.parameters(), lr=0.01) criterion = CrossEntropyLoss().to(device) scaler = GradScaler() def train_step(engine, batch): x = convert_tensor(batch[0], device, non_blocking=True) y = convert_tensor(batch[1], device, non_blocking=True) optimizer.zero_grad() # Runs the forward pass with autocasting. with autocast(): y_pred = model(x) loss = criterion(y_pred, y) # Scales loss. Calls backward() on scaled loss to create scaled gradients. # Backward passes under autocast are not recommended. # Backward ops run in the same precision that autocast used for corresponding forward ops. scaler.scale(loss).backward() # scaler.step() first unscales the gradients of the optimizer's assigned params. # If these gradients do not contain infs or NaNs, optimizer.step() is then called, # otherwise, optimizer.step() is skipped. scaler.step(optimizer) # Updates the scale for next iteration. scaler.update() return loss.item() trainer = Engine(train_step) timer = Timer(average=True) timer.attach(trainer, step=Events.EPOCH_COMPLETED) ProgressBar(persist=True).attach(trainer, output_transform=lambda out: {"batch loss": out}) metrics = {"Accuracy": Accuracy(), "Loss": Loss(criterion)} evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def log_metrics(engine, title): for name in metrics: print(f"\t{title} {name}: {engine.state.metrics[name]:.2f}") @trainer.on(Events.COMPLETED) def run_validation(_): print(f"- Mean elapsed time for 1 epoch: {timer.value()}") print("- Metrics:") with evaluator.add_event_handler(Events.COMPLETED, log_metrics, "Train"): evaluator.run(eval_train_loader) with evaluator.add_event_handler(Events.COMPLETED, log_metrics, "Test"): evaluator.run(test_loader) trainer.run(train_loader, max_epochs=max_epochs)
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_dir): train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() device = "cpu" if torch.cuda.is_available(): device = "cuda" model.to(device) # Move model before creating optimizer optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) criterion = nn.CrossEntropyLoss() trainer = create_supervised_trainer(model, optimizer, criterion, device=device) trainer.logger = setup_logger("Trainer") if sys.version_info > (3, ): from ignite.contrib.metrics.gpu_info import GpuInfo try: GpuInfo().attach(trainer) except RuntimeError: print( "INFO: By default, in this example it is possible to log GPU information (used memory, utilization). " "As there is no pynvml python package installed, GPU information won't be logged. Otherwise, please " "install it : `pip install pynvml`") metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)} train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) train_evaluator.logger = setup_logger("Train Evaluator") validation_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) validation_evaluator.logger = setup_logger("Val Evaluator") @trainer.on(Events.EPOCH_COMPLETED) def compute_metrics(engine): train_evaluator.run(train_loader) validation_evaluator.run(val_loader) tb_logger = TensorboardLogger(log_dir=log_dir) tb_logger.attach_output_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), tag="training", output_transform=lambda loss: {"batchloss": loss}, metric_names="all", ) for tag, evaluator in [("training", train_evaluator), ("validation", validation_evaluator)]: tb_logger.attach_output_handler( evaluator, event_name=Events.EPOCH_COMPLETED, tag=tag, metric_names=["loss", "accuracy"], global_step_transform=global_step_from_engine(trainer), ) tb_logger.attach_opt_params_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), optimizer=optimizer) tb_logger.attach(trainer, log_handler=WeightsScalarHandler(model), event_name=Events.ITERATION_COMPLETED(every=100)) tb_logger.attach(trainer, log_handler=WeightsHistHandler(model), event_name=Events.EPOCH_COMPLETED(every=100)) tb_logger.attach(trainer, log_handler=GradsScalarHandler(model), event_name=Events.ITERATION_COMPLETED(every=100)) tb_logger.attach(trainer, log_handler=GradsHistHandler(model), event_name=Events.EPOCH_COMPLETED(every=100)) def score_function(engine): return engine.state.metrics["accuracy"] model_checkpoint = ModelCheckpoint( log_dir, n_saved=2, filename_prefix="best", score_function=score_function, score_name="validation_accuracy", global_step_transform=global_step_from_engine(trainer), ) validation_evaluator.add_event_handler(Events.COMPLETED, model_checkpoint, {"model": model}) # kick everything off trainer.run(train_loader, max_epochs=epochs) tb_logger.close()
def fit_model_multiclass(model, train_loader, test_loader, lr, max_epochs=5, number_of_classes=2): optimizer = torch.optim.SGD(model.parameters(), lr=lr) criterion = torch.nn.CrossEntropyLoss() def threshold_output_transform(output): y_pred, y = output print('got in here threshold ') y_pred = torch.heaviside(y_pred, values=torch.zeros(1)) # print(f'y_pred size : {y_pred.size()}') # print(f'y size : {y.size()}') return y_pred, y def prepare_batch(batch, device, non_blocking): x, y = batch # print(f'x type: {x.dtype}') # print(f'y type: {y.dtype}') x = x.to(dtype=torch.long) y = y.to(dtype=torch.long) # print(f'x type: {x.dtype}') # print(f'y type: {y.dtype}') # y = torch.unsqueeze(y, 1) y = y.squeeze() return (x, y) def squeeze_y_dims(output): prediction, target = output print('got in here squeeze y dims') # print(f'prediction size: {prediction.size()}') # print(f'target size: {target.size()}') return prediction.long(), target.squeeze().long() def correct_shape(output): y_pred, y = output print('got in here squeeze correct shape') one_hot_y = torch.nn.functional.one_hot(y, num_classes=number_of_classes) one_hot_y = one_hot_y.squeeze(1) argmax = y_pred.argmax(1) m = torch.zeros(y_pred.shape).scatter(1, argmax.unsqueeze(1), 1.0) return m, one_hot_y def trainer_output_shape(output): print('got here output transform trainer ') x, y, y_pred, loss = output trainer = create_supervised_trainer(model, optimizer, criterion, prepare_batch=prepare_batch) val_metrics = { "accuracy": Accuracy(output_transform=correct_shape, is_multilabel=True), "loss": Loss(criterion, output_transform=squeeze_y_dims) # "precision" : Precision(threshold_output_transform, average=False), # "recall": Recall(threshold_output_transform, average=False) } evaluator = create_supervised_evaluator(model, metrics=val_metrics, prepare_batch=prepare_batch) # @trainer.on(Events.ITERATION_COMPLETED(every=10)) # def log_training_loss(trainer): # evaluator.run(train_loader) # metrics = evaluator.state.metrics # print(f"Training Results - Epoch: {trainer.state.epoch} Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['loss']:.2f}") # @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(trainer): evaluator.run(train_loader) metrics = evaluator.state.metrics # print(f"Training Results - Epoch: {trainer.state.epoch} Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['bce']:.2f} Avg precision : {metrics['precision']:.2f} Avg recall: {metrics['recall']:.2f}") print( f"Training Results - Epoch: {trainer.state.epoch} Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['loss']:.2f}" ) @trainer.on(Events.EPOCH_COMPLETED(every=10)) def log_validation_results(trainer): evaluator.run(test_loader) metrics = evaluator.state.metrics # print(f"Validation Results - Epoch: {trainer.state.epoch} Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['bce']:.2f} Avg precision : {metrics['precision']:.2f} Avg recall: {metrics['recall']:.2f}") print( f"Validation Results - Epoch: {trainer.state.epoch} Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['loss']:.2f}" ) trainer.run(train_loader, max_epochs=max_epochs) return model
def run(train_batch_size, val_batch_size, epochs, lr, momentum): train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() device = "cpu" if torch.cuda.is_available(): device = "cuda" model.to(device) # Move model before creating optimizer optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) criterion = nn.CrossEntropyLoss() trainer = create_supervised_trainer(model, optimizer, criterion, device=device) trainer.logger = setup_logger("Trainer") metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)} train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) train_evaluator.logger = setup_logger("Train Evaluator") validation_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) validation_evaluator.logger = setup_logger("Val Evaluator") @trainer.on(Events.EPOCH_COMPLETED) def compute_metrics(engine): train_evaluator.run(train_loader) validation_evaluator.run(val_loader) wandb_logger = WandBLogger( project="pytorch-ignite-integration", name="ignite-mnist-example", config={ "train_batch_size": train_batch_size, "val_batch_size": val_batch_size, "epochs": epochs, "lr": lr, "momentum": momentum, }, ) wandb_logger.attach_output_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), tag="training", output_transform=lambda loss: {"batchloss": loss}, ) for tag, evaluator in [("training", train_evaluator), ("validation", validation_evaluator)]: wandb_logger.attach_output_handler( evaluator, event_name=Events.EPOCH_COMPLETED, tag=tag, metric_names=["loss", "accuracy"], global_step_transform=lambda *_: trainer.state.iteration, ) wandb_logger.attach_opt_params_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), optimizer=optimizer) wandb_logger.watch(model, log="all") def score_function(engine): return engine.state.metrics["accuracy"] model_checkpoint = ModelCheckpoint( wandb_logger.run.dir, n_saved=2, filename_prefix="best", score_function=score_function, score_name="validation_accuracy", global_step_transform=global_step_from_engine(trainer), ) validation_evaluator.add_event_handler(Events.COMPLETED, model_checkpoint, {"model": model}) # kick everything off trainer.run(train_loader, max_epochs=epochs) wandb_logger.close()
with autocast(): outputs = model(inputs.cuda()) loss = criterion(outputs, targets.cuda()) scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() return loss.item() trainer = Engine(update) #trainer = create_supervised_trainer(model, optimizer, criterion, device='cuda') val_metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)} evaluator = create_supervised_evaluator(model, metrics=val_metrics, device='cuda') data = Data(x=X, y=y) loader = DataLoader(dataset=data, shuffle=True, batch_size=args.batch_size) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(trainer): evaluator.run(data=loader) metrics = evaluator.state.metrics print("Training Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(trainer.state.epoch, metrics["accuracy"], metrics["loss"]))
# model model = AttnCanAdcrowdNet() model = model.to(device) # loss function loss_fn = nn.MSELoss(size_average=False).to(device) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.decay) trainer = create_supervised_trainer(model, optimizer, loss_fn, device=device) evaluator = create_supervised_evaluator(model, metrics={ 'mae': CrowdCountingMeanAbsoluteError(), 'mse': CrowdCountingMeanSquaredError(), 'nll': Loss(loss_fn) }, device=device) print(model) print(args) @trainer.on(Events.ITERATION_COMPLETED(every=50)) def log_training_loss(trainer): timestamp = get_readable_time() print(timestamp + " Epoch[{}] Loss: {:.2f}".format(trainer.state.epoch, trainer.state.output)) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(trainer):
writer = SummaryWriter(log_dir=log_dir) # Instantiate the model class object model = MF(n_user, n_item, k=k, c_kld=c_kld, c_bias=c_bias, writer=writer) # Use Adam optimizer optimizer = torch.optim.Adam(model.parameters(), lr=lr) # Create a supervised trainer trainer = create_supervised_trainer(model, optimizer, model.loss) # Use Mean Squared Error as evaluation metric metrics = {'evaluation': MeanSquaredError()} # Create a supervised evaluator evaluator = create_supervised_evaluator(model, metrics=metrics) # Load the train and test data train_loader = Loader(train_x, train_y, batchsize=1024) test_loader = Loader(test_x, test_y, batchsize=1024) def log_training_loss(engine, log_interval=500): """ Function to log the training loss """ model.itr = engine.state.iteration # Keep track of iterations if model.itr % log_interval == 0: fmt = "Epoch[{}] Iteration[{}/{}] Loss: {:.2f}" # Keep track of epochs and outputs msg = fmt.format(engine.state.epoch, engine.state.iteration,
trainer = create_supervised_trainer(model, optimizer, criterion, device=device) metrics = { "accuracy": Accuracy(), "MAE": MeanAbsoluteError( output_transform=lambda out: (torch.max(out[0], dim=1)[1], out[1])), "MSE": MeanSquaredError( output_transform=lambda out: (torch.max(out[0], dim=1)[1], out[1])), "loss": Loss(loss_fn=criterion) } evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(trainer): print( f"Training (Epoch {trainer.state.epoch}): {trainer.state.output:.3f}") best_epoch = 0 best_val_metrics = {"MAE": np.inf} best_test_metrics = {"MAE": np.inf} @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(trainer):
def run(epochs, lr, momentum, log_interval): device = "cuda" if torch.cuda.is_available() else "cpu" net = Net().to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=lr, momentum=momentum) trainer = create_supervised_trainer(net, optimizer, criterion, device=device) trainer.logger = setup_logger("trainer") val_metrics = { "accuracy": Accuracy(), "loss": Loss(criterion), "recall": Recall() } evaluator = create_supervised_evaluator(net, metrics=val_metrics, device=device) evaluator.logger = setup_logger("evaluator") # Attach handler to plot trainer's loss every 100 iterations tb_logger.attach_output_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=params.get("loss_report")), tag="training", output_transform=lambda loss: {"loss": loss}, ) # Attach handler to dump evaluator's metrics every epoch completed for tag, evaluator in [("training", trainer), ("validation", evaluator)]: tb_logger.attach_output_handler( evaluator, event_name=Events.EPOCH_COMPLETED, tag=tag, metric_names="all", global_step_transform=global_step_from_engine(trainer), ) # Attach function to build debug images and report every epoch end tb_logger.attach( evaluator, log_handler=predictions_gt_images_handler, event_name=Events.EPOCH_COMPLETED(once=1), ) desc = "ITERATION - loss: {:.2f}" pbar = tqdm(initial=0, leave=False, total=len(trainloader), desc=desc.format(0)) @trainer.on(Events.ITERATION_COMPLETED(every=log_interval)) def log_training_loss(engine): pbar.desc = desc.format(engine.state.output) pbar.update(log_interval) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): pbar.refresh() evaluator.run(trainloader) metrics = evaluator.state.metrics avg_accuracy = metrics["accuracy"] avg_nll = metrics["loss"] tqdm.write( "Training Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(testloader) metrics = evaluator.state.metrics avg_accuracy = metrics["accuracy"] avg_nll = metrics["loss"] tqdm.write( "Validation Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) pbar.n = pbar.last_print_n = 0 @trainer.on(Events.EPOCH_COMPLETED | Events.COMPLETED) def log_time(): tqdm.write("{} took {} seconds".format( trainer.last_event_name.name, trainer.state.times[trainer.last_event_name.name], )) trainer.run(trainloader, max_epochs=epochs) pbar.close() PATH = "./cifar_net.pth" torch.save(net.state_dict(), PATH) print("Finished Training") print("Task ID number is: {}".format(task.id))
def run(train_batch_size, epochs, lr, weight_decay, model_name, config, exp_id, log_dir, trained_model_file, save_result_file, disable_gpu=False): if config['test_ratio']: train_loader, val_loader, test_loader = get_data_loaders( config, train_batch_size, exp_id) else: train_loader, val_loader = get_data_loaders(config, train_batch_size, exp_id) device = torch.device( "cuda" if not disable_gpu and torch.cuda.is_available() else "cpu") if model_name == 'CNNIQAplus' or model_name == 'CNNIQA': model = CNNIQAplusnet(n_distortions=config['n_distortions'], ker_size=config['kernel_size'], n_kers=config['n_kernels'], n1_nodes=config['n1_nodes'], n2_nodes=config['n2_nodes']) else: model = CNNIQAplusplusnet(n_distortions=config['n_distortions'], ker_size=config['kernel_size'], n1_kers=config['n1_kernels'], pool_size=config['pool_size'], n2_kers=config['n2_kernels'], n1_nodes=config['n1_nodes'], n2_nodes=config['n2_nodes']) writer = SummaryWriter(log_dir=log_dir) model = model.to(device) print(model) # if multi_gpu and torch.cuda.device_count() > 1: # model = nn.DataParallel(model) optimizer = Adam(model.parameters(), lr=lr, weight_decay=weight_decay) global best_criterion best_criterion = -1 # SROCC>=-1 trainer = create_supervised_trainer(model, optimizer, loss_fn, device=device) evaluator = create_supervised_evaluator(model, metrics={ 'IQA_performance': IQAPerformance(), 'IDC_performance': IDCPerformance() }, device=device) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): writer.add_scalar("training/loss", engine.state.output, engine.state.iteration) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics SROCC, KROCC, PLCC, RMSE, MAE, OR = metrics['IQA_performance'] Acc = metrics['IDC_performance'] print( "Validation Results - Epoch: {} Acc: {:.2f}% SROCC: {:.4f} KROCC: {:.4f} PLCC: {:.4f} RMSE: {:.4f} MAE: {:.4f} OR: {:.2f}%" .format(engine.state.epoch, 100 * Acc, SROCC, KROCC, PLCC, RMSE, MAE, 100 * OR)) writer.add_scalar("validation/SROCC", SROCC, engine.state.epoch) writer.add_scalar("validation/KROCC", KROCC, engine.state.epoch) writer.add_scalar("validation/PLCC", PLCC, engine.state.epoch) writer.add_scalar("validation/RMSE", RMSE, engine.state.epoch) writer.add_scalar("validation/MAE", MAE, engine.state.epoch) writer.add_scalar("validation/OR", OR, engine.state.epoch) writer.add_scalar("validation/Acc", Acc, engine.state.epoch) global best_criterion global best_epoch if SROCC > best_criterion: best_criterion = SROCC best_epoch = engine.state.epoch torch.save(model.state_dict(), trained_model_file) @trainer.on(Events.EPOCH_COMPLETED) def log_testing_results(engine): if config["test_ratio"] > 0 and config['test_during_training']: evaluator.run(test_loader) metrics = evaluator.state.metrics SROCC, KROCC, PLCC, RMSE, MAE, OR = metrics['IQA_performance'] Acc = metrics['IDC_performance'] print( "Testing Results - Epoch: {} Acc: {:.2f}% SROCC: {:.4f} KROCC: {:.4f} PLCC: {:.4f} RMSE: {:.4f} MAE: {:.4f} OR: {:.2f}%" .format(engine.state.epoch, 100 * Acc, SROCC, KROCC, PLCC, RMSE, MAE, 100 * OR)) writer.add_scalar("testing/SROCC", SROCC, engine.state.epoch) writer.add_scalar("testing/KROCC", KROCC, engine.state.epoch) writer.add_scalar("testing/PLCC", PLCC, engine.state.epoch) writer.add_scalar("testing/RMSE", RMSE, engine.state.epoch) writer.add_scalar("testing/MAE", MAE, engine.state.epoch) writer.add_scalar("testing/OR", OR, engine.state.epoch) writer.add_scalar("testing/Acc", Acc, engine.state.epoch) @trainer.on(Events.COMPLETED) def final_testing_results(engine): if config["test_ratio"] > 0: model.load_state_dict(torch.load(trained_model_file)) evaluator.run(test_loader) metrics = evaluator.state.metrics SROCC, KROCC, PLCC, RMSE, MAE, OR = metrics['IQA_performance'] Acc = metrics['IDC_performance'] global best_epoch print( "Final Test Results - Epoch: {} Acc: {:.2f}% SROCC: {:.4f} KROCC: {:.4f} PLCC: {:.4f} RMSE: {:.4f} MAE: {:.4f} OR: {:.2f}%" .format(best_epoch, 100 * Acc, SROCC, KROCC, PLCC, RMSE, MAE, 100 * OR)) np.save(save_result_file, (Acc, SROCC, KROCC, PLCC, RMSE, MAE, OR)) # kick everything off trainer.run(train_loader, max_epochs=epochs) writer.close()