def test_integration(): n_iters = 100 batch_size = 10 n_classes = 10 y_true_batch_values = iter( np.random.randint(0, n_classes, size=(n_iters, batch_size))) y_pred_batch_values = iter(np.random.rand(n_iters, batch_size, n_classes)) loss_values = iter(range(n_iters)) def update_fn(engine, batch): loss_value = next(loss_values) y_true_batch = next(y_true_batch_values) y_pred_batch = next(y_pred_batch_values) return ( loss_value, torch.from_numpy(y_pred_batch), torch.from_numpy(y_true_batch), ) trainer = Engine(update_fn) alpha = 0.98 acc_metric = RunningAverage( Accuracy(output_transform=lambda x: [x[1], x[2]]), alpha=alpha) acc_metric.attach(trainer, "running_avg_accuracy") avg_output = RunningAverage(output_transform=lambda x: x[0], alpha=alpha) avg_output.attach(trainer, "running_avg_output") running_avg_acc = [ None, ] @trainer.on(Events.ITERATION_COMPLETED) def manual_running_avg_acc(engine): _, y_pred, y = engine.state.output indices = torch.max(y_pred, 1)[1] correct = torch.eq(indices, y).view(-1) num_correct = torch.sum(correct).item() num_examples = correct.shape[0] batch_acc = num_correct * 1.0 / num_examples if running_avg_acc[0] is None: running_avg_acc[0] = batch_acc else: running_avg_acc[0] = running_avg_acc[0] * alpha + ( 1.0 - alpha) * batch_acc engine.state.running_avg_acc = running_avg_acc[0] @trainer.on(Events.EPOCH_STARTED) def running_avg_output_init(engine): engine.state.running_avg_output = None @trainer.on(Events.ITERATION_COMPLETED) def running_avg_output_update(engine): if engine.state.running_avg_output is None: engine.state.running_avg_output = engine.state.output[0] else: engine.state.running_avg_output = ( engine.state.running_avg_output * alpha + (1.0 - alpha) * engine.state.output[0]) @trainer.on(Events.ITERATION_COMPLETED) def assert_equal_running_avg_acc_values(engine): assert (engine.state.running_avg_acc == engine.state. metrics["running_avg_accuracy"]), "{} vs {}".format( engine.state.running_avg_acc, engine.state.metrics["running_avg_accuracy"]) @trainer.on(Events.ITERATION_COMPLETED) def assert_equal_running_avg_output_values(engine): assert (engine.state.running_avg_output == engine.state.metrics["running_avg_output"]), "{} vs {}".format( engine.state.running_avg_output, engine.state.metrics["running_avg_output"]) np.random.seed(10) running_avg_acc = [ None, ] n_iters = 10 batch_size = 10 n_classes = 10 data = list(range(n_iters)) loss_values = iter(range(n_iters)) y_true_batch_values = iter( np.random.randint(0, n_classes, size=(n_iters, batch_size))) y_pred_batch_values = iter(np.random.rand(n_iters, batch_size, n_classes)) trainer.run(data, max_epochs=1) running_avg_acc = [ None, ] n_iters = 10 batch_size = 10 n_classes = 10 data = list(range(n_iters)) loss_values = iter(range(n_iters)) y_true_batch_values = iter( np.random.randint(0, n_classes, size=(n_iters, batch_size))) y_pred_batch_values = iter(np.random.rand(n_iters, batch_size, n_classes)) trainer.run(data, max_epochs=1)
def main(dataset_path, batch_size=256, max_epochs=10): assert torch.cuda.is_available() assert torch.backends.cudnn.enabled, "NVIDIA/Apex:Amp requires cudnn backend to be enabled." torch.backends.cudnn.benchmark = True device = "cuda" train_loader, test_loader, eval_train_loader = get_train_eval_loaders(dataset_path, batch_size=batch_size) model = wide_resnet50_2(num_classes=100).to(device) optimizer = SGD(model.parameters(), lr=0.01) criterion = CrossEntropyLoss().to(device) scaler = GradScaler() def train_step(engine, batch): x = convert_tensor(batch[0], device, non_blocking=True) y = convert_tensor(batch[1], device, non_blocking=True) optimizer.zero_grad() # Runs the forward pass with autocasting. with autocast(): y_pred = model(x) loss = criterion(y_pred, y) # Scales loss. Calls backward() on scaled loss to create scaled gradients. # Backward passes under autocast are not recommended. # Backward ops run in the same precision that autocast used for corresponding forward ops. scaler.scale(loss).backward() # scaler.step() first unscales the gradients of the optimizer's assigned params. # If these gradients do not contain infs or NaNs, optimizer.step() is then called, # otherwise, optimizer.step() is skipped. scaler.step(optimizer) # Updates the scale for next iteration. scaler.update() return loss.item() trainer = Engine(train_step) timer = Timer(average=True) timer.attach(trainer, step=Events.EPOCH_COMPLETED) ProgressBar(persist=True).attach(trainer, output_transform=lambda out: {"batch loss": out}) metrics = {"Accuracy": Accuracy(), "Loss": Loss(criterion)} evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def log_metrics(engine, title): for name in metrics: print(f"\t{title} {name}: {engine.state.metrics[name]:.2f}") @trainer.on(Events.COMPLETED) def run_validation(_): print(f"- Mean elapsed time for 1 epoch: {timer.value()}") print("- Metrics:") with evaluator.add_event_handler(Events.COMPLETED, log_metrics, "Train"): evaluator.run(eval_train_loader) with evaluator.add_event_handler(Events.COMPLETED, log_metrics, "Test"): evaluator.run(test_loader) trainer.run(train_loader, max_epochs=max_epochs)
def fit_model_multiclass(model, train_loader, test_loader, lr, max_epochs=5, number_of_classes=2): optimizer = torch.optim.SGD(model.parameters(), lr=lr) criterion = torch.nn.CrossEntropyLoss() def threshold_output_transform(output): y_pred, y = output print('got in here threshold ') y_pred = torch.heaviside(y_pred, values=torch.zeros(1)) # print(f'y_pred size : {y_pred.size()}') # print(f'y size : {y.size()}') return y_pred, y def prepare_batch(batch, device, non_blocking): x, y = batch # print(f'x type: {x.dtype}') # print(f'y type: {y.dtype}') x = x.to(dtype=torch.long) y = y.to(dtype=torch.long) # print(f'x type: {x.dtype}') # print(f'y type: {y.dtype}') # y = torch.unsqueeze(y, 1) y = y.squeeze() return (x, y) def squeeze_y_dims(output): prediction, target = output print('got in here squeeze y dims') # print(f'prediction size: {prediction.size()}') # print(f'target size: {target.size()}') return prediction.long(), target.squeeze().long() def correct_shape(output): y_pred, y = output print('got in here squeeze correct shape') one_hot_y = torch.nn.functional.one_hot(y, num_classes=number_of_classes) one_hot_y = one_hot_y.squeeze(1) argmax = y_pred.argmax(1) m = torch.zeros(y_pred.shape).scatter(1, argmax.unsqueeze(1), 1.0) return m, one_hot_y def trainer_output_shape(output): print('got here output transform trainer ') x, y, y_pred, loss = output trainer = create_supervised_trainer(model, optimizer, criterion, prepare_batch=prepare_batch) val_metrics = { "accuracy": Accuracy(output_transform=correct_shape, is_multilabel=True), "loss": Loss(criterion, output_transform=squeeze_y_dims) # "precision" : Precision(threshold_output_transform, average=False), # "recall": Recall(threshold_output_transform, average=False) } evaluator = create_supervised_evaluator(model, metrics=val_metrics, prepare_batch=prepare_batch) # @trainer.on(Events.ITERATION_COMPLETED(every=10)) # def log_training_loss(trainer): # evaluator.run(train_loader) # metrics = evaluator.state.metrics # print(f"Training Results - Epoch: {trainer.state.epoch} Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['loss']:.2f}") # @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(trainer): evaluator.run(train_loader) metrics = evaluator.state.metrics # print(f"Training Results - Epoch: {trainer.state.epoch} Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['bce']:.2f} Avg precision : {metrics['precision']:.2f} Avg recall: {metrics['recall']:.2f}") print( f"Training Results - Epoch: {trainer.state.epoch} Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['loss']:.2f}" ) @trainer.on(Events.EPOCH_COMPLETED(every=10)) def log_validation_results(trainer): evaluator.run(test_loader) metrics = evaluator.state.metrics # print(f"Validation Results - Epoch: {trainer.state.epoch} Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['bce']:.2f} Avg precision : {metrics['precision']:.2f} Avg recall: {metrics['recall']:.2f}") print( f"Validation Results - Epoch: {trainer.state.epoch} Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['loss']:.2f}" ) trainer.run(train_loader, max_epochs=max_epochs) return model
def run_inference_test(root_dir, model_file, device=torch.device("cuda:0")): images = sorted(glob(os.path.join(root_dir, "im*.nii.gz"))) segs = sorted(glob(os.path.join(root_dir, "seg*.nii.gz"))) val_files = [{ "image": img, "label": seg } for img, seg in zip(images, segs)] # define transforms for image and segmentation val_transforms = Compose([ LoadNiftid(keys=["image", "label"]), AsChannelFirstd(keys=["image", "label"], channel_dim=-1), ScaleIntensityd(keys=["image", "label"]), ToTensord(keys=["image", "label"]), ]) # create a validation data loader val_ds = monai.data.Dataset(data=val_files, transform=val_transforms) val_loader = monai.data.DataLoader(val_ds, batch_size=1, num_workers=4) # create UNet, DiceLoss and Adam optimizer net = monai.networks.nets.UNet( dimensions=3, in_channels=1, out_channels=1, channels=(16, 32, 64, 128, 256), strides=(2, 2, 2, 2), num_res_units=2, ).to(device) val_post_transforms = Compose([ Activationsd(keys="pred", sigmoid=True), AsDiscreted(keys="pred", threshold_values=True), KeepLargestConnectedComponentd(keys="pred", applied_labels=[1]), ]) val_handlers = [ StatsHandler(output_transform=lambda x: None), CheckpointLoader(load_path=f"{model_file}", load_dict={"net": net}), SegmentationSaver( output_dir=root_dir, batch_transform=lambda batch: batch["image_meta_dict"], output_transform=lambda output: output["pred"], ), ] evaluator = SupervisedEvaluator( device=device, val_data_loader=val_loader, network=net, inferer=SlidingWindowInferer(roi_size=(96, 96, 96), sw_batch_size=4, overlap=0.5), post_transform=val_post_transforms, key_val_metric={ "val_mean_dice": MeanDice(include_background=True, output_transform=lambda x: (x["pred"], x["label"])) }, additional_metrics={ "val_acc": Accuracy(output_transform=lambda x: (x["pred"], x["label"])) }, val_handlers=val_handlers, ) evaluator.run() return evaluator.state.best_metric
def train(): # parser = ArgumentParser() # parser.add_argument("--dataset_path", type=str, default="", # help="Path or url of the dataset. If empty download from S3.") # parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") # parser.add_argument("--model_checkpoint", type=str, # default="/home/rohola/codes/transfer-learning-conv-ai/model/bert_large_uncased", # help="Path, url or short name of the model") # # parser.add_argument("--model_checkpoint", type=str, default="/home/rohola/codes/transfer-learning-conv-ai/logs/logs4", help="Path, url or short name of the model") # # parser.add_argument("--model_checkpoint", type=str, default="bert-base-uncased", help="Path, url or short name of the model") # parser.add_argument("--num_candidates", type=int, default=2, help="Number of candidates for training") # parser.add_argument("--do_lower_case", default='True', action='store_true') # parser.add_argument("--max_history", type=int, default=2, help="Number of previous exchanges to keep in history") # parser.add_argument("--train_batch_size", type=int, default=3, help="Batch size for training") # parser.add_argument("--valid_batch_size", type=int, default=1, help="Batch size for validation") # parser.add_argument("--gradient_accumulation_steps", type=int, default=8, # help="Accumulate gradients on several steps") # parser.add_argument("--lr", type=float, default=5e-5, help="Learning rate") # parser.add_argument("--warmup_proportion", type=float, default=0.1, help="Warmup Proportion") # parser.add_argument("--lm_coef", type=float, default=1.0, help="LM loss coefficient") # parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient") # parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") # parser.add_argument("--n_epochs", type=int, default=4, help="Number of training epochs") # parser.add_argument("--personality_permutations", type=int, default=1, # help="Number of permutations of personality sentences") # parser.add_argument("--eval_before_start", action='store_true', # help="If true start with a first evaluation before training") # parser.add_argument("--device", type=str, default="cuda:0" if torch.cuda.is_available() else "cpu", # help="Device (cuda or cpu)") # parser.add_argument("--fp16", type=str, default="", # help="Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)") # parser.add_argument("--local_rank", type=int, default=-1, # help="Local rank for distributed training (-1: not distributed)") # parser.add_argument("--log_dir", type=str, default="", # help="Local rank for distributed training (-1: not distributed)") # config = parser.parse_args() config_file = "configs/train_using_bert_config.json" config = Config.from_json_file(config_file) # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig( level=logging.INFO if config.local_rank in [-1, 0] else logging.WARN) logger.warning( "Running process %d", config.local_rank ) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(config)) # Initialize distributed training if needed config.distributed = (config.local_rank != -1) if config.distributed: torch.cuda.set_device(config.local_rank) config.device = torch.device("cuda", config.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info( "Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning" ) tokenizer = BertTokenizer.from_pretrained( config.model_checkpoint, do_lower_case=config.do_lower_case) model = BertDoubleHeadsModel.from_pretrained(config.model_checkpoint) model.to(config.device) optimizer = BertAdam(model.parameters(), lr=config.lr, warmup=config.warmup_proportion) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if config.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=config.fp16) if config.distributed: model = DistributedDataParallel(model, device_ids=[config.local_rank], output_device=config.local_rank) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders( config, tokenizer) # Training function and trainer def update(engine, batch): model.train() batch = tuple(input_tensor.to(config.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, input_mask = batch lm_loss, mc_loss = model(input_ids, mc_token_ids, input_mask, lm_labels, mc_labels, token_type_ids) loss = (lm_loss * config.lm_coef + mc_loss * config.mc_coef) / config.gradient_accumulation_steps # mc_loss = model(input_ids, mc_token_ids, input_mask, lm_labels=None, mc_labels=mc_labels, token_type_ids=token_type_ids) # loss = mc_loss[0] / config.gradient_accumulation_steps if config.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), config.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_norm) if engine.state.iteration % config.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple( input_tensor.to(config.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, input_mask = batch model_outputs = model(input_ids, mc_token_ids, input_mask, token_type_ids=token_type_ids) lm_logits, mc_logits = model_outputs[0], model_outputs[1] lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view( -1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) # loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-1) # l = loss_fct(lm_logits_flat_shifted, lm_labels_flat_shifted) return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if config.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if config.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if config.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, config.lr), (config.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = { "nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])), "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1])) } metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], config), "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], config) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if config.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) tb_logger = TensorboardLogger(log_dir=config.log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list( metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model) }) # "getattr" take care of distributed encapsulation torch.save(config, tb_logger.writer.log_dir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file( os.path.join(tb_logger.writer.log_dir, CONFIG_NAME)) tokenizer.save_vocabulary(tb_logger.writer.log_dir) # Run the training trainer.run(train_loader, max_epochs=config.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if config.local_rank in [-1, 0] and config.n_epochs > 0: os.rename( checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME) ) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
def training(local_rank, config): rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() logger = setup_logger(name="CIFAR10-Training", distributed_rank=local_rank) log_basic_info(logger, config) output_path = config["output_path"] if rank == 0: if config["stop_iteration"] is None: now = datetime.now().strftime("%Y%m%d-%H%M%S") else: now = f"stop-on-{config['stop_iteration']}" folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}" output_path = Path(output_path) / folder_name if not output_path.exists(): output_path.mkdir(parents=True) config["output_path"] = output_path.as_posix() logger.info(f"Output path: {config['output_path']}") if "cuda" in device.type: config["cuda device name"] = torch.cuda.get_device_name(local_rank) if config["with_clearml"]: try: from clearml import Task except ImportError: # Backwards-compatibility for legacy Trains SDK from trains import Task task = Task.init("CIFAR10-Training", task_name=output_path.stem) task.connect_configuration(config) # Log hyper parameters hyper_params = [ "model", "batch_size", "momentum", "weight_decay", "num_epochs", "learning_rate", "num_warmup_epochs", ] task.connect({k: config[k] for k in hyper_params}) # Setup dataflow, model, optimizer, criterion train_loader, test_loader = get_dataflow(config) config["num_iters_per_epoch"] = len(train_loader) model, optimizer, criterion, lr_scheduler = initialize(config) # Create trainer for current task trainer = create_trainer(model, optimizer, criterion, lr_scheduler, train_loader.sampler, config, logger) # Let's now setup evaluator engine to perform model's validation and compute metrics metrics = { "Accuracy": Accuracy(), "Loss": Loss(criterion), } # We define two evaluators as they wont have exactly similar roles: # - `evaluator` will save the best model based on validation score evaluator = create_evaluator(model, metrics=metrics, config=config) train_evaluator = create_evaluator(model, metrics=metrics, config=config) def run_validation(engine): epoch = trainer.state.epoch state = train_evaluator.run(train_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(test_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED, run_validation) if rank == 0: # Setup TensorBoard logging on trainer and evaluators. Logged values are: # - Training metrics, e.g. running average loss values # - Learning rate # - Evaluation train/test metrics evaluators = {"training": train_evaluator, "test": evaluator} tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators) # Store 2 best models by validation accuracy starting from num_epochs / 2: best_model_handler = Checkpoint( {"model": model}, get_save_handler(config), filename_prefix="best", n_saved=2, global_step_transform=global_step_from_engine(trainer), score_name="test_accuracy", score_function=Checkpoint.get_default_score_fn("Accuracy"), ) evaluator.add_event_handler( Events.COMPLETED( lambda *_: trainer.state.epoch > config["num_epochs"] // 2), best_model_handler) # In order to check training resuming we can stop training on a given iteration if config["stop_iteration"] is not None: @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"])) def _(): logger.info( f"Stop training on {trainer.state.iteration} iteration") trainer.terminate() try: trainer.run(train_loader, max_epochs=config["num_epochs"]) except Exception as e: logger.exception("") raise e if rank == 0: tb_logger.close()
def training(local_rank, config): rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() logger = setup_logger(name="ImageNet-Training", distributed_rank=local_rank) log_basic_info(logger, config) output_path = config["output_path"] if rank == 0: if config["stop_iteration"] is None: now = datetime.now().strftime("%Y%m%d-%H%M%S") else: now = "stop-on-{}".format(config["stop_iteration"]) folder_name = "{}_backend-{}-{}_{}".format(config["model"], idist.backend(), idist.get_world_size(), now) output_path = Path(output_path) / folder_name if not output_path.exists(): output_path.mkdir(parents=True) config["output_path"] = output_path.as_posix() logger.info("Output path: {}".format(config["output_path"])) if "cuda" in device.type: config["cuda device name"] = torch.cuda.get_device_name(local_rank) # Setup dataflow, model, optimizer, criterion train_loader, test_loader = get_imagenet_dataloader(config) config["num_iters_per_epoch"] = len(train_loader) model, optimizer, criterion, lr_scheduler = initialize(config) # Create trainer for current task trainer = create_supervised_trainer(model, optimizer, criterion, lr_scheduler, train_loader.sampler, config, logger) # Let's now setup evaluator engine to perform model's validation and compute metrics metrics = { "accuracy": Accuracy(), "loss": Loss(criterion), } # We define two evaluators as they wont have exactly similar roles: # - `evaluator` will save the best model based on validation score evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def run_validation(engine): epoch = trainer.state.epoch state = train_evaluator.run(train_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(test_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED, run_validation) if rank == 0: # Setup TensorBoard logging on trainer and evaluators. Logged values are: # - Training metrics, e.g. running average loss values # - Learning rate # - Evaluation train/test metrics evaluators = {"training": train_evaluator, "test": evaluator} tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators) # Store 3 best models by validation accuracy: common.gen_save_best_models_by_val_score( save_handler=get_save_handler(config), evaluator=evaluator, models={"model": model}, metric_name="accuracy", n_saved=3, trainer=trainer, tag="test", ) # In order to check training resuming we can stop training on a given iteration if config["stop_iteration"] is not None: @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"])) def _(): logger.info("Stop training on {} iteration".format( trainer.state.iteration)) trainer.terminate() @trainer.on(Events.ITERATION_COMPLETED(every=20)) def print_acc(engine): if rank == 0: print("Epoch[{}] Iteration[{}/{}] Loss: {:.3f}"\ .format(engine.state.epoch, engine.state.iteration, len(train_loader), engine.state.saved_batch_loss )) try: trainer.run(train_loader, max_epochs=config["num_epochs"]) except Exception as e: import traceback print(traceback.format_exc()) if rank == 0: tb_logger.close()
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_dir): train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() device = "cpu" if torch.cuda.is_available(): device = "cuda" model.to(device) # Move model before creating optimizer optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) criterion = nn.CrossEntropyLoss() trainer = create_supervised_trainer(model, optimizer, criterion, device=device) trainer.logger = setup_logger("Trainer") if sys.version_info > (3, ): from ignite.contrib.metrics.gpu_info import GpuInfo try: GpuInfo().attach(trainer) except RuntimeError: print( "INFO: By default, in this example it is possible to log GPU information (used memory, utilization). " "As there is no pynvml python package installed, GPU information won't be logged. Otherwise, please " "install it : `pip install pynvml`") metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)} train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) train_evaluator.logger = setup_logger("Train Evaluator") validation_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) validation_evaluator.logger = setup_logger("Val Evaluator") @trainer.on(Events.EPOCH_COMPLETED) def compute_metrics(engine): train_evaluator.run(train_loader) validation_evaluator.run(val_loader) tb_logger = TensorboardLogger(log_dir=log_dir) tb_logger.attach_output_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), tag="training", output_transform=lambda loss: {"batchloss": loss}, metric_names="all", ) for tag, evaluator in [("training", train_evaluator), ("validation", validation_evaluator)]: tb_logger.attach_output_handler( evaluator, event_name=Events.EPOCH_COMPLETED, tag=tag, metric_names=["loss", "accuracy"], global_step_transform=global_step_from_engine(trainer), ) tb_logger.attach_opt_params_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), optimizer=optimizer) tb_logger.attach(trainer, log_handler=WeightsScalarHandler(model), event_name=Events.ITERATION_COMPLETED(every=100)) tb_logger.attach(trainer, log_handler=WeightsHistHandler(model), event_name=Events.EPOCH_COMPLETED(every=100)) tb_logger.attach(trainer, log_handler=GradsScalarHandler(model), event_name=Events.ITERATION_COMPLETED(every=100)) tb_logger.attach(trainer, log_handler=GradsHistHandler(model), event_name=Events.EPOCH_COMPLETED(every=100)) def score_function(engine): return engine.state.metrics["accuracy"] model_checkpoint = ModelCheckpoint( log_dir, n_saved=2, filename_prefix="best", score_function=score_function, score_name="validation_accuracy", global_step_transform=global_step_from_engine(trainer), ) validation_evaluator.add_event_handler(Events.COMPLETED, model_checkpoint, {"model": model}) # kick everything off trainer.run(train_loader, max_epochs=epochs) tb_logger.close()
def metrics_estimating(tester, criterion): Accuracy(output_transform=thresholded_output_transform).attach(tester, 'accuracy') Recall(output_transform=thresholded_output_transform, average=True).attach(tester, 'recall') Precision(output_transform=thresholded_output_transform, average=True).attach(tester, 'precision') Loss(criterion).attach(tester, 'loss')
def run(self, train_loader, val_loader, test_loader): """Perform model training and evaluation on holdout dataset.""" ## attach certain metrics to trainer ## CpuInfo().attach(self.trainer, "cpu_util") Loss(self.loss).attach(self.trainer, "loss") ###### configure evaluator settings ###### def get_output_transform(target: str, collapse_y: bool = False): return lambda out: metric_output_transform( out, self.loss, target, collapse_y=collapse_y) graph_num_classes = len(self.graph_classes) node_num_classes = len(self.node_classes) node_num_classes = 2 if node_num_classes == 1 else node_num_classes node_output_transform = get_output_transform("node") node_output_transform_collapsed = get_output_transform("node", collapse_y=True) graph_output_transform = get_output_transform("graph") graph_output_transform_collapsed = get_output_transform( "graph", collapse_y=True) # metrics we are interested in base_metrics: dict = { 'loss': Loss(self.loss), "cpu_util": CpuInfo(), 'node_accuracy_avg': Accuracy(output_transform=node_output_transform, is_multilabel=False), 'node_accuracy': LabelwiseAccuracy(output_transform=node_output_transform, is_multilabel=False), "node_recall": Recall(output_transform=node_output_transform_collapsed, is_multilabel=False, average=False), "node_precision": Precision(output_transform=node_output_transform_collapsed, is_multilabel=False, average=False), "node_f1_score": Fbeta(1, output_transform=node_output_transform_collapsed, average=False), "node_c_matrix": ConfusionMatrix(node_num_classes, output_transform=node_output_transform_collapsed, average=None) } metrics = dict(**base_metrics) # settings for the evaluator evaluator_settings = { "device": self.device, "loss_fn": self.loss, "node_classes": self.node_classes, "graph_classes": self.graph_classes, "non_blocking": True, "metrics": OrderedDict(sorted(metrics.items(), key=lambda m: m[0])), "pred_collector_function": self._pred_collector_function } ## configure evaluators ## val_evaluator = None if len(val_loader): val_evaluator = create_supervised_evaluator( self.model, **evaluator_settings) # configure behavior for early stopping if self.stopper: val_evaluator.add_event_handler(Events.COMPLETED, self.stopper) # configure behavior for checkpoint saving val_evaluator.add_event_handler(Events.COMPLETED, self.best_checkpoint_handler) val_evaluator.add_event_handler(Events.COMPLETED, self.latest_checkpoint_handler) else: self.trainer.add_event_handler(Events.COMPLETED, self.latest_checkpoint_handler) test_evaluator = None if len(test_loader): test_evaluator = create_supervised_evaluator( self.model, **evaluator_settings) ############################# @self.trainer.on(Events.STARTED) def log_training_start(trainer): self.custom_print("Start training...") @self.trainer.on(Events.EPOCH_COMPLETED) def compute_metrics(trainer): """Compute evaluation metric values after each epoch.""" epoch = trainer.state.epoch self.custom_print(f"Finished epoch {epoch:03d}!") if len(val_loader): self.persist_collection = True val_evaluator.run(val_loader) self._save_collected_predictions( prefix=f"validation_epoch{epoch:03}") # write metrics to file self.write_metrics(trainer, val_evaluator, suffix="validation") @self.trainer.on(Events.COMPLETED) def log_training_complete(trainer): """Trigger evaluation on test set if training is completed.""" epoch = trainer.state.epoch suffix = "(Early Stopping)" if epoch < self.epochs else "" self.custom_print("Finished after {:03d} epochs! {}".format( epoch, suffix)) # load best model for evaluation self.custom_print("Load best model for final evaluation...") last_checkpoint: str = self.best_checkpoint_handler.last_checkpoint or self.latest_checkpoint_handler.last_checkpoint best_checkpoint_path = os.path.join(self.save_path, last_checkpoint) checkpoint_path_dict: dict = { "latest_checkpoint_path": best_checkpoint_path # we want to load states from the best checkpoint as "latest" configuration for testing } self.model, self.optimizer, self.trainer, _, _, _ = self._load_checkpoint( self.model, self.optimizer, self.trainer, None, None, None, checkpoint_path_dict=checkpoint_path_dict) if len(test_loader): self.persist_collection = True test_evaluator.run(test_loader) self._save_collected_predictions(prefix="test_final") # write metrics to file self.write_metrics(trainer, test_evaluator, suffix="test") # terminate training if Nan values are produced self.trainer.add_event_handler(Events.ITERATION_COMPLETED, TerminateOnNan()) # start the actual training self.custom_print(f"Train for a maximum of {self.epochs} epochs...") self.trainer.run(train_loader, max_epochs=self.epochs)
def training(local_rank, config, logger=None): if not getattr(config, "use_fp16", True): raise RuntimeError("This training script uses by default fp16 AMP") torch.backends.cudnn.benchmark = True set_seed(config.seed + local_rank) train_loader, val_loader, train_eval_loader = config.train_loader, config.val_loader, config.train_eval_loader # Setup model, optimizer, criterion model, optimizer, criterion = initialize(config) if not hasattr(config, "prepare_batch"): config.prepare_batch = _prepare_batch # Setup trainer for this specific task trainer = create_trainer(model, optimizer, criterion, train_loader.sampler, config, logger) if getattr(config, "benchmark_dataflow", False): benchmark_dataflow_num_iters = getattr(config, "benchmark_dataflow_num_iters", 1000) DataflowBenchmark(benchmark_dataflow_num_iters, prepare_batch=config.prepare_batch).attach( trainer, train_loader) # Setup evaluators val_metrics = { "Accuracy": Accuracy(), "Top-5 Accuracy": TopKCategoricalAccuracy(k=5), } if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict): val_metrics.update(config.val_metrics) evaluator, train_evaluator = create_evaluators(model, val_metrics, config) @trainer.on( Events.EPOCH_COMPLETED(every=getattr(config, "val_interval", 1)) | Events.COMPLETED) def run_validation(): epoch = trainer.state.epoch state = train_evaluator.run(train_eval_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(val_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) if getattr(config, "start_by_validation", False): trainer.add_event_handler(Events.STARTED, run_validation) score_metric_name = "Accuracy" if hasattr(config, "es_patience"): common.add_early_stopping_by_val_score(config.es_patience, evaluator, trainer, metric_name=score_metric_name) # Store 3 best models by validation accuracy: common.save_best_model_by_val_score( config.output_path.as_posix(), evaluator, model=model, metric_name=score_metric_name, n_saved=3, trainer=trainer, tag="val", ) if idist.get_rank() == 0: tb_logger = common.setup_tb_logging( config.output_path.as_posix(), trainer, optimizer, evaluators={ "training": train_evaluator, "validation": evaluator }, ) exp_tracking_logger = exp_tracking.setup_logging(trainer, optimizer, evaluators={ "training": train_evaluator, "validation": evaluator }) # Log train/val predictions: tb_logger.attach( evaluator, log_handler=predictions_gt_images_handler( img_denormalize_fn=config.img_denormalize, n_images=15, another_engine=trainer, prefix_tag="validation"), event_name=Events.ITERATION_COMPLETED(once=len(val_loader) // 2), ) tb_logger.attach( train_evaluator, log_handler=predictions_gt_images_handler( img_denormalize_fn=config.img_denormalize, n_images=15, another_engine=trainer, prefix_tag="training"), event_name=Events.ITERATION_COMPLETED( once=len(train_eval_loader) // 2), ) trainer.run(train_loader, max_epochs=config.num_epochs) if idist.get_rank() == 0: tb_logger.close() exp_tracking_logger.close()
def train(model, train_loader, eval_loaders, optimizer, loss_fn, n_it_max, patience, split_names, select_metric='Val accuracy_0', select_mode='max', viz=None, device='cpu', lr_scheduler=None, name=None, log_steps=None, log_epoch=False, _run=None, prepare_batch=_prepare_batch, single_pass=False, n_ep_max=None): # print(model) if not log_steps and not log_epoch: logger.warning('/!\\ No logging during training /!\\') if log_steps is None: log_steps = [] epoch_steps = len(train_loader) if log_epoch: log_steps.append(epoch_steps) if single_pass: max_epoch = 1 elif n_ep_max is None: assert n_it_max is not None max_epoch = int(n_it_max / epoch_steps) + 1 else: assert n_it_max is None max_epoch = n_ep_max all_metrics = defaultdict(dict) trainer = create_supervised_trainer(model, optimizer, loss_fn, device=device, prepare_batch=prepare_batch) if hasattr(model, 'new_epoch_hook'): trainer.add_event_handler(Events.EPOCH_STARTED, model.new_epoch_hook) if hasattr(model, 'new_iter_hook'): trainer.add_event_handler(Events.ITERATION_STARTED, model.new_iter_hook) trainer.logger.setLevel(logging.WARNING) # trainer output is in the format (x, y, y_pred, loss, optionals) train_loss = RunningAverage(output_transform=lambda out: out[3].item(), epoch_bound=True) train_loss.attach(trainer, 'Trainer loss') if hasattr(model, 's'): met = Average(output_transform=lambda _: float('nan') if model.s is None else model.s) met.attach(trainer, 'cur_s') trainer.add_event_handler(Events.ITERATION_COMPLETED, met.completed, 'cur_s') if hasattr(model, 'arch_sampler') and model.arch_sampler.distrib_dim > 0: met = Average(output_transform=lambda _: float('nan') if model.cur_split is None else model.cur_split) met.attach(trainer, 'Trainer split') trainer.add_event_handler(Events.ITERATION_COMPLETED, met.completed, 'Trainer split') # trainer.add_event_handler(Events.EPOCH_STARTED, met.started) all_ent = Average( output_transform=lambda out: out[-1]['arch_entropy_avg'].item()) all_ent.attach(trainer, 'Trainer all entropy') trainer.add_event_handler(Events.ITERATION_COMPLETED, all_ent.completed, 'Trainer all entropy') train_ent = Average( output_transform=lambda out: out[-1]['arch_entropy_sample'].item()) train_ent.attach(trainer, 'Trainer sampling entropy') trainer.add_event_handler(Events.ITERATION_COMPLETED, train_ent.completed, 'Trainer sampling entropy') trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda engine: model.check_arch_freezing( ent=train_ent.compute(), epoch=engine.state.iteration/(epoch_steps*max_epoch)) ) def log_always(engine, name): val = engine.state.output[-1][name] all_metrics[name][engine.state.iteration/epoch_steps] = val.mean().item() def log_always_dict(engine, name): for node, val in engine.state.output[-1][name].items(): all_metrics['node {} {}'.format(node, name)][engine.state.iteration/epoch_steps] = val.mean().item() trainer.add_event_handler(Events.ITERATION_COMPLETED, log_always_dict, name='arch_grads') trainer.add_event_handler(Events.ITERATION_COMPLETED, log_always_dict, name='arch_probas') trainer.add_event_handler(Events.ITERATION_COMPLETED, log_always_dict, name='node_grads') trainer.add_event_handler(Events.ITERATION_COMPLETED, log_always, name='task all_loss') trainer.add_event_handler(Events.ITERATION_COMPLETED, log_always, name='arch all_loss') trainer.add_event_handler(Events.ITERATION_COMPLETED, log_always, name='entropy all_loss') if n_it_max is not None: StopAfterIterations([n_it_max]).attach(trainer) # epoch_pbar = ProgressBar(bar_format='{l_bar}{bar}{r_bar}', desc=name, # persist=True, disable=not (_run or viz)) # epoch_pbar.attach(trainer, metric_names=['Train loss']) # # training_pbar = ProgressBar(bar_format='{l_bar}{bar}{r_bar}', desc=name, # persist=True, disable=not (_run or viz)) # training_pbar.attach(trainer, event_name=Events.EPOCH_COMPLETED, # closing_event_name=Events.COMPLETED) total_time = Timer(average=False) eval_time = Timer(average=False) eval_time.pause() data_time = Timer(average=False) forward_time = Timer(average=False) forward_time.attach(trainer, start=Events.EPOCH_STARTED, pause=Events.ITERATION_COMPLETED, resume=Events.ITERATION_STARTED, step=Events.ITERATION_COMPLETED) epoch_time = Timer(average=False) epoch_time.attach(trainer, start=Events.EPOCH_STARTED, pause=Events.EPOCH_COMPLETED, resume=Events.EPOCH_STARTED, step=Events.EPOCH_COMPLETED) def get_loss(y_pred, y): l = loss_fn(y_pred, y) if not torch.is_tensor(l): l, *l_details = l return l.mean() def get_member(x, n=0): if isinstance(x, (list, tuple)): return x[n] return x eval_metrics = {'loss': Loss(get_loss)} for i in range(model.n_out): out_trans = get_attr_transform(i) def extract_ys(out): x, y, y_pred, loss, _ = out return out_trans((y_pred, y)) train_acc = Accuracy(extract_ys) train_acc.attach(trainer, 'Trainer accuracy_{}'.format(i)) trainer.add_event_handler(Events.ITERATION_COMPLETED, train_acc.completed, 'Trainer accuracy_{}'.format(i)) eval_metrics['accuracy_{}'.format(i)] = \ Accuracy(output_transform=out_trans) # if isinstance(model, SSNWrapper): # model.arch_sampler.entropy().mean() evaluator = create_supervised_evaluator(model, metrics=eval_metrics, device=device, prepare_batch=prepare_batch) last_iteration = 0 patience_counter = 0 best = {'value': float('inf') * 1 if select_mode == 'min' else -1, 'iter': -1, 'state_dict': None } def is_better(new, old): if select_mode == 'min': return new < old else: return new > old def log_results(evaluator, data_loader, iteration, split_name): evaluator.run(data_loader) metrics = evaluator.state.metrics log_metrics = {} for metric_name, metric_val in metrics.items(): log_name = '{} {}'.format(split_name, metric_name) if viz: first = iteration == 0 and split_name == split_names[0] viz.line([metric_val], X=[iteration], win=metric_name, name=log_name, update=None if first else 'append', opts={'title': metric_name, 'showlegend': True, 'width': 500, 'xlabel': 'iterations'}) viz.line([metric_val], X=[iteration/epoch_steps], win='{}epoch'.format(metric_name), name=log_name, update=None if first else 'append', opts={'title': metric_name, 'showlegend': True, 'width': 500, 'xlabel': 'epoch'}) if _run: _run.log_scalar(log_name, metric_val, iteration) log_metrics[log_name] = metric_val all_metrics[log_name][iteration] = metric_val return log_metrics if lr_scheduler is not None: @trainer.on(Events.EPOCH_COMPLETED) def step(_): lr_scheduler.step() # logger.warning('current lr {:.5e}'.format( # optimizer.param_groups[0]['lr'])) @trainer.on(Events.ITERATION_COMPLETED) def log_event(trainer): iteration = trainer.state.iteration if trainer.state else 0 nonlocal last_iteration, patience_counter, best if not log_steps or not \ (iteration in log_steps or iteration % log_steps[-1] == 0): return epoch_time.pause() eval_time.resume() all_metrics['training_epoch'][iteration] = iteration / epoch_steps all_metrics['training_iteration'][iteration] = iteration if hasattr(model, 'arch_sampler'): all_metrics['training_archs'][iteration] = \ model.arch_sampler().squeeze().detach() # if hasattr(model, 'distrib_gen'): # entropy = model.distrib_gen.entropy() # all_metrics['entropy'][iteration] = entropy.mean().item() # if trainer.state and len(trainer.state.metrics) > 1: # raise ValueError(trainer.state.metrics) all_metrics['data time'][iteration] = data_time.value() all_metrics['data time_ps'][iteration] = data_time.value() / max(data_time.step_count, 1.) all_metrics['forward time'][iteration] = forward_time.value() all_metrics['forward time_ps'][iteration] = forward_time.value() / max(forward_time.step_count, 1.) all_metrics['epoch time'][iteration] = epoch_time.value() all_metrics['epoch time_ps'][iteration] = epoch_time.value() / max(epoch_time.step_count, 1.) if trainer.state: # logger.warning(trainer.state.metrics) for metric, value in trainer.state.metrics.items(): all_metrics[metric][iteration] = value if viz: viz.line([value], X=[iteration], win=metric.split()[-1], name=metric, update=None if iteration==0 else 'append', opts={'title': metric, 'showlegend': True, 'width': 500, 'xlabel': 'iterations'}) iter_this_step = iteration - last_iteration for d_loader, name in zip(eval_loaders, split_names): if name == 'Train': if iteration == 0: all_metrics['Trainer loss'][iteration] = float('nan') all_metrics['Trainer accuracy_0'][iteration] = float('nan') if hasattr(model, 'arch_sampler'): all_metrics['Trainer all entropy'][iteration] = float('nan') all_metrics['Trainer sampling entropy'][iteration] = float('nan') # if hasattr(model, 'cur_split'): all_metrics['Trainer split'][iteration] = float('nan') continue split_metrics = log_results(evaluator, d_loader, iteration, name) if select_metric not in split_metrics: continue if is_better(split_metrics[select_metric], best['value']): best['value'] = split_metrics[select_metric] best['iter'] = iteration best['state_dict'] = copy.deepcopy(model.state_dict()) if patience > 0: patience_counter = 0 elif patience > 0: patience_counter += iter_this_step if patience_counter >= patience: logger.info('#####') logger.info('# Early stopping Run') logger.info('#####') trainer.terminate() last_iteration = iteration eval_time.pause() eval_time.step() all_metrics['eval time'][iteration] = eval_time.value() all_metrics['eval time_ps'][iteration] = eval_time.value() / eval_time.step_count all_metrics['total time'][iteration] = total_time.value() epoch_time.resume() log_event(trainer) # # @trainer.on(Events.EPOCH_COMPLETED) # def log_epoch(trainer): # iteration = trainer.state.iteration if trainer.state else 0 # epoch = iteration/epoch_steps # fw_t = forward_time.value() # fw_t_ps = fw_t / forward_time.step_count # d_t = data_time.value() # d_t_ps = d_t / data_time.step_count # e_t = epoch_time.value() # e_t_ps = e_t / epoch_time.step_count # ev_t = eval_time.value() # ev_t_ps = ev_t / eval_time.step_count # logger.warning('<{}> Epoch {}/{} finished (Forward: {:.3f}s({:.3f}), ' # 'data: {:.3f}s({:.3f}), epoch: {:.3f}s({:.3f}),' # ' Eval: {:.3f}s({:.3f}), Total: ' # '{:.3f}s)'.format(type(model).__name__, epoch, # max_epoch, fw_t, fw_t_ps, d_t, d_t_ps, # e_t, e_t_ps, ev_t, ev_t_ps, # total_time.value())) data_time.attach(trainer, start=Events.STARTED, pause=Events.ITERATION_STARTED, resume=Events.ITERATION_COMPLETED, step=Events.ITERATION_STARTED) if hasattr(model, 'iter_per_epoch'): model.iter_per_epoch = len(train_loader) trainer.run(train_loader, max_epochs=max_epoch) return trainer.state.iteration, all_metrics, best
def task_diagnostics(tasks, train_data, val_data, vocabulary, model, args): devicea = -1 if torch.cuda.is_available(): devicea = 0 train_activations = [] test_activations = [] train_labels = [] test_labels = [] for tid,task in enumerate(tasks): train_act, train_lab = evaluate_get_dataset(model, task, vocabulary[task], train_data[task], 1000, tid) test_act, test_lab = evaluate_get_dataset(model, task, vocabulary[task], val_data[task], 500, tid) train_activations.append(train_act) test_activations.append(test_act) train_labels.append(train_lab) test_labels.append(test_lab) train_activations = torch.cat(train_activations, dim=0) test_activations = torch.cat(test_activations, dim=0) train_labels = torch.cat(train_labels, dim=0) test_labels = torch.cat(test_labels, dim=0) print("Activations ", train_activations.shape, test_activations.shape, train_labels.shape, test_labels.shape) # Datasets train_ds = torch.utils.data.TensorDataset(train_activations, train_labels) test_ds = torch.utils.data.TensorDataset(test_activations, test_labels) train_dl = torch.utils.data.DataLoader(train_ds, batch_size=32) test_dl = torch.utils.data.DataLoader(test_ds, batch_size=2100) # Models and Optimizer diag_model = DiagnositicClassifier(train_activations.size(1), 128, len(tasks)) if devicea != -1: diag_model.cuda(devicea) optimizer = utils.get_optimizer(args.opt_alg, diag_model.parameters(), args.lr, args.wdecay) criterion = nn.CrossEntropyLoss() # ignite training loops if devicea == -1: trainer = create_supervised_trainer(diag_model, optimizer, criterion) evaluator = create_supervised_evaluator(diag_model, {"accuracy": Accuracy(), "loss": Loss(criterion)}) val_evaluator = create_supervised_evaluator(diag_model, {"accuracy": Accuracy(), "loss": Loss(criterion)}) else: trainer = create_supervised_trainer(diag_model, optimizer, diag_model.loss_function, device=devicea) evaluator = create_supervised_evaluator(diag_model, metrics={'accuracy': Accuracy()}, device=devicea) val_evaluator = create_supervised_evaluator(diag_model, metrics={'accuracy': Accuracy()}, device=devicea) @trainer.on(Events.EPOCH_COMPLETED) def compute_metrics(engine): evaluator.run(train_dl) val_evaluator.run(test_dl) def score_function(engine): return engine.state.metrics['accuracy'] early_stop_metric = EarlyStopping(patience=20, score_function=score_function, trainer=trainer) val_evaluator.add_event_handler(Events.COMPLETED, early_stop_metric) trainer.run(train_dl, max_epochs=1000) logits, test_labels = val_evaluator.state.output _, predicted = torch.max(logits, 1) correct_ones = (predicted == test_labels).sum() metrics = {} for i,task in enumerate(tasks): start = i*500 end = (i+1)*500 correct_this = (predicted[start:end] == test_labels[start:end]).sum() metrics[task] = correct_this.item()/500 #print("Task based accuracy", start, end , task, correct_this) metrics["overall"] = val_evaluator.state.metrics["accuracy"] print("Diagnostics metric", metrics) return metrics
trainer = Engine(process_function) train_evaluator = Engine(eval_function) validator_evaluator = Engine(eval_function) RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss') def thresholded_output_transform(output): y_pred, y = output y_pred = torch.round(y_pred) return y_pred, y Accuracy(output_transform=thresholded_output_transform).attach(train_evaluator, 'accuracy') Loss(loss_function).attach(train_evaluator, 'loss_train') # binary cross entropy Accuracy(output_transform=thresholded_output_transform).attach(validator_evaluator, 'accuracy') Loss(loss_function).attach(validator_evaluator, 'loss_val') pbar = ProgressBar(persist=True, bar_format="") pbar.attach(trainer, ['loss']) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): train_evaluator.run(train_iter) metrics = train_evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_loss = metrics['loss_train']
def test_binary_input(): acc = Accuracy() def _test(y_pred, y, batch_size): acc.reset() if batch_size > 1: n_iters = y.shape[0] // batch_size + 1 for i in range(n_iters): idx = i * batch_size acc.update( (y_pred[idx:idx + batch_size], y[idx:idx + batch_size])) else: acc.update((y_pred, y)) np_y = y.numpy().ravel() np_y_pred = y_pred.numpy().ravel() assert acc._type == "binary" assert isinstance(acc.compute(), float) assert accuracy_score(np_y, np_y_pred) == pytest.approx(acc.compute()) def get_test_cases(): test_cases = [ # Binary accuracy on input of shape (N, 1) or (N, ) (torch.randint(0, 2, size=(10, )).long(), torch.randint(0, 2, size=(10, )).long(), 1), (torch.randint(0, 2, size=(10, 1)).long(), torch.randint(0, 2, size=(10, 1)).long(), 1), # updated batches (torch.randint(0, 2, size=(50, )).long(), torch.randint(0, 2, size=(50, )).long(), 16), (torch.randint(0, 2, size=(50, 1)).long(), torch.randint(0, 2, size=(50, 1)).long(), 16), # Binary accuracy on input of shape (N, L) (torch.randint(0, 2, size=(10, 5)).long(), torch.randint(0, 2, size=(10, 5)).long(), 1), (torch.randint(0, 2, size=(10, 8)).long(), torch.randint(0, 2, size=(10, 8)).long(), 1), # updated batches (torch.randint(0, 2, size=(50, 5)).long(), torch.randint(0, 2, size=(50, 5)).long(), 16), (torch.randint(0, 2, size=(50, 8)).long(), torch.randint(0, 2, size=(50, 8)).long(), 16), # Binary accuracy on input of shape (N, H, W, ...) (torch.randint(0, 2, size=(4, 1, 12, 10)).long(), torch.randint(0, 2, size=(4, 1, 12, 10)).long(), 1), (torch.randint(0, 2, size=(15, 1, 20, 10)).long(), torch.randint(0, 2, size=(15, 1, 20, 10)).long(), 1), # updated batches (torch.randint(0, 2, size=(50, 1, 12, 10)).long(), torch.randint(0, 2, size=(50, 1, 12, 10)).long(), 16), (torch.randint(0, 2, size=(50, 1, 20, 10)).long(), torch.randint(0, 2, size=(50, 1, 20, 10)).long(), 16), ] return test_cases for _ in range(5): # check multiple random inputs as random exact occurencies are rare test_cases = get_test_cases() for y_pred, y, n_iters in test_cases: _test(y_pred, y, n_iters)
def run(train_batch_size, val_batch_size, epochs, lr, momentum): train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() device = "cpu" if torch.cuda.is_available(): device = "cuda" model.to(device) # Move model before creating optimizer optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) criterion = nn.CrossEntropyLoss() trainer = create_supervised_trainer(model, optimizer, criterion, device=device) trainer.logger = setup_logger("Trainer") metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)} train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) train_evaluator.logger = setup_logger("Train Evaluator") validation_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) validation_evaluator.logger = setup_logger("Val Evaluator") @trainer.on(Events.EPOCH_COMPLETED) def compute_metrics(engine): train_evaluator.run(train_loader) validation_evaluator.run(val_loader) wandb_logger = WandBLogger( project="pytorch-ignite-integration", name="ignite-mnist-example", config={ "train_batch_size": train_batch_size, "val_batch_size": val_batch_size, "epochs": epochs, "lr": lr, "momentum": momentum, }, ) wandb_logger.attach_output_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), tag="training", output_transform=lambda loss: {"batchloss": loss}, ) for tag, evaluator in [("training", train_evaluator), ("validation", validation_evaluator)]: wandb_logger.attach_output_handler( evaluator, event_name=Events.EPOCH_COMPLETED, tag=tag, metric_names=["loss", "accuracy"], global_step_transform=lambda *_: trainer.state.iteration, ) wandb_logger.attach_opt_params_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), optimizer=optimizer) wandb_logger.watch(model, log="all") def score_function(engine): return engine.state.metrics["accuracy"] model_checkpoint = ModelCheckpoint( wandb_logger.run.dir, n_saved=2, filename_prefix="best", score_function=score_function, score_name="validation_accuracy", global_step_transform=global_step_from_engine(trainer), ) validation_evaluator.add_event_handler(Events.COMPLETED, model_checkpoint, {"model": model}) # kick everything off trainer.run(train_loader, max_epochs=epochs) wandb_logger.close()
num_workers=num_workers) test_dataset = TrainValTestDataset(image_dataset, mode="test") test_loader = DataLoader(dataset=test_dataset, batch_size=args.batchsize, num_workers=num_workers) model = Model(number_of_classes=number_of_classes) optimizer = optim.Adam(model.parameters(), lr=args.learningrate) trainer = create_supervised_trainer(model, optimizer, criterion, device=device) metrics = { "accuracy": Accuracy(), "MAE": MeanAbsoluteError( output_transform=lambda out: (torch.max(out[0], dim=1)[1], out[1])), "MSE": MeanSquaredError( output_transform=lambda out: (torch.max(out[0], dim=1)[1], out[1])), "loss": Loss(loss_fn=criterion) } evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(trainer):
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" path = '/mnt/ssd1/datasets/Recursion_class' path_data = path device = 'cuda' batch_size = 64 warnings.filterwarnings('ignore') criterion = nn.CrossEntropyLoss() criterion = FocalLoss() optimizer = torch.optim.Adam(model.parameters(), lr=0.0003) # optimizer = torch.optim.RMSprop(model.parameters(), lr=0.0003, momentum=0.9) metrics = { 'loss': Loss(criterion), 'accuracy': Accuracy(), } trainer = create_supervised_trainer(model, optimizer, criterion, device=device) val_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) @trainer.on(Events.EPOCH_COMPLETED) def compute_and_display_val_metrics(engine): epoch = engine.state.epoch metrics = val_evaluator.run(val_loader).metrics print("Validation Results - Epoch: {} Average Loss: {:.4f} | Accuracy: {:.4f} " .format(engine.state.epoch, metrics['loss'], metrics['accuracy']))
# Create DenseNet121 net = monai.networks.nets.densenet3d.densenet121( in_channels=1, out_channels=2, ) device = torch.device("cuda:0") def prepare_batch(batch, device=None, non_blocking=False): return _prepare_batch((batch['img'], batch['label']), device, non_blocking) metric_name = 'Accuracy' # add evaluation metric to the evaluator engine val_metrics = {metric_name: Accuracy()} # ignite evaluator expects batch=(img, label) and returns output=(y_pred, y) at every iteration, # user can add output_transform to return other values evaluator = create_supervised_evaluator(net, val_metrics, device, True, prepare_batch=prepare_batch) # Add stats event handler to print validation stats via evaluator val_stats_handler = StatsHandler( name='evaluator', output_transform=lambda x: None # no need to print loss value, so disable per iteration output ) val_stats_handler.attach(evaluator)
def adv_prune_train_loop(model, params, ds, dset, min_y, base_data, model_id, prune_type, device, batch_size, tpa, max_epochs=5): #assert prune_type in ['global_unstructured', 'structured'] total_prune_amount = tpa remove_amount = tpa ds_train, ds_valid = ds train_set, valid_set = dset min_y_train, min_y_val = min_y train_set, valid_set = dset total_prune_amount = tpa original_model = copy.deepcopy(model) original_model.eval() model_id = f'{model_id}_{prune_type}_pruning_{tpa}_l1' valid_freq = 200 * 500 // batch_size // 3 conv_layers = [model.conv1] for sequential in [model.layer1, model.layer2, model.layer3, model.layer4]: for bottleneck in sequential: conv_layers.extend([bottleneck.conv1, bottleneck.conv2, bottleneck.conv3]) conv_layers = conv_layers[:22] def prune_model(model): print(f'pruned model by {total_prune_amount}') if prune_type == 'global_unstructured': parameters_to_prune = [(layer, 'weight') for layer in conv_layers] prune.global_unstructured( parameters_to_prune, pruning_method=prune.L1Unstructured, amount=total_prune_amount, ) else: for layer in conv_layers: prune_model(model) def valid_eval(model, dataset, dataloader, device, label): right = 0 total = 0 model.eval() with torch.no_grad(): for i, data in tqdm(enumerate(dataloader), total=len(dataset) / dataloader.batch_size): data, y = data data = data.to(device) y = y.to(device) - label ans = model.forward(data) right += torch.sum(torch.eq(torch.argmax(ans, dim=1), y)) total += y.shape[0] return right/total valid_acc = valid_eval(model, valid_set, ds_valid, device, min_y_val) print('initial accuracy:', valid_acc.item()) with create_summary_writer(model, ds_train, base_data, model_id, device=device) as writer: lr = params['lr'] mom = params['momentum'] wd = params['l2_wd'] optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=mom, weight_decay=wd) sched = ReduceLROnPlateau(optimizer, factor=0.5, patience=5) funcs = {'accuracy': Accuracy(), 'loss': Loss(F.cross_entropy)} loss = funcs['loss']._loss_fn acc_metric = Accuracy(device=device) loss_metric = Loss(F.cross_entropy, device=device) acc_val_metric = Accuracy(device=device) loss_val_metric = Loss(F.cross_entropy, device=device) # attack = GradientSignAttack(original_model, loss_fn=loss, eps=0.2) def train_step(engine, batch): model.train() x, y = batch x = x.to(device) y = y.to(device) - min_y_train # with ctx_noparamgrad_and_eval(model): # x_adv = attack.perturb(x, y) # optimizer.zero_grad() # x = torch.cat((x, x_adv)) # y = torch.cat((y, y)) ans = model.forward(x) l = loss(ans, y) optimizer.zero_grad() l.backward() optimizer.step() with torch.no_grad(): for layer in conv_layers: layer.weight *= layer.weight_mask return l.item() trainer = Engine(train_step) def train_eval_step(engine, batch): model.eval() x, y = batch x = x.to(device) y = y.to(device) - min_y_train # x_adv = attack.perturb(x, y) # x = torch.cat((x, x_adv)) # y = torch.cat((y, y)) with torch.no_grad(): ans = model.forward(x) return ans, y train_evaluator = Engine(train_eval_step) acc_metric.attach(train_evaluator, "accuracy") loss_metric.attach(train_evaluator, 'loss') def validation_step(engine, batch): model.eval() x, y = batch x = x.to(device) y = y.to(device) - min_y_val # x_adv = attack.perturb(x, y) # x = torch.cat((x, x_adv)) # y = torch.cat((y, y)) with torch.no_grad(): ans = model.forward(x) return ans, y valid_evaluator = Engine(validation_step) acc_val_metric.attach(valid_evaluator, "accuracy") loss_val_metric.attach(valid_evaluator, 'loss') @trainer.on(Events.ITERATION_COMPLETED(every=valid_freq)) def log_validation_results(engine): valid_evaluator.run(ds_valid) metrics = valid_evaluator.state.metrics valid_avg_accuracy = metrics['accuracy'] avg_nll = metrics['loss'] print("Validation Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, valid_avg_accuracy, avg_nll)) writer.add_scalar("validation/avg_loss", avg_nll, engine.state.epoch) writer.add_scalar("validation/avg_accuracy", valid_avg_accuracy, engine.state.epoch) writer.add_scalar("validation/avg_error", 1. - valid_avg_accuracy, engine.state.epoch) @trainer.on(Events.EPOCH_COMPLETED) def lr_scheduler(engine): metrics = valid_evaluator.state.metrics avg_nll = metrics['accuracy'] sched.step(avg_nll) @trainer.on(Events.ITERATION_COMPLETED(every=100)) def log_training_loss(engine): batch = engine.state.batch ds = DataLoader(TensorDataset(*batch), batch_size=batch_size) train_evaluator.run(ds) metrics = train_evaluator.state.metrics accuracy = metrics['accuracy'] nll = metrics['loss'] iter = (engine.state.iteration - 1) % len(ds_train) + 1 if (iter % 50) == 0: print("Epoch[{}] Iter[{}/{}] Accuracy: {:.2f} Loss: {:.2f}" .format(engine.state.epoch, iter, len(ds_train), accuracy, nll)) writer.add_scalar("batchtraining/detloss", nll, engine.state.epoch) writer.add_scalar("batchtraining/accuracy", accuracy, engine.state.iteration) writer.add_scalar("batchtraining/error", 1. - accuracy, engine.state.iteration) writer.add_scalar("batchtraining/loss", engine.state.output, engine.state.iteration) @trainer.on(Events.EPOCH_COMPLETED) def log_lr(engine): writer.add_scalar("lr", optimizer.param_groups[0]['lr'], engine.state.epoch) @trainer.on(Events.ITERATION_COMPLETED(every=valid_freq)) def validation_value(engine): metrics = valid_evaluator.state.metrics valid_avg_accuracy = metrics['accuracy'] return valid_avg_accuracy to_save = {'model': model} handler = Checkpoint(to_save, DiskSaver(os.path.join(base_data, model_id), create_dir=True), score_function=validation_value, score_name="val_acc", global_step_transform=global_step_from_engine(trainer), n_saved=None) # kick everything off trainer.add_event_handler(Events.ITERATION_COMPLETED(every=valid_freq), handler) trainer.run(ds_train, max_epochs=max_epochs)
def main(): parser = argparse.ArgumentParser() arg = parser.add_argument arg('clf_gt', help='segmentation predictions') # Dataset params arg('--test-height', type=int, default=2528) arg('--crop-height', type=int, default=768) arg('--crop-width', type=int, default=512) arg('--scale-aug', type=float, default=0.3) arg('--color-hue-aug', type=int, default=7) arg('--color-sat-aug', type=int, default=30) arg('--color-val-aug', type=int, default=30) arg('--n-tta', type=int, default=1) arg('--pseudolabels', nargs='+', help='path to pseudolabels to be added to train') arg('--pseudolabels-oversample', type=int, default=1) arg('--test-book', help='use only this book for testing and pseudolabels') arg('--fold', type=int, default=0) arg('--n-folds', type=int, default=5) arg('--train-limit', type=int) arg('--test-limit', type=int) # Model params arg('--base', default='resnet50') arg('--use-sequences', type=int, default=0) arg('--head-dropout', type=float, default=0.5) arg('--frozen-start', type=int) arg('--head', type=str, default='Head') # Training params arg('--device', default='cuda', help='device') arg('--opt-level', help='pass 01 to use fp16 training with apex') arg('--benchmark', type=int) arg('--batch-size', default=10, type=int) arg('--max-targets', type=int) arg('--workers', default=8, type=int, help='number of data loading workers') arg('--lr', default=14e-3, type=float, help='initial learning rate') arg('--wd', default=1e-4, type=float, help='weight decay') arg('--optimizer', default='sgd') arg('--accumulation-steps', type=int, default=1) arg('--epochs', default=50, type=int, help='number of total epochs to run') arg('--repeat-train', type=int, default=6) arg('--drop-lr-epoch', default=0, type=int, help='epoch at which to drop lr') arg('--cosine', type=int, default=1, help='cosine lr schedule') # Misc. params arg('--output-dir', help='path where to save') arg('--resume', help='resume from checkpoint') arg('--test-only', help='Only test the model', action='store_true') arg('--submission', help='Create submission', action='store_true') arg('--detailed-postfix', default='', help='postfix of detailed file name') arg('--print-model', default=1, type=int) arg('--dump-features', default=0, type=int) # for knn, unused args = parser.parse_args() if args.test_only and args.submission: parser.error('pass one of --test-only and --submission') print(args) output_dir = Path(args.output_dir) if args.output_dir else None if output_dir: output_dir.mkdir(parents=True, exist_ok=True) if not args.resume: (output_dir / 'params.json').write_text( json.dumps(vars(args), indent=4)) print('Loading data') df_train_gt, df_valid_gt = load_train_valid_df(args.fold, args.n_folds) df_clf_gt = load_train_df(args.clf_gt)[['labels', 'image_id']] if args.submission: df_valid = df_train = df_clf_gt empty_index = df_valid['labels'] == '' empty_pages = df_valid[empty_index]['image_id'].values df_valid = df_valid[~empty_index] else: df_train, df_valid = [ df_clf_gt[df_clf_gt['image_id'].isin(set(df['image_id']))] for df in [df_train_gt, df_valid_gt] ] df_valid = df_valid[df_valid['labels'] != ''] if args.pseudolabels: df_ps = pd.concat( [pd.read_csv(p)[df_train.columns] for p in args.pseudolabels]) if args.test_book: df_ps = df_ps[df_ps['image_id'].apply( lambda x: get_book_id(x) == args.test_book)] df_train = ( pd.concat([df_train] + [df_ps] * args.pseudolabels_oversample).reset_index( drop=True)) if args.test_book: df_valid = df_valid[df_valid['image_id'].apply( lambda x: get_book_id(x) == args.test_book)] if args.train_limit: df_train = df_train.sample(n=args.train_limit, random_state=42) if args.test_limit: df_valid = df_valid.sample(n=args.test_limit, random_state=42) gt_by_image_id = {item.image_id: item for item in df_valid_gt.itertuples()} print(f'{len(df_train):,} in train, {len(df_valid):,} in valid') classes = get_encoded_classes() def _get_transforms(*, train: bool): if not train and args.n_tta > 1: test_heights = [ args.test_height * (1 + s) for s in np.linspace(0, args.scale_aug, args.n_tta) ] print('TTA test heights:', list(map(int, test_heights))) else: test_heights = [args.test_height] return [ get_transform( train=train, test_height=test_height, crop_width=args.crop_width, crop_height=args.crop_height, scale_aug=args.scale_aug, color_hue_aug=args.color_hue_aug, color_sat_aug=args.color_sat_aug, color_val_aug=args.color_val_aug, ) for test_height in test_heights ] def make_test_data_loader(df): return DataLoader( Dataset( df=df, transforms=_get_transforms(train=False), resample_empty=False, classes=classes, ), batch_size=1, collate_fn=collate_fn, num_workers=args.workers, ) data_loader_test = make_test_data_loader(df_valid) if args.dump_features: # unused df_train = df_train[df_train['labels'] != ''] data_loader_train = make_test_data_loader(df_train) else: data_loader_train = DataLoader( Dataset( df=pd.concat([df_train] * args.repeat_train), transforms=_get_transforms(train=True), resample_empty=True, classes=classes, ), num_workers=args.workers, shuffle=True, collate_fn=partial(collate_fn, max_targets=args.max_targets), batch_size=args.batch_size, ) print('Creating model') fp16 = bool(args.opt_level) model: nn.Module = build_model( base=args.base, head=args.head, frozen_start=args.frozen_start, fp16=fp16, n_classes=len(classes), head_dropout=args.head_dropout, use_sequences=bool(args.use_sequences), ) if args.print_model: print(model) device = torch.device(args.device) model.to(device) if args.benchmark: torch.backends.cudnn.benchmark = True parameters = model.parameters() if args.optimizer == 'adam': optimizer = optim.Adam(parameters, lr=args.lr, weight_decay=args.wd) elif args.optimizer == 'sgd': optimizer = optim.SGD(parameters, lr=args.lr, weight_decay=args.wd, momentum=0.9) else: parser.error(f'Unexpected optimzier {args.optimizer}') if fp16: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level) loss = nn.CrossEntropyLoss() step = epoch = 0 best_f1 = 0 if args.resume: state = torch.load(args.resume, map_location='cpu') if 'optimizer' in state: optimizer.load_state_dict(state['optimizer']) model.load_state_dict(state['model']) step = state['step'] epoch = state['epoch'] best_f1 = state['best_f1'] else: model.load_state_dict(state) del state @contextmanager def no_benchmark(): torch.backends.cudnn.benchmark = False yield if args.benchmark: torch.backends.cudnn.benchmark = True if args.dump_features and not args.submission: # unused if not output_dir: parser.error('set --output-dir with --dump-features') # We also dump test features below feature_evaluator = create_supervised_evaluator( model, device=device, prepare_batch=_prepare_batch, metrics={'features': GetFeatures(n_tta=args.n_tta)}, ) with no_benchmark(): run_with_pbar(feature_evaluator, data_loader_train, desc='train features') torch.save(feature_evaluator.state.metrics['features'], output_dir / 'train_features.pth') def get_y_pred_y(output): y_pred, y = output return get_output(y_pred), get_labels(y) metrics = { 'accuracy': Accuracy(output_transform=get_y_pred_y), 'loss': Loss(loss, output_transform=get_y_pred_y), 'predictions': GetPredictions(n_tta=args.n_tta, classes=classes), 'detailed': GetDetailedPrediction(n_tta=args.n_tta, classes=classes), } if args.dump_features: metrics['features'] = GetFeatures(n_tta=args.n_tta) evaluator = create_supervised_evaluator(model, device=device, prepare_batch=_prepare_batch, metrics=metrics) def evaluate(): with no_benchmark(): run_with_pbar(evaluator, data_loader_test, desc='evaluate') metrics = { 'valid_loss': evaluator.state.metrics['loss'], 'accuracy': evaluator.state.metrics['accuracy'], } scores = [] for prediction, meta in evaluator.state.metrics['predictions']: item = gt_by_image_id[meta['image_id']] target_boxes, target_labels = get_target_boxes_labels(item) target_boxes = torch.from_numpy(target_boxes) pred_centers = np.array([p['center'] for p in prediction]) pred_labels = [p['cls'] for p in prediction] scores.append( dict(score_boxes( truth_boxes=from_coco(target_boxes).numpy(), truth_label=target_labels, preds_center=pred_centers, preds_label=np.array(pred_labels), ), image_id=item.image_id)) metrics.update(get_metrics(scores)) if output_dir: pd.DataFrame(evaluator.state.metrics['detailed']).to_csv( output_dir / f'detailed{args.detailed_postfix}.csv.gz', index=None) if args.dump_features: f_name = 'test' if args.submission else 'valid' torch.save(evaluator.state.metrics['features'], output_dir / f'{f_name}_features.pth') return metrics def make_submission(): with no_benchmark(): run_with_pbar(evaluator, data_loader_test, desc='evaluate') submission = [] for prediction, meta in tqdm.tqdm( evaluator.state.metrics['predictions']): submission.append(submission_item(meta['image_id'], prediction)) submission.extend( submission_item(image_id, []) for image_id in empty_pages) pd.DataFrame(submission).to_csv(output_dir / f'submission_{output_dir.name}.csv.gz', index=None) pd.DataFrame(evaluator.state.metrics['detailed']).to_csv( output_dir / f'test_detailed{args.detailed_postfix}.csv.gz', index=None) if args.dump_features: torch.save(evaluator.state.metrics['features'], output_dir / 'test_features.pth') if args.test_only or args.submission: if not args.resume: parser.error('please pass --resume when running with --test-only ' 'or --submission') if args.test_only: print_metrics(evaluate()) elif args.submission: if not output_dir: parser.error('--output-dir required with --submission') make_submission() return trainer = create_supervised_trainer( model, optimizer, loss_fn=lambda y_pred, y: loss(get_output(y_pred), get_labels(y)), device=device, prepare_batch=_prepare_batch, accumulation_steps=args.accumulation_steps, fp16=fp16, ) epochs_left = args.epochs - epoch epochs_pbar = tqdm.trange(epochs_left) epoch_pbar = tqdm.trange(len(data_loader_train)) train_losses = deque(maxlen=20) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(_): nonlocal step train_losses.append(trainer.state.output) smoothed_loss = np.mean(train_losses) epoch_pbar.set_postfix(loss=f'{smoothed_loss:.4f}') epoch_pbar.update(1) step += 1 if step % 20 == 0 and output_dir: json_log_plots.write_event(output_dir, step=step * args.batch_size, loss=smoothed_loss) @trainer.on(Events.EPOCH_COMPLETED) def checkpoint(_): if output_dir: torch.save( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'step': step, 'epoch': epoch, 'best_f1': best_f1, }, output_dir / 'checkpoint.pth') @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(_): nonlocal best_f1 metrics = evaluate() if output_dir: json_log_plots.write_event(output_dir, step=step * args.batch_size, **metrics) if metrics['f1'] > best_f1: best_f1 = metrics['f1'] if output_dir: torch.save(model.state_dict(), output_dir / 'model_best.pth') epochs_pbar.set_postfix( {k: format_value(v) for k, v in metrics.items()}) @trainer.on(Events.EPOCH_COMPLETED) def update_pbars_on_epoch_completion(_): nonlocal epoch epochs_pbar.update(1) epoch_pbar.reset() epoch += 1 scheduler = None if args.drop_lr_epoch and args.cosine: parser.error('Choose only one schedule') if args.drop_lr_epoch: scheduler = StepLR(optimizer, step_size=args.drop_lr_epoch, gamma=0.1) if args.cosine: scheduler = CosineAnnealingLR(optimizer, epochs_left) if scheduler is not None: trainer.on(Events.EPOCH_COMPLETED)(lambda _: scheduler.step()) trainer.run(data_loader_train, max_epochs=epochs_left)
def test_multilabel_input(): acc = Accuracy(is_multilabel=True) def _test(y_pred, y, batch_size): acc.reset() if batch_size > 1: n_iters = y.shape[0] // batch_size + 1 for i in range(n_iters): idx = i * batch_size acc.update( (y_pred[idx:idx + batch_size], y[idx:idx + batch_size])) else: acc.update((y_pred, y)) np_y_pred = to_numpy_multilabel(y_pred) np_y = to_numpy_multilabel(y) assert acc._type == "multilabel" assert isinstance(acc.compute(), float) assert accuracy_score(np_y, np_y_pred) == pytest.approx(acc.compute()) def get_test_cases(): test_cases = [ # Multilabel input data of shape (N, C) and (N, C) (torch.randint(0, 2, size=(10, 4)).long(), torch.randint(0, 2, size=(10, 4)).long(), 1), (torch.randint(0, 2, size=(10, 7)).long(), torch.randint(0, 2, size=(10, 7)).long(), 1), # updated batches (torch.randint(0, 2, size=(50, 4)).long(), torch.randint(0, 2, size=(50, 4)).long(), 16), (torch.randint(0, 2, size=(50, 7)).long(), torch.randint(0, 2, size=(50, 7)).long(), 16), # Multilabel input data of shape (N, H, W) (torch.randint(0, 2, size=(10, 5, 10)).long(), torch.randint(0, 2, size=(10, 5, 10)).long(), 1), (torch.randint(0, 2, size=(10, 4, 10)).long(), torch.randint(0, 2, size=(10, 4, 10)).long(), 1), # updated batches (torch.randint(0, 2, size=(50, 5, 10)).long(), torch.randint(0, 2, size=(50, 5, 10)).long(), 16), (torch.randint(0, 2, size=(50, 4, 10)).long(), torch.randint(0, 2, size=(50, 4, 10)).long(), 16), # Multilabel input data of shape (N, C, H, W, ...) and (N, C, H, W, ...) (torch.randint(0, 2, size=(4, 5, 12, 10)).long(), torch.randint(0, 2, size=(4, 5, 12, 10)).long(), 1), (torch.randint(0, 2, size=(4, 10, 12, 8)).long(), torch.randint(0, 2, size=(4, 10, 12, 8)).long(), 1), # updated batches (torch.randint(0, 2, size=(50, 5, 12, 10)).long(), torch.randint(0, 2, size=(50, 5, 12, 10)).long(), 16), (torch.randint(0, 2, size=(50, 10, 12, 8)).long(), torch.randint(0, 2, size=(50, 10, 12, 8)).long(), 16), ] return test_cases for _ in range(5): # check multiple random inputs as random exact occurencies are rare test_cases = get_test_cases() for y_pred, y, batch_size in test_cases: _test(y_pred, y, batch_size)
def run_training_test(root_dir, device=torch.device("cuda:0")): images = sorted(glob(os.path.join(root_dir, "img*.nii.gz"))) segs = sorted(glob(os.path.join(root_dir, "seg*.nii.gz"))) train_files = [{ "image": img, "label": seg } for img, seg in zip(images[:20], segs[:20])] val_files = [{ "image": img, "label": seg } for img, seg in zip(images[-20:], segs[-20:])] # define transforms for image and segmentation train_transforms = Compose([ LoadNiftid(keys=["image", "label"]), AsChannelFirstd(keys=["image", "label"], channel_dim=-1), ScaleIntensityd(keys=["image", "label"]), RandCropByPosNegLabeld(keys=["image", "label"], label_key="label", spatial_size=[96, 96, 96], pos=1, neg=1, num_samples=4), RandRotate90d(keys=["image", "label"], prob=0.5, spatial_axes=[0, 2]), ToTensord(keys=["image", "label"]), ]) val_transforms = Compose([ LoadNiftid(keys=["image", "label"]), AsChannelFirstd(keys=["image", "label"], channel_dim=-1), ScaleIntensityd(keys=["image", "label"]), ToTensord(keys=["image", "label"]), ]) # create a training data loader train_ds = monai.data.CacheDataset(data=train_files, transform=train_transforms, cache_rate=0.5) # use batch_size=2 to load images and use RandCropByPosNegLabeld to generate 2 x 4 images for network training train_loader = monai.data.DataLoader(train_ds, batch_size=2, shuffle=True, num_workers=4) # create a validation data loader val_ds = monai.data.CacheDataset(data=val_files, transform=val_transforms, cache_rate=1.0) val_loader = monai.data.DataLoader(val_ds, batch_size=1, num_workers=4) # create UNet, DiceLoss and Adam optimizer net = monai.networks.nets.UNet( dimensions=3, in_channels=1, out_channels=1, channels=(16, 32, 64, 128, 256), strides=(2, 2, 2, 2), num_res_units=2, ).to(device) loss = monai.losses.DiceLoss(sigmoid=True) opt = torch.optim.Adam(net.parameters(), 1e-3) lr_scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=2, gamma=0.1) val_post_transforms = Compose([ Activationsd(keys="pred", sigmoid=True), AsDiscreted(keys="pred", threshold_values=True), KeepLargestConnectedComponentd(keys="pred", applied_labels=[1]), ]) val_handlers = [ StatsHandler(output_transform=lambda x: None), TensorBoardStatsHandler(log_dir=root_dir, output_transform=lambda x: None), TensorBoardImageHandler(log_dir=root_dir, batch_transform=lambda x: (x["image"], x["label"]), output_transform=lambda x: x["pred"]), CheckpointSaver(save_dir=root_dir, save_dict={"net": net}, save_key_metric=True), ] evaluator = SupervisedEvaluator( device=device, val_data_loader=val_loader, network=net, inferer=SlidingWindowInferer(roi_size=(96, 96, 96), sw_batch_size=4, overlap=0.5), post_transform=val_post_transforms, key_val_metric={ "val_mean_dice": MeanDice(include_background=True, output_transform=lambda x: (x["pred"], x["label"])) }, additional_metrics={ "val_acc": Accuracy(output_transform=lambda x: (x["pred"], x["label"])) }, val_handlers=val_handlers, ) train_post_transforms = Compose([ Activationsd(keys="pred", sigmoid=True), AsDiscreted(keys="pred", threshold_values=True), KeepLargestConnectedComponentd(keys="pred", applied_labels=[1]), ]) train_handlers = [ LrScheduleHandler(lr_scheduler=lr_scheduler, print_lr=True), ValidationHandler(validator=evaluator, interval=2, epoch_level=True), StatsHandler(tag_name="train_loss", output_transform=lambda x: x["loss"]), TensorBoardStatsHandler(log_dir=root_dir, tag_name="train_loss", output_transform=lambda x: x["loss"]), CheckpointSaver(save_dir=root_dir, save_dict={ "net": net, "opt": opt }, save_interval=2, epoch_level=True), ] trainer = SupervisedTrainer( device=device, max_epochs=5, train_data_loader=train_loader, network=net, optimizer=opt, loss_function=loss, inferer=SimpleInferer(), amp=False, post_transform=train_post_transforms, key_train_metric={ "train_acc": Accuracy(output_transform=lambda x: (x["pred"], x["label"])) }, train_handlers=train_handlers, ) trainer.run() return evaluator.state.best_metric
def _test(metric_device): metric_device = torch.device(metric_device) acc = Accuracy(is_multilabel=True, device=metric_device) torch.manual_seed(10 + rank) y_pred = torch.randint(0, 2, size=(4, 5, 8, 10), device=device).long() y = torch.randint(0, 2, size=(4, 5, 8, 10), device=device).long() acc.update((y_pred, y)) assert ( acc._num_correct.device == metric_device ), f"{type(acc._num_correct.device)}:{acc._num_correct.device} vs {type(metric_device)}:{metric_device}" # gather y_pred, y y_pred = idist.all_gather(y_pred) y = idist.all_gather(y) np_y_pred = to_numpy_multilabel( y_pred.cpu()) # (N, C, H, W, ...) -> (N * H * W ..., C) np_y = to_numpy_multilabel( y.cpu()) # (N, C, H, W, ...) -> (N * H * W ..., C) assert acc._type == "multilabel" n = acc._num_examples res = acc.compute() assert n * idist.get_world_size() == acc._num_examples assert isinstance(res, float) assert accuracy_score(np_y, np_y_pred) == pytest.approx(res) acc.reset() torch.manual_seed(10 + rank) y_pred = torch.randint(0, 2, size=(4, 7, 10, 8), device=device).long() y = torch.randint(0, 2, size=(4, 7, 10, 8), device=device).long() acc.update((y_pred, y)) assert ( acc._num_correct.device == metric_device ), f"{type(acc._num_correct.device)}:{acc._num_correct.device} vs {type(metric_device)}:{metric_device}" # gather y_pred, y y_pred = idist.all_gather(y_pred) y = idist.all_gather(y) np_y_pred = to_numpy_multilabel( y_pred.cpu()) # (N, C, H, W, ...) -> (N * H * W ..., C) np_y = to_numpy_multilabel( y.cpu()) # (N, C, H, W, ...) -> (N * H * W ..., C) assert acc._type == "multilabel" n = acc._num_examples res = acc.compute() assert n * idist.get_world_size() == acc._num_examples assert isinstance(res, float) assert accuracy_score(np_y, np_y_pred) == pytest.approx(res) # check that result is not changed res = acc.compute() assert n * idist.get_world_size() == acc._num_examples assert isinstance(res, float) assert accuracy_score(np_y, np_y_pred) == pytest.approx(res) # Batched Updates acc.reset() torch.manual_seed(10 + rank) y_pred = torch.randint(0, 2, size=(80, 5, 8, 10), device=device).long() y = torch.randint(0, 2, size=(80, 5, 8, 10), device=device).long() batch_size = 16 n_iters = y.shape[0] // batch_size + 1 for i in range(n_iters): idx = i * batch_size acc.update((y_pred[idx:idx + batch_size], y[idx:idx + batch_size])) assert ( acc._num_correct.device == metric_device ), f"{type(acc._num_correct.device)}:{acc._num_correct.device} vs {type(metric_device)}:{metric_device}" # gather y_pred, y y_pred = idist.all_gather(y_pred) y = idist.all_gather(y) np_y_pred = to_numpy_multilabel( y_pred.cpu()) # (N, C, L, ...) -> (N * L * ..., C) np_y = to_numpy_multilabel(y.cpu()) # (N, C, L, ...) -> (N * L ..., C) assert acc._type == "multilabel" n = acc._num_examples res = acc.compute() assert n * idist.get_world_size() == acc._num_examples assert isinstance(res, float) assert accuracy_score(np_y, np_y_pred) == pytest.approx(res)
def train(): config_file = "configs/train_full_config.json" config = Config.from_json_file(config_file) # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig( level=logging.INFO if config.local_rank in [-1, 0] else logging.WARN) logger.warning( "Running process %d", config.local_rank ) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(config)) # Initialize distributed training if needed config.distributed = (config.local_rank != -1) if config.distributed: torch.cuda.set_device(config.local_rank) config.device = torch.device("cuda", config.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info( "Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning" ) tokenizer_class = GPT2Tokenizer if "gpt2" in config.model_checkpoint else OpenAIGPTTokenizer tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint) model_class = GPT2DoubleHeadsModel if "gpt2" in config.model_checkpoint else OpenAIGPTDoubleHeadsModel model = model_class.from_pretrained(config.model_checkpoint) tokenizer.set_special_tokens(SPECIAL_TOKENS) model.set_num_special_tokens(len(SPECIAL_TOKENS)) model.to(config.device) optimizer = OpenAIAdam(model.parameters(), lr=config.lr) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if config.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=config.fp16) if config.distributed: model = DistributedDataParallel(model, device_ids=[config.local_rank], output_device=config.local_rank) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders( config, tokenizer) # Training function and trainer def update(engine, batch): model.train() input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids = tuple( input_tensor.to(config.device) for input_tensor in batch) lm_loss, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids) loss = (lm_loss * config.lm_coef + mc_loss * config.mc_coef) / config.gradient_accumulation_steps if config.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), config.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_norm) if engine.state.iteration % config.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple( input_tensor.to(config.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids = batch #logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) model_outputs = model(input_ids, mc_token_ids, token_type_ids=token_type_ids, token_emotion_ids=token_emotion_ids) lm_logits, mc_logits = model_outputs[0], model_outputs[ 1] # So we can also use GPT2 outputs lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view( -1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if config.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if config.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if config.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, config.lr), (config.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = { "nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])), "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1])) } metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], config), "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], config) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if config.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) tb_logger = TensorboardLogger(log_dir=config.log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list( metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model) }) # "getattr" take care of distributed encapsulation torch.save(config, tb_logger.writer.log_dir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file( os.path.join(tb_logger.writer.log_dir, CONFIG_NAME)) tokenizer.save_vocabulary(tb_logger.writer.log_dir) # Run the training trainer.run(train_loader, max_epochs=config.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if config.local_rank in [-1, 0] and config.n_epochs > 0: os.rename( checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME) ) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
def test__check_type(): acc = Accuracy() with pytest.raises(RuntimeError, match=r"Invalid shapes of y"): acc._check_type((torch.rand([1, 1, 1]), torch.rand([1])))
def fit_model(model, train_loader, test_loader, lr, max_epochs=5): optimizer = torch.optim.SGD(model.parameters(), lr=lr) criterion = torch.nn.BCEWithLogitsLoss() def threshold_output_transform(output): y_pred, y = output y_pred = torch.heaviside(y_pred, values=torch.zeros(1)) # print(f'y_pred size : {y_pred.size()}') # print(f'y size : {y.size()}') return y_pred, y def prepare_batch(batch, device, non_blocking): x, y = batch x = x.float() y = y.float() y = torch.unsqueeze(y, 1) return (x, y) def squeeze_y_dims(output): prediction, target = output # print(f'prediction size: {prediction.size()}') # print(f'target size: {target.size()}') return prediction, target trainer = create_supervised_trainer(model, optimizer, criterion, prepare_batch=prepare_batch) val_metrics = { "accuracy": Accuracy(threshold_output_transform), "bce": Loss(criterion, output_transform=squeeze_y_dims) # "precision" : Precision(threshold_output_transform, average=False), # "recall": Recall(threshold_output_transform, average=False) } evaluator = create_supervised_evaluator(model, metrics=val_metrics, prepare_batch=prepare_batch) @trainer.on(Events.ITERATION_COMPLETED(every=10)) def log_training_loss(trainer): evaluator.run(train_loader) metrics = evaluator.state.metrics print( f"Training Results - Epoch: {trainer.state.epoch} Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['bce']:.2f}" ) # @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(trainer): evaluator.run(train_loader) metrics = evaluator.state.metrics # print(f"Training Results - Epoch: {trainer.state.epoch} Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['bce']:.2f} Avg precision : {metrics['precision']:.2f} Avg recall: {metrics['recall']:.2f}") print( f"Training Results - Epoch: {trainer.state.epoch} Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['bce']:.2f}" ) @trainer.on(Events.EPOCH_COMPLETED(every=10)) def log_validation_results(trainer): evaluator.run(test_loader) metrics = evaluator.state.metrics # print(f"Validation Results - Epoch: {trainer.state.epoch} Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['bce']:.2f} Avg precision : {metrics['precision']:.2f} Avg recall: {metrics['recall']:.2f}") print( f"Validation Results - Epoch: {trainer.state.epoch} Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['bce']:.2f}" ) trainer.run(train_loader, max_epochs=max_epochs) return model
def test_binary_wrong_inputs(): acc = Accuracy() with pytest.raises( ValueError, match=r"For binary cases, y must be comprised of 0's and 1's"): # y has not only 0 or 1 values acc.update( (torch.randint(0, 2, size=(10, )).long(), torch.arange(0, 10).long())) with pytest.raises( ValueError, match=r"For binary cases, y_pred must be comprised of 0's and 1's" ): # y_pred values are not thresholded to 0, 1 values acc.update(( torch.rand(10, ), torch.randint(0, 2, size=(10, )).long(), )) with pytest.raises(ValueError, match=r"y must have shape of "): # incompatible shapes acc.update((torch.randint(0, 2, size=(10, )).long(), torch.randint(0, 2, size=(10, 5)).long())) with pytest.raises(ValueError, match=r"y must have shape of "): # incompatible shapes acc.update((torch.randint(0, 2, size=(10, 5, 6)).long(), torch.randint(0, 2, size=(10, )).long())) with pytest.raises(ValueError, match=r"y must have shape of "): # incompatible shapes acc.update((torch.randint(0, 2, size=(10, )).long(), torch.randint(0, 2, size=(10, 5, 6)).long()))
create_dataloader, imdb_dataset(directory='../data/', train=True, test=True)) model = Classifier( WordRNN(256, embeddings, bidirectional=True, merge_bi='cat', packed_sequence=True, attention=True, device=DEVICE), 512, 3) optimizer = Adam([p for p in model.parameters() if p.requires_grad], lr=1e-3) criterion = nn.CrossEntropyLoss() metrics = {'accuracy': Accuracy(), 'loss': Loss(criterion)} trainer = SequentialTrainer( model, optimizer, checkpoint_dir='../checkpoints' if not DEBUG else None, metrics=metrics, non_blocking=True, retain_graph=True, patience=5, loss_fn=criterion, device=DEVICE) if DEBUG: log.info('Starting end to end test') print('--------------------------------------------------------------') trainer.fit_debug(train_loader, dev_loader)
def _test_distrib_on_metric(device): import torch.distributed as dist rank = dist.get_rank() n_iters = 10 n_epochs = 3 batch_size = 10 n_classes = 10 data = list(range(n_iters)) np.random.seed(12) all_y_true_batch_values = np.random.randint(0, n_classes, size=(dist.get_world_size(), n_epochs * n_iters, batch_size)) all_y_pred_batch_values = np.random.rand(dist.get_world_size(), n_epochs * n_iters, batch_size, n_classes) y_true_batch_values = iter(all_y_true_batch_values[rank, ...]) y_pred_batch_values = iter(all_y_pred_batch_values[rank, ...]) def update_fn(engine, batch): y_true_batch = next(y_true_batch_values) y_pred_batch = next(y_pred_batch_values) return torch.from_numpy(y_pred_batch), torch.from_numpy(y_true_batch) trainer = Engine(update_fn) alpha = 0.98 acc_metric = RunningAverage( Accuracy(output_transform=lambda x: [x[0], x[1]], device=device), alpha=alpha, epoch_bound=False, ) acc_metric.attach(trainer, "running_avg_accuracy") running_avg_acc = [ None, ] true_acc_metric = Accuracy(device=device) @trainer.on(Events.ITERATION_COMPLETED) def manual_running_avg_acc(engine): i = engine.state.iteration - 1 true_acc_metric.reset() for j in range(dist.get_world_size()): output = ( torch.from_numpy(all_y_pred_batch_values[j, i, :, :]), torch.from_numpy(all_y_true_batch_values[j, i, :]), ) true_acc_metric.update(output) batch_acc = true_acc_metric._num_correct * 1.0 / true_acc_metric._num_examples if running_avg_acc[0] is None: running_avg_acc[0] = batch_acc else: running_avg_acc[0] = running_avg_acc[0] * alpha + ( 1.0 - alpha) * batch_acc engine.state.running_avg_acc = running_avg_acc[0] @trainer.on(Events.ITERATION_COMPLETED) def assert_equal_running_avg_acc_values(engine): assert (engine.state.running_avg_acc == engine.state. metrics["running_avg_accuracy"]), "{} vs {}".format( engine.state.running_avg_acc, engine.state.metrics["running_avg_accuracy"]) trainer.run(data, max_epochs=3)