"emb_vecs": [emb_vec1, emb_vec2], "cls_pred": cls_pred, "cls_true": cls_true, "targets": targets } return ret # ---------------------------------------- if __name__ == "__main__": engine = Engine(_update) metrics = { "sim_acc": SiameseNetSimilarityAccuracy(margin=margin), "clsf_acc": Accuracy(output_transform=lambda x: (x['cls_pred'], x['cls_true'])) } for name, metric in metrics.items(): metric.attach(engine, name) from ignite.contrib.handlers import ProgressBar pbar = ProgressBar() pbar.attach(engine, output_transform=lambda x: { 'con_loss': x['con_loss'], 'clsf_loss': x['clsf_loss'] }) from ignite.engine import Events # @engine.on(Events.ITERATION_COMPLETED)
def train(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="data/korean/", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model_checkpoint", type=str, default="gpt2", help="Path, url or short name of the model") parser.add_argument("--model_version", type=str, default='v4', help="version of model") parser.add_argument("--num_candidates", type=int, default=2, help="Number of candidates for training") parser.add_argument("--max_history", type=int, default=30, help="Number of previous exchanges to keep in history") parser.add_argument("--train_batch_size", type=int, default=1, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=1, help="Batch size for validation") parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="Accumulate gradients on several steps") parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate") parser.add_argument("--lm_coef", type=float, default=1.0, help="LM loss coefficient") parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient") parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--n_epochs", type=int, default=5, help="Number of training epochs") parser.add_argument( "--eval_before_start", action='store_true', help="If true start with a first evaluation before training") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument( "--fp16", type=str, default="", help= "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)") parser.add_argument( "--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") args = parser.parse_args() # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Running process %d", args.local_rank ) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(args)) # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info( "Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning" ) torch.manual_seed(42) def get_kogpt2_tokenizer(model_path=None): if not model_path: model_path = 'taeminlee/kogpt2' tokenizer = GPT2Tokenizer.from_pretrained(model_path) return tokenizer tokenizer = get_kogpt2_tokenizer() optimizer_class = AdamW model = get_kogpt2_model() model.to(args.device) optimizer = optimizer_class(model.parameters(), lr=args.lr) # tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint, unk_token='<|unkwn|>') SPECIAL_TOKENS_DICT = {'additional_special_tokens': SPECIAL_TOKENS} # tokenizer.add_special_tokens(SPECIAL_TOKENS_DICT) print("SPECIAL TOKENS") print(SPECIAL_TOKENS) tokenizer.add_special_tokens(SPECIAL_TOKENS_DICT) for value in SPECIAL_TOKENS: logger.info("Assigning %s to the %s key of the tokenizer", value, value) setattr(tokenizer, value, value) model.resize_token_embeddings(len(tokenizer)) s = ' '.join(act_name) + ' '.join(slot_name) print(tokenizer.decode(tokenizer.encode(s))) print(len(act_name) + len(slot_name), len(tokenizer.encode(s))) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if args.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders( args, tokenizer) # Training function and trainer def update(engine, batch): model.train() batch = tuple(input_tensor.to(args.device) for input_tensor in batch) lm_loss, mc_loss, *_ = model(*batch) loss = (lm_loss * args.lm_coef + mc_loss * args.mc_coef) / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) trainer.logger.setLevel(logging.INFO) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple( input_tensor.to(args.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) model_outputs = model(input_ids, mc_token_ids, token_type_ids=token_type_ids) lm_logits, mc_logits = model_outputs[0], model_outputs[ 1] # So we can also use GPT2 outputs lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view( -1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels) evaluator = Engine(inference) evaluator.logger.setLevel(logging.INFO) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = { "nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])), "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1])) } metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args), "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) tb_logger = TensorboardLogger(log_dir=None) tb_logger.writer.log_dir = tb_logger.writer.file_writer.get_logdir() tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) """tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), global_step_transform=global_step_from_engine(trainer)), event_name=Events.EPOCH_COMPLETED)""" tb_logger.attach_output_handler( evaluator, event_name=Events.EPOCH_COMPLETED, tag="validation", metric_names=list(metrics.keys()), global_step_transform=global_step_from_engine(trainer)) checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model) }) # "getattr" take care of distributed encapsulation torch.save(args, tb_logger.writer.log_dir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file( os.path.join(tb_logger.writer.log_dir, CONFIG_NAME)) tokenizer.save_vocabulary(tb_logger.writer.log_dir) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename( checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME) ) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
def _test_distrib_on_metric(device): import torch.distributed as dist rank = dist.get_rank() n_iters = 10 n_epochs = 3 batch_size = 10 n_classes = 10 data = list(range(n_iters)) np.random.seed(12) all_y_true_batch_values = np.random.randint(0, n_classes, size=(dist.get_world_size(), n_epochs * n_iters, batch_size)) all_y_pred_batch_values = np.random.rand(dist.get_world_size(), n_epochs * n_iters, batch_size, n_classes) y_true_batch_values = iter(all_y_true_batch_values[rank, ...]) y_pred_batch_values = iter(all_y_pred_batch_values[rank, ...]) def update_fn(engine, batch): y_true_batch = next(y_true_batch_values) y_pred_batch = next(y_pred_batch_values) return torch.from_numpy(y_pred_batch), torch.from_numpy(y_true_batch) trainer = Engine(update_fn) alpha = 0.98 acc_metric = RunningAverage(Accuracy( output_transform=lambda x: [x[0], x[1]], device=device), alpha=alpha, epoch_bound=False) acc_metric.attach(trainer, 'running_avg_accuracy') running_avg_acc = [ None, ] true_acc_metric = Accuracy(device=device) @trainer.on(Events.ITERATION_COMPLETED) def manual_running_avg_acc(engine): i = engine.state.iteration - 1 true_acc_metric.reset() for j in range(dist.get_world_size()): output = (torch.from_numpy(all_y_pred_batch_values[j, i, :, :]), torch.from_numpy(all_y_true_batch_values[j, i, :])) true_acc_metric.update(output) batch_acc = true_acc_metric._num_correct * 1.0 / true_acc_metric._num_examples if running_avg_acc[0] is None: running_avg_acc[0] = batch_acc else: running_avg_acc[0] = running_avg_acc[0] * alpha + ( 1.0 - alpha) * batch_acc engine.state.running_avg_acc = running_avg_acc[0] @trainer.on(Events.ITERATION_COMPLETED) def assert_equal_running_avg_acc_values(engine): assert engine.state.running_avg_acc == engine.state.metrics['running_avg_accuracy'], \ "{} vs {}".format(engine.state.running_avg_acc, engine.state.metrics['running_avg_accuracy']) trainer.run(data, max_epochs=3)
def main(): monai.config.print_config() logging.basicConfig(stream=sys.stdout, level=logging.INFO) # IXI dataset as a demo, downloadable from https://brain-development.org/ixi-dataset/ images = [ os.sep.join([ "workspace", "data", "medical", "ixi", "IXI-T1", "IXI607-Guys-1097-T1.nii.gz" ]), os.sep.join([ "workspace", "data", "medical", "ixi", "IXI-T1", "IXI175-HH-1570-T1.nii.gz" ]), os.sep.join([ "workspace", "data", "medical", "ixi", "IXI-T1", "IXI385-HH-2078-T1.nii.gz" ]), os.sep.join([ "workspace", "data", "medical", "ixi", "IXI-T1", "IXI344-Guys-0905-T1.nii.gz" ]), os.sep.join([ "workspace", "data", "medical", "ixi", "IXI-T1", "IXI409-Guys-0960-T1.nii.gz" ]), os.sep.join([ "workspace", "data", "medical", "ixi", "IXI-T1", "IXI584-Guys-1129-T1.nii.gz" ]), os.sep.join([ "workspace", "data", "medical", "ixi", "IXI-T1", "IXI253-HH-1694-T1.nii.gz" ]), os.sep.join([ "workspace", "data", "medical", "ixi", "IXI-T1", "IXI092-HH-1436-T1.nii.gz" ]), os.sep.join([ "workspace", "data", "medical", "ixi", "IXI-T1", "IXI574-IOP-1156-T1.nii.gz" ]), os.sep.join([ "workspace", "data", "medical", "ixi", "IXI-T1", "IXI585-Guys-1130-T1.nii.gz" ]), ] # 2 binary labels for gender classification: man and woman labels = np.array([0, 0, 1, 0, 1, 0, 1, 0, 1, 0], dtype=np.int64) # define transforms for image val_transforms = Compose( [ScaleIntensity(), AddChannel(), Resize((96, 96, 96)), ToTensor()]) # define image dataset val_ds = ImageDataset(image_files=images, labels=labels, transform=val_transforms, image_only=False) # create DenseNet121 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") net = monai.networks.nets.densenet.densenet121(spatial_dims=3, in_channels=1, out_channels=2).to(device) metric_name = "Accuracy" # add evaluation metric to the evaluator engine val_metrics = {metric_name: Accuracy()} def prepare_batch(batch, device=None, non_blocking=False): return _prepare_batch((batch[0], batch[1]), device, non_blocking) # Ignite evaluator expects batch=(img, label) and returns output=(y_pred, y) at every iteration, # user can add output_transform to return other values evaluator = create_supervised_evaluator(net, val_metrics, device, True, prepare_batch=prepare_batch) # add stats event handler to print validation stats via evaluator val_stats_handler = StatsHandler( name="evaluator", output_transform=lambda x: None, # no need to print loss value, so disable per iteration output ) val_stats_handler.attach(evaluator) # for the array data format, assume the 3rd item of batch data is the meta_data prediction_saver = ClassificationSaver( output_dir="tempdir", batch_transform=lambda batch: batch[2], output_transform=lambda output: output[0].argmax(1), ) prediction_saver.attach(evaluator) # the model was trained by "densenet_training_array" example CheckpointLoader(load_path="./runs_array/net_checkpoint_20.pt", load_dict={ "net": net }).attach(evaluator) # create a validation data loader val_loader = DataLoader(val_ds, batch_size=2, num_workers=4, pin_memory=torch.cuda.is_available()) state = evaluator.run(val_loader) print(state)
def training(local_rank, config): rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() logger = setup_logger(name="CIFAR10-Training", distributed_rank=local_rank) log_basic_info(logger, config) output_path = config["output_path"] if rank == 0: if config["stop_iteration"] is None: now = datetime.now().strftime("%Y%m%d-%H%M%S") else: now = f"stop-on-{config['stop_iteration']}" folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}" output_path = Path(output_path) / folder_name if not output_path.exists(): output_path.mkdir(parents=True) config["output_path"] = output_path.as_posix() logger.info(f"Output path: {config['output_path']}") if "cuda" in device.type: config["cuda device name"] = torch.cuda.get_device_name(local_rank) if config["with_clearml"]: try: from clearml import Task except ImportError: # Backwards-compatibility for legacy Trains SDK from trains import Task task = Task.init("CIFAR10-Training", task_name=output_path.stem) task.connect_configuration(config) # Log hyper parameters hyper_params = [ "model", "batch_size", "momentum", "weight_decay", "num_epochs", "learning_rate", "num_warmup_epochs", ] task.connect({k: config[k] for k in hyper_params}) # Setup dataflow, model, optimizer, criterion train_loader, test_loader = get_dataflow(config) config["num_iters_per_epoch"] = len(train_loader) model, optimizer, criterion, lr_scheduler = initialize(config) # Create trainer for current task trainer = create_trainer(model, optimizer, criterion, lr_scheduler, train_loader.sampler, config, logger) # Let's now setup evaluator engine to perform model's validation and compute metrics metrics = { "accuracy": Accuracy(), "loss": Loss(criterion), } # We define two evaluators as they wont have exactly similar roles: # - `evaluator` will save the best model based on validation score evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def run_validation(engine): epoch = trainer.state.epoch state = train_evaluator.run(train_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(test_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED, run_validation) if rank == 0: # Setup TensorBoard logging on trainer and evaluators. Logged values are: # - Training metrics, e.g. running average loss values # - Learning rate # - Evaluation train/test metrics evaluators = {"training": train_evaluator, "test": evaluator} tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators) # Store 3 best models by validation accuracy: common.gen_save_best_models_by_val_score( save_handler=get_save_handler(config), evaluator=evaluator, models={"model": model}, metric_name="accuracy", n_saved=3, trainer=trainer, tag="test", ) # In order to check training resuming we can stop training on a given iteration if config["stop_iteration"] is not None: @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"])) def _(): logger.info( f"Stop training on {trainer.state.iteration} iteration") trainer.terminate() try: trainer.run(train_loader, max_epochs=config["num_epochs"]) except Exception as e: import traceback print(traceback.format_exc()) if rank == 0: tb_logger.close()
def train(): parser = ArgumentParser() parser.add_argument("--run_data", type=bool, default=False) parser.add_argument("--eval_freq", type=int, default=200000) parser.add_argument("--save_freq", type=int, default=2000) parser.add_argument("--data_nuggets", type=str, default=my_dataset) parser.add_argument("--dataset_path", type=str, default=my_dataset + 'json.txt', help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default=my_dataset + 'cache', help="Path or url of the dataset cache") parser.add_argument("--model_checkpoint", type=str, default="gpt2", help="Path, url or short name of the model") parser.add_argument("--model", type=str, default="gpt2") parser.add_argument("--eval_before_start", type=bool, default=False, help="If true start with a first evaluation before training") parser.add_argument("--num_candidates", type=int, default=2, help="Number of candidates for training") parser.add_argument("--max_history", type=int, default=1, help="Number of previous exchanges to keep in history") parser.add_argument("--train_batch_size", type=int, default=2, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=2, help="Batch size for validation") parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="Accumulate gradients on several steps") parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate") parser.add_argument("--lm_coef", type=float, default=2.0, help="LM loss coefficient") parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient") parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--n_epochs", type=int, default=50, help="Number of training epochs") parser.add_argument("--personality_permutations", type=int, default=1, help="Number of permutations of personality sentences") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--fp16", type=str, default="", help="Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)") parser.add_argument("--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") args = parser.parse_args() # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning("Running process %d", args.local_rank) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(args)) # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info("Prepare tokenizer, pretrained model and optimizer.") tokenizer_class = GPT2Tokenizer if "gpt2" in args.model else OpenAIGPTTokenizer # cant use Autotokenizer because checkpoint could be a Path tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model_class = GPT2DoubleHeadsModel if "gpt2" in args.model else OpenAIGPTDoubleHeadsModel model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) # Add special tokens if they are not already added add_special_tokens_(model, tokenizer) optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if args.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) #if args.distributed: #model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) #model = DataParallel(model) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(args, tokenizer) # Training function and trainer def update(engine, batch): model.train() #print(batch) batch = tuple(input_tensor.to(args.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch (lm_loss), (mc_loss), *_ = model( input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, mc_labels=mc_labels, lm_labels=lm_labels ) loss = (lm_loss * args.lm_coef + mc_loss * args.mc_coef) / args.gradient_accumulation_steps # DATAPARALLEL #loss = loss.sum() if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple(input_tensor.to(args.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) # if we dont send labels to model, it doesnt return losses lm_logits, mc_logits, *_ = model( input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, ) lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-100), output_transform=lambda x: (x[0][0], x[1][0])), "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))} metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args), "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args)}) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics))) log_dir = make_logdir(args.model_checkpoint) tb_logger = TensorboardLogger(log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', save_interval=1, n_saved=100) trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)}) # "getattr" takes care of distributed encapsulation torch.save(args, log_dir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME)) tokenizer.save_pretrained(log_dir) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename(os.path.join(log_dir, checkpoint_handler._saved[-1][1]), os.path.join(log_dir, WEIGHTS_NAME)) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
def main(dataset_path, batch_size=256, max_epochs=10): assert torch.cuda.is_available() assert torch.backends.cudnn.enabled, "NVIDIA/Apex:Amp requires cudnn backend to be enabled." torch.backends.cudnn.benchmark = True device = "cuda" train_loader, test_loader, eval_train_loader = get_train_eval_loaders( dataset_path, batch_size=batch_size) model = wide_resnet50_2(num_classes=100).to(device) optimizer = SGD(model.parameters(), lr=0.01) criterion = CrossEntropyLoss().to(device) scaler = GradScaler() def train_step(engine, batch): x = convert_tensor(batch[0], device, non_blocking=True) y = convert_tensor(batch[1], device, non_blocking=True) optimizer.zero_grad() # Runs the forward pass with autocasting. with autocast(): y_pred = model(x) loss = criterion(y_pred, y) # Scales loss. Calls backward() on scaled loss to create scaled gradients. # Backward passes under autocast are not recommended. # Backward ops run in the same precision that autocast used for corresponding forward ops. scaler.scale(loss).backward() # scaler.step() first unscales the gradients of the optimizer's assigned params. # If these gradients do not contain infs or NaNs, optimizer.step() is then called, # otherwise, optimizer.step() is skipped. scaler.step(optimizer) # Updates the scale for next iteration. scaler.update() return loss.item() trainer = Engine(train_step) timer = Timer(average=True) timer.attach(trainer, step=Events.EPOCH_COMPLETED) ProgressBar(persist=True).attach( trainer, output_transform=lambda out: {"batch loss": out}) metrics = {"Accuracy": Accuracy(), "Loss": Loss(criterion)} evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def log_metrics(engine, title): for name in metrics: print(f"\t{title} {name}: {engine.state.metrics[name]:.2f}") @trainer.on(Events.COMPLETED) def run_validation(_): print(f"- Mean elapsed time for 1 epoch: {timer.value()}") print("- Metrics:") with evaluator.add_event_handler(Events.COMPLETED, log_metrics, "Train"): evaluator.run(eval_train_loader) with evaluator.add_event_handler(Events.COMPLETED, log_metrics, "Test"): evaluator.run(test_loader) trainer.run(train_loader, max_epochs=max_epochs)
def main(): if not os.path.exists(args.outdir): os.mkdir(args.outdir) device = torch.device("cuda") torch.cuda.set_device(args.gpu) logfilename = os.path.join(args.outdir, args.logname) log(logfilename, "Hyperparameter List") log(logfilename, "Epochs: {:}".format(args.epochs)) log(logfilename, "Learning Rate: {:}".format(args.lr)) log(logfilename, "Alpha: {:}".format(args.alpha)) log(logfilename, "Keep ratio: {:}".format(args.keep_ratio)) test_acc_list = [] for _ in range(args.round): train_dataset = get_dataset(args.dataset, 'train') test_dataset = get_dataset(args.dataset, 'test') pin_memory = (args.dataset == "imagenet") train_loader = DataLoader(train_dataset, shuffle=True, batch_size=args.batch, num_workers=args.workers, pin_memory=pin_memory) test_loader = DataLoader(test_dataset, shuffle=False, batch_size=args.batch, num_workers=args.workers, pin_memory=pin_memory) # Loading the base_classifier base_classifier = get_architecture(args.arch, args.dataset, device) checkpoint = torch.load(args.savedir) base_classifier.load_state_dict(checkpoint['state_dict']) base_classifier.eval() print("Loaded the base_classifier") original_acc = model_inference(base_classifier, test_loader, device, display=True) log(logfilename, "Original Model Test Accuracy: {:.5}".format(original_acc)) print("Original Model Test Accuracy, ", original_acc) # Creating a fresh copy of network not affecting the original network. net = copy.deepcopy(base_classifier) net = net.to(device) # Generating the mask 'm' for layer in net.modules(): if isinstance(layer, nn.Linear) or isinstance(layer, nn.Conv2d): layer.weight_mask = nn.Parameter(torch.ones_like(layer.weight)) layer.weight.requires_grad = True layer.weight_mask.requires_grad = True # This is the monkey-patch overriding layer.forward to custom function. # layer.forward will pass nn.Linear with weights: 'w' and 'm' elementwised if isinstance(layer, nn.Linear): layer.forward = types.MethodType(mask_forward_linear, layer) if isinstance(layer, nn.Conv2d): layer.forward = types.MethodType(mask_forward_conv2d, layer) criterion = nn.NLLLoss().to( device) # I added Log Softmax layer to all architecture. optimizer = SGD( net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=0) # weight_decay = 0 for training the mask. sparsity, total = 0, 0 for layer in net.modules(): if isinstance(layer, nn.Linear) or isinstance(layer, nn.Conv2d): boolean_list = layer.weight_mask.data > args.threshold sparsity += (boolean_list == 1).sum() total += layer.weight.numel() # Training the mask with the training set. # You can set the maximum number of loop in case the sparsity on auxiliary parameter # do not go below target sparsity. for epoch in range(300): if epoch % 5 == 0: print("Current epochs: ", epoch) print("Sparsity: {:}".format(sparsity)) train_loss = mask_train(train_loader, net, criterion, optimizer, epoch, device, alpha=args.alpha, display=False) acc = model_inference(net, test_loader, device, display=False) log(logfilename, "Epoch {:}, Mask Update Test Acc: {:.5}".format(epoch, acc)) sparsity, total = 0, 0 for layer in net.modules(): if isinstance(layer, nn.Linear) or isinstance( layer, nn.Conv2d): boolean_list = layer.weight_mask.data > args.threshold sparsity += (boolean_list == 1).sum() total += layer.weight.numel() if sparsity <= total * args.keep_ratio: print("Current epochs breaking loop at {:}".format(epoch)) break mask_update_acc = model_inference(net, test_loader, device, display=True) log(logfilename, "Mask Update Test Accuracy: {:.5}".format(mask_update_acc)) # This line allows to calculate the threshold to satisfy the keep_ratio. c_abs = [] for layer in net.modules(): if isinstance(layer, nn.Linear) or isinstance(layer, nn.Conv2d): c_abs.append(torch.abs(layer.weight_mask)) all_scores = torch.cat([torch.flatten(x) for x in c_abs]) num_params_to_keep = int(len(all_scores) * args.keep_ratio) threshold, _ = torch.topk(all_scores, num_params_to_keep, sorted=True) threshold = threshold[-1] keep_masks = [] for c in c_abs: keep_masks.append((c >= threshold).float()) print( "Number of ones.", torch.sum(torch.cat([torch.flatten(x == 1) for x in keep_masks]))) # Updating the weight with elementwise product of update c. for layer in net.modules(): if isinstance(layer, nn.Linear) or isinstance(layer, nn.Conv2d): # We update the weight by elementwise multiplication between # weight 'w' and mask 'm'. layer.weight.data = layer.weight.data * layer.weight_mask.data layer.zeros = nn.Parameter(torch.zeros_like( layer.weight)) # Dummy parameter. layer.ones = nn.Parameter(torch.ones_like( layer.weight)) # Dummy parameter. layer.weight_mask.data = torch.where( torch.abs(layer.weight_mask) <= threshold, layer.zeros, layer.ones ) # Updated weight_mask becomes the mask with element # 0 and 1 again. # Temporarily disabling the backprop for both 'w' and 'm'. layer.weight.requires_grad = False layer.weight_mask.requires_grad = False if isinstance(layer, nn.Linear): layer.forward = types.MethodType(mask_forward_linear, layer) if isinstance(layer, nn.Conv2d): layer.forward = types.MethodType(mask_forward_conv2d, layer) weight_update_acc = model_inference(net, test_loader, device, display=True) log(logfilename, "Weight Update Test Accuracy: {:.5}".format(weight_update_acc)) # Calculating the sparsity of the network. remain = 0 total = 0 for layer in net.modules(): if isinstance(layer, nn.Linear) or isinstance(layer, nn.Conv2d): total += torch.norm(torch.ones_like(layer.weight), p=1) # Counting total num parameter remain += torch.norm(layer.weight_mask.data, p=1) # Counting ones in the mask. # Disabling backprop except weight 'w' for the finetuning. layer.zeros.requires_grad = False layer.ones.requires_grad = False layer.weight_mask.requires_grad = False layer.weight.requires_grad = True if isinstance(layer, nn.Linear): layer.forward = types.MethodType(mask_forward_linear, layer) if isinstance(layer, nn.Conv2d): layer.forward = types.MethodType(mask_forward_conv2d, layer) log(logfilename, "Sparsity: {:.3}".format(remain / total)) print("Sparsity: ", remain / total) # -------------------------------- # We need to transfer the weight we learned from "net" to "base_classifier". for (layer1, layer2) in zip(base_classifier.modules(), net.modules()): if isinstance(layer1, (nn.Linear, nn.Conv2d)) or isinstance( layer2, (nn.Linear, nn.Conv2d)): layer1.weight.data = layer2.weight.data if layer1.bias != None: layer1.bias.data = layer2.bias.data layer1.bias.requires_grad = True layer1.weight.requires_grad = True # Applying the mask to the base_classifier. apply_prune_mask(base_classifier, keep_masks) # -------------------------------- optimizer = SGD(base_classifier.parameters(), lr=1e-3, momentum=args.momentum, weight_decay=args.weight_decay) loss = nn.NLLLoss() scheduler = StepLR(optimizer, step_size=args.lr_step_size, gamma=args.gamma) test_acc = [] # Finetuning via ignite trainer = create_supervised_trainer(base_classifier, optimizer, nn.NLLLoss(), device) evaluator = create_supervised_evaluator(base_classifier, { 'accuracy': Accuracy(), 'nll': Loss(loss) }, device) pbar = ProgressBar() pbar.attach(trainer) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iter_in_epoch = (engine.state.iteration - 1) % len(train_loader) + 1 if engine.state.iteration % args.print_freq == 0: pbar.log_message("Epoch[{}] Iteration[{}/{}] Loss: {:.2f}" "".format(engine.state.epoch, iter_in_epoch, len(train_loader), engine.state.output)) @trainer.on(Events.EPOCH_COMPLETED) def log_epoch(engine): scheduler.step() evaluator.run(test_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] pbar.log_message( "Validation Results - Epoch: {} Avg accuracy: {:.5f} Avg loss: {:.3f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) log( logfilename, "Validation - Epoch: {} Avg accuracy: {:.5f} Avg loss: {:.3f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) test_acc.append(avg_accuracy) if avg_accuracy >= max(test_acc): print("Saving the model at Epoch {:}".format( engine.state.epoch)) torch.save( { 'arch': args.arch, 'state_dict': base_classifier.state_dict(), 'optimizer': optimizer.state_dict(), }, os.path.join(args.outdir, 'checkpoint.pth.tar')) if engine.state.epoch == args.epochs: test_acc_list.append(max(test_acc)) log(logfilename, "Finetuned Test Accuracy: {:.5f}".format(max(test_acc))) print("Finetuned Test Accuracy: ", max(test_acc)) trainer.run(train_loader, args.epochs) log(logfilename, "This is the test accuracy list for args.round.") log(logfilename, str(test_acc_list))
def main(): monai.config.print_config() logging.basicConfig(stream=sys.stdout, level=logging.INFO) # IXI dataset as a demo, downloadable from https://brain-development.org/ixi-dataset/ images = [ "/workspace/data/medical/ixi/IXI-T1/IXI314-IOP-0889-T1.nii.gz", "/workspace/data/medical/ixi/IXI-T1/IXI249-Guys-1072-T1.nii.gz", "/workspace/data/medical/ixi/IXI-T1/IXI609-HH-2600-T1.nii.gz", "/workspace/data/medical/ixi/IXI-T1/IXI173-HH-1590-T1.nii.gz", "/workspace/data/medical/ixi/IXI-T1/IXI020-Guys-0700-T1.nii.gz", "/workspace/data/medical/ixi/IXI-T1/IXI342-Guys-0909-T1.nii.gz", "/workspace/data/medical/ixi/IXI-T1/IXI134-Guys-0780-T1.nii.gz", "/workspace/data/medical/ixi/IXI-T1/IXI577-HH-2661-T1.nii.gz", "/workspace/data/medical/ixi/IXI-T1/IXI066-Guys-0731-T1.nii.gz", "/workspace/data/medical/ixi/IXI-T1/IXI130-HH-1528-T1.nii.gz", "/workspace/data/medical/ixi/IXI-T1/IXI607-Guys-1097-T1.nii.gz", "/workspace/data/medical/ixi/IXI-T1/IXI175-HH-1570-T1.nii.gz", "/workspace/data/medical/ixi/IXI-T1/IXI385-HH-2078-T1.nii.gz", "/workspace/data/medical/ixi/IXI-T1/IXI344-Guys-0905-T1.nii.gz", "/workspace/data/medical/ixi/IXI-T1/IXI409-Guys-0960-T1.nii.gz", "/workspace/data/medical/ixi/IXI-T1/IXI584-Guys-1129-T1.nii.gz", "/workspace/data/medical/ixi/IXI-T1/IXI253-HH-1694-T1.nii.gz", "/workspace/data/medical/ixi/IXI-T1/IXI092-HH-1436-T1.nii.gz", "/workspace/data/medical/ixi/IXI-T1/IXI574-IOP-1156-T1.nii.gz", "/workspace/data/medical/ixi/IXI-T1/IXI585-Guys-1130-T1.nii.gz", ] # 2 binary labels for gender classification: man and woman labels = np.array( [0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0]) # define transforms train_transforms = Compose([ ScaleIntensity(), AddChannel(), Resize((96, 96, 96)), RandRotate90(), ToTensor() ]) val_transforms = Compose( [ScaleIntensity(), AddChannel(), Resize((96, 96, 96)), ToTensor()]) # define nifti dataset, data loader check_ds = NiftiDataset(image_files=images, labels=labels, transform=train_transforms) check_loader = DataLoader(check_ds, batch_size=2, num_workers=2, pin_memory=torch.cuda.is_available()) im, label = monai.utils.misc.first(check_loader) print(type(im), im.shape, label) # create DenseNet121, CrossEntropyLoss and Adam optimizer net = monai.networks.nets.densenet.densenet121( spatial_dims=3, in_channels=1, out_channels=2, ) loss = torch.nn.CrossEntropyLoss() lr = 1e-5 opt = torch.optim.Adam(net.parameters(), lr) device = torch.device("cuda:0") # Ignite trainer expects batch=(img, label) and returns output=loss at every iteration, # user can add output_transform to return other values, like: y_pred, y, etc. trainer = create_supervised_trainer(net, opt, loss, device, False) # adding checkpoint handler to save models (network params and optimizer stats) during training checkpoint_handler = ModelCheckpoint("./runs/", "net", n_saved=10, require_empty=False) trainer.add_event_handler(event_name=Events.EPOCH_COMPLETED, handler=checkpoint_handler, to_save={ "net": net, "opt": opt }) # StatsHandler prints loss at every iteration and print metrics at every epoch, # we don't set metrics for trainer here, so just print loss, user can also customize print functions # and can use output_transform to convert engine.state.output if it's not loss value train_stats_handler = StatsHandler(name="trainer") train_stats_handler.attach(trainer) # TensorBoardStatsHandler plots loss at every iteration and plots metrics at every epoch, same as StatsHandler train_tensorboard_stats_handler = TensorBoardStatsHandler() train_tensorboard_stats_handler.attach(trainer) # set parameters for validation validation_every_n_epochs = 1 metric_name = "Accuracy" # add evaluation metric to the evaluator engine val_metrics = {metric_name: Accuracy()} # Ignite evaluator expects batch=(img, label) and returns output=(y_pred, y) at every iteration, # user can add output_transform to return other values evaluator = create_supervised_evaluator(net, val_metrics, device, True) # add stats event handler to print validation stats via evaluator val_stats_handler = StatsHandler( name="evaluator", output_transform=lambda x: None, # no need to print loss value, so disable per iteration output global_epoch_transform=lambda x: trainer.state.epoch, ) # fetch global epoch number from trainer val_stats_handler.attach(evaluator) # add handler to record metrics to TensorBoard at every epoch val_tensorboard_stats_handler = TensorBoardStatsHandler( output_transform=lambda x: None, # no need to plot loss value, so disable per iteration output global_epoch_transform=lambda x: trainer.state.epoch, ) # fetch global epoch number from trainer val_tensorboard_stats_handler.attach(evaluator) # add early stopping handler to evaluator early_stopper = EarlyStopping( patience=4, score_function=stopping_fn_from_metric(metric_name), trainer=trainer) evaluator.add_event_handler(event_name=Events.EPOCH_COMPLETED, handler=early_stopper) # create a validation data loader val_ds = NiftiDataset(image_files=images[-10:], labels=labels[-10:], transform=val_transforms) val_loader = DataLoader(val_ds, batch_size=2, num_workers=2, pin_memory=torch.cuda.is_available()) @trainer.on(Events.EPOCH_COMPLETED(every=validation_every_n_epochs)) def run_validation(engine): evaluator.run(val_loader) # create a training data loader train_ds = NiftiDataset(image_files=images[:10], labels=labels[:10], transform=train_transforms) train_loader = DataLoader(train_ds, batch_size=2, shuffle=True, num_workers=2, pin_memory=torch.cuda.is_available()) train_epochs = 30 state = trainer.run(train_loader, train_epochs)
def _test(): acc = Accuracy() y_pred = torch.rand(10, 4) y = torch.randint(0, 4, size=(10, )).long() acc.update((y_pred, y)) np_y_pred = y_pred.numpy().argmax(axis=1).ravel() np_y = y.numpy().ravel() assert acc._type == "multiclass" assert isinstance(acc.compute(), float) assert accuracy_score(np_y, np_y_pred) == pytest.approx(acc.compute()) acc.reset() y_pred = torch.rand(10, 10, 1) y = torch.randint(0, 18, size=(10, 1)).long() acc.update((y_pred, y)) np_y_pred = y_pred.numpy().argmax(axis=1).ravel() np_y = y.numpy().ravel() assert acc._type == "multiclass" assert isinstance(acc.compute(), float) assert accuracy_score(np_y, np_y_pred) == pytest.approx(acc.compute()) acc.reset() y_pred = torch.rand(10, 18) y = torch.randint(0, 18, size=(10, )).long() acc.update((y_pred, y)) np_y_pred = y_pred.numpy().argmax(axis=1).ravel() np_y = y.numpy().ravel() assert acc._type == "multiclass" assert isinstance(acc.compute(), float) assert accuracy_score(np_y, np_y_pred) == pytest.approx(acc.compute()) acc.reset() y_pred = torch.rand(4, 10) y = torch.randint(0, 10, size=(4, )).long() acc.update((y_pred, y)) np_y_pred = y_pred.numpy().argmax(axis=1).ravel() np_y = y.numpy().ravel() assert acc._type == "multiclass" assert isinstance(acc.compute(), float) assert accuracy_score(np_y, np_y_pred) == pytest.approx(acc.compute()) # 2-classes acc.reset() y_pred = torch.rand(4, 2) y = torch.randint(0, 2, size=(4, )).long() acc.update((y_pred, y)) np_y_pred = y_pred.numpy().argmax(axis=1).ravel() np_y = y.numpy().ravel() assert acc._type == "multiclass" assert isinstance(acc.compute(), float) assert accuracy_score(np_y, np_y_pred) == pytest.approx(acc.compute()) # Batched Updates acc.reset() y_pred = torch.rand(100, 5) y = torch.randint(0, 5, size=(100, )).long() batch_size = 16 n_iters = y.shape[0] // batch_size + 1 for i in range(n_iters): idx = i * batch_size acc.update((y_pred[idx:idx + batch_size], y[idx:idx + batch_size])) np_y = y.numpy().ravel() np_y_pred = y_pred.numpy().argmax(axis=1).ravel() assert acc._type == "multiclass" assert isinstance(acc.compute(), float) assert accuracy_score(np_y, np_y_pred) == pytest.approx(acc.compute())
def _test(): acc = Accuracy(is_multilabel=True) torch.manual_seed(10 + rank) y_pred = torch.randint(0, 2, size=(4, 5, 8, 10), device=device).long() y = torch.randint(0, 2, size=(4, 5, 8, 10), device=device).long() acc.update((y_pred, y)) # gather y_pred, y y_pred = idist.all_gather(y_pred) y = idist.all_gather(y) np_y_pred = to_numpy_multilabel( y_pred.cpu()) # (N, C, H, W, ...) -> (N * H * W ..., C) np_y = to_numpy_multilabel( y.cpu()) # (N, C, H, W, ...) -> (N * H * W ..., C) assert acc._type == "multilabel" n = acc._num_examples res = acc.compute() assert n * idist.get_world_size() == acc._num_examples assert isinstance(res, float) assert accuracy_score(np_y, np_y_pred) == pytest.approx(res) acc.reset() torch.manual_seed(10 + rank) y_pred = torch.randint(0, 2, size=(4, 7, 10, 8), device=device).long() y = torch.randint(0, 2, size=(4, 7, 10, 8), device=device).long() acc.update((y_pred, y)) # gather y_pred, y y_pred = idist.all_gather(y_pred) y = idist.all_gather(y) np_y_pred = to_numpy_multilabel( y_pred.cpu()) # (N, C, H, W, ...) -> (N * H * W ..., C) np_y = to_numpy_multilabel( y.cpu()) # (N, C, H, W, ...) -> (N * H * W ..., C) assert acc._type == "multilabel" n = acc._num_examples res = acc.compute() assert n * idist.get_world_size() == acc._num_examples assert isinstance(res, float) assert accuracy_score(np_y, np_y_pred) == pytest.approx(res) # check that result is not changed res = acc.compute() assert n * idist.get_world_size() == acc._num_examples assert isinstance(res, float) assert accuracy_score(np_y, np_y_pred) == pytest.approx(res) # Batched Updates acc.reset() torch.manual_seed(10 + rank) y_pred = torch.randint(0, 2, size=(80, 5, 8, 10), device=device).long() y = torch.randint(0, 2, size=(80, 5, 8, 10), device=device).long() batch_size = 16 n_iters = y.shape[0] // batch_size + 1 for i in range(n_iters): idx = i * batch_size acc.update((y_pred[idx:idx + batch_size], y[idx:idx + batch_size])) # gather y_pred, y y_pred = idist.all_gather(y_pred) y = idist.all_gather(y) np_y_pred = to_numpy_multilabel( y_pred.cpu()) # (N, C, L, ...) -> (N * L * ..., C) np_y = to_numpy_multilabel(y.cpu()) # (N, C, L, ...) -> (N * L ..., C) assert acc._type == "multilabel" n = acc._num_examples res = acc.compute() assert n * idist.get_world_size() == acc._num_examples assert isinstance(res, float) assert accuracy_score(np_y, np_y_pred) == pytest.approx(res)
def test_no_update(): acc = Accuracy() with pytest.raises(NotComputableError): acc.compute()
def training(local_rank, config): rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() logger = setup_logger(name="IMDB-Training", distributed_rank=local_rank) log_basic_info(logger, config) output_path = config["output_dir"] if rank == 0: now = datetime.now().strftime("%Y%m%d-%H%M%S") folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}" output_path = Path(output_path) / folder_name if not output_path.exists(): output_path.mkdir(parents=True) config["output_dir"] = output_path.as_posix() logger.info(f"Output path: {config['output_dir']}") if "cuda" in device.type: config["cuda device name"] = torch.cuda.get_device_name(local_rank) if config["with_clearml"]: try: from clearml import Task except ImportError: # Backwards-compatibility for legacy Trains SDK from trains import Task task = Task.init("IMDB-Training", task_name=output_path.stem) task.connect_configuration(config) # Log hyper parameters hyper_params = [ "model", "dropout", "n_fc", "batch_size", "max_length", "weight_decay", "num_epochs", "learning_rate", "num_warmup_epochs", ] task.connect({k: config[k] for k in hyper_params}) # Setup dataflow, model, optimizer, criterion train_loader, test_loader = get_dataflow(config) config["num_iters_per_epoch"] = len(train_loader) model, optimizer, criterion, lr_scheduler = initialize(config) # Create trainer for current task trainer = create_trainer(model, optimizer, criterion, lr_scheduler, train_loader.sampler, config, logger) # Let's now setup evaluator engine to perform model's validation and compute metrics metrics = { "Accuracy": Accuracy(output_transform=utils.thresholded_output_transform), "Loss": Loss(criterion), } # We define two evaluators as they wont have exactly similar roles: # - `evaluator` will save the best model based on validation score evaluator = create_evaluator(model, metrics, config, tag="val") train_evaluator = create_evaluator(model, metrics, config, tag="train") def run_validation(engine): epoch = trainer.state.epoch state = train_evaluator.run(train_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(test_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED | Events.STARTED, run_validation) if rank == 0: # Setup TensorBoard logging on trainer and evaluators. Logged values are: # - Training metrics, e.g. running average loss values # - Learning rate # - Evaluation train/test metrics evaluators = {"training": train_evaluator, "test": evaluator} tb_logger = common.setup_tb_logging( output_path, trainer, optimizer, evaluators=evaluators, log_every_iters=config["log_every_iters"]) # Store 2 best models by validation accuracy starting from num_epochs / 2: best_model_handler = Checkpoint( {"model": model}, utils.get_save_handler(config), filename_prefix="best", n_saved=2, global_step_transform=global_step_from_engine(trainer), score_name="test_accuracy", score_function=Checkpoint.get_default_score_fn("Accuracy"), ) evaluator.add_event_handler( Events.COMPLETED( lambda *_: trainer.state.epoch > config["num_epochs"] // 2), best_model_handler) try: trainer.run(train_loader, max_epochs=config["num_epochs"]) except Exception as e: logger.exception("") raise e if rank == 0: tb_logger.close()
def run(output_path, config): device = "cuda" batch_size = config['batch_size'] train_loader, test_loader = get_train_test_loaders( dataset_name=config['dataset'], path=config['data_path'], batch_size=batch_size, num_workers=config['num_workers']) model = get_model(config['model']) model = model.to(device) optim_fn = optim.SGD if config['with_layca']: optim_fn = LaycaSGD optimizer = optim_fn(model.parameters(), lr=0.0, momentum=config['momentum'], weight_decay=config['weight_decay'], nesterov=True) criterion = nn.CrossEntropyLoss() le = len(train_loader) milestones_values = [(le * m, v) for m, v in config['lr_milestones_values']] scheduler = PiecewiseLinear(optimizer, "lr", milestones_values=milestones_values) def _prepare_batch(batch, device, non_blocking): x, y = batch return (convert_tensor(x, device=device, non_blocking=non_blocking), convert_tensor(y, device=device, non_blocking=non_blocking)) def process_function(engine, batch): x, y = _prepare_batch(batch, device=device, non_blocking=True) model.train() y_pred = model(x) loss = criterion(y_pred, y) optimizer.zero_grad() loss.backward() optimizer.step() return loss.item() trainer = Engine(process_function) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) RunningAverage(output_transform=lambda x: x, epoch_bound=False).attach(trainer, 'batchloss') ProgressBar(persist=True, bar_format="").attach(trainer, event_name=Events.EPOCH_STARTED, closing_event_name=Events.COMPLETED) tb_logger = TensorboardLogger(log_dir=output_path) tb_logger.attach(trainer, log_handler=tbOutputHandler(tag="train", metric_names='all'), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=tbOptimizerParamsHandler(optimizer, param_name="lr"), event_name=Events.ITERATION_STARTED) tb_logger.attach(trainer, log_handler=LayerRotationStatsHandler(model), event_name=Events.EPOCH_STARTED) metrics = { "accuracy": Accuracy(), } evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def run_validation(engine, val_interval): if (engine.state.epoch - 1) % val_interval == 0: train_evaluator.run(train_loader) evaluator.run(test_loader) trainer.add_event_handler(Events.EPOCH_COMPLETED, run_validation, val_interval=2) trainer.add_event_handler(Events.COMPLETED, run_validation, val_interval=1) tb_logger.attach(train_evaluator, log_handler=tbOutputHandler(tag="train", metric_names='all', another_engine=trainer), event_name=Events.COMPLETED) tb_logger.attach(evaluator, log_handler=tbOutputHandler(tag="test", metric_names='all', another_engine=trainer), event_name=Events.COMPLETED) def mlflow_batch_metrics_logging(engine, tag): step = trainer.state.iteration for name, value in engine.state.metrics.items(): mlflow.log_metric("{} {}".format(tag, name), value, step=step) def mlflow_val_metrics_logging(engine, tag): step = trainer.state.epoch for name in metrics.keys(): value = engine.state.metrics[name] mlflow.log_metric("{} {}".format(tag, name), value, step=step) trainer.add_event_handler(Events.ITERATION_COMPLETED, mlflow_batch_metrics_logging, "train") train_evaluator.add_event_handler(Events.COMPLETED, mlflow_val_metrics_logging, "train") evaluator.add_event_handler(Events.COMPLETED, mlflow_val_metrics_logging, "test") trainer.run(train_loader, max_epochs=config['num_epochs']) tb_logger.close()
def run( train_batch_size, val_batch_size, epochs, lr, momentum, log_interval, log_dir, checkpoint_every, resume_from, crash_iteration=1000, ): train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() writer = SummaryWriter(logdir=log_dir) device = "cpu" if torch.cuda.is_available(): device = "cuda" criterion = nn.NLLLoss() optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) lr_scheduler = StepLR(optimizer, step_size=1, gamma=0.5) trainer = create_supervised_trainer(model, optimizer, criterion, device=device) evaluator = create_supervised_evaluator(model, metrics={ "accuracy": Accuracy(), "nll": Loss(criterion) }, device=device) @trainer.on(Events.EPOCH_COMPLETED) def lr_step(engine): lr_scheduler.step() desc = "ITERATION - loss: {:.4f} - lr: {:.4f}" pbar = tqdm(initial=0, leave=False, total=len(train_loader), desc=desc.format(0, lr)) if log_interval is None: e = Events.ITERATION_COMPLETED log_interval = 1 else: e = Events.ITERATION_COMPLETED(every=log_interval) @trainer.on(e) def log_training_loss(engine): lr = optimizer.param_groups[0]["lr"] pbar.desc = desc.format(engine.state.output, lr) pbar.update(log_interval) writer.add_scalar("training/loss", engine.state.output, engine.state.iteration) writer.add_scalar("lr", lr, engine.state.iteration) if resume_from is None: @trainer.on(Events.ITERATION_COMPLETED(once=crash_iteration)) def _(engine): raise Exception("STOP at {}".format(engine.state.iteration)) else: @trainer.on(Events.STARTED) def _(engine): pbar.n = engine.state.iteration @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): pbar.refresh() evaluator.run(train_loader) metrics = evaluator.state.metrics avg_accuracy = metrics["accuracy"] avg_nll = metrics["nll"] tqdm.write( "Training Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) writer.add_scalar("training/avg_loss", avg_nll, engine.state.epoch) writer.add_scalar("training/avg_accuracy", avg_accuracy, engine.state.epoch) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics avg_accuracy = metrics["accuracy"] avg_nll = metrics["nll"] tqdm.write( "Validation Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) pbar.n = pbar.last_print_n = 0 writer.add_scalar("valdation/avg_loss", avg_nll, engine.state.epoch) writer.add_scalar("valdation/avg_accuracy", avg_accuracy, engine.state.epoch) objects_to_checkpoint = { "trainer": trainer, "model": model, "optimizer": optimizer, "lr_scheduler": lr_scheduler } training_checkpoint = Checkpoint(to_save=objects_to_checkpoint, save_handler=DiskSaver( log_dir, require_empty=False)) trainer.add_event_handler( Events.ITERATION_COMPLETED(every=checkpoint_every), training_checkpoint) if resume_from is not None: tqdm.write("Resume from a checkpoint: {}".format(resume_from)) checkpoint = torch.load(resume_from) Checkpoint.load_objects(to_load=objects_to_checkpoint, checkpoint=checkpoint) try: trainer.run(train_loader, max_epochs=epochs) except Exception as e: import traceback print(traceback.format_exc()) pbar.close() writer.close()
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_interval, log_dir): train_loader, val_loader = get_data_loaders( train_batch_size, val_batch_size) # Dataloader实例化 model = Net() # 网络模型 writer = create_summary_writer(model, train_loader, log_dir) device = "cpu" if torch.cuda.is_available(): device = "cuda" optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) # 优化器 # 定义trainer,传入,model、optimizer、loss、device实例化 trainer = create_supervised_trainer(model, optimizer, F.nll_loss, device=device) evaluator = create_supervised_evaluator(model, metrics={ "accuracy": Accuracy(), "nll": Loss(F.nll_loss) }, device=device) # 定义触发事件 @trainer.on(Events.ITERATION_COMPLETED) def log_train_loss(engine): iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: print("Epoch[{}] Iteration[{}/{}] Loss: {:.2f}".format( engine.state.epoch, iter, len(train_loader), engine.state.output)) # 这里的engine.state.output 不是很明白 writer.add_scalar("training/loss", engine.state.output, engine.state.iteration) # iteration是总共的迭代次数 @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): evaluator.run(train_loader) metrics = evaluator.state.metrics # avg_accuracy = metrics["accuracy"] avg_nll = metrics["nll"] print( "Training Results -Epoch:{} Avg accuracy: {:.2f} Avg loss: {:.2f}". format(engine.state.epoch, avg_accuracy, avg_nll)) writer.add_scalar("training/avg_loss", avg_nll, engine.state.epoch) writer.add_scalar("training/avg_accuracy", avg_accuracy, engine.state.epoch) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_nll = metrics['nll'] print( "Validation Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) writer.add_scalar("valdation/avg_loss", avg_nll, engine.state.epoch) writer.add_scalar("valdation/avg_accuracy", avg_accuracy, engine.state.epoch) trainer.run(train_loader, max_epochs=epochs) writer.close()
"con_loss": contras_loss.item(), "clsf_loss": clsf_loss.item(), "emb_vecs": [emb_vec1, emb_vec2], "cls_pred": cls_pred, "cls_true": cls_true, "targets": targets } return ret # ---------------------------------------- if __name__ == "__main__": engine = Engine(_update) metrics = { "sim_acc": SiameseNetSimilarityAccuracy(margin=margin, l2_normalize=True), "clsf_acc": Accuracy( output_transform=lambda x: (x['cls_pred'], x['cls_true'])) } for name, metric in metrics.items(): metric.attach(engine, name) from ignite.contrib.handlers import ProgressBar pbar = ProgressBar() pbar.attach(engine, output_transform=lambda x: { 'con_loss': x['con_loss'], 'clsf_loss': x['clsf_loss'] }) from ignite.engine import Events # @engine.on(Events.ITERATION_COMPLETED) # def log_training_loss(engine):
model = UNet(in_channels=params['in_channels'], n_classes=params['n_classes'], depth=params['depth']) model.load_state_dict(torch.load(model_path)) model.to(device) # Create Trainer or Evaluators criterion = nn.NLLLoss() optimizer = torch.optim.Adam(model.parameters(), lr=params['learning_rate']) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min') # Determine metrics for evaluation. metrics = { "accuracy": Accuracy(), "loss": Loss(criterion), "mean_iou": mIoU(ConfusionMatrix(num_classes=params['n_classes'])), } def backprop_step(engine, batch): model.eval() model.zero_grad() batch_x, batch_y = batch batch_x = batch_x.to(device) batch_y = batch_y.to(device) outputs = model(batch_x) loss = criterion(outputs[:, :, 127:128, 127:128], batch_y[:, 127:128, 127:128]) loss.backward()
def main(batch_size, epochs): # 1. GPUの設定(PyTorchでは明示的に指定する必要がある) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(device) train_loader, test_loader = get_data_loaders(batch_size) # 2. モデル作成 # model = net.CNN(num_classes=num_classes).to(device) model = net.Net(1000, 10).to(device) print(model) # ネットワークの詳細を確認用に表示 # 3. 損失関数を定義 criterion = nn.CrossEntropyLoss() # 4. 最適化手法を定義(ここでは例としてAdamを選択) # optimizer = optim.Adam(model.parameters(), lr=0.001) optimizer = optim.Adam(model.parameters()) trainer = create_supervised_trainer(model, optimizer, criterion, device=device) train_evaluator = create_supervised_evaluator(model, metrics={ 'accuracy': Accuracy(), 'loss': Loss(criterion) }, device=device) test_evaluator = create_supervised_evaluator(model, metrics={ 'accuracy': Accuracy(), 'loss': Loss(criterion) }, device=device) desc = "ITERATION - loss: {:.2f}" pbar = tqdm(initial=0, leave=False, total=len(train_loader), desc=desc.format(0)) log_interval = 10 # 5. ログ出力 @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): i = (engine.state.iteration - 1) % len(train_loader) + 1 if i % log_interval == 0: pbar.desc = desc.format(engine.state.output) pbar.update(log_interval) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): pbar.refresh() train_evaluator.run(train_loader) metrics = train_evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_loss = metrics['loss'] tqdm.write( "Training Results - Epoch: {} Avg accuracy: {:.3f} Avg loss: {:.4f}" .format(engine.state.epoch, avg_accuracy, avg_loss)) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): test_evaluator.run(test_loader) metrics = test_evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_loss = metrics['loss'] tqdm.write( "Validation Results - Epoch: {} Avg accuracy: {:.3f} Avg loss: {:.4f}" .format(engine.state.epoch, avg_accuracy, avg_loss)) pbar.n = pbar.last_print_n = 0 def score_function(engine): val_loss = engine.state.metrics['loss'] return -val_loss # 5. checkpoint setting best_handler = ModelCheckpoint(dirname='./checkpoints', filename_prefix='best', n_saved=3, score_name='loss', score_function=score_function, create_dir=True, require_empty=False) test_evaluator.add_event_handler(Events.EPOCH_COMPLETED, best_handler, {'mymodel': model}) early_handler = EarlyStopping(patience=5, score_function=score_function, trainer=trainer) # Note: the handler is attached to an *Evaluator* (runs one epoch on validation dataset) test_evaluator.add_event_handler(Events.COMPLETED, early_handler) # 6. 実行 trainer.run(train_loader, max_epochs=epochs) pbar.close()
# create DenseNet121 net = monai.networks.nets.densenet.densenet121( spatial_dims=3, in_channels=1, out_channels=2, ) device = torch.device('cuda:0') def prepare_batch(batch, device=None, non_blocking=False): return _prepare_batch((batch['img'], batch['label']), device, non_blocking) metric_name = 'Accuracy' # add evaluation metric to the evaluator engine val_metrics = {metric_name: Accuracy()} # ignite evaluator expects batch=(img, label) and returns output=(y_pred, y) at every iteration, # user can add output_transform to return other values evaluator = create_supervised_evaluator(net, val_metrics, device, True, prepare_batch=prepare_batch) # add stats event handler to print validation stats via evaluator val_stats_handler = StatsHandler( name='evaluator', output_transform=lambda x: None # no need to print loss value, so disable per iteration output ) val_stats_handler.attach(evaluator) # for the array data format, assume the 3rd item of batch data is the meta_data prediction_saver = ClassificationSaver(output_dir='tempdir', name='evaluator', batch_transform=lambda batch: {'filename_or_obj': batch['img.filename_or_obj']}, output_transform=lambda output: output[0].argmax(1))
def main(tempdir): monai.config.print_config() logging.basicConfig(stream=sys.stdout, level=logging.INFO) # create a temporary directory and 40 random image, mask pairs print(f"generating synthetic data to {tempdir} (this may take a while)") for i in range(5): im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1, channel_dim=-1) n = nib.Nifti1Image(im, np.eye(4)) nib.save(n, os.path.join(tempdir, f"im{i:d}.nii.gz")) n = nib.Nifti1Image(seg, np.eye(4)) nib.save(n, os.path.join(tempdir, f"seg{i:d}.nii.gz")) images = sorted(glob(os.path.join(tempdir, "im*.nii.gz"))) segs = sorted(glob(os.path.join(tempdir, "seg*.nii.gz"))) val_files = [{ "image": img, "label": seg } for img, seg in zip(images, segs)] # model file path model_file = glob("./runs/net_key_metric*")[0] # define transforms for image and segmentation val_transforms = Compose([ LoadImaged(keys=["image", "label"]), AsChannelFirstd(keys=["image", "label"], channel_dim=-1), ScaleIntensityd(keys="image"), EnsureTyped(keys=["image", "label"]), ]) # create a validation data loader val_ds = monai.data.Dataset(data=val_files, transform=val_transforms) val_loader = monai.data.DataLoader(val_ds, batch_size=1, num_workers=4) # create UNet, DiceLoss and Adam optimizer device = torch.device("cuda" if torch.cuda.is_available() else "cpu") net = monai.networks.nets.UNet( spatial_dims=3, in_channels=1, out_channels=1, channels=(16, 32, 64, 128, 256), strides=(2, 2, 2, 2), num_res_units=2, ).to(device) val_post_transforms = Compose([ EnsureTyped(keys="pred"), Activationsd(keys="pred", sigmoid=True), AsDiscreted(keys="pred", threshold=0.5), KeepLargestConnectedComponentd(keys="pred", applied_labels=[1]), SaveImaged(keys="pred", meta_keys="image_meta_dict", output_dir="./runs/") ]) val_handlers = [ StatsHandler(output_transform=lambda x: None), CheckpointLoader(load_path=model_file, load_dict={"net": net}), ] evaluator = SupervisedEvaluator( device=device, val_data_loader=val_loader, network=net, inferer=SlidingWindowInferer(roi_size=(96, 96, 96), sw_batch_size=4, overlap=0.5), postprocessing=val_post_transforms, key_val_metric={ "val_mean_dice": MeanDice(include_background=True, output_transform=from_engine(["pred", "label"])) }, additional_metrics={ "val_acc": Accuracy(output_transform=from_engine(["pred", "label"])) }, val_handlers=val_handlers, # if no FP16 support in GPU or PyTorch version < 1.6, will not enable AMP evaluation amp=True if monai.utils.get_torch_version_tuple() >= (1, 6) else False, ) evaluator.run()
def run(model, optimizer, scheduler, loss_fn, device, train_loader, val_loader, training_history, param_history, model_info, start_epoch, path ): expected_batch_size = model_info["data"]["batch_size"] def prep_batch(batch, device=device, non_blocking=False): return batch.to(device), batch.y.to(device) def update(trainer, batch): nonlocal expected_batch_size model.train() optimizer.zero_grad() x, y = prep_batch(batch, device=device, non_blocking=False) if expected_batch_size != x.num_graphs: print(expected_batch_size) print(type(x)) print(x.num_graphs) y_pred = model(x) loss = loss_fn(y_pred, y) loss.backward() ### do clipping here for param in model.parameters(): if param.grad is None: continue param.grad.data.clamp_(-1,1) optimizer.step() return loss.item() trainer = Engine(update) evaluator = create_supervised_evaluator(model, metrics={'accuracy': Accuracy(), 'nll': Loss(loss_fn)}, device=device, prepare_batch=prep_batch) optimizer_history = [] from event_handlers.log_lr import log_lr trainer.add_event_handler( Events.EPOCH_STARTED, log_lr, optimizer, optimizer_history) from event_handlers.scheduler import do_scheduler trainer.add_event_handler( Events.EPOCH_STARTED, do_scheduler, optimizer, scheduler) from event_handlers.log_gradient import log_gradient trainer.add_event_handler( Events.EPOCH_COMPLETED, log_gradient, model, param_history) from event_handlers.log_training import log_training_results trainer.add_event_handler( Events.EPOCH_COMPLETED, log_training_results, evaluator, val_loader, training_history) from event_handlers.save_model import handler_save_model trainer.add_event_handler( Events.EPOCH_COMPLETED, handler_save_model, model_info["training"]["save_every"], model, optimizer, training_history, param_history, path, start_epoch) from event_handlers.save_img import log_img trainer.add_event_handler( Events.EPOCH_COMPLETED, log_img, model_info["training"]["save_every"], training_history, param_history, path, start_epoch,optimizer_history) pbar = tqdm(total=model_info["training"]["max_epochs"]) @trainer.on(Events.EPOCH_COMPLETED) def show_bar(engine): pbar.update(1) trainer.run(train_loader, max_epochs=model_info["training"]["max_epochs"]) pbar.close()
classes = 1108 model = getattr(models, model_name)(pretrained=True) num_ftrs = model.fc.in_features model.fc = torch.nn.Linear(num_ftrs, classes) # In[6]: criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # In[7]: metrics = { 'loss': Loss(criterion), 'accuracy': Accuracy(), } trainer = create_supervised_trainer(model, optimizer, criterion, device=device) val_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) # In[8]: @trainer.on(Events.EPOCH_COMPLETED) def compute_and_display_val_metrics(engine): epoch = engine.state.epoch metrics = val_evaluator.run(val_loader).metrics print(
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_interval): train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() device = "cpu" if torch.cuda.is_available(): device = "cuda" model.to(device) # Move model before creating optimizer optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) criterion = nn.NLLLoss() trainer = create_supervised_trainer(model, optimizer, criterion, device=device) trainer.logger = setup_logger("trainer") val_metrics = {"accuracy": Accuracy(), "nll": Loss(criterion)} evaluator = create_supervised_evaluator(model, metrics=val_metrics, device=device) evaluator.logger = setup_logger("evaluator") pbar = tqdm( initial=0, leave=False, total=len(train_loader), desc=f"ITERATION - loss: {0:.2f}", ) @trainer.on(Events.ITERATION_COMPLETED(every=log_interval)) def log_training_loss(engine): pbar.desc = f"ITERATION - loss: {engine.state.output:.2f}" pbar.update(log_interval) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): pbar.refresh() evaluator.run(train_loader) metrics = evaluator.state.metrics avg_accuracy = metrics["accuracy"] avg_nll = metrics["nll"] tqdm.write( f"Training Results - Epoch: {engine.state.epoch} Avg accuracy: {avg_accuracy:.2f} Avg loss: {avg_nll:.2f}" ) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics avg_accuracy = metrics["accuracy"] avg_nll = metrics["nll"] tqdm.write( f"Validation Results - Epoch: {engine.state.epoch} Avg accuracy: {avg_accuracy:.2f} Avg loss: {avg_nll:.2f}" ) pbar.n = pbar.last_print_n = 0 @trainer.on(Events.EPOCH_COMPLETED | Events.COMPLETED) def log_time(engine): tqdm.write( f"{trainer.last_event_name.name} took { trainer.state.times[trainer.last_event_name.name]} seconds" ) trainer.run(train_loader, max_epochs=epochs) pbar.close()
def run(tb, vb, lr, epochs, writer): device = os.environ['main-device'] logging.info('Training program start!') logging.info('Configuration:') logging.info('\n' + json.dumps(INFO, indent=2)) # ------------------------------------ # 1. Define dataloader train_loader, train4val_loader, val_loader, num_of_images, mapping, _ = get_dataloaders( tb, vb) # train_loader, train4val_loader, val_loader, num_of_images = get_dataloaders(tb, vb) # Adjust weights of unknown num_of_images[6] += int(sum(num_of_images) / len(num_of_images)) weights = (1 / num_of_images) / ((1 / num_of_images).sum().item()) # weights = (1/num_of_images)/(1/num_of_images + 1/(num_of_images.sum().item()-num_of_images)) weights = weights.to(device=device) # ------------------------------------ # 2. Define model model = EfficientNet.from_pretrained( 'efficientnet-b0', num_classes=INFO['dataset-info']['num-of-classes']) model = carrier(model) # ------------------------------------ # 3. Define optimizer optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200) ignite_scheduler = LRScheduler(scheduler) # ------------------------------------ # 4. Define metrics class SoftCrossEntropyLoss(nn.Module): def __init__(self, weight=None): super(SoftCrossEntropyLoss, self).__init__() self.class_weights = weight def forward(self, input, target): softmax = torch.exp(input) / torch.exp(input).sum(1)[:, None] onehot_labels = to_onehot(target, input.shape[1]) soft_labels = torch.zeros_like(onehot_labels) soft_labels = torch.where( onehot_labels.cpu() == 1, torch.tensor([0.9]), torch.tensor([0.1 / (input.shape[1] - 1)])).to(device=device) if self.class_weights is not None: # print(soft_labels.shape, softmax.shape) loss = -torch.sum( torch.log(softmax) * soft_labels * self.class_weights * input.shape[1]) else: loss = -torch.sum(torch.log(softmax) * soft_labels) return loss class EntropyPrediction(metric.Metric): def __init__(self, threshold=1.0): super(EntropyPrediction, self).__init__() self.threshold = threshold self.prediction = torch.tensor([], dtype=torch.int) self.y = torch.tensor([], dtype=torch.int) def reset(self): # self.threshold = 0.3 self.prediction = torch.tensor([]) self.y = torch.tensor([]) super(EntropyPrediction, self).reset() def update(self, output): y_pred, y = output softmax = torch.exp(y_pred) / torch.exp(y_pred).sum(1)[:, None] entropy_base = math.log(y_pred.shape[1]) entropy = (-softmax * torch.log(softmax)).sum(1) / entropy_base values, inds = softmax.max(1) prediction = torch.where(entropy > self.threshold, inds, torch.tensor([-1]).to(device=device)) self.prediction = torch.cat( (self.prediction.type(torch.LongTensor).to(device=device), torch.tensor([mapping[x.item()] for x in prediction]).to(device=device))) self.y = torch.cat( (self.y.type(torch.LongTensor).to(device=device), y.to(device=device))) # return self.prediction, self.y def compute(self): return self.prediction, self.y train_metrics = { 'accuracy': Accuracy(), 'loss': Loss(nn.CrossEntropyLoss(weight=weights)), 'precision_recall': MetricsLambda(PrecisionRecallTable, Precision(), Recall(), train_loader.dataset.classes), 'cmatrix': MetricsLambda(CMatrixTable, ConfusionMatrix(INFO['dataset-info']['num-of-classes']), train_loader.dataset.classes) } val_metrics = { 'accuracy': MetricsLambda(Labels2Acc, EntropyPrediction()), 'precision_recall': MetricsLambda(Labels2PrecisionRecall, EntropyPrediction(), val_loader.dataset.classes), 'cmatrix': MetricsLambda(Labels2CMatrix, EntropyPrediction(), val_loader.dataset.classes) } # ------------------------------------ # 5. Create trainer trainer = create_supervised_trainer(model, optimizer, nn.CrossEntropyLoss(weight=weights), device=device) # ------------------------------------ # 6. Create evaluator train_evaluator = create_supervised_evaluator(model, metrics=train_metrics, device=device) val_evaluator = create_supervised_evaluator(model, metrics=val_metrics, device=device) desc = 'ITERATION - loss: {:.4f}' pbar = tqdm(initial=0, leave=False, total=len(train_loader), desc=desc.format(0)) # ------------------------------------ # 7. Create event hooks # Update process bar on each iteration completed. @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): log_interval = 1 iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: pbar.desc = desc.format(engine.state.output) pbar.update(log_interval) @trainer.on(Events.EPOCH_STARTED) def refresh_pbar(engine): pbar.refresh() pbar.n = pbar.last_print_n = 0 # Compute metrics on train data on each epoch completed. @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): print('Checking on training set.') train_evaluator.run(train4val_loader) metrics = train_evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_loss = metrics['loss'] precision_recall = metrics['precision_recall'] cmatrix = metrics['cmatrix'] prompt = """ Training Results - Epoch: {} Avg accuracy: {:.4f} Avg loss: {:.4f} precision_recall: \n{} confusion matrix: \n{} """.format(engine.state.epoch, avg_accuracy, avg_loss, precision_recall['pretty'], cmatrix['pretty']) tqdm.write(prompt) logging.info('\n' + prompt) writer.add_text(os.environ['run-id'], prompt, engine.state.epoch) writer.add_scalars('Aggregate/Acc', {'Train Acc': avg_accuracy}, engine.state.epoch) writer.add_scalars('Aggregate/Loss', {'Train Loss': avg_loss}, engine.state.epoch) # Compute metrics on val data on each epoch completed. cpe = CustomPeriodicEvent(n_epochs=50) cpe.attach(trainer) @trainer.on(cpe.Events.EPOCHS_50_COMPLETED) def log_validation_results(engine): pbar.clear() print('* - * - * - * - * - * - * - * - * - * - * - * - *') print('Checking on validation set.') val_evaluator.run(val_loader) metrics = val_evaluator.state.metrics avg_accuracy = metrics['accuracy'] precision_recall = metrics['precision_recall'] cmatrix = metrics['cmatrix'] prompt = """ Validating Results - Epoch: {} Avg accuracy: {:.4f} precision_recall: \n{} confusion matrix: \n{} """.format(engine.state.epoch, avg_accuracy, precision_recall['pretty'], cmatrix['pretty']) tqdm.write(prompt) logging.info('\n' + prompt) writer.add_text(os.environ['run-id'], prompt, engine.state.epoch) writer.add_scalars('Aggregate/Acc', {'Val Acc': avg_accuracy}, engine.state.epoch) writer.add_scalars( 'Aggregate/Score', { 'Val avg precision': precision_recall['data'][0, -1], 'Val avg recall': precision_recall['data'][1, -1] }, engine.state.epoch) # Save model ever N epoch. save_model_handler = ModelCheckpoint(os.environ['savedir'], '', save_interval=10, n_saved=2) trainer.add_event_handler(Events.EPOCH_COMPLETED, save_model_handler, {'model': model}) # Update learning-rate due to scheduler. trainer.add_event_handler(Events.EPOCH_STARTED, ignite_scheduler) # ------------------------------------ # Run trainer.run(train_loader, max_epochs=epochs) pbar.close()
def run_training(model, optimizer, scheduler, output_path, train_loader, val_loader, epochs, patience, epochs_pretrain, mixed_precision, classes_weights): # trainer device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if classes_weights is not None: classes_weights = classes_weights.to(device) crit = nn.CrossEntropyLoss(weight=classes_weights) metrics = {"accuracy": Accuracy(), "loss": Loss(crit)} trainer = create_supervised_trainer_with_pretraining( model, optimizer, crit, device=device, epochs_pretrain=epochs_pretrain, mixed_precision=mixed_precision) train_evaluator = create_supervised_evaluator( model, metrics=metrics, device=device) val_evaluator = create_supervised_evaluator( model, metrics=metrics, device=device) # Out paths path_ckpt = os.path.join(output_path, "model_ckpt") log_dir = os.path.join(output_path, "log_dir") os.makedirs(log_dir, exist_ok=True) # tensorboard tb_logger = TensorboardLogger(log_dir=log_dir) tb_logger.attach(train_evaluator, log_handler=OutputHandler(tag="training", metric_names=[ "accuracy", "loss"], another_engine=trainer), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(val_evaluator, log_handler=OutputHandler(tag="validation", metric_names=[ "accuracy", "loss"], another_engine=trainer), event_name=Events.EPOCH_COMPLETED) # training progress pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names="all") # @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): train_evaluator.run(train_loader) val_evaluator.run(val_loader) train_loss = train_evaluator.state.metrics["loss"] val_loss = val_evaluator.state.metrics["loss"] train_acc = train_evaluator.state.metrics["accuracy"] val_acc = val_evaluator.state.metrics["accuracy"] pbar.log_message( "Training Results - Epoch: {} Loss: {:.6f} Accuracy: {:.6f}".format(engine.state.epoch, train_loss, train_acc)) pbar.log_message( "Validation Results - Epoch: {} Loss: {:.6f} Accuracy: {:.6f}".format(engine.state.epoch, val_loss, val_acc)) pbar.n = pbar.last_print_n = 0 trainer.add_event_handler(Events.EPOCH_COMPLETED, log_training_results) # def get_val_loss(engine): # return -engine.state.metrics['loss'] def get_val_acc(engine): return engine.state.metrics['accuracy'] # checkpoint and early stopping checkpointer = ModelCheckpoint( path_ckpt, "model", score_function=get_val_acc, score_name="accuracy", require_empty=False) early_stopper = EarlyStopping(patience, get_val_acc, trainer) to_save = {'optimizer': optimizer, 'model': model} if scheduler is not None: to_save["scheduler"] = scheduler val_evaluator.add_event_handler(Events.COMPLETED, checkpointer, to_save) val_evaluator.add_event_handler(Events.COMPLETED, early_stopper) if scheduler is not None: trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # free resources trainer.add_event_handler( Events.ITERATION_COMPLETED, lambda _: _empty_cache()) train_evaluator.add_event_handler( Events.ITERATION_COMPLETED, lambda _: _empty_cache()) val_evaluator.add_event_handler( Events.ITERATION_COMPLETED, lambda _: _empty_cache()) trainer.run(train_loader, max_epochs=epochs) tb_logger.close() # Evaluation with best model model.load_state_dict(torch.load( glob.glob(os.path.join(path_ckpt, "*.pth"))[0])["model"]) train_evaluator = create_supervised_evaluator( model, metrics=metrics, device=device) val_evaluator = create_supervised_evaluator( model, metrics=metrics, device=device) train_evaluator.run(train_loader) val_evaluator.run(val_loader) _pretty_print("Evaluating best model") pbar.log_message( "Best model on training set - Loss: {:.6f} Accuracy: {:.6f}" .format(train_evaluator.state.metrics["loss"], train_evaluator.state.metrics["accuracy"])) pbar.log_message( "Best model on validation set - Loss: {:.6f} Accuracy: {:.6f}" .format(val_evaluator.state.metrics["loss"], val_evaluator.state.metrics["accuracy"])) return model, train_evaluator.state.metrics, val_evaluator.state.metrics
def __init__(self, output_transform=binary_transform): self._y = None self._pred = None super().__init__(output_transform=output_transform) def reset(self): self._y = list() self._pred = list() super().reset() def update(self, output): y_pred, y = output self._y.append(y) self._pred.append(y_pred) def compute(self): y_pred = torch.cat(self._pred, 0).cpu() y = torch.cat(self._y, 0).cpu() score = f1_score(y, y_pred, average='weighted') return score metric = { 'acc2': Accuracy(output_transform=binary_transform), 'acc5': Accuracy(output_transform=five_transform), 'acc7': Accuracy(output_transform=seven_transform), 'f1': F1(), 'corr': Pearson(), 'mae': MeanAbsoluteError() }
def _train( self, train_data, val_data, test_data, writer, experiment, dry_run: bool = False, ) -> None: use_cuda = torch.cuda.is_available() # Preprocess all datasets. logger.info("Preprocessing datasets...") train_loader = self._preprocess_for_training("train", train_data, use_cuda) val_loader = self._preprocess_for_training("val", val_data, use_cuda) test_loader = self._preprocess_for_training("test", test_data, use_cuda) logger.info("") # Set up model and move it to device. logger.info("Creating model...") self._create_model(self.num_classes) device = torch.device("cuda" if use_cuda else "cpu") logger.info(f" device: {device}") self.model = self.model.to(device) # Set up optimizer and loss. optimizer = self._create_optimizer() loss_func = nn.CrossEntropyLoss() logger.info(f" loss function: cross-entropy") logger.info("") # Dedicate a few images that will be plotted as samples to tensorboard. num_samples_to_plot = self.config.get("num_samples_to_plot", 5) def get_samples(loader): if loader is None: return None, None else: return next( iter(DataLoader(loader.dataset, batch_size=num_samples_to_plot)) ) train_sample_images, train_sample_labels = get_samples(train_loader) val_sample_images, val_sample_labels = get_samples(val_loader) test_sample_images, test_sample_labels = get_samples(test_loader) # Configure trainer and metrics. accumulate_train_metrics = self.config.get("accumulate_train_metrics", True) # We need to transform the output of the trainer and metrics here to accumulate # metrics during training (otherwise, we have to re-evaluate on the complete # train set which takes a long time). By default, the trainer outputs # `loss.item()` and the metrics expect `y_pred, y` (which is what the evaluator # outputs). We are now outputting `y_pred, y, loss` from the trainer and then # slicing off the `loss` before it goes into the metric. # See also the footnote here but note that it's a bit wrong: # https://pytorch.org/ignite/quickstart.html# def trainer_output_transform(x, y, y_pred, loss): return y_pred, y, loss.item() def metrics_output_transform(output): return output[:2] # use only y_pred, y trainer = create_supervised_trainer( self.model, optimizer, loss_func, device=device, output_transform=trainer_output_transform, ) if accumulate_train_metrics: # TODO: Maybe put train_metrics and val_metrics into one dict. train_metrics = { "accuracy": Accuracy(output_transform=metrics_output_transform), "loss": Loss(loss_func, output_transform=metrics_output_transform), # "confusion_matrix": ConfusionMatrix(num_classes), } for name, metric in train_metrics.items(): # Attach metrics to trainer to accumulate them during training. metric.attach(trainer, name) val_metrics = { "accuracy": Accuracy(), "loss": Loss(loss_func), # "confusion_matrix": ConfusionMatrix(num_classes), } evaluator = create_supervised_evaluator( self.model, metrics=val_metrics, device=device ) @trainer.on( Events.ITERATION_COMPLETED(every=self.config.get("print_every", 100)) ) def log_batch(trainer): batch = (trainer.state.iteration - 1) % trainer.state.epoch_length + 1 logger.info( f"Epoch {trainer.state.epoch} / {num_epochs}, " f"batch {batch} / {trainer.state.epoch_length}: " f"Loss: {trainer.state.output[2]:.3f}" # f"Loss: {trainer.state.output:.3f}" ) def log_results(name, metrics, epoch): """Log results of an epoch to stdout, tensorboard and comet.""" logger.info( f"{name}: Average loss: {metrics['loss']:.3f}, " f"Average accuracy: {metrics['accuracy']:.3f}" ) experiment.log_metric(f"{name}_loss", metrics["loss"]) experiment.log_metric(f"{name}_accuracy", metrics["accuracy"]) writer.add_scalar(f"{name}_loss", metrics["loss"], epoch) writer.add_scalar(f"{name}_accuracy", metrics["accuracy"], epoch) # TODO: This iterates over complete train set again, maybe accumulate as in the # example in the footnote here: https://pytorch.org/ignite/quickstart.html# @trainer.on(Events.EPOCH_COMPLETED) def log_epoch(trainer): logger.info("") logger.info(f"Epoch {trainer.state.epoch} / {num_epochs} results: ") # Train data. if accumulate_train_metrics: log_results("train", trainer.state.metrics, trainer.state.epoch) logger.info("(train metrics are accumulated during training; " "to re-evaluate on the complete train set after training, " "use config parameter 'accumulate_train_metrics': False)") else: evaluator.run(train_loader) log_results("train", evaluator.state.metrics, trainer.state.epoch) # Val data. if val_loader: evaluator.run(val_loader) log_results("val", evaluator.state.metrics, trainer.state.epoch) # Test data. if test_loader: evaluator.run(test_loader) log_results("test", evaluator.state.metrics, trainer.state.epoch) logger.info("") @trainer.on(Events.EPOCH_COMPLETED) def checkpoint_model(trainer): # TODO: Do not checkpoint at every step. checkpoint_dir = ( self.out_dir / "checkpoints" / f"epoch{trainer.state.epoch}" ) checkpoint_dir.mkdir(parents=True, exist_ok=True) torch.save(self.model, checkpoint_dir / "model.pt") @trainer.on(Events.EPOCH_COMPLETED) def plot_samples(trainer): """Plot a few sample images and probabilites to tensorboard.""" def write_samples_plot(name, sample_images, sample_labels): # TODO: This can be improved by just using the outputs already # calculated in evaluator.state.output in the functions above. # Problem: At least in the train evaluator, the batches are not equal, # so the plotted images will differ from run to run. if sample_images is None: return with torch.no_grad(): sample_output = self.model(sample_images.to(device)) sample_pred = torch.softmax(sample_output, dim=1) visualization.plot_samples( writer, f"{name}-samples", trainer.state.epoch, sample_images.to("cpu").numpy(), sample_labels.to("cpu").numpy(), sample_pred.to("cpu").numpy(), ) write_samples_plot("train", train_sample_images, train_sample_labels) write_samples_plot("val", val_sample_images, val_sample_labels) write_samples_plot("test", test_sample_images, test_sample_labels) # Start training. num_epochs = 1 if dry_run else self.config.get("num_epochs", 5) if dry_run: num_batches = 1 logger.info(f"Training model on device {device}... (DRY RUN, only 1 batch)") elif "num_samples" in self.config: # TODO: Make sure batch_size doesn't differ from the value extracted during # preprocessing. batch_size = self.config.get("batch_size", 128) # TODO: This always uses a few more samples than num_samples. Maybe get it # to the correct value. num_batches = int(self.config["num_samples"] / batch_size) + 1 logger.info( f"Training model on device {device}... (using " f"{self.config['num_samples']} of {len(train_loader.dataset)} samples)" ) else: num_batches = None # all batches logger.info(f"Training model on device {device}...") logger.info( "(if this takes too long, train on less data with the config " "parameter 'num_samples')" ) logger.info("(show more steps by setting the config parameter 'print_every')") logger.info("") trainer.run(train_loader, max_epochs=num_epochs, epoch_length=num_batches) logger.info("Training finished!") # Save the trained model. torch.save(self.model, self.out_dir / "model.pt")
def test_integration(): n_iters = 100 batch_size = 10 n_classes = 10 y_true_batch_values = iter( np.random.randint(0, n_classes, size=(n_iters, batch_size))) y_pred_batch_values = iter(np.random.rand(n_iters, batch_size, n_classes)) loss_values = iter(range(n_iters)) def update_fn(engine, batch): loss_value = next(loss_values) y_true_batch = next(y_true_batch_values) y_pred_batch = next(y_pred_batch_values) return loss_value, torch.from_numpy(y_pred_batch), torch.from_numpy( y_true_batch) trainer = Engine(update_fn) alpha = 0.98 acc_metric = RunningAverage( Accuracy(output_transform=lambda x: [x[1], x[2]]), alpha=alpha) acc_metric.attach(trainer, 'running_avg_accuracy') avg_output = RunningAverage(output_transform=lambda x: x[0], alpha=alpha) avg_output.attach(trainer, 'running_avg_output') running_avg_acc = [ None, ] @trainer.on(Events.ITERATION_COMPLETED) def manual_running_avg_acc(engine): _, y_pred, y = engine.state.output indices = torch.max(y_pred, 1)[1] correct = torch.eq(indices, y).view(-1) num_correct = torch.sum(correct).item() num_examples = correct.shape[0] batch_acc = num_correct * 1.0 / num_examples if running_avg_acc[0] is None: running_avg_acc[0] = batch_acc else: running_avg_acc[0] = running_avg_acc[0] * alpha + ( 1.0 - alpha) * batch_acc engine.state.running_avg_acc = running_avg_acc[0] @trainer.on(Events.EPOCH_STARTED) def running_avg_output_init(engine): engine.state.running_avg_output = None @trainer.on(Events.ITERATION_COMPLETED) def running_avg_output_update(engine): if engine.state.running_avg_output is None: engine.state.running_avg_output = engine.state.output[0] else: engine.state.running_avg_output = engine.state.running_avg_output * alpha + \ (1.0 - alpha) * engine.state.output[0] @trainer.on(Events.ITERATION_COMPLETED) def assert_equal_running_avg_acc_values(engine): assert engine.state.running_avg_acc == engine.state.metrics['running_avg_accuracy'], \ "{} vs {}".format(engine.state.running_avg_acc, engine.state.metrics['running_avg_accuracy']) @trainer.on(Events.ITERATION_COMPLETED) def assert_equal_running_avg_output_values(engine): assert engine.state.running_avg_output == engine.state.metrics['running_avg_output'], \ "{} vs {}".format(engine.state.running_avg_output, engine.state.metrics['running_avg_output']) np.random.seed(10) running_avg_acc = [ None, ] n_iters = 10 batch_size = 10 n_classes = 10 data = list(range(n_iters)) loss_values = iter(range(n_iters)) y_true_batch_values = iter( np.random.randint(0, n_classes, size=(n_iters, batch_size))) y_pred_batch_values = iter(np.random.rand(n_iters, batch_size, n_classes)) trainer.run(data, max_epochs=1) running_avg_acc = [ None, ] n_iters = 10 batch_size = 10 n_classes = 10 data = list(range(n_iters)) loss_values = iter(range(n_iters)) y_true_batch_values = iter( np.random.randint(0, n_classes, size=(n_iters, batch_size))) y_pred_batch_values = iter(np.random.rand(n_iters, batch_size, n_classes)) trainer.run(data, max_epochs=1)
def train_epoch(self): device = 'cpu' ########### model ########### #Model = model.GQNet() #Model = GQResnet.resnet18() #Model = GQResNeXt.resnext18() #Model = GQSEResNet.se_resnet_18() Model = GQSEResNeXt.se_resnext_18() #Model = GQSKResnet.SKNet18() #Model = GQSKResNeXt.SKNet18() total_model_parameters = sum(x.numel() for x in Model.parameters()) print("Model have %.2fM paramerters in total" % (total_model_parameters / 1e6)) print("SE_ResNeXt") if torch.cuda.is_available(): print("pytorch gpu") device = 'cuda' Model = Model.cuda() ########### optimizer, loss ########### optimizer = torch.optim.Adam(Model.parameters(), lr=self.learning_rate, weight_decay=0.0001) loss = torch.nn.CrossEntropyLoss() ########### create trainer and evaluator ########### trainer = create_supervised_trainer(Model, optimizer, loss, device=device) evaluator = create_supervised_evaluator(Model, metrics={ "accuracy": Accuracy(), "loss": Loss(loss) }, device=device) ########### log ########### desc = "ITERATION - loss: {:.2f}" pbar = tqdm(initial=0, leave=False, total=len(self.data_loader_train), desc=desc.format(0)) ########### train loss ########### @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iter = (engine.state.iteration - 1) % len( self.data_loader_train) + 1 if iter % self.log_interval == 0: pbar.desc = desc.format(engine.state.output) pbar.update(self.log_interval) ########### train results ########### @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): pbar.refresh() evaluator.run(self.data_loader_train) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_cross_loss = metrics['loss'] tqdm.write( "Training Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_cross_loss)) ########### test results ########### @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(self.data_loader_valid) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_cross_loss = metrics['loss'] tqdm.write( "Validation Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_cross_loss)) pbar.n = pbar.last_print_n = 0 ########### trainer start ########### trainer.run(self.data_loader_train, max_epochs=self.num_epochs) ############## save model ################ torch.save(Model, str(self.version_name)) pbar.close()