def _init_scheduler(self): if self.hparams.scheduler_name == "none": self.scheduler = None elif self.hparams.scheduler_name == "warmup_with_cosine": from ignite.contrib.handlers import LinearCyclicalScheduler, CosineAnnealingScheduler, ConcatScheduler lr = self.hparams.lr if self.hparams.run_params["epoch_length"]: epoch_length = self.hparams.run_params["epoch_length"] else: epoch_length = len(self.train_loader) num_epochs = self.hparams.run_params["max_epochs"] scheduler_1 = LinearCyclicalScheduler(self.optimizer, "lr", start_value=lr*0.01, end_value=lr, cycle_size=epoch_length*2) scheduler_2 = CosineAnnealingScheduler(self.optimizer, "lr", start_value=lr, end_value=lr*0.001, cycle_size=num_epochs*epoch_length) durations = [epoch_length, ] self.scheduler = ConcatScheduler(schedulers=[scheduler_1, scheduler_2], durations=durations) elif self.hparams.scheduler_name == "warmup_with_cosine_100": from ignite.contrib.handlers import LinearCyclicalScheduler, CosineAnnealingScheduler, ConcatScheduler lr = self.hparams.lr if self.hparams.run_params["epoch_length"]: epoch_length = self.hparams.run_params["epoch_length"] else: epoch_length = len(self.train_loader) num_epochs = self.hparams.run_params["max_epochs"] scheduler_1 = LinearCyclicalScheduler(self.optimizer, "lr", start_value=lr*0.01, end_value=lr, cycle_size=epoch_length*2) scheduler_2 = CosineAnnealingScheduler(self.optimizer, "lr", start_value=lr, end_value=lr*0.01, cycle_size=num_epochs*epoch_length) durations = [epoch_length, ] self.scheduler = ConcatScheduler(schedulers=[scheduler_1, scheduler_2], durations=durations) elif self.hparams.scheduler_name == "warmup_with_cosine_10": from ignite.contrib.handlers import LinearCyclicalScheduler, CosineAnnealingScheduler, ConcatScheduler lr = self.hparams.lr if self.hparams.run_params["epoch_length"]: epoch_length = self.hparams.run_params["epoch_length"] else: epoch_length = len(self.train_loader) num_epochs = self.hparams.run_params["max_epochs"] scheduler_1 = LinearCyclicalScheduler(self.optimizer, "lr", start_value=lr*0.1, end_value=lr, cycle_size=epoch_length*2) scheduler_2 = CosineAnnealingScheduler(self.optimizer, "lr", start_value=lr, end_value=lr*0.1, cycle_size=num_epochs*epoch_length) durations = [epoch_length, ] self.scheduler = ConcatScheduler(schedulers=[scheduler_1, scheduler_2], durations=durations) elif self.hparams.scheduler_name == "one_cycle_cosine_10": from ignite.contrib.handlers import CosineAnnealingScheduler lr = self.hparams.lr if self.hparams.run_params["epoch_length"]: epoch_length = self.hparams.run_params["epoch_length"] else: epoch_length = len(self.train_loader) num_epochs = self.hparams.run_params["max_epochs"] self.scheduler = CosineAnnealingScheduler(self.optimizer, "lr", start_value=lr, end_value=lr*0.1, cycle_size=num_epochs*epoch_length) elif self.hparams.scheduler_name == "one_cycle_cosine_100": from ignite.contrib.handlers import CosineAnnealingScheduler lr = self.hparams.lr if self.hparams.run_params["epoch_length"]: epoch_length = self.hparams.run_params["epoch_length"] else: epoch_length = len(self.train_loader) num_epochs = self.hparams.run_params["max_epochs"] self.scheduler = CosineAnnealingScheduler(self.optimizer, "lr", start_value=lr, end_value=lr*0.01, cycle_size=num_epochs*epoch_length)
def get_scheduler(optimizer, epochs, learning_rate, train_loader_size): scheduler = CosineAnnealingScheduler(optimizer, 'lr', learning_rate, learning_rate / 1000, epochs * train_loader_size) scheduler = create_lr_scheduler_with_warmup(scheduler, 0, 1000, learning_rate) return scheduler
def get_lr_scheduler( config: ConfigSchema, optimizer: Optimizer, trainer: Engine, evaluator: Engine ): if config.num_warmup_epochs: length = config.num_epochs - config.num_warmup_epochs else: length = config.num_epochs if config.lr_scheduler == "cosine": lr_scheduler = CosineAnnealingScheduler( optimizer, "lr", config.learning_rate, 0.001 * config.learning_rate, cycle_size=length + 1, ) if config.num_warmup_epochs: lr_scheduler = create_lr_scheduler_with_warmup( lr_scheduler, 0.0, config.num_warmup_epochs ) elif config.lr_scheduler == "reduce_at_plateau": lr_scheduler = LRReductionEarlyStopping( optimizer, trainer=trainer, reduction_rate=0.1, num_reduction=2, patience=config.patience, score_function=lambda _: evaluator.state.metrics["accuracy"], num_warmup_epochs=config.num_warmup_epochs, warmup_start_value=0.001 * config.learning_rate, ) else: raise ValueError(f"unknown lr scheduler {config.lr_scheduler}") return lr_scheduler
val_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) # #### EarlyStopping # In[10]: # handler = EarlyStopping(patience=30, score_function=lambda engine: engine.state.metrics['accuracy'], trainer=trainer) # val_evaluator.add_event_handler(Events.COMPLETED, handler) # #### LR Scheduler # In[11]: scheduler = CosineAnnealingScheduler(optimizer, 'lr', init_lr, end_lr, len(loader)) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) @trainer.on(Events.ITERATION_COMPLETED) def print_lr(engine): epoch = engine.state.epoch iteration = engine.state.iteration if epoch < 2 and iteration % 100 == 0: print(f'Iteration {iteration} | LR {optimizer.param_groups[0]["lr"]}') # #### Compute and display metrics # In[12]:
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--model", type=str, default='ffn', help="model's name") parser.add_argument("--mode", type=int, choices=[0, 1, 2], default=None) parser.add_argument("--SNRdb", type=float, default=None) parser.add_argument("--pilot_version", type=int, choices=[1, 2], default=1) parser.add_argument("--loss_type", type=str, default="BCELoss") parser.add_argument("--train_batch_size", type=int, default=128) parser.add_argument("--valid_batch_size", type=int, default=128) parser.add_argument("--gradient_accumulation_steps", type=int, default=1) parser.add_argument("--max_norm", type=float, default=-1) parser.add_argument("--lr", type=float, default=1e-3) parser.add_argument("--noise_lambda", type=float, default=1.0) parser.add_argument("--lr_scheduler", type=str, choices=["linear", "cycle", "cosine"], default="linear") parser.add_argument("--reset_lr_scheduler", type=str, choices=["linear", "cycle", "cosine"], default=None) parser.add_argument("--reset_trainer", action='store_true') parser.add_argument("--modify_model", action='store_true') parser.add_argument("--wd", type=float, default=1e-4, help="weight decay") parser.add_argument("--eval_iter", type=int, default=10) parser.add_argument("--save_iter", type=int, default=10) parser.add_argument("--n_epochs", type=int, default=10) parser.add_argument("--flush_dataset", type=int, default=0) parser.add_argument("--no_cache", action='store_true') parser.add_argument("--with_pure_y", action='store_true') parser.add_argument("--with_h", action='store_true') parser.add_argument("--only_l1", action='store_true', help="Only loss 1") parser.add_argument("--interpolation", action='store_true', help="if interpolate between pure and reconstruction.") parser.add_argument("--data_dir", type=str, default="data") parser.add_argument("--cache_dir", type=str, default="train_cache") parser.add_argument("--output_path", type=str, default="runs", help="model save") parser.add_argument("--resume_from", type=str, default=None, help="resume training.") parser.add_argument("--first_cache_index", type=int, default=0) parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") parser.add_argument("--seed", type=int, default=43) parser.add_argument("--debug", action='store_true') args = parser.parse_args() args.output_path = os.path.join(args.output_path, f'pilot_{args.pilot_version}') args.cache_dir = os.path.join(args.data_dir, args.cache_dir) # Setup CUDA, GPU & distributed training args.distributed = (args.local_rank != -1) if not args.distributed: device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method='env://') args.n_gpu = torch.cuda.device_count() if not args.distributed else 1 args.device = device # Set seed set_seed(args) logger = setup_logger("trainer", distributed_rank=args.local_rank) # Model construction model = getattr(models, args.model)(args) model = model.to(device) optimizer = AdamW(model.parameters(), lr = args.lr, weight_decay=args.wd) if args.loss_type == "MSELoss": criterion = nn.MSELoss(reduction='sum').to(device) else: criterion = getattr(nn, args.loss_type, getattr(auxiliary, args.loss_type, None))().to(device) criterion2 = nn.MSELoss(reduction='sum').to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True ) train_dataset = SIGDataset(args, data_type="train") valid_dataset = SIGDataset(args, data_type="valid") train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if args.distributed else None valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) if args.distributed else None train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, pin_memory=True, shuffle=(not args.distributed)) valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.valid_batch_size, pin_memory=True, shuffle=False) lr_scheduler = None if args.lr_scheduler == "linear": lr_scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) elif args.lr_scheduler == "cycle": lr_scheduler = LinearCyclicalScheduler(optimizer, 'lr', 0.0, args.lr, args.eval_iter * len(train_loader)) elif args.lr_scheduler == "cosine": lr_scheduler = CosineAnnealingScheduler(optimizer, 'lr', args.lr, 0.0, args.eval_iter * len(train_loader)) # Training function and trainer def update(engine, batch): model.train() y, x_label, y_pure, H = train_dataset.prepare_batch(batch, device=args.device) if args.with_pure_y and args.with_h: x_pred, y_pure_pred, H_pred = model(y, pure=y_pure, H=H, opp=True) loss_1 = criterion(x_pred, x_label) / args.gradient_accumulation_steps if args.loss_type == "MSELoss": loss_1 = loss_1 / x_pred.size(0) loss_noise = criterion2(y_pure_pred, y_pure) / y.size(0) / args.gradient_accumulation_steps loss_noise_h = criterion2(H_pred, H) / H.size(0) / args.gradient_accumulation_steps if args.only_l1: loss = loss_1 else: loss = loss_1 + loss_noise * args.noise_lambda + loss_noise_h output = (loss.item(), loss_1.item(), loss_noise.item(), loss_noise_h.item()) elif args.with_pure_y: x_pred, y_pure_pred = model(y, pure=y_pure if args.interpolation else None, opp=True) loss_1 = criterion(x_pred, x_label) / args.gradient_accumulation_steps loss_noise = criterion2(y_pure_pred, y_pure) / y.size(0) / args.gradient_accumulation_steps loss = loss_1 + loss_noise * args.noise_lambda output = (loss.item(), loss_1.item(), loss_noise.item()) elif args.with_h: x_pred, H_pred = model(y, opp=True) loss_1 = criterion(x_pred, x_label) / args.gradient_accumulation_steps loss_noise = criterion2(H_pred, H) / H.size(0) / args.gradient_accumulation_steps loss = loss_1 + loss_noise * args.noise_lambda output = (loss.item(), loss_1.item(), loss_noise.item()) else: x_pred = model(y) loss_1 = criterion(x_pred, x_label) / args.gradient_accumulation_steps loss = loss_1 output = (loss.item(), loss_1.item(), torch.zeros_like(loss_1).item()) loss.backward() if args.max_norm > 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return output trainer = Engine(update) to_save = {"trainer": trainer, "model": model, "optimizer": optimizer, "lr_scheduler": lr_scheduler} metric_names = ["loss", "l1", "ln"] if args.with_pure_y and args.with_h: metric_names.append("lnH") common.setup_common_training_handlers( trainer=trainer, train_sampler=train_loader.sampler, to_save=to_save, save_every_iters=len(train_loader) * args.save_iter, lr_scheduler=lr_scheduler, output_names=metric_names, with_pbars=False, clear_cuda_cache=False, output_path=args.output_path, n_saved=2, ) resume_from = args.resume_from if resume_from is not None: checkpoint_fp = Path(resume_from) assert checkpoint_fp.exists(), "Checkpoint '{}' is not found".format(checkpoint_fp.as_posix()) logger.info("Resume from a checkpoint: {}".format(checkpoint_fp.as_posix())) checkpoint = torch.load(checkpoint_fp.as_posix(), map_location="cpu") if args.reset_trainer: to_save.pop("trainer") checkpoint_to_load = to_save if 'validation' not in resume_from else {"model": model} Checkpoint.load_objects(to_load=checkpoint_to_load, checkpoint=checkpoint) if args.reset_lr_scheduler is not None: if args.reset_lr_scheduler == "linear": lr_scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) elif args.reset_lr_scheduler == "cycle": lr_scheduler = LinearCyclicalScheduler(optimizer, 'lr', 0.0, args.lr, args.eval_iter * len(train_loader)) elif args.reset_lr_scheduler == "cosine": lr_scheduler = CosineAnnealingScheduler(optimizer, 'lr', args.lr, 0.0, args.eval_iter * len(train_loader)) metrics = { "accuracy": Accuracy(lambda output: (torch.round(output[0][0]), output[1][0])), "loss_1": Loss(criterion, output_transform=lambda output: (output[0][0], output[1][0])), "loss_noise": Loss(criterion2, output_transform=lambda output: (output[0][1], output[1][1])) } if args.with_pure_y and args.with_h: metrics["loss_noise_h"] = Loss(criterion2, output_transform=lambda output: (output[0][2], output[1][2])) def _inference(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, Tuple[torch.Tensor]]: model.eval() with torch.no_grad(): x, y, x_pure, H = valid_dataset.prepare_batch(batch, device=args.device, non_blocking=True) if args.with_pure_y and args.with_h: y_pred, x_pure_pred, h_pred = model(x, opp=True) outputs = (y_pred, x_pure_pred, h_pred), (y, x_pure, H) elif args.with_pure_y: y_pred, x_pure_pred = model(x, opp=True) outputs = (y_pred, x_pure_pred), (y, x_pure) elif args.with_h: y_pred, h_pred = model(x, opp=True) outputs = (y_pred, h_pred), (y, H) else: y_pred = model(x) x_pure_pred = x_pure outputs = (y_pred, x_pure_pred), (y, x_pure) return outputs evaluator = Engine(_inference) for name, metric in metrics.items(): metric.attach(evaluator, name) trainer.add_event_handler(Events.EPOCH_COMPLETED(every=args.eval_iter), lambda _: evaluator.run(valid_loader)) if args.flush_dataset > 0: trainer.add_event_handler(Events.EPOCH_COMPLETED(every=args.n_epochs//args.flush_dataset), lambda _: train_loader.dataset.reset() if args.no_cache else train_loader.dataset.reload()) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=metric_names, output_transform=lambda _: {"lr": f"{optimizer.param_groups[0]['lr']:.2e}"}) evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics))) tb_logger = common.setup_tb_logging(args.output_path, trainer, optimizer, evaluators={'validation': evaluator}, log_every_iters=1) # Store 3 best models by validation accuracy: common.gen_save_best_models_by_val_score( save_handler=DiskSaver(args.output_path, require_empty=False), evaluator=evaluator, models={"model": model}, metric_name="accuracy", n_saved=3, trainer=trainer, tag="validation" ) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) if args.local_rank in [-1, 0]: tb_logger.close()
'params': parameters_of(model, (nn.BatchNorm2d, nn.PReLU)), }], lr=args.learning_rate, momentum=0.9) class_freq = torch.from_numpy(Cityscapes.CLASS_FREQ).float() weight = 1 / torch.log(1.02 + class_freq) loss_fn = torch.nn.CrossEntropyLoss(ignore_index=255, weight=weight) loss_fn = loss_fn.cuda() warmup_iterations = 1000 scheduler = CosineAnnealingScheduler( optimizer, 'lr', args.learning_rate, args.learning_rate * 1e-4, cycle_size=args.epochs * len(train_loader) - warmup_iterations, ) scheduler = create_lr_scheduler_with_warmup(scheduler, 0, args.learning_rate, warmup_iterations) model, optimizer = amp.initialize(model, optimizer, opt_level="O2") if args.distributed: model = convert_syncbn_model(model) model = DistributedDataParallel(model) trainer = create_segmentation_trainer( model, optimizer, loss_fn,
'weight_decay': args.weight_decay, }, { 'params': (p for p in model.parameters() if len(p.shape) == 1), }, ], lr=args.learning_rate, ) loss_fn = OHEMLoss(ignore_index=255) loss_fn = loss_fn.cuda() scheduler = CosineAnnealingScheduler( optimizer, 'lr', args.learning_rate, args.learning_rate / 1000, args.epochs * len(train_loader) - 1000, ) scheduler = create_lr_scheduler_with_warmup(scheduler, 0, args.learning_rate, 1000) model, optimizer = amp.initialize(model, optimizer, opt_level="O2") if args.distributed: model = convert_syncbn_model(model) model = DistributedDataParallel(model) trainer = create_segmentation_trainer( model, optimizer, loss_fn,
def train(model_name: str, dir_dataset: Path, dir_model: Path, img_type: str, architecture: str, epoch: int, batch_size: int, multi_sample_aug: str, seed: int, debug: bool): logger = getLogger('root') tic = time.time() random.seed(seed) np.random.seed(seed) train_csv = pd.read_csv(dir_dataset / 'train.csv') dir_images = Path('../../input/train_raw') train_images_all = sorted(os.listdir(str(dir_images))) random.shuffle(train_images_all) num_train = int(len(train_images_all) * 0.9) train_images = train_images_all[:num_train] valid_images = train_images_all[num_train:] if debug: train_images = train_images[:4000] valid_images = valid_images[:2000] logger.info('train images: {}'.format(train_images[:5])) logger.info('valid images: {}'.format(valid_images[:5])) logger.info('==> prepare dataset') logger.info(f'==> use {multi_sample_aug}') if multi_sample_aug == 'mixup': train_mixup_dataset = BengaliMixUpDataset( 1.0, dir_images, train_csv=train_csv, list_images=train_images, is_aug=True, get_augmenter_func=get_augmenter) elif multi_sample_aug == 'cutmix': train_mixup_dataset = BengaliCutMixDataset( 1.0, dir_images, train_csv=train_csv, list_images=train_images, is_aug=True, get_augmenter_func=get_augmenter) elif multi_sample_aug == 'cutmixup': train_mixup_dataset = BengaliCutMixUpDataset( 0.5, 0.5, dir_images, train_csv=train_csv, list_images=train_images, is_aug=True, get_augmenter_func=get_augmenter) else: raise ValueError('Unknown Augmentation') train_dataset = BengaliDataset(dir_images, train_csv=train_csv, list_images=train_images, is_aug=False) valid_dataset = BengaliDataset(dir_images, train_csv=train_csv, list_images=valid_images, is_aug=False) logger.info('==> create data loader') train_loader = DataLoader(train_mixup_dataset, num_workers=7, batch_sampler=BalancedSampler( train_csv, train_images, len(train_images) // batch_size, batch_size)) valid_loader = DataLoader(valid_dataset, batch_size=batch_size, num_workers=7, shuffle=False) eval_train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=7, shuffle=False, sampler=SequentialSampler( train_dataset, num_samples=len(valid_images))) logger.info('==> build model') model = build_model(architecture, img_type, pretrained=True) model.cuda() optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-3) loss_f = BengaliLoss() device = "cuda" if torch.cuda.is_available() else "cpu" trainer = create_supervised_trainer(model, optimizer, loss_f, device=device) ProgressBar(persist=True, desc='Train').attach(trainer) def extract_grapheme_root(output): y_pred, (y, _) = output return y_pred[:, :NUM_GRAPHEME_ROOT], y[:, 0] def extract_vowel_diacritic(output): y_pred, (y, _) = output return y_pred[:, NUM_GRAPHEME_ROOT:NUM_GRAPHEME_ROOT + NUM_VOWEL], y[:, 1] def extract_consonant_diacritic(output): y_pred, (y, _) = output return y_pred[:, NUM_GRAPHEME_ROOT + NUM_VOWEL:], y[:, 2] metrics = { 'accuracy_gr': Accuracy(extract_grapheme_root), 'accuracy_vd': Accuracy(extract_vowel_diacritic), 'accuracy_cd': Accuracy(extract_consonant_diacritic), 'recall_gr': Recall(extract_grapheme_root, average=True), 'recall_vd': Recall(extract_vowel_diacritic, average=True), 'recall_cd': Recall(extract_consonant_diacritic, average=True), 'ave_recall': 0.5 * Recall(extract_grapheme_root, average=True) + 0.25 * Recall(extract_vowel_diacritic, average=True) + 0.25 * Recall(extract_consonant_diacritic, average=True) } evaluator_train = create_supervised_evaluator(model, metrics=metrics, device=device) evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) ProgressBar(persist=True, desc='Train Evaluation').attach(evaluator_train) ProgressBar(persist=True, desc='Valid Evaluation').attach(evaluator) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(trainer): evaluator_train.run(eval_train_loader) eval_print('Train', trainer.state.epoch, evaluator_train.state.metrics) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(trainer): evaluator.run(valid_loader) eval_print('Valid', trainer.state.epoch, evaluator.state.metrics) handler = ModelCheckpoint(dirname=dir_model, filename_prefix=f'temp', create_dir=True, require_empty=False, n_saved=1, score_function=score_function) evaluator.add_event_handler(Events.COMPLETED, handler, {'model': model}) lr_handler = CosineAnnealingScheduler(optimizer, 'lr', 1e-3, 1e-7, len(train_loader) * epoch, save_history=True) trainer.add_event_handler(Events.ITERATION_COMPLETED, lr_handler) trainer.run(train_loader, max_epochs=epoch) saved_model = handler._saved[0][1][0] model_new_name = str(dir_model / f'{model_name}.pth') logger.info(f'rename model {saved_model} to {model_new_name}') os.rename(saved_model, model_new_name) plt.plot(trainer.state.param_history['lr']) plt.xlabel('batch') plt.ylabel('learning rate') plt.savefig('_log/learning_rate.png') plt.close() model_conf = { 'model_name': model_name, 'architecture': architecture, 'seed': seed, 'img_type': img_type, 'multi_sample_aug': multi_sample_aug } with open(str(params.dir_model / f'{model_name}.json'), 'w') as f: json.dump(model_conf, f, indent=2) elapsed_time = time.time() - tic logger.info(f'elapsed time: {elapsed_time / 60.0:.1f} [min]')
def train(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default='wikitext-2', help="One of ('wikitext-103', 'wikitext-2') or a dict of splits paths." ) parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--embed_dim", type=int, default=410, help="Embeddings dim") parser.add_argument("--hidden_dim", type=int, default=2100, help="Hidden dimension") parser.add_argument("--num_max_positions", type=int, default=256, help="Max input length") parser.add_argument("--num_heads", type=int, default=10, help="Number of heads") parser.add_argument("--num_layers", type=int, default=16, help="NUmber of layers") parser.add_argument("--dropout", type=float, default=0.1, help="Dropout") parser.add_argument("--initializer_range", type=float, default=0.02, help="Dropout") parser.add_argument("--train_batch_size", type=int, default=8, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=8, help="Batch size for validation") parser.add_argument("--lr", type=float, default=2.5e-4, help="Learning rate") parser.add_argument("--max_norm", type=float, default=0.25, help="Clipping gradient norm") parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay") parser.add_argument("--n_epochs", type=int, default=200, help="Number of training epochs") parser.add_argument("--n_warmup", type=int, default=1000, help="Number of warmup iterations") parser.add_argument("--eval_every", type=int, default=-1, help="Evaluate every X steps (-1 => end of epoch)") parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="Accumulate gradient") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument( "--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") args = parser.parse_args() # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log on main process only, logger.warning => log on all processes logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Running process %d", args.local_rank ) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat( args)) # This is a logger.info: only printed on the first process # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info("Prepare tokenizer, model and optimizer") tokenizer = BertTokenizer.from_pretrained( 'bert-base-cased', do_lower_case=False) # Let's use a pre-defined tokenizer args.num_embeddings = len( tokenizer.vocab ) # We need this to create the model at next line (number of embeddings to use) model = TransformerWithLMHead(args) model.to(args.device) optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) logger.info("Model has %s parameters", sum(p.numel() for p in model.parameters() if p.requires_grad)) # Prepare model for distributed training if needed if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler, train_num_words, valid_num_words = get_data_loaders( args, tokenizer) # Training function and trainer def update(engine, batch): model.train() batch = batch.transpose(0, 1).contiguous().to( args.device) # to shape [seq length, batch] logits, loss = model(batch, labels=batch) loss = loss / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = batch.transpose(0, 1).contiguous().to( args.device) # to shape [seq length, batch] logits = model(batch) shift_logits = logits[:-1] shift_labels = batch[1:] return shift_logits.view(-1, logits.size(-1)), shift_labels.view(-1) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate at the end of each epoch and every 'eval_every' iterations if needed trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_every > 0: trainer.add_event_handler( Events.ITERATION_COMPLETED, lambda engine: evaluator.run(val_loader) if engine.state.iteration % args.eval_every == 0 else None) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Learning rate schedule: linearly warm-up to lr and then decrease the learning rate to zero with cosine schedule cos_scheduler = CosineAnnealingScheduler(optimizer, 'lr', args.lr, 0.0, len(train_loader) * args.n_epochs) scheduler = create_lr_scheduler_with_warmup(cos_scheduler, 0.0, args.lr, args.n_warmup) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we average distributed metrics using average_distributed_scalar metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1))} metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) metrics["average_word_ppl"] = MetricsLambda( lambda x: math.exp(x * val_loader.dataset.numel() / valid_num_words), metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model and configuration before we start to train if args.local_rank in [-1, 0]: checkpoint_handler, tb_logger = add_logging_and_checkpoint_saving( trainer, evaluator, metrics, model, optimizer, args) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint for easy re-loading if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename( checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME) ) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
def run(*options, cfg=None, debug=False): """Run training and validation of model Notes: Options can be passed in via the options argument and loaded from the cfg file Options from default.py will be overridden by options loaded from cfg file Options from default.py will be overridden by options loaded from cfg file Options passed in via options argument will override option loaded from cfg file Args: *options (str,int ,optional): Options used to overide what is loaded from the config. To see what options are available consult default.py cfg (str, optional): Location of config file to load. Defaults to None. debug (bool): Places scripts in debug/test mode and only executes a few iterations """ # Configuration: update_config(config, options=options, config_file=cfg) # The model will be saved under: outputs/<config_file_name>/<model_dir> config_file_name = "default_config" if not cfg else cfg.split("/")[-1].split(".")[0] try: output_dir = generate_path( config.OUTPUT_DIR, git_branch(), git_hash(), config_file_name, config.TRAIN.MODEL_DIR, current_datetime(), ) except: output_dir = generate_path(config.OUTPUT_DIR, config_file_name, config.TRAIN.MODEL_DIR, current_datetime(),) # Logging: load_log_configuration(config.LOG_CONFIG) logger = logging.getLogger(__name__) logger.debug(config.WORKERS) # Set CUDNN benchmark mode: torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK # We will write the model under outputs / config_file_name / model_dir config_file_name = "default_config" if not cfg else cfg.split("/")[-1].split(".")[0] # Fix random seeds: torch.manual_seed(config.SEED) if torch.cuda.is_available(): torch.cuda.manual_seed_all(config.SEED) np.random.seed(seed=config.SEED) # Augmentation: basic_aug = Compose( [ Normalize(mean=(config.TRAIN.MEAN,), std=(config.TRAIN.STD,), max_pixel_value=1), PadIfNeeded( min_height=config.TRAIN.PATCH_SIZE, min_width=config.TRAIN.PATCH_SIZE, border_mode=config.OPENCV_BORDER_CONSTANT, always_apply=True, mask_value=255, value=0, ), Resize( config.TRAIN.AUGMENTATIONS.RESIZE.HEIGHT, config.TRAIN.AUGMENTATIONS.RESIZE.WIDTH, always_apply=True, ), PadIfNeeded( min_height=config.TRAIN.AUGMENTATIONS.PAD.HEIGHT, min_width=config.TRAIN.AUGMENTATIONS.PAD.WIDTH, border_mode=config.OPENCV_BORDER_CONSTANT, always_apply=True, mask_value=255, ), ] ) if config.TRAIN.AUGMENTATION: train_aug = Compose([basic_aug, HorizontalFlip(p=0.5)]) val_aug = basic_aug else: train_aug = val_aug = basic_aug # Training and Validation Loaders: TrainPatchLoader = get_patch_loader(config) logging.info(f"Using {TrainPatchLoader}") train_set = TrainPatchLoader( config.DATASET.ROOT, config.DATASET.NUM_CLASSES, split="train", is_transform=True, stride=config.TRAIN.STRIDE, patch_size=config.TRAIN.PATCH_SIZE, augmentations=train_aug, debug=debug, ) logger.info(train_set) n_classes = train_set.n_classes val_set = TrainPatchLoader( config.DATASET.ROOT, config.DATASET.NUM_CLASSES, split="val", is_transform=True, stride=config.TRAIN.STRIDE, patch_size=config.TRAIN.PATCH_SIZE, augmentations=val_aug, debug=debug, ) logger.info(val_set) if debug: logger.info("Running in debug mode..") train_set = data.Subset(train_set, range(config.TRAIN.BATCH_SIZE_PER_GPU * config.NUM_DEBUG_BATCHES)) val_set = data.Subset(val_set, range(config.VALIDATION.BATCH_SIZE_PER_GPU)) train_loader = data.DataLoader( train_set, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS, shuffle=True ) val_loader = data.DataLoader( val_set, batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU, num_workers=1 ) # config.WORKERS) # Model: model = getattr(models, config.MODEL.NAME).get_seg_model(config) device = "cuda" if torch.cuda.is_available() else "cpu" model = model.to(device) # Optimizer and LR Scheduler: optimizer = torch.optim.SGD( model.parameters(), lr=config.TRAIN.MAX_LR, momentum=config.TRAIN.MOMENTUM, weight_decay=config.TRAIN.WEIGHT_DECAY, ) epochs_per_cycle = config.TRAIN.END_EPOCH // config.TRAIN.SNAPSHOTS snapshot_duration = epochs_per_cycle * len(train_loader) if not debug else 2 * len(train_loader) scheduler = CosineAnnealingScheduler( optimizer, "lr", config.TRAIN.MAX_LR, config.TRAIN.MIN_LR, cycle_size=snapshot_duration ) # Tensorboard writer: summary_writer = create_summary_writer(log_dir=path.join(output_dir, "logs")) # class weights are inversely proportional to the frequency of the classes in the training set class_weights = torch.tensor(config.DATASET.CLASS_WEIGHTS, device=device, requires_grad=False) # Loss: criterion = torch.nn.CrossEntropyLoss(weight=class_weights, ignore_index=255, reduction="mean") # Ignite trainer and evaluator: trainer = create_supervised_trainer(model, optimizer, criterion, prepare_batch, device=device) transform_fn = lambda output_dict: (output_dict["y_pred"].squeeze(), output_dict["mask"].squeeze()) evaluator = create_supervised_evaluator( model, prepare_batch, metrics={ "nll": Loss(criterion, output_transform=transform_fn), "pixacc": pixelwise_accuracy(n_classes, output_transform=transform_fn, device=device), "cacc": class_accuracy(n_classes, output_transform=transform_fn), "mca": mean_class_accuracy(n_classes, output_transform=transform_fn), "ciou": class_iou(n_classes, output_transform=transform_fn), "mIoU": mean_iou(n_classes, output_transform=transform_fn), }, device=device, ) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Logging: trainer.add_event_handler( Events.ITERATION_COMPLETED, logging_handlers.log_training_output(log_interval=config.PRINT_FREQ), ) trainer.add_event_handler(Events.EPOCH_COMPLETED, logging_handlers.log_lr(optimizer)) # Tensorboard and Logging: trainer.add_event_handler(Events.ITERATION_COMPLETED, tensorboard_handlers.log_training_output(summary_writer)) trainer.add_event_handler(Events.ITERATION_COMPLETED, tensorboard_handlers.log_validation_output(summary_writer)) # add specific logger which also triggers printed metrics on training set @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): evaluator.run(train_loader) tensorboard_handlers.log_results(engine, evaluator, summary_writer, n_classes, stage="Training") logging_handlers.log_metrics(engine, evaluator, stage="Training") # add specific logger which also triggers printed metrics on validation set @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) tensorboard_handlers.log_results(engine, evaluator, summary_writer, n_classes, stage="Validation") logging_handlers.log_metrics(engine, evaluator, stage="Validation") # dump validation set metrics at the very end for debugging purposes if engine.state.epoch == config.TRAIN.END_EPOCH and debug: fname = f"metrics_{config_file_name}_{config.TRAIN.MODEL_DIR}.json" metrics = evaluator.state.metrics out_dict = {x: metrics[x] for x in ["nll", "pixacc", "mca", "mIoU"]} with open(fname, "w") as fid: json.dump(out_dict, fid) log_msg = " ".join(f"{k}: {out_dict[k]}" for k in out_dict.keys()) logging.info(log_msg) # Checkpointing: snapshotting trained models to disk checkpoint_handler = SnapshotHandler( output_dir, config.MODEL.NAME, extract_metric_from("mIoU"), lambda: (trainer.state.iteration % snapshot_duration) == 0, ) evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {"model": model}) logger.info("Starting training") trainer.run(train_loader, max_epochs=config.TRAIN.END_EPOCH, epoch_length=len(train_loader), seed=config.SEED) summary_writer.close()
# #### EarlyStopping # In[8]: handler = EarlyStopping(patience=10, score_function=lambda engine: engine.state.metrics['accuracy'], trainer=trainer) val_evaluator.add_event_handler(Events.COMPLETED, handler) # #### LR Scheduler # In[9]: scheduler = CosineAnnealingScheduler(optimizer, 'lr', initial_lr, 1e-7, len(loader)) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) @trainer.on(Events.ITERATION_STARTED) def warmup_lr(engine): epoch = engine.state.epoch if epoch < 6: for param_group in optimizer.param_groups: # at the 6th epoch the lr will be the initial lr param_group['lr'] = 1e-7 + (epoch - 1)*(1./5.)*initial_lr printed_warmup_epochs = [] @trainer.on(Events.ITERATION_STARTED) def print_warmup_lr(engine): global printed_warmup_epochs epoch = engine.state.epoch
mode='bilinear', align_corners=True) return loss_fn(y_pred, y) def supervised_loss_fn(y_pred, y): y_pred, aux_y_pred = y_pred return \ loss_fn(y_pred, y) \ + 0.4 * sum((aux_loss(y_pred, y) for y_pred in aux_y_pred)) scheduler1 = CosineAnnealingScheduler( optimizer, param_name='lr', start_value=args.learning_rate / 10, end_value=args.learning_rate / 10 * 1e-4, cycle_size=args.epochs * len(train_loader) - 1000, param_group_index=0, ) scheduler1 = create_lr_scheduler_with_warmup(scheduler1, 0, args.learning_rate / 10, 1000) scheduler2 = CosineAnnealingScheduler( optimizer, param_name='lr', start_value=args.learning_rate / 10, end_value=args.learning_rate / 10 * 1e-4, cycle_size=args.epochs * len(train_loader) - 1000, param_group_index=1, ) scheduler2 = create_lr_scheduler_with_warmup(scheduler2, 0, args.learning_rate / 10, 1000)
def add_events(engines, dataloaders, model, optimizer, device, save_dir, args): trainer, valid_evaluator, test_evaluator = engines train_dl, valid_dl, test_dl = dataloaders if args.valid_on == 'Loss': score_fn = lambda engine: -engine.state.metrics[args.valid_on] elif args.valid_on == 'Product': score_fn = lambda engine: engine.state.metrics[ 'MRR'] * engine.state.metrics['HR@10'] elif args.valid_on == 'RMS': score_fn = lambda engine: engine.state.metrics[ 'MRR']**2 + engine.state.metrics['HR@10']**2 else: score_fn = lambda engine: engine.state.metrics[args.valid_on] # LR Scheduler if args.lr_scheduler == 'restart': scheduler = CosineAnnealingScheduler(optimizer, 'lr', start_value=args.lr, end_value=args.lr * 0.01, cycle_size=len(train_dl), cycle_mult=args.cycle_mult) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler, 'lr_scheduler') elif args.lr_scheduler == 'triangle': scheduler = make_slanted_triangular_lr_scheduler( optimizer, n_events=args.n_epochs * len(train_dl), lr_max=args.lr) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler, 'lr_scheduler') elif args.lr_scheduler == 'none': pass else: raise NotImplementedError # EarlyStopping trainer.add_event_handler(Events.ITERATION_COMPLETED, TerminateOnNan()) valid_evaluator.add_event_handler( Events.COMPLETED, EarlyStopping(args.patience, score_function=score_fn, trainer=trainer)) # Training Loss RunningAverage(output_transform=lambda x: x, alpha=args.avg_alpha).attach(trainer, 'loss') # Checkpoint ckpt_handler = ModelCheckpoint(save_dir, 'best', score_function=score_fn, score_name=args.valid_on, n_saved=1) valid_evaluator.add_event_handler(Events.COMPLETED, ckpt_handler, {'model': model}) # Timer timer = Timer(average=True) timer.attach(trainer, resume=Events.EPOCH_STARTED, step=Events.EPOCH_COMPLETED) # Progress Bar if args.pbar: pbar = ProgressBar() pbar.attach(trainer, ['loss']) log_msg = pbar.log_message else: log_msg = print cpe_valid = CustomPeriodicEvent(n_epochs=args.valid_every) cpe_valid.attach(trainer) valid_metrics_history = [] @trainer.on( getattr(cpe_valid.Events, f'EPOCHS_{args.valid_every}_COMPLETED')) def evaluate_on_valid(engine): state = valid_evaluator.run(valid_dl) metrics = state.metrics valid_metrics_history.append(metrics) msg = f'Epoch: {engine.state.epoch:3d} AvgTime: {timer.value():3.1f}s TrainLoss: {engine.state.metrics["loss"]:.4f} ' msg += ' '.join([ f'{k}: {v:.4f}' for k, v in metrics.items() if k in ['Loss', 'MRR', 'HR@10'] ]) log_msg(msg) @trainer.on(Events.COMPLETED) def evaluate_on_test(engine): pth_file = [ f for f in pathlib.Path(save_dir).iterdir() if f.name.endswith('pth') ][0] log_msg(f'Load Best Model: {str(pth_file)}') model.load_state_dict(torch.load(pth_file, map_location=device)) # Rerun on Valid for log. valid_state = valid_evaluator.run(valid_dl) engine.state.valid_metrics = valid_state.metrics # Test test_state = test_evaluator.run(test_dl) engine.state.test_metrics = test_state.metrics engine.state.valid_metrics_history = valid_metrics_history msg = f'[Test] ' msg += ' '.join([ f'{k}: {v:.4f}' for k, v in test_state.metrics.items() if k in ['Loss', 'MRR', 'HR@10'] ]) log_msg(msg) # Tensorboard if args.tensorboard: tb_logger = TensorboardLogger(log_dir=str(save_dir / 'tb_log')) # Loss tb_logger.attach(trainer, log_handler=OutputHandler( tag='training', output_transform=lambda x: x), event_name=Events.ITERATION_COMPLETED) # Metrics tb_logger.attach(valid_evaluator, log_handler=OutputHandler( tag='validation', metric_names=['Loss', 'MRR', 'HR@10'], another_engine=trainer), event_name=Events.EPOCH_COMPLETED) # Optimizer tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) # Parameters # tb_logger.attach(trainer, # log_handler=WeightsScalarHandler(model), # event_name=Events.ITERATION_COMPLETED) # tb_logger.attach(trainer, # log_handler=GradsScalarHandler(model), # event_name=Events.ITERATION_COMPLETED) @trainer.on(Events.COMPLETED) def close_tb(engine): tb_logger.close()
{'params': parameters_of(model, (nn.BatchNorm2d, nn.PReLU)), }, ], lr=args.learning_rate, ) class_freq = torch.from_numpy(Cityscapes.CLASS_FREQ).float() weight = 1 / torch.log(1.02 + class_freq) loss_fn = torch.nn.CrossEntropyLoss(ignore_index=255, weight=weight) # loss_fn = OHEMLoss(ignore_index=255) loss_fn = loss_fn.cuda() scheduler1 = CosineAnnealingScheduler( optimizer, 'lr', args.learning_rate, args.learning_rate * 1e-4, cycle_size=args.epochs * len(train_loader), ) scheduler1 = create_lr_scheduler_with_warmup( scheduler1, 0, args.learning_rate, 1000) model, optimizer = amp.initialize(model, optimizer, opt_level="O2") if args.distributed: model = convert_syncbn_model(model) model = DistributedDataParallel(model) trainer = create_segmentation_trainer( model, optimizer, loss_fn, device=device,
def train(args): tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=False) args.num_embeddings = len( tokenizer.vocab ) # We need this to create the model at next line (number of embeddings to use) model = TransformerWithLMHead(args) model.to(args.device) optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) logger.info("Model has %s parameters", sum(p.numel() for p in model.parameters() if p.requires_grad)) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler, train_num_words, valid_num_words = get_data_loaders( args, tokenizer) # Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original def mask_tokens(inputs): labels = inputs.clone() masked_indices = torch.bernoulli( torch.full(labels.shape, args.mlm_probability)).byte() labels[~masked_indices] = -1 # We only compute loss on masked tokens indices_replaced = torch.bernoulli(torch.full( labels.shape, 0.8)).byte() & masked_indices inputs[indices_replaced] = tokenizer.vocab[ "[MASK]"] # 80% of the time, replace masked input tokens with [MASK] indices_random = torch.bernoulli(torch.full( labels.shape, 0.5)).byte() & masked_indices & ~indices_replaced random_words = torch.randint(args.num_embeddings, labels.shape, dtype=torch.long, device=args.device) inputs[indices_random] = random_words[ indices_random] # 10% of the time, replace masked input tokens with random word return inputs, labels def update(engine, batch): model.train() inputs = batch.transpose(0, 1).contiguous().to(args.device) inputs, labels = mask_tokens(inputs) if args.mlm else (inputs, inputs) logits, loss = model(inputs, labels=labels) loss = loss / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) def inference(engine, batch): model.eval() with torch.no_grad(): inputs = batch.transpose(0, 1).contiguous().to(args.device) inputs, labels = mask_tokens(inputs) if args.mlm else ( inputs, inputs) # Prepare masked input/labels if we use masked LM logits = model(inputs) shift_logits = logits[:-1] if not args.mlm else logits shift_labels = labels[1:] if not args.mlm else labels return shift_logits.view(-1, logits.size(-1)), shift_labels.view(-1) evaluator = Engine(inference) trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_every > 0: trainer.add_event_handler( Events.ITERATION_COMPLETED, lambda engine: evaluator.run(val_loader) if engine.state.iteration % args.eval_every == 0 else None) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) # Learning rate schedule: linearly warm-up to lr and then decrease the learning rate to zero with cosine schedule cos_scheduler = CosineAnnealingScheduler(optimizer, 'lr', args.lr, 0.0, len(train_loader) * args.n_epochs) scheduler = create_lr_scheduler_with_warmup(cos_scheduler, 0.0, args.lr, args.n_warmup) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we average distributed metrics using average_distributed_scalar metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1))} metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) # Let's convert sub-word perplexities in word perplexities. If you need details: http://sjmielke.com/comparing-perplexities.htm metrics["average_word_ppl"] = MetricsLambda( lambda x: math.exp(x * val_loader.dataset.numel() / valid_num_words), metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model and configuration before we start to train if args.local_rank in [-1, 0]: checkpoint_handler, tb_logger = add_logging_and_checkpoint_saving( trainer, evaluator, metrics, model, optimizer, args) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs)
print(model) wandb.watch(model) loss = nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay) trainer = create_supervised_trainer(model, optimizer, loss, device) RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss') trainer.add_event_handler( Events.ITERATION_COMPLETED, create_lr_scheduler_with_warmup(CosineAnnealingScheduler( optimizer, param_name='lr', start_value=cfg.lr, end_value=0, cycle_size=len(train_loader) * cfg.n_epochs, start_value_mult=0, end_value_mult=0), warmup_start_value=0.0, warmup_end_value=cfg.lr, warmup_duration=len(train_loader))) evaluator = create_supervised_evaluator( model, metrics={ 'loss': Loss(loss), 'acc_smpl': Accuracy(threshold_output, is_multilabel=True), 'p': Precision(threshold_output, average=True), 'r': Recall(threshold_output, average=True), 'f1': Fbeta(1.0, output_transform=threshold_output),
def run(*options, cfg=None, local_rank=0, debug=False): """Run training and validation of model Notes: Options can be passed in via the options argument and loaded from the cfg file Options from default.py will be overridden by options loaded from cfg file Options passed in via options argument will override option loaded from cfg file Args: *options (str,int ,optional): Options used to overide what is loaded from the config. To see what options are available consult default.py cfg (str, optional): Location of config file to load. Defaults to None. """ update_config(config, options=options, config_file=cfg) # we will write the model under outputs / config_file_name / model_dir config_file_name = "default_config" if not cfg else cfg.split( "/")[-1].split(".")[0] # Start logging load_log_configuration(config.LOG_CONFIG) logger = logging.getLogger(__name__) logger.debug(config.WORKERS) silence_other_ranks = True world_size = int(os.environ.get("WORLD_SIZE", 1)) distributed = world_size > 1 if distributed: # FOR DISTRIBUTED: Set the device according to local_rank. torch.cuda.set_device(local_rank) # FOR DISTRIBUTED: Initialize the backend. torch.distributed.launch will # provide environment variables, and requires that you use init_method=`env://`. torch.distributed.init_process_group(backend="nccl", init_method="env://") epochs_per_cycle = config.TRAIN.END_EPOCH // config.TRAIN.SNAPSHOTS torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK torch.manual_seed(config.SEED) if torch.cuda.is_available(): torch.cuda.manual_seed_all(config.SEED) np.random.seed(seed=config.SEED) # Setup Augmentations basic_aug = Compose([ Normalize(mean=(config.TRAIN.MEAN, ), std=(config.TRAIN.STD, ), max_pixel_value=1), PadIfNeeded( min_height=config.TRAIN.PATCH_SIZE, min_width=config.TRAIN.PATCH_SIZE, border_mode=config.OPENCV_BORDER_CONSTANT, always_apply=True, mask_value=255, ), Resize( config.TRAIN.AUGMENTATIONS.RESIZE.HEIGHT, config.TRAIN.AUGMENTATIONS.RESIZE.WIDTH, always_apply=True, ), PadIfNeeded( min_height=config.TRAIN.AUGMENTATIONS.PAD.HEIGHT, min_width=config.TRAIN.AUGMENTATIONS.PAD.WIDTH, border_mode=config.OPENCV_BORDER_CONSTANT, always_apply=True, mask_value=255, ), ]) if config.TRAIN.AUGMENTATION: train_aug = Compose([basic_aug, HorizontalFlip(p=0.5)]) val_aug = basic_aug else: train_aug = val_aug = basic_aug TrainPatchLoader = get_patch_loader(config) train_set = TrainPatchLoader( config.DATASET.ROOT, split="train", is_transform=True, stride=config.TRAIN.STRIDE, patch_size=config.TRAIN.PATCH_SIZE, augmentations=train_aug, ) val_set = TrainPatchLoader( config.DATASET.ROOT, split="val", is_transform=True, stride=config.TRAIN.STRIDE, patch_size=config.TRAIN.PATCH_SIZE, augmentations=val_aug, ) logger.info(f"Validation examples {len(val_set)}") n_classes = train_set.n_classes if debug: val_set = data.Subset(val_set, range(config.VALIDATION.BATCH_SIZE_PER_GPU)) train_set = data.Subset(train_set, range(config.TRAIN.BATCH_SIZE_PER_GPU * 2)) logger.info(f"Training examples {len(train_set)}") logger.info(f"Validation examples {len(val_set)}") train_sampler = torch.utils.data.distributed.DistributedSampler( train_set, num_replicas=world_size, rank=local_rank) train_loader = data.DataLoader( train_set, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS, sampler=train_sampler, ) val_sampler = torch.utils.data.distributed.DistributedSampler( val_set, num_replicas=world_size, rank=local_rank) val_loader = data.DataLoader( val_set, batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS, sampler=val_sampler, ) model = getattr(models, config.MODEL.NAME).get_seg_model(config) device = "cpu" if torch.cuda.is_available(): device = "cuda" model = model.to(device) # Send to GPU optimizer = torch.optim.SGD( model.parameters(), lr=config.TRAIN.MAX_LR, momentum=config.TRAIN.MOMENTUM, weight_decay=config.TRAIN.WEIGHT_DECAY, ) # weights are inversely proportional to the frequency of the classes in # the training set class_weights = torch.tensor(config.DATASET.CLASS_WEIGHTS, device=device, requires_grad=False) criterion = torch.nn.CrossEntropyLoss(weight=class_weights, ignore_index=255, reduction="mean") model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[device], find_unused_parameters=True) snapshot_duration = epochs_per_cycle * len( train_loader) if not debug else 2 * len(train_loader) warmup_duration = 5 * len(train_loader) warmup_scheduler = LinearCyclicalScheduler( optimizer, "lr", start_value=config.TRAIN.MAX_LR, end_value=config.TRAIN.MAX_LR * world_size, cycle_size=10 * len(train_loader), ) cosine_scheduler = CosineAnnealingScheduler( optimizer, "lr", config.TRAIN.MAX_LR * world_size, config.TRAIN.MIN_LR * world_size, cycle_size=snapshot_duration, ) scheduler = ConcatScheduler( schedulers=[warmup_scheduler, cosine_scheduler], durations=[warmup_duration]) trainer = create_supervised_trainer(model, optimizer, criterion, prepare_batch, device=device) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Set to update the epoch parameter of our distributed data sampler so that we get # different shuffles trainer.add_event_handler(Events.EPOCH_STARTED, update_sampler_epoch(train_loader)) if silence_other_ranks & local_rank != 0: logging.getLogger("ignite.engine.engine.Engine").setLevel( logging.WARNING) def _select_pred_and_mask(model_out_dict): return (model_out_dict["y_pred"].squeeze(), model_out_dict["mask"].squeeze()) evaluator = create_supervised_evaluator( model, prepare_batch, metrics={ "nll": Loss(criterion, output_transform=_select_pred_and_mask, device=device), "pixa": pixelwise_accuracy(n_classes, output_transform=_select_pred_and_mask, device=device), "cacc": class_accuracy(n_classes, output_transform=_select_pred_and_mask, device=device), "mca": mean_class_accuracy(n_classes, output_transform=_select_pred_and_mask, device=device), "ciou": class_iou(n_classes, output_transform=_select_pred_and_mask, device=device), "mIoU": mean_iou(n_classes, output_transform=_select_pred_and_mask, device=device), }, device=device, ) # Set the validation run to start on the epoch completion of the training run trainer.add_event_handler(Events.EPOCH_COMPLETED, Evaluator(evaluator, val_loader)) if local_rank == 0: # Run only on master process trainer.add_event_handler( Events.ITERATION_COMPLETED, logging_handlers.log_training_output( log_interval=config.TRAIN.BATCH_SIZE_PER_GPU), ) trainer.add_event_handler(Events.EPOCH_STARTED, logging_handlers.log_lr(optimizer)) try: output_dir = generate_path( config.OUTPUT_DIR, git_branch(), git_hash(), config_file_name, config.TRAIN.MODEL_DIR, current_datetime(), ) except TypeError: output_dir = generate_path( config.OUTPUT_DIR, config_file_name, config.TRAIN.MODEL_DIR, current_datetime(), ) summary_writer = create_summary_writer( log_dir=path.join(output_dir, config.LOG_DIR)) logger.info( f"Logging Tensorboard to {path.join(output_dir, config.LOG_DIR)}") trainer.add_event_handler( Events.EPOCH_STARTED, tensorboard_handlers.log_lr(summary_writer, optimizer, "epoch"), ) trainer.add_event_handler( Events.ITERATION_COMPLETED, tensorboard_handlers.log_training_output(summary_writer), ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, logging_handlers.log_metrics( "Validation results", metrics_dict={ "nll": "Avg loss :", "mIoU": " Avg IoU :", "pixa": "Pixelwise Accuracy :", "mca": "Mean Class Accuracy :", }, ), ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, tensorboard_handlers.log_metrics( summary_writer, trainer, "epoch", metrics_dict={ "mIoU": "Validation/IoU", "nll": "Validation/Loss", "mca": "Validation/MCA", }, ), ) def _select_max(pred_tensor): return pred_tensor.max(1)[1] def _tensor_to_numpy(pred_tensor): return pred_tensor.squeeze().cpu().numpy() transform_func = compose(np_to_tb, decode_segmap(n_classes=n_classes), _tensor_to_numpy) transform_pred = compose(transform_func, _select_max) evaluator.add_event_handler( Events.EPOCH_COMPLETED, create_image_writer(summary_writer, "Validation/Image", "image"), ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, create_image_writer(summary_writer, "Validation/Mask", "mask", transform_func=transform_func), ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, create_image_writer( summary_writer, "Validation/Pred", "y_pred", transform_func=transform_pred, ), ) def snapshot_function(): return (trainer.state.iteration % snapshot_duration) == 0 checkpoint_handler = SnapshotHandler( output_dir, config.MODEL.NAME, extract_metric_from("mIoU"), snapshot_function, ) evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {"model": model}) logger.info("Starting training") if debug: trainer.run( train_loader, max_epochs=config.TRAIN.END_EPOCH, epoch_length=config.TRAIN.BATCH_SIZE_PER_GPU * 2, seed=config.SEED, ) else: trainer.run(train_loader, max_epochs=config.TRAIN.END_EPOCH, epoch_length=len(train_loader), seed=config.SEED)
val_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) # #### EarlyStopping # In[8]: # handler = EarlyStopping(patience=30, score_function=lambda engine: engine.state.metrics['accuracy'], trainer=trainer) # val_evaluator.add_event_handler(Events.COMPLETED, handler) # #### LR Scheduler # In[9]: scheduler = CosineAnnealingScheduler(optimizer, 'lr', 10e-4, 1e-7, len(loader)) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) @trainer.on(Events.ITERATION_COMPLETED) def print_lr(engine): epoch = engine.state.epoch iteration = engine.state.iteration if epoch < 2 and iteration % 100 == 0: print(f'Iteration {iteration} | LR {optimizer.param_groups[0]["lr"]}') # #### Compute and display metrics # In[10]:
def run(*options, cfg=None, debug=False): """Run training and validation of model Notes: Options can be passed in via the options argument and loaded from the cfg file Options loaded from default.py will be overridden by those loaded from cfg file Options passed in via options argument will override those loaded from cfg file Args: *options (str, int, optional): Options used to overide what is loaded from the config. To see what options are available consult default.py cfg (str, optional): Location of config file to load. Defaults to None. debug (bool): Places scripts in debug/test mode and only executes a few iterations """ update_config(config, options=options, config_file=cfg) # we will write the model under outputs / config_file_name / model_dir config_file_name = "default_config" if not cfg else cfg.split("/")[-1].split(".")[0] # Start logging load_log_configuration(config.LOG_CONFIG) logger = logging.getLogger(__name__) logger.debug(config.WORKERS) epochs_per_cycle = config.TRAIN.END_EPOCH // config.TRAIN.SNAPSHOTS torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK torch.manual_seed(config.SEED) if torch.cuda.is_available(): torch.cuda.manual_seed_all(config.SEED) np.random.seed(seed=config.SEED) device = "cpu" if torch.cuda.is_available(): device = "cuda" # Setup Augmentations basic_aug = Compose( [ Normalize(mean=(config.TRAIN.MEAN,), std=(config.TRAIN.STD,), max_pixel_value=config.TRAIN.MAX,), PadIfNeeded( min_height=config.TRAIN.PATCH_SIZE, min_width=config.TRAIN.PATCH_SIZE, border_mode=config.OPENCV_BORDER_CONSTANT, always_apply=True, mask_value=mask_value, value=0, ), Resize( config.TRAIN.AUGMENTATIONS.RESIZE.HEIGHT, config.TRAIN.AUGMENTATIONS.RESIZE.WIDTH, always_apply=True, ), PadIfNeeded( min_height=config.TRAIN.AUGMENTATIONS.PAD.HEIGHT, min_width=config.TRAIN.AUGMENTATIONS.PAD.WIDTH, border_mode=config.OPENCV_BORDER_CONSTANT, always_apply=True, mask_value=mask_value, value=0, ), ] ) if config.TRAIN.AUGMENTATION: train_aug = Compose([basic_aug, HorizontalFlip(p=0.5)]) val_aug = basic_aug else: train_aug = val_aug = basic_aug PenobscotDataset = get_patch_dataset(config) train_set = PenobscotDataset( config.DATASET.ROOT, config.TRAIN.PATCH_SIZE, config.TRAIN.STRIDE, split="train", transforms=train_aug, n_channels=config.MODEL.IN_CHANNELS, complete_patches_only=config.TRAIN.COMPLETE_PATCHES_ONLY, ) val_set = PenobscotDataset( config.DATASET.ROOT, config.TRAIN.PATCH_SIZE, config.TRAIN.STRIDE, split="val", transforms=val_aug, n_channels=config.MODEL.IN_CHANNELS, complete_patches_only=config.VALIDATION.COMPLETE_PATCHES_ONLY, ) logger.info(train_set) logger.info(val_set) n_classes = train_set.n_classes train_loader = data.DataLoader( train_set, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS, shuffle=True, ) if debug: val_set = data.Subset(val_set, range(3)) val_loader = data.DataLoader(val_set, batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS) model = getattr(models, config.MODEL.NAME).get_seg_model(config) model = model.to(device) # Send to GPU optimizer = torch.optim.SGD( model.parameters(), lr=config.TRAIN.MAX_LR, momentum=config.TRAIN.MOMENTUM, weight_decay=config.TRAIN.WEIGHT_DECAY, ) try: output_dir = generate_path( config.OUTPUT_DIR, git_branch(), git_hash(), config_file_name, config.TRAIN.MODEL_DIR, current_datetime(), ) except TypeError: output_dir = generate_path(config.OUTPUT_DIR, config_file_name, config.TRAIN.MODEL_DIR, current_datetime(),) summary_writer = create_summary_writer(log_dir=path.join(output_dir, config.LOG_DIR)) snapshot_duration = epochs_per_cycle * len(train_loader) if not debug else 2 * len(train_loader) scheduler = CosineAnnealingScheduler( optimizer, "lr", config.TRAIN.MAX_LR, config.TRAIN.MIN_LR, cycle_size=snapshot_duration ) # weights are inversely proportional to the frequency of the classes in # the training set class_weights = torch.tensor(config.DATASET.CLASS_WEIGHTS, device=device, requires_grad=False) criterion = torch.nn.CrossEntropyLoss(weight=class_weights, ignore_index=mask_value, reduction="mean") trainer = create_supervised_trainer(model, optimizer, criterion, _prepare_batch, device=device) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) trainer.add_event_handler( Events.ITERATION_COMPLETED, logging_handlers.log_training_output(log_interval=config.TRAIN.BATCH_SIZE_PER_GPU), ) trainer.add_event_handler(Events.EPOCH_STARTED, logging_handlers.log_lr(optimizer)) trainer.add_event_handler( Events.EPOCH_STARTED, tensorboard_handlers.log_lr(summary_writer, optimizer, "epoch"), ) trainer.add_event_handler( Events.ITERATION_COMPLETED, tensorboard_handlers.log_training_output(summary_writer), ) def _select_pred_and_mask(model_out_dict): return (model_out_dict["y_pred"].squeeze(), model_out_dict["mask"].squeeze()) evaluator = create_supervised_evaluator( model, _prepare_batch, metrics={ "pixacc": pixelwise_accuracy(n_classes, output_transform=_select_pred_and_mask), "nll": Loss(criterion, output_transform=_select_pred_and_mask), "cacc": class_accuracy(n_classes, output_transform=_select_pred_and_mask), "mca": mean_class_accuracy(n_classes, output_transform=_select_pred_and_mask), "ciou": class_iou(n_classes, output_transform=_select_pred_and_mask), "mIoU": mean_iou(n_classes, output_transform=_select_pred_and_mask), }, device=device, ) # Set the validation run to start on the epoch completion of the training run trainer.add_event_handler(Events.EPOCH_COMPLETED, Evaluator(evaluator, val_loader)) evaluator.add_event_handler( Events.EPOCH_COMPLETED, logging_handlers.log_metrics( "Validation results", metrics_dict={ "nll": "Avg loss :", "pixacc": "Pixelwise Accuracy :", "mca": "Avg Class Accuracy :", "mIoU": "Avg Class IoU :", }, ), ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, tensorboard_handlers.log_metrics( summary_writer, trainer, "epoch", metrics_dict={ "mIoU": "Validation/mIoU", "nll": "Validation/Loss", "mca": "Validation/MCA", "pixacc": "Validation/Pixel_Acc", }, ), ) def _select_max(pred_tensor): return pred_tensor.max(1)[1] def _tensor_to_numpy(pred_tensor): return pred_tensor.squeeze().cpu().numpy() transform_func = compose(np_to_tb, decode_segmap, _tensor_to_numpy,) transform_pred = compose(transform_func, _select_max) evaluator.add_event_handler( Events.EPOCH_COMPLETED, create_image_writer(summary_writer, "Validation/Image", "image"), ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, create_image_writer(summary_writer, "Validation/Mask", "mask", transform_func=transform_func), ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, create_image_writer(summary_writer, "Validation/Pred", "y_pred", transform_func=transform_pred), ) def snapshot_function(): return (trainer.state.iteration % snapshot_duration) == 0 checkpoint_handler = SnapshotHandler(output_dir, config.MODEL.NAME, extract_metric_from("mIoU"), snapshot_function,) evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {"model": model}) logger.info("Starting training") if debug: trainer.run( train_loader, max_epochs=config.TRAIN.END_EPOCH, epoch_length=config.TRAIN.BATCH_SIZE_PER_GPU, seed=config.SEED, ) else: trainer.run(train_loader, max_epochs=config.TRAIN.END_EPOCH, epoch_length=len(train_loader), seed=config.SEED)
def run(*options, cfg=None, local_rank=0, debug=False, input=None, distributed=False): """Run training and validation of model Notes: Options can be passed in via the options argument and loaded from the cfg file Options from default.py will be overridden by options loaded from cfg file Options from default.py will be overridden by options loaded from cfg file Options passed in via options argument will override option loaded from cfg file Args: *options (str,int ,optional): Options used to overide what is loaded from the config. To see what options are available consult default.py cfg (str, optional): Location of config file to load. Defaults to None. debug (bool): Places scripts in debug/test mode and only executes a few iterations input (str, optional): Location of data if Azure ML run, for local runs input is config.DATASET.ROOT distributed (bool): This flag tells the training script to run in distributed mode if more than one GPU exists. """ # if AML training pipeline supplies us with input if input is not None: data_dir = input output_dir = data_dir + config.OUTPUT_DIR # Start logging load_log_configuration(config.LOG_CONFIG) logger = logging.getLogger(__name__) logger.debug(config.WORKERS) # Configuration: update_config(config, options=options, config_file=cfg) silence_other_ranks = True world_size = int(os.environ.get("WORLD_SIZE", 1)) distributed = world_size > 1 if distributed: # FOR DISTRIBUTED: Set the device according to local_rank. torch.cuda.set_device(local_rank) # FOR DISTRIBUTED: Initialize the backend. torch.distributed.launch will # provide environment variables, and requires that you use init_method=`env://`. torch.distributed.init_process_group(backend="nccl", init_method="env://") logging.info(f"Started train.py using distributed mode.") else: logging.info(f"Started train.py using local mode.") # Set CUDNN benchmark mode: torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK # Fix random seeds: torch.manual_seed(config.SEED) if torch.cuda.is_available(): torch.cuda.manual_seed_all(config.SEED) np.random.seed(seed=config.SEED) # Augmentation: basic_aug = Compose([ Normalize(mean=(config.TRAIN.MEAN, ), std=(config.TRAIN.STD, ), max_pixel_value=1), PadIfNeeded( min_height=config.TRAIN.PATCH_SIZE, min_width=config.TRAIN.PATCH_SIZE, border_mode=config.OPENCV_BORDER_CONSTANT, always_apply=True, mask_value=255, value=0, ), Resize( config.TRAIN.AUGMENTATIONS.RESIZE.HEIGHT, config.TRAIN.AUGMENTATIONS.RESIZE.WIDTH, always_apply=True, ), PadIfNeeded( min_height=config.TRAIN.AUGMENTATIONS.PAD.HEIGHT, min_width=config.TRAIN.AUGMENTATIONS.PAD.WIDTH, border_mode=config.OPENCV_BORDER_CONSTANT, always_apply=True, mask_value=255, ), ]) if config.TRAIN.AUGMENTATION: train_aug = Compose([basic_aug, HorizontalFlip(p=0.5)]) val_aug = basic_aug else: train_aug = val_aug = basic_aug # Training and Validation Loaders: TrainPatchLoader = get_patch_loader(config) logging.info(f"Using {TrainPatchLoader}") train_set = TrainPatchLoader( config, split="train", is_transform=True, augmentations=train_aug, debug=debug, ) logger.info(train_set) n_classes = train_set.n_classes val_set = TrainPatchLoader( config, split="val", is_transform=True, augmentations=val_aug, debug=debug, ) logger.info(val_set) if debug: data_flow_dict = dict() data_flow_dict["train_patch_loader_length"] = len(train_set) data_flow_dict["validation_patch_loader_length"] = len(val_set) data_flow_dict["train_input_shape"] = train_set.seismic.shape data_flow_dict["train_label_shape"] = train_set.labels.shape data_flow_dict["n_classes"] = n_classes logger.info("Running in debug mode..") train_range = min( config.TRAIN.BATCH_SIZE_PER_GPU * config.NUM_DEBUG_BATCHES, len(train_set)) logging.info(f"train range in debug mode {train_range}") train_set = data.Subset(train_set, range(train_range)) valid_range = min(config.VALIDATION.BATCH_SIZE_PER_GPU, len(val_set)) val_set = data.Subset(val_set, range(valid_range)) data_flow_dict["train_length_subset"] = len(train_set) data_flow_dict["validation_length_subset"] = len(val_set) train_sampler = torch.utils.data.distributed.DistributedSampler( train_set, num_replicas=world_size, rank=local_rank) val_sampler = torch.utils.data.distributed.DistributedSampler( val_set, num_replicas=world_size, rank=local_rank) train_loader = data.DataLoader( train_set, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS, sampler=train_sampler, ) val_loader = data.DataLoader( val_set, batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS, sampler=val_sampler) if debug: data_flow_dict["train_loader_length"] = len(train_loader) data_flow_dict["validation_loader_length"] = len(val_loader) config_file_name = "default_config" if not cfg else cfg.split( "/")[-1].split(".")[0] fname = f"data_flow_train_{config_file_name}_{config.TRAIN.MODEL_DIR}.json" with open(fname, "w") as f: json.dump(data_flow_dict, f, indent=2) # Model: model = getattr(models, config.MODEL.NAME).get_seg_model(config) device = "cuda" if torch.cuda.is_available() else "cpu" model = model.to(device) # Optimizer and LR Scheduler: optimizer = torch.optim.SGD( model.parameters(), lr=config.TRAIN.MAX_LR, momentum=config.TRAIN.MOMENTUM, weight_decay=config.TRAIN.WEIGHT_DECAY, ) epochs_per_cycle = config.TRAIN.END_EPOCH // config.TRAIN.SNAPSHOTS snapshot_duration = epochs_per_cycle * len( train_loader) if not debug else 2 * len(train_loader) cosine_scheduler = CosineAnnealingScheduler( optimizer, "lr", config.TRAIN.MAX_LR * world_size, config.TRAIN.MIN_LR * world_size, cycle_size=snapshot_duration, ) if distributed: warmup_duration = 5 * len(train_loader) warmup_scheduler = LinearCyclicalScheduler( optimizer, "lr", start_value=config.TRAIN.MAX_LR, end_value=config.TRAIN.MAX_LR * world_size, cycle_size=10 * len(train_loader), ) scheduler = ConcatScheduler( schedulers=[warmup_scheduler, cosine_scheduler], durations=[warmup_duration]) else: scheduler = cosine_scheduler # class weights are inversely proportional to the frequency of the classes in the training set class_weights = torch.tensor(config.DATASET.CLASS_WEIGHTS, device=device, requires_grad=False) # Loss: criterion = torch.nn.CrossEntropyLoss(weight=class_weights, ignore_index=255, reduction="mean") # Model: if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[device], find_unused_parameters=True) if silence_other_ranks & local_rank != 0: logging.getLogger("ignite.engine.engine.Engine").setLevel( logging.WARNING) # Ignite trainer and evaluator: trainer = create_supervised_trainer(model, optimizer, criterion, prepare_batch, device=device) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Set to update the epoch parameter of our distributed data sampler so that we get # different shuffles trainer.add_event_handler(Events.EPOCH_STARTED, update_sampler_epoch(train_loader)) transform_fn = lambda output_dict: (output_dict["y_pred"].squeeze(), output_dict["mask"].squeeze()) evaluator = create_supervised_evaluator( model, prepare_batch, metrics={ "nll": Loss(criterion, output_transform=transform_fn, device=device), "pixacc": pixelwise_accuracy(n_classes, output_transform=transform_fn, device=device), "cacc": class_accuracy(n_classes, output_transform=transform_fn, device=device), "mca": mean_class_accuracy(n_classes, output_transform=transform_fn, device=device), "ciou": class_iou(n_classes, output_transform=transform_fn, device=device), "mIoU": mean_iou(n_classes, output_transform=transform_fn, device=device), }, device=device, ) # The model will be saved under: outputs/<config_file_name>/<model_dir> config_file_name = "default_config" if not cfg else cfg.split( "/")[-1].split(".")[0] try: output_dir = generate_path( config.OUTPUT_DIR, git_branch(), git_hash(), config_file_name, config.TRAIN.MODEL_DIR, current_datetime(), ) except: output_dir = generate_path( config.OUTPUT_DIR, config_file_name, config.TRAIN.MODEL_DIR, current_datetime(), ) if local_rank == 0: # Run only on master process # Logging: trainer.add_event_handler( Events.ITERATION_COMPLETED, logging_handlers.log_training_output( log_interval=config.PRINT_FREQ), ) trainer.add_event_handler(Events.EPOCH_STARTED, logging_handlers.log_lr(optimizer)) # Checkpointing: snapshotting trained models to disk checkpoint_handler = SnapshotHandler( output_dir, config.MODEL.NAME, extract_metric_from("mIoU"), lambda: (trainer.state.iteration % snapshot_duration) == 0, ) evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {"model": model}) # Tensorboard and Logging: summary_writer = create_summary_writer( log_dir=path.join(output_dir, "logs")) trainer.add_event_handler( Events.EPOCH_STARTED, tensorboard_handlers.log_lr(summary_writer, optimizer, "epoch")) trainer.add_event_handler( Events.ITERATION_COMPLETED, tensorboard_handlers.log_training_output(summary_writer)) trainer.add_event_handler( Events.ITERATION_COMPLETED, tensorboard_handlers.log_validation_output(summary_writer)) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): evaluator.run(train_loader) if local_rank == 0: # Run only on master process tensorboard_handlers.log_results(engine, evaluator, summary_writer, n_classes, stage="Training") logging_handlers.log_metrics(engine, evaluator, stage="Training") logger.info("Logging training results..") @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) if local_rank == 0: # Run only on master process tensorboard_handlers.log_results(engine, evaluator, summary_writer, n_classes, stage="Validation") logging_handlers.log_metrics(engine, evaluator, stage="Validation") logger.info("Logging validation results..") # dump validation set metrics at the very end for debugging purposes if engine.state.epoch == config.TRAIN.END_EPOCH and debug: fname = f"metrics_{config_file_name}_{config.TRAIN.MODEL_DIR}.json" metrics = evaluator.state.metrics out_dict = { x: metrics[x] for x in ["nll", "pixacc", "mca", "mIoU"] } with open(fname, "w") as fid: json.dump(out_dict, fid) log_msg = " ".join(f"{k}: {out_dict[k]}" for k in out_dict.keys()) logging.info(log_msg) logger.info("Starting training") trainer.run(train_loader, max_epochs=config.TRAIN.END_EPOCH, epoch_length=len(train_loader), seed=config.SEED) if local_rank == 0: summary_writer.close()
# Define training function def update(engine, batch): model.train() batch = batch.transpose(0, 1).contiguous().to(args.device) # to shape [seq length, batch] logits, loss = model(batch, labels=batch) loss = loss / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Add progressbar with loss RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") ProgressBar(persist=True).attach(trainer, metric_names=['loss']) # Learning rate schedule: linearly warm-up to lr and then decrease the learning rate to zero with cosine cos_scheduler = CosineAnnealingScheduler(optimizer, 'lr', args.lr, 0.0, len(dataloader) * args.n_epochs) scheduler = create_lr_scheduler_with_warmup(cos_scheduler, 0.0, args.lr, args.n_warmup) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Save checkpoints and training config checkpoint_handler = ModelCheckpoint(args.log_dir, 'checkpoint', save_interval=1, n_saved=5) trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': model}) torch.save(args, os.path.join(args.log_dir, 'training_args.bin')) trainer.run(dataloader, max_epochs=args.n_epochs)
'lr': args.learning_rate, 'weight_decay': args.weight_decay, }, {'params': parameters_of(model.decoder, nn.BatchNorm2d), 'lr': args.learning_rate, }, ], momentum=0.9, ) loss_fn = OHEMLoss(ignore_index=255).cuda() lr = args.learning_rate lrs = [lr / 10, lr/10, lr, lr] schedulers = [ CosineAnnealingScheduler( optimizer, 'lr', lr, lr * 1e-4, args.epochs * len(train_loader), param_group_index=0) for index, lr in enumerate(lrs)] schedulers = [ create_lr_scheduler_with_warmup(scheduler, 0, lr, 1000) for scheduler, lr in zip(schedulers, lrs) ] model, optimizer = amp.initialize(model, optimizer, opt_level="O2") if args.distributed: model = convert_syncbn_model(model) model = DistributedDataParallel(model)
def run(*options, cfg=None, debug=False): """Run training and validation of model Notes: Options can be passed in via the options argument and loaded from the cfg file Options from default.py will be overridden by options loaded from cfg file Options passed in via options argument will override option loaded from cfg file Args: *options (str,int ,optional): Options used to overide what is loaded from the config. To see what options are available consult default.py cfg (str, optional): Location of config file to load. Defaults to None. """ update_config(config, options=options, config_file=cfg) # Start logging load_log_configuration(config.LOG_CONFIG) logger = logging.getLogger(__name__) logger.debug(config.WORKERS) scheduler_step = config.TRAIN.END_EPOCH // config.TRAIN.SNAPSHOTS torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK torch.manual_seed(config.SEED) if torch.cuda.is_available(): torch.cuda.manual_seed_all(config.SEED) np.random.seed(seed=config.SEED) # Setup Augmentations basic_aug = Compose([ Normalize(mean=(config.TRAIN.MEAN, ), std=(config.TRAIN.STD, ), max_pixel_value=1) ]) if config.TRAIN.AUGMENTATION: train_aug = Compose([basic_aug, HorizontalFlip(p=0.5)]) val_aug = basic_aug else: train_aug = val_aug = basic_aug TrainLoader = get_section_loader(config) train_set = TrainLoader( data_dir=config.DATASET.ROOT, split="train", is_transform=True, augmentations=train_aug, ) val_set = TrainLoader( data_dir=config.DATASET.ROOT, split="val", is_transform=True, augmentations=val_aug, ) class CustomSampler(torch.utils.data.Sampler): def __init__(self, data_source): self.data_source = data_source def __iter__(self): char = ["i" if np.random.randint(2) == 1 else "x"] self.indices = [ idx for (idx, name) in enumerate(self.data_source) if char[0] in name ] return (self.indices[i] for i in torch.randperm(len(self.indices))) def __len__(self): return len(self.data_source) n_classes = train_set.n_classes val_list = val_set.sections train_list = val_set.sections train_loader = data.DataLoader( train_set, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, sampler=CustomSampler(train_list), num_workers=config.WORKERS, shuffle=False, ) val_loader = data.DataLoader( val_set, batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU, sampler=CustomSampler(val_list), num_workers=config.WORKERS, ) model = getattr(models, config.MODEL.NAME).get_seg_model(config) device = "cpu" if torch.cuda.is_available(): device = "cuda" model = model.to(device) # Send to GPU optimizer = torch.optim.SGD( model.parameters(), lr=config.TRAIN.MAX_LR, momentum=config.TRAIN.MOMENTUM, weight_decay=config.TRAIN.WEIGHT_DECAY, ) try: output_dir = generate_path( config.OUTPUT_DIR, git_branch(), git_hash(), config.MODEL.NAME, current_datetime(), ) except TypeError: output_dir = generate_path( config.OUTPUT_DIR, config.MODEL.NAME, current_datetime(), ) summary_writer = create_summary_writer( log_dir=path.join(output_dir, config.LOG_DIR)) snapshot_duration = scheduler_step * len(train_loader) scheduler = CosineAnnealingScheduler(optimizer, "lr", config.TRAIN.MAX_LR, config.TRAIN.MIN_LR, snapshot_duration) # weights are inversely proportional to the frequency of the classes in # the training set class_weights = torch.tensor(config.DATASET.CLASS_WEIGHTS, device=device, requires_grad=False) criterion = torch.nn.CrossEntropyLoss(weight=class_weights, ignore_index=255, reduction="mean") trainer = create_supervised_trainer(model, optimizer, criterion, prepare_batch, device=device) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) trainer.add_event_handler( Events.ITERATION_COMPLETED, logging_handlers.log_training_output(log_interval=config.PRINT_FREQ), ) trainer.add_event_handler(Events.EPOCH_STARTED, logging_handlers.log_lr(optimizer)) trainer.add_event_handler( Events.EPOCH_STARTED, tensorboard_handlers.log_lr(summary_writer, optimizer, "epoch"), ) trainer.add_event_handler( Events.ITERATION_COMPLETED, tensorboard_handlers.log_training_output(summary_writer), ) def _select_pred_and_mask(model_out_dict): return (model_out_dict["y_pred"].squeeze(), model_out_dict["mask"].squeeze()) evaluator = create_supervised_evaluator( model, prepare_batch, metrics={ "nll": Loss(criterion, output_transform=_select_pred_and_mask, device=device), "pixacc": pixelwise_accuracy(n_classes, output_transform=_select_pred_and_mask, device=device), "cacc": class_accuracy(n_classes, output_transform=_select_pred_and_mask, device=device), "mca": mean_class_accuracy(n_classes, output_transform=_select_pred_and_mask, device=device), "ciou": class_iou(n_classes, output_transform=_select_pred_and_mask, device=device), "mIoU": mean_iou(n_classes, output_transform=_select_pred_and_mask, device=device), }, device=device, ) if debug: logger.info("Running Validation in Debug/Test mode") val_loader = take(3, val_loader) trainer.add_event_handler(Events.EPOCH_COMPLETED, Evaluator(evaluator, val_loader)) evaluator.add_event_handler( Events.EPOCH_COMPLETED, logging_handlers.log_metrics( "Validation results", metrics_dict={ "nll": "Avg loss :", "pixacc": "Pixelwise Accuracy :", "mca": "Avg Class Accuracy :", "mIoU": "Avg Class IoU :", }, ), ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, logging_handlers.log_class_metrics( "Per class validation results", metrics_dict={ "ciou": "Class IoU :", "cacc": "Class Accuracy :" }, ), ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, tensorboard_handlers.log_metrics( summary_writer, trainer, "epoch", metrics_dict={ "mIoU": "Validation/mIoU", "nll": "Validation/Loss", "mca": "Validation/MCA", "pixacc": "Validation/Pixel_Acc", }, ), ) def _select_max(pred_tensor): return pred_tensor.max(1)[1] def _tensor_to_numpy(pred_tensor): return pred_tensor.squeeze().cpu().numpy() transform_func = compose(np_to_tb, decode_segmap(n_classes=n_classes), _tensor_to_numpy) transform_pred = compose(transform_func, _select_max) evaluator.add_event_handler( Events.EPOCH_COMPLETED, create_image_writer(summary_writer, "Validation/Image", "image"), ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, create_image_writer(summary_writer, "Validation/Mask", "mask", transform_func=transform_func), ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, create_image_writer(summary_writer, "Validation/Pred", "y_pred", transform_func=transform_pred), ) def snapshot_function(): return (trainer.state.iteration % snapshot_duration) == 0 checkpoint_handler = SnapshotHandler( path.join(output_dir, config.TRAIN.MODEL_DIR), config.MODEL.NAME, extract_metric_from("mIoU"), snapshot_function, ) evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {"model": model}) logger.info("Starting training") if debug: logger.info("Running Validation in Debug/Test mode") train_loader = take(3, train_loader) trainer.run(train_loader, max_epochs=config.TRAIN.END_EPOCH)
def train(): parser = ArgumentParser() parser.add_argument("--basedir", type=str) parser.add_argument("--dataset_key", type=str, default='wikitext-2', help="key from DATASETS global") parser.add_argument("--train_file", type=str, help='Optional file path to use for train file') parser.add_argument("--valid_file", type=str, help='Optional file path to use for valid file') parser.add_argument("--dataset_cache", type=str, default=os.path.expanduser('~/.bl-data'), help="Path or url of the dataset cache") parser.add_argument("--cache_features", type=str2bool, default=True) parser.add_argument("--d_model", type=int, default=410, help="Model dimension (and embedding dsz)") parser.add_argument("--d_ff", type=int, default=2100, help="FFN dimension") parser.add_argument("--num_heads", type=int, default=10, help="Number of heads") parser.add_argument("--num_layers", type=int, default=8, help="Number of layers") parser.add_argument("--nctx", type=int, default=256, help="Max input length") parser.add_argument("--batch_size", type=int, default=8, help="Batch Size") parser.add_argument("--tokens", choices=["words", "chars", "subwords"], default="subwords", help="What tokens to use") parser.add_argument("--dropout", type=float, default=0.1, help="Dropout") parser.add_argument("--lr", type=float, default=4.0e-4, help="Learning rate") parser.add_argument("--clip", type=float, default=0.25, help="Clipping gradient norm") parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay") parser.add_argument("--epochs", type=int, default=20, help="Num training epochs") parser.add_argument("--warmup_steps", type=int, default=1000, help="Num warmup steps") parser.add_argument("--eval_every", type=int, default=-1, help="Evaluate every X steps (-1 => end of epoch)") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--distributed", type=str2bool, default=False, help="Are we doing distributed training?") parser.add_argument( "--local_rank", type=int, default=-1, help= "Local rank for distributed training (-1 means use the environment variables to find)" ) parser.add_argument("--chars_per_word", type=int, default=40, help="How many max characters per word") parser.add_argument( "--accum_grad_steps", type=int, default=1, help="Create effective batch size by accumulating grads without updates" ) args = parser.parse_args() if args.train_file and not args.valid_file: logger.error( "If you provide a train_file, you must provide a valid_file") return if not args.train_file and args.valid_file: logger.error( "If you provide a valid_file, you must also provide a train_file") return if args.basedir is None: args.basedir = 'transformer-{}-{}-{}'.format(args.dataset_key, args.tokens, os.getpid()) logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info("Cache directory [%s]", args.dataset_cache) args.distributed = args.distributed or int(os.environ.get("WORLD_SIZE", 1)) > 1 if args.distributed: if args.local_rank == -1: # https://github.com/kubeflow/pytorch-operator/issues/128 # https://github.com/pytorch/examples/blob/master/imagenet/main.py logger.info("Setting local rank to RANK env variable") args.local_rank = int(os.environ['RANK']) logger.warning("Local rank (%d)", args.local_rank) torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') if args.train_file: dataset = { 'train_file': args.train_file, 'valid_file': args.valid_file } else: dataset = DataDownloader(DATASETS[args.dataset_key], args.dataset_cache).download() reader = create_reader(args.tokens, args.nctx, args.chars_per_word) preproc_data = load_embed_and_vocab(args.tokens, reader, dataset, args.dataset_key, args.d_model, args.cache_features) vocabs = preproc_data['vocabs'] os.makedirs(args.basedir, exist_ok=True) # We want to make sure to save our input vocab into the basedir for reuse later write_json(vocabs['x'], os.path.join(args.basedir, 'vocabs.json')) embeddings = preproc_data['embeddings'] valid_num_words = preproc_data['valid_num_words'] tgt_key = preproc_data['tgt_key'] logger.info("Loaded embeddings") train_set = load_data(args.tokens, reader, dataset, 'train_file', vocabs, args.cache_features) valid_set = load_data(args.tokens, reader, dataset, 'valid_file', vocabs, args.cache_features) logger.info("valid. tokens [%s], valid. words [%s]", valid_set.tensors[-1].numel(), valid_num_words) train_sampler = torch.utils.data.distributed.DistributedSampler( train_set) if args.distributed else None train_loader = DataLoader(train_set, sampler=train_sampler, batch_size=args.batch_size, shuffle=(not args.distributed)) valid_sampler = torch.utils.data.distributed.DistributedSampler( valid_set) if args.distributed else None valid_loader = DataLoader(valid_set, sampler=valid_sampler, batch_size=args.batch_size, shuffle=False) logger.info("Loaded datasets") model = TransformerLanguageModel.create( embeddings, hsz=args.d_model, d_ff=args.d_ff, tie_weights=(args.tokens != 'chars'), dropout=args.dropout, gpu=False, num_heads=args.num_heads, layers=args.num_layers, src_keys=['x'], tgt_key=tgt_key) model.to(args.device) train_loss = model.create_loss() train_loss.to(args.device) logger.info("Loaded model and loss") optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) logger.info("Model has %s parameters", sum(p.numel() for p in model.parameters() if p.requires_grad)) # Prepare model for distributed training if needed if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) logger.info("Model located on %d", args.local_rank) def update(engine, batch): model.train() x, y = batch inputs = {'x': x.to(args.device)} labels = y.to(args.device).transpose(0, 1).contiguous() logits = model(inputs, None)[0].transpose(0, 1).contiguous() shift_logits = logits[:-1] shift_labels = labels[1:] loss = train_loss(shift_logits, shift_labels) loss = loss / args.accum_grad_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) if engine.state.iteration % args.accum_grad_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) def inference(_, batch): model.eval() with torch.no_grad(): x, y = batch inputs = {'x': x.to(args.device)} labels = y.to(args.device).transpose(0, 1).contiguous() logits = model(inputs, None)[0].transpose(0, 1).contiguous() shift_logits = logits[:-1] shift_labels = labels[1:] return shift_logits.view(-1, logits.size(-1)), shift_labels.view(-1) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate at the end of each epoch and every 'eval_every' iterations if needed trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(valid_loader)) if args.eval_every > 0: trainer.add_event_handler( Events.ITERATION_COMPLETED, lambda engine: evaluator.run(valid_loader) if engine.state.iteration % args.eval_every == 0 else None) if args.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) cos_scheduler = CosineAnnealingScheduler(optimizer, 'lr', args.lr, 0.0, len(train_loader) * args.epochs) scheduler = create_lr_scheduler_with_warmup(cos_scheduler, 0.0, args.lr, args.warmup_steps) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1))} metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args) }) if args.tokens == 'subwords': # If we compute subwords, need to renormalize for num words metrics["average_subword_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) metrics["average_word_ppl"] = MetricsLambda( lambda x: math.exp(x * valid_set.tensors[-1].numel() / valid_num_words), metrics["average_nll"]) else: metrics["average_word_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) if args.local_rank < 1: RunningAverage(output_transform=lambda x: x).attach( trainer, "valid_loss") trainer.add_event_handler( Events.EPOCH_COMPLETED, lambda _: print( "Epoch[{}] Training Loss: {:.2f}, Perplexity {:.2f}".format( trainer.state.epoch, trainer.state.output, np.exp(trainer.state.output)))) evaluator.add_event_handler( Events.COMPLETED, lambda _: print("Validation: %s" % pformat( evaluator.state.metrics))) checkpoint_handler = ModelCheckpoint(args.basedir, 'checkpoint', save_interval=1, n_saved=3, create_dir=False) trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)}) trainer.run(train_loader, max_epochs=args.epochs)