def setup_timer(engine): timer = Timer(average=True) timer.attach(engine, start=Events.EPOCH_STARTED, resume=Events.ITERATION_STARTED, pause=Events.ITERATION_COMPLETED) return timer
def add_progress_bar_eval(evaluator, validation_loader): """ "I can't believe it's not Keras" Running average accuracy and loss metrics + TQDM progressbar """ validation_history = {'accuracy': [], 'loss': []} last_epoch = [] RunningAverage(output_transform=lambda x: x[0]).attach(evaluator, 'loss') RunningAverage(Accuracy(output_transform=lambda x: (x[0], x[1]))).attach( evaluator, 'accuracy') prog_bar = ProgressBar() prog_bar.attach(evaluator, ['accuracy']) # prog_bar.pbar_cls=tqdm.tqdm from ignite.handlers import Timer timer = Timer(average=True) timer.attach(evaluator, start=Events.EPOCH_STARTED, resume=Events.EPOCH_STARTED, pause=Events.EPOCH_COMPLETED, step=Events.EPOCH_COMPLETED) @evaluator.on(Events.EPOCH_COMPLETED) def log_validation_results(evaluator): metrics = evaluator.state.metrics accuracy = metrics['accuracy'] * 100 loss = metrics['nll'] validation_history['accuracy'].append(accuracy) validation_history['loss'].append(loss) val_msg = "Valid Epoch {}: acc: {:.2f}% loss: {:.2f}, eval time: {:.2f}s".format( evaluator.state.epoch, accuracy, loss, timer.value()) prog_bar.log_message(val_msg)
class DataflowBenchmark: def __init__(self, num_iters=100, prepare_batch=None): from ignite.handlers import Timer device = idist.device() def upload_to_gpu(engine, batch): if prepare_batch is not None: x, y = prepare_batch(batch, device=device, non_blocking=False) self.num_iters = num_iters self.benchmark_dataflow = Engine(upload_to_gpu) @self.benchmark_dataflow.on(Events.ITERATION_COMPLETED(once=num_iters)) def stop_benchmark_dataflow(engine): engine.terminate() if idist.get_rank() == 0: @self.benchmark_dataflow.on( Events.ITERATION_COMPLETED(every=num_iters // 100)) def show_progress_benchmark_dataflow(engine): print(".", end=" ") self.timer = Timer(average=False) self.timer.attach( self.benchmark_dataflow, start=Events.EPOCH_STARTED, resume=Events.ITERATION_STARTED, pause=Events.ITERATION_COMPLETED, step=Events.ITERATION_COMPLETED, ) def attach(self, trainer, train_loader): from torch.utils.data import DataLoader @trainer.on(Events.STARTED) def run_benchmark(_): if idist.get_rank() == 0: print("-" * 50) print(" - Dataflow benchmark") self.benchmark_dataflow.run(train_loader) t = self.timer.value() if idist.get_rank() == 0: print(" ") print(" Total time ({} iterations) : {:.5f} seconds".format( self.num_iters, t)) print(" time per iteration : {} seconds".format( t / self.num_iters)) if isinstance(train_loader, DataLoader): num_images = train_loader.batch_size * self.num_iters print(" number of images / s : {}".format( num_images / t)) print("-" * 50)
def main(dataset_path, batch_size=256, max_epochs=10): assert torch.cuda.is_available() assert torch.backends.cudnn.enabled, "NVIDIA/Apex:Amp requires cudnn backend to be enabled." torch.backends.cudnn.benchmark = True device = "cuda" train_loader, test_loader, eval_train_loader = get_train_eval_loaders( dataset_path, batch_size=batch_size) model = wide_resnet50_2(num_classes=100).to(device) optimizer = SGD(model.parameters(), lr=0.01) criterion = CrossEntropyLoss().to(device) def train_step(engine, batch): x = convert_tensor(batch[0], device, non_blocking=True) y = convert_tensor(batch[1], device, non_blocking=True) optimizer.zero_grad() y_pred = model(x) loss = criterion(y_pred, y) loss.backward() optimizer.step() return loss.item() trainer = Engine(train_step) timer = Timer(average=True) timer.attach(trainer, step=Events.EPOCH_COMPLETED) ProgressBar(persist=True).attach( trainer, output_transform=lambda out: {"batch loss": out}) metrics = {"Accuracy": Accuracy(), "Loss": Loss(criterion)} evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def log_metrics(engine, title): for name in metrics: print("\t{} {}: {:.2f}".format(title, name, engine.state.metrics[name])) @trainer.on(Events.COMPLETED) def run_validation(_): print("- Mean elapsed time for 1 epoch: {}".format(timer.value())) print("- Metrics:") with evaluator.add_event_handler(Events.COMPLETED, log_metrics, "Train"): evaluator.run(eval_train_loader) with evaluator.add_event_handler(Events.COMPLETED, log_metrics, "Test"): evaluator.run(test_loader) trainer.run(train_loader, max_epochs=max_epochs)
def run(cfg, train_loader, tr_comp, saver, trainer, valid_dict): # TODO resume # trainer = Engine(...) # trainer.load_state_dict(state_dict) # trainer.run(data) # checkpoint handler = ModelCheckpoint(saver.model_dir, 'train', n_saved=3, create_dir=True) checkpoint_params = tr_comp.state_dict() trainer.add_event_handler(Events.EPOCH_COMPLETED, handler, checkpoint_params) timer = Timer(average=True) timer.attach(trainer, start=Events.EPOCH_STARTED, resume=Events.ITERATION_STARTED, pause=Events.ITERATION_COMPLETED, step=Events.ITERATION_COMPLETED) # average metric to attach on trainer names = ["Acc", "Loss"] names.extend(tr_comp.loss_function_map.keys()) for n in names: RunningAverage(output_transform=Run(n)).attach(trainer, n) @trainer.on(Events.EPOCH_COMPLETED) def adjust_learning_rate(engine): tr_comp.scheduler.step() @trainer.on(Events.ITERATION_COMPLETED(every=cfg.TRAIN.LOG_ITER_PERIOD)) def log_training_loss(engine): message = f"Epoch[{engine.state.epoch}], " + \ f"Iteration[{engine.state.iteration}/{len(train_loader)}], " + \ f"Base Lr: {tr_comp.scheduler.get_last_lr()[0]:.2e}, " for loss_name in engine.state.metrics.keys(): message += f"{loss_name}: {engine.state.metrics[loss_name]:.4f}, " if tr_comp.xent and tr_comp.xent.learning_weight: message += f"xentWeight: {tr_comp.xent.uncertainty.mean().item():.4f}, " logger.info(message) # adding handlers using `trainer.on` decorator API @trainer.on(Events.EPOCH_COMPLETED) def print_times(engine): logger.info('Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]' .format(engine.state.epoch, timer.value() * timer.step_count, train_loader.batch_size / timer.value())) logger.info('-' * 80) timer.reset() @trainer.on(Events.EPOCH_COMPLETED(every=cfg.EVAL.EPOCH_PERIOD)) def log_validation_results(engine): logger.info(f"Valid - Epoch: {engine.state.epoch}") eval_multi_dataset(cfg, valid_dict, tr_comp) trainer.run(train_loader, max_epochs=cfg.TRAIN.MAX_EPOCHS)
def engine_eval_geomreg(cfg, mode): prepare_config_eval(cfg) ckpt_path = cfg.eval.general.ckpt_path gpu = cfg.general.gpu root_path = cfg.log.root_path seed = cfg.general.seed eu.redirect_stdout(root_path, 'eval_geomreg-{}'.format(mode)) eu.print_config(cfg) eu.seed_random(seed) device = eu.get_device(gpu) dataloader = get_dataloader_eval_geomreg(cfg, mode) num_batches = len(dataloader) render_model, desc_model = get_models(cfg) render_model.to(device) render_model.eval_mode() render_model.print_params('render_model') desc_model.to(device) desc_model.eval_mode() desc_model.print_params('desc_model') assert eu.is_not_empty(ckpt_path) render_model.load(ckpt_path) desc_model.load(ckpt_path) engine = Engine( functools.partial(step_eval_geomreg, render_model=render_model, desc_model=desc_model, device=device, cfg=cfg)) timer = Timer(average=True) timer.attach(engine, start=Events.EPOCH_STARTED, pause=Events.EPOCH_COMPLETED, resume=Events.ITERATION_STARTED, step=Events.ITERATION_COMPLETED) engine.add_event_handler(Events.ITERATION_COMPLETED, eu.print_eval_log, timer=timer, num_batches=num_batches) engine.add_event_handler(Events.EXCEPTION_RAISED, eu.handle_exception) engine.run(dataloader, 1) return root_path
def visdom_loss_handler(modules_dict, model_name): """ Attaches plots and metrics to trainer. This handler creates or connects to an environment on a running Visdom dashboard and creates a line plot that tracks the loss function of a training loop as a function of the number of iterations. This can be attached to an Ignite Engine, and the training closure must have 'loss' as one of the keys in its return dict for this plot to be made. See documentation for Ignite (https://github.com/pytorch/ignite) and Visdom (https://github.com/facebookresearch/visdom) for more information. """ tim = Timer() tim.attach( trainer, start=Events.STARTED, step=Events.ITERATION_COMPLETED, ) vis = visdom.Visdom(env=environment) def create_plot_window(vis, xlabel, ylabel, title): return vis.line(X=np.array([1]), Y=np.array([np.nan]), opts=dict(xlabel=xlabel, ylabel=ylabel, title=title)) train_loss_window = create_plot_window(vis, '#Iterations', 'Loss', description) log_interval = 10 @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iter = (engine.state.iteration - 1) if iter % log_interval == 0: print("Epoch[{}] Iteration: {} Time: {} Loss: {:.2f}".format( engine.state.epoch, iter, str(datetime.timedelta(seconds=int(tim.value()))), engine.state.output)) vis.line(X=np.array([engine.state.iteration]), Y=np.array([engine.state.output]), update='append', win=train_loss_window) save_interval = 50 handler = ModelCheckpoint('/tmp/models', model_name, save_interval=save_interval, n_saved=5, create_dir=True, require_empty=False) trainer.add_event_handler(Events.ITERATION_COMPLETED, handler, modules_dict)