def setup_tb_logging(output_path, trainer, optimizers=None, evaluators=None, log_every_iters=100): """Method to setup TensorBoard logging on trainer and a list of evaluators. Logged metrics are: - Training metrics, e.g. running average loss values - Learning rate(s) - Evaluation metrics Args: output_path (str): logging directory path trainer (Engine): trainer engine optimizers (torch.optim.Optimizer or dict of torch.optim.Optimizer, optional): single or dictionary of torch optimizers. If a dictionary, keys are used as tags arguments for logging. evaluators (Engine or dict of Engine, optional): single or dictionary of evaluators. If a dictionary, keys are used as tags arguments for logging. log_every_iters (int, optional): interval for loggers attached to iteration events. To log every iteration, value can be set to 1 or None. Returns: TensorboardLogger """ tb_logger = TensorboardLogger(log_dir=output_path) setup_any_logging(tb_logger, tb_logger_module, trainer, optimizers, evaluators, log_every_iters=log_every_iters) return tb_logger
def setup_tb_logging( output_path: str, trainer: Engine, optimizers: Optional[Union[Optimizer, Dict[str, Optimizer]]] = None, evaluators: Optional[Union[Engine, Dict[str, Engine]]] = None, log_every_iters: int = 100, **kwargs: Any, ) -> TensorboardLogger: """Method to setup TensorBoard logging on trainer and a list of evaluators. Logged metrics are: - Training metrics, e.g. running average loss values - Learning rate(s) - Evaluation metrics Args: output_path (str): logging directory path trainer (Engine): trainer engine optimizers (torch.optim.Optimizer or dict of torch.optim.Optimizer, optional): single or dictionary of torch optimizers. If a dictionary, keys are used as tags arguments for logging. evaluators (Engine or dict of Engine, optional): single or dictionary of evaluators. If a dictionary, keys are used as tags arguments for logging. log_every_iters (int, optional): interval for loggers attached to iteration events. To log every iteration, value can be set to 1 or None. **kwargs: optional keyword args to be passed to construct the logger. Returns: :class:`~ignite.contrib.handlers.tensorboard_logger.TensorboardLogger` """ logger = TensorboardLogger(log_dir=output_path, **kwargs) _setup_logging(logger, trainer, optimizers, evaluators, log_every_iters) return logger
def set_tensorboard(self, metrics): """Extension method for logging on tensorboard. Args: trainer (ignite.Engine): trainer val_evaluator (ignite.Engine): validation evaluator. val_evaluator (ignite.Engine): test evaluator. """ logger = TensorboardLogger(log_dir=self.res_dir / 'tensorboard' / 'train') _log_tensorboard(logger, self.trainer, f"{self.prefix}/train", self.step_func, ["loss"]) _log_tensorboard(logger, self.evaluator, f"{self.prefix}/val", self.step_func, metrics)
def setup_all_loggers( conf: OmegaConf) -> [TensorboardLogger, TensorboardLogger, Path]: folder = Path( conf.checkpoint.folder).expanduser().resolve() / conf.fullname folder.mkdir(parents=True, exist_ok=True) # Loguru logger: stderr and logs.txt add_logfile(folder / "logs.txt") logger.info(f"PID: {os.getpid()}") logger.info(f"Host: {socket.gethostname()}") logger.info(f'SLURM_JOB_ID: {os.getenv("SLURM_JOB_ID")}') # Tensorboard: two loggers, the second one specifically for images, so the first one stays slim tb_logger = TensorboardLogger(logdir=folder) tb_img_logger = TensorboardLogger(logdir=folder, filename_suffix=".images") add_custom_scalars(tb_logger.writer) add_hparam_summary(tb_logger.writer, conf.hparams) # Json: only validation metrics json_logger = folder / "metrics.json" return tb_logger, tb_img_logger, json_logger
def setup_ignite(engine: Engine, parameters: SimpleNamespace, experience_source: ExperienceSourceFirstLast, run_name: str, extra_metrics: Iterable[str] = ()): warnings.simplefilter("ignore", category=UserWarning) handler: EndOfEpisodeHandler = EndOfEpisodeHandler(experience_source, bound_avg_reward=parameters.stop_reward) handler.attach(engine) EpisodeFPSHandler().attach(engine) @engine.on(EpisodeEvents.EPISODE_COMPLETED) def episode_completed(trainer: Engine): time_passed = trainer.state.metrics.get('time_passed', 0) print("Episode %d: reward=%.0f, steps=%s, " "speed=%.1f f/s, elapsed=%s" % ( trainer.state.episode, trainer.state.episode_reward, trainer.state.episode_steps, trainer.state.metrics.get('avg_fps', 0), timedelta(seconds=int(time_passed)))) @engine.on(EpisodeEvents.BOUND_REWARD_REACHED) def game_solved(trainer: Engine): time_passed = trainer.state.metrics['time_passed'] print("Game solved in %s, after %d episodes " "and %d iterations!" % ( timedelta(seconds=int(time_passed)), trainer.state.episode, trainer.state.iteration)) trainer.should_terminate = True now = datetime.now().isoformat(timespec='minutes') log_directory: str = f"runs/{now}-{parameters.run_name}-{run_name}" tensorboard_logger: TensorboardLogger = TensorboardLogger(log_dir=log_directory) running_average: RunningAverage = RunningAverage(output_transform=lambda v: v['loss']) running_average.attach(engine, "avg_loss") metrics: List[str] = ['reward', 'steps', 'avg_reward'] output_handler: OutputHandler = OutputHandler(tag="episodes", metric_names=metrics) episode_completed_event = EpisodeEvents.EPISODE_COMPLETED tensorboard_logger.attach(engine, log_handler=output_handler, event_name=episode_completed_event) PeriodicEvents().attach(engine) metrics = ['avg_loss', 'avg_fps'] metrics.extend(extra_metrics) output_handler = OutputHandler(tag="train", metric_names=metrics, output_transform=lambda a: a) hundred_iterations_event = PeriodEvents.ITERS_100_COMPLETED tensorboard_logger.attach(engine, log_handler=output_handler, event_name=hundred_iterations_event)
def run(epochs, lr, momentum, log_interval, params, trainloader, testloader, model): device = "cuda" if torch.cuda.is_available() else "cpu" net = Net(params).to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=lr, momentum=momentum) trainer = create_supervised_trainer(net, optimizer, criterion, device=device) trainer.logger = setup_logger("trainer") val_metrics = { "accuracy": Accuracy(), "loss": Loss(criterion), "recall": Recall() } evaluator = create_supervised_evaluator(net, metrics=val_metrics, device=device) evaluator.logger = setup_logger("evaluator") # Attach handler to plot trainer's loss every 100 iterations tb_logger = TensorboardLogger(log_dir="cifar-output") tb_logger.attach_output_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=params.get("loss_report")), tag="training", output_transform=lambda loss: {"loss": loss}, ) # Attach handler to dump evaluator's metrics every epoch completed for tag, evaluator in [("training", trainer), ("validation", evaluator)]: tb_logger.attach_output_handler( evaluator, event_name=Events.EPOCH_COMPLETED, tag=tag, metric_names="all", global_step_transform=global_step_from_engine(trainer), ) # Attach function to build debug images and report every epoch end tb_logger.attach( evaluator, log_handler=predictions_gt_images_handler, event_name=Events.EPOCH_COMPLETED(once=1), ) desc = "ITERATION - loss: {:.2f}" pbar = tqdm(initial=0, leave=False, total=len(trainloader), desc=desc.format(0)) @trainer.on(Events.ITERATION_COMPLETED(every=log_interval)) def log_training_loss(engine): pbar.desc = desc.format(engine.state.output) pbar.update(log_interval) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): pbar.refresh() evaluator.run(trainloader) metrics = evaluator.state.metrics avg_accuracy = metrics["accuracy"] avg_nll = metrics["loss"] tqdm.write( "Training Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(testloader) metrics = evaluator.state.metrics avg_accuracy = metrics["accuracy"] avg_nll = metrics["loss"] tqdm.write( "Validation Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) pbar.n = pbar.last_print_n = 0 @trainer.on(Events.EPOCH_COMPLETED | Events.COMPLETED) def log_time(): tqdm.write("{} took {} seconds".format( trainer.last_event_name.name, trainer.state.times[trainer.last_event_name.name], )) trainer.run(trainloader, max_epochs=epochs) pbar.close() PATH = "./cifar_net.pth" # CONDITION depicts a custom condition for when to save the model. The model is saved and then updated in ClearML CONDITION = True if CONDITION: torch.save(net.state_dict(), PATH) model.update_weights(weights_filename=PATH) print("Finished Training") print("Task ID number is: {}".format(Task.current_task().id))
def run(output_dir, config): device = torch.device("cuda" if args.use_cuda else "cpu") torch.manual_seed(config['seed']) np.random.seed(config['seed']) # Rescale batch_size and num_workers ngpus_per_node = 1 batch_size = config['batch_size'] num_workers = int( (config['num_workers'] + ngpus_per_node - 1) / ngpus_per_node) (train_loader, test_loader, mislabeled_train_loader) = get_train_test_loaders( path=config['data_path'], batch_size=batch_size, num_workers=num_workers, random_seed=config['seed'], random_labels_fraction=config['random_labels_fraction'], ) model = get_mnist_model(args, device) optimizer = AdamFlexibleWeightDecay( model.parameters(), lr=config['init_lr'], weight_decay_order=config['weight_decay_order'], weight_decay=config['weight_decay']) criterion = nn.CrossEntropyLoss().to(device) le = len(train_loader) lr_scheduler = MultiStepLR(optimizer, milestones=[le * config['epochs'] * 3 // 4], gamma=0.1) def _prepare_batch(batch, device, non_blocking): x, y = batch return (convert_tensor(x, device=device, non_blocking=non_blocking), convert_tensor(y, device=device, non_blocking=non_blocking)) def process_function(unused_engine, batch): x, y = _prepare_batch(batch, device=device, non_blocking=True) model.train() optimizer.zero_grad() y_pred = model(x) if config['agreement_threshold'] > 0.0: # The "batch_size" in this function refers to the batch size per env # Since we treat every example as one env, we should set the parameter # n_agreement_envs equal to batch size mean_loss, masks = and_mask_utils.get_grads( agreement_threshold=config['agreement_threshold'], batch_size=1, loss_fn=criterion, n_agreement_envs=config['batch_size'], params=optimizer.param_groups[0]['params'], output=y_pred, target=y, method=args.method, scale_grad_inverse_sparsity=config[ 'scale_grad_inverse_sparsity'], ) else: mean_loss = criterion(y_pred, y) mean_loss.backward() optimizer.step() return {} trainer = Engine(process_function) metric_names = [] common.setup_common_training_handlers(trainer, output_path=output_dir, lr_scheduler=lr_scheduler, output_names=metric_names, with_pbar_on_iters=True, log_every_iters=10) tb_logger = TensorboardLogger(log_dir=output_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag="train", metric_names=metric_names), event_name=Events.ITERATION_COMPLETED) metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)} test_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) mislabeled_train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def run_validation(engine): if args.use_cuda: torch.cuda.synchronize() train_evaluator.run(train_loader) if config['random_labels_fraction'] > 0.0: mislabeled_train_evaluator.run(mislabeled_train_loader) test_evaluator.run(test_loader) def flush_metrics(engine): tb_logger.writer.flush() trainer.add_event_handler(Events.EPOCH_STARTED(every=1), run_validation) trainer.add_event_handler(Events.COMPLETED, run_validation) trainer.add_event_handler(Events.EPOCH_COMPLETED, flush_metrics) ProgressBar(persist=False, desc="Train evaluation").attach(train_evaluator) ProgressBar(persist=False, desc="Test evaluation").attach(test_evaluator) ProgressBar(persist=False, desc="Train (mislabeled portion) evaluation").attach( mislabeled_train_evaluator) tb_logger.attach( train_evaluator, log_handler=OutputHandler( tag="train", metric_names=list(metrics.keys()), global_step_transform=global_step_from_engine(trainer)), event_name=Events.COMPLETED) tb_logger.attach( test_evaluator, log_handler=OutputHandler( tag="test", metric_names=list(metrics.keys()), global_step_transform=global_step_from_engine(trainer)), event_name=Events.COMPLETED) tb_logger.attach( mislabeled_train_evaluator, log_handler=OutputHandler( tag="train_wrong", metric_names=list(metrics.keys()), global_step_transform=global_step_from_engine(trainer)), event_name=Events.COMPLETED) trainer_rng = np.random.RandomState() trainer.run(train_loader, max_epochs=config['epochs'], seed=trainer_rng.randint(2**32)) tb_logger.close()
def main(args): # check cuda available assert torch.cuda.is_available() == True # when the input dimension doesnot change, add this flag to speed up cudnn.benchmark = True num_folds = len(args.folds) # model ckpt name prefix model_save_dir = '_'.join([args.model, args.problem_type, str(args.lr)]) # we can add more params for comparison in future experiments model_save_dir = '_'.join([model_save_dir, str(args.jaccard_weight), \ str(args.batch_size), str(args.input_height), str(args.input_width)]) if args.semi: model_save_dir = '_'.join( [model_save_dir, args.semi_method, str(args.semi_percentage)]) # model save directory model_save_dir = Path(args.model_save_dir) / model_save_dir model_save_dir.mkdir(exist_ok=True, parents=True) args.model_save_dir = str( model_save_dir) # e.g. $ROOT_DIR/model/UNet_binary_1e-5/ # logger logging_logger = logging.getLogger('train') logging_logger.propagate = False # this logger should not propagate to parent logger # logger log_level logging_logger.setLevel(args.log_level) # logging format formatter = logging.Formatter( "[%(asctime)s %(levelname)s %(filename)s:%(lineno)d] %(message)s") # console log handler: write log to console rf_handler = logging.StreamHandler() rf_handler.setFormatter(formatter) logging_logger.addHandler(rf_handler) # file log handler: write log to file for each fold f_handler = logging.FileHandler( str(model_save_dir / (args.log_filename + '.log'))) f_handler.setFormatter(formatter) logging_logger.addHandler(f_handler) # add as args args.logging_logger = logging_logger # TODO: add tensorboardX and tf logger in ignite # visualization of internal values (attention maps, gated outputs, etc) if args.tb_log: # tensorboard logger tf_log_dir = model_save_dir / 'tb_logs' tf_log_dir.mkdir(exist_ok=True, parents=True) tb_logger = TensorboardLogger(log_dir=str(tf_log_dir)) # add as arguments args.tb_logger = tb_logger # input params input_msg = 'Input arguments:\n' for arg_name, arg_val in vars(args).items(): input_msg += '{}: {}\n'.format(arg_name, arg_val) logging_logger.info(input_msg) # metrics mean and std: RESUME HERE mean_metrics = {'miou': 0, 'std_miou': 0, 'mdice': 0, 'std_mdice': 0} args.mean_metrics = mean_metrics # when the input dimension doesnot change, add this flag to speed up cudnn.enabled = True cudnn.benchmark = True for fold in args.folds: #metrics_records_fold = train_fold(fold, args) # input params process_fold(fold, args) # find best validation results on mean_iou best_epoch, best_record = sorted(metrics_records_fold.items(), key=lambda item: item[1]['miou'], reverse=True)[0] # accumulate metrics and calculate mean for all folds for metric_name, val in best_record.items(): mean_metrics[metric_name] += val / num_folds # print fold results logging_logger.info('fold: %d, (on epoch %d) metrics: %s' % (fold, best_epoch, best_record)) # print mean results avg_results_log = 'average on validation for %d folds %s:\n' % (num_folds, args.folds) for metric_name, val in mean_metrics.items(): avg_results_log += '%s: %.3f\n' % (metric_name, val) logging_logger.info(avg_results_log)
num_workers=10) classes = ( "plane", "car", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck", ) tb_logger = TensorboardLogger(log_dir="cifar-output") # Helper function to store predictions and scores using matplotlib def predictions_gt_images_handler(engine, logger, *args, **kwargs): x, _ = engine.state.batch y_pred, y = engine.state.output num_x = num_y = 4 le = num_x * num_y fig = plt.figure(figsize=(20, 20)) trans = transforms.ToPILImage() for idx in range(le): preds = torch.argmax(F.softmax(y_pred[idx], dim=0)) probs = torch.max(F.softmax(y_pred[idx], dim=0)) ax = fig.add_subplot(num_x, num_y, idx + 1, xticks=[], yticks=[])
metric_names = ['_batch_train_loss', *losses.keys()] common.setup_common_training_handlers(trainer, train_sampler=train_sampler, to_save=to_save, save_every_iters=hp['save_every_iters'], output_path=str(output_path), lr_scheduler=scheduler, with_gpu_stats=True, output_names=metric_names, with_pbars=True, with_pbar_on_iters=True, log_every_iters=hp['log_progress_every_iters'], device=backend_conf.device) if backend_conf.rank == 0: tb_logger = TensorboardLogger(log_dir=str(output_path)) tb_logger.attach(trainer, log_handler=OutputHandler(tag='train', metric_names=metric_names), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer, param_name='lr'), event_name=Events.ITERATION_STARTED) # TODO: make sure hp params logging works here + use test eval metrics instead of training's tb_logger.attach(trainer, log_handler=HyperparamsOutoutHandler(hp, metric_names=metric_names), event_name=Events.COMPLETED) def _metrics(prefix): return {**{f'{prefix}_{n}': m for n, m in metrics.items()}, **{f'{prefix}_{n}': loss for n, loss in losses.items()}} valid_evaluator = create_supervised_evaluator(model, metrics=_metrics('valid'), device=device, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=_metrics('train'), device=device, non_blocking=True) @trainer.on(Events.EPOCH_STARTED(every=hp['validate_every_epochs'])) @trainer.on(Events.COMPLETED) def _run_validation(engine: Engine): if torch.cuda.is_available() and not backend_conf.is_cpu:
def train(epochs=500, batch_size=32, bptt_len=70, lr=0.00025, log_steps=200, clip_grad=0.25, log_dir="experiments"): ################################################################### # Dataset ################################################################### wt = wikitext103(batch_size=batch_size, bptt_len=bptt_len) # wt = wikitext2(batch_size=batch_size, bptt_len=bptt_len) ################################################################### # Configs ################################################################### embedding_config = DropEmbedding.Hyperparams(len(wt.text_field.vocab) + 3, ninp=512) encoder_config = TransformerEncoder.Hyperparams( att_num_units=[512, 512, 512, 512, 512, 512], max_ext=384) ################################################################### # Models ################################################################### base_embedding = DropEmbedding(embedding_config) embedding = TransformerEmbedding(embedding=base_embedding, max_length=bptt_len, embedding_size=embedding_config.ninp, use_positional_embedding=False) encoder = TransformerEncoder(encoder_config) model = TransformerLanguageModel(embedding, encoder) model.init_weight() ################################################################### # Loss ################################################################### criterion = lm_criterion(in_features=encoder_config.att_num_units[-1], vocab_size=len(wt.text_field.vocab)) ################################################################### # Parameters + Train ops ################################################################### parameters = (list(model.parameters()) + list(criterion.parameters())) tot_params = 0 for p in parameters: tot_params += reduce(lambda x, y: x * y, p.size()) print("Total Parameters: ", tot_params) opt = optim.Adam(parameters, lr=lr) model.to(DEVICE) criterion.to(DEVICE) ################################################################### # Train + Evaluation ################################################################### def train_step(engine, batch): model.train() opt.zero_grad() text = batch.text.to(DEVICE).t().contiguous() target = batch.target.to(DEVICE).t().contiguous() out, out_past = model(text, engine.state.train_past) engine.state.train_past = out_past raw_loss = criterion(out.view(-1, out.size(2)), target.view(-1)) loss = raw_loss[1] loss.backward() nn.utils.clip_grad_norm_(parameters, clip_grad) opt.step() return {"train_loss": loss.item(), "train_ppl": loss.exp().item()} def eval_step(engine, batch): model.eval() if not hasattr(engine.state, "eval_past"): engine.state.eval_past = None with torch.no_grad(): text = batch.text.to(DEVICE).t().contiguous() target = batch.target.to(DEVICE).t().contiguous() out, out_past = model(text, engine.state.eval_past) engine.state.eval_past = out_past raw_loss = criterion(out.view(-1, out.size(2)), target.view(-1)) loss = raw_loss[1] return {"val_loss": loss.item()} train_engine = Engine(train_step) eval_engine = Engine(eval_step) def reset_state(engine): engine.state.train_past = None def run_eval(_): print("start running eval") eval_engine.run(wt.valid_iter) metrics = eval_engine.state.metrics print("Validation loss: ", metrics["val_loss"], ", ppl: ", np.exp(metrics["val_loss"])) train_engine.add_event_handler(Events.EPOCH_STARTED, reset_state) train_engine.add_event_handler(Events.EPOCH_COMPLETED, run_eval) ################################################################### # LR Scheduler ################################################################### cosine_scheduler = CosineAnnealingScheduler(opt.param_groups[0], "lr", 0.0, 2.5e-4, cycle_size=len(wt.train_iter)) warmup_scheduler = create_lr_scheduler_with_warmup(cosine_scheduler, 0.0, 2.5e-4, 200) train_engine.add_event_handler(Events.ITERATION_STARTED, warmup_scheduler) ################################################################### # Metrics ################################################################### RunningAverage(output_transform=lambda x: x["train_ppl"]).attach( train_engine, "train_ppl") RunningAverage(output_transform=lambda x: x["train_loss"]).attach( train_engine, "train_loss") RunningAverage(output_transform=lambda x: x["val_loss"]).attach( eval_engine, "val_loss") progress_bar = ProgressBar(persist=True) progress_bar.attach(train_engine, ["train_ppl", "train_loss"]) progress_bar_val = ProgressBar(persist=True) progress_bar_val.attach(eval_engine, ["val_loss"]) ################################################################### # Tensorboard ################################################################### tb_logger = TensorboardLogger(log_dir=log_dir) def stepn_logger(num_steps, handler): def logger_runner(engine, log_handler, event_name): if engine.state.iteration % num_steps == 0: handler(engine, log_handler, event_name) return logger_runner tb_logger.attach(train_engine, log_handler=stepn_logger( log_steps, OutputHandler(tag="training", output_transform=lambda loss: loss)), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(eval_engine, log_handler=OutputHandler( tag="validation", output_transform=lambda loss: loss, another_engine=train_engine), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(train_engine, log_handler=stepn_logger(log_steps, OptimizerParamsHandler(opt)), event_name=Events.ITERATION_STARTED) tb_logger.attach(train_engine, log_handler=stepn_logger(log_steps, WeightsScalarHandler(model)), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(train_engine, log_handler=stepn_logger(log_steps, GradsScalarHandler(model)), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(train_engine, log_handler=stepn_logger(500, WeightsHistHandler(model)), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(train_engine, log_handler=stepn_logger(500, GradsHistHandler(model)), event_name=Events.ITERATION_COMPLETED) try: train_engine.run(wt.train_iter, max_epochs=epochs) except Exception: pass finally: tb_logger.close()
def run(warmup_iterations=5000, batch_size=4, test_size=2000, epochs=10, log_interval=100, debug_images_interval=50, train_dataset_ann_file='~/bigdata/coco/annotations/instances_train2017.json', val_dataset_ann_file='~/bigdata/coco/annotations/instances_val2017.json', input_checkpoint='', load_optimizer=False, load_params=False, output_dir="/tmp/checkpoints", log_dir="/tmp/tensorboard_logs", lr=0.005, momentum=0.9, weight_decay=0.0005, use_mask=True, use_toy_testing_data=False, backbone_name='resnet101', num_workers=6, trainable_layers=3, train_set_size=None, early_stopping=False, patience=3, step_size=3, gamma=0.1, record_histograms=True): # Set the training device to GPU if available - if not set it to CPU device = torch.cuda.current_device() if torch.cuda.is_available() else torch.device('cpu') torch.backends.cudnn.benchmark = True if torch.cuda.is_available() else False # optimization for fixed input size # Write hyperparams hparam_dict = { 'warmup_iterations': warmup_iterations, 'training_batch_size': batch_size, 'test_size': test_size, 'epochs': epochs, 'trainable_layers': trainable_layers, 'lr': lr, 'momentum': momentum, 'weight_decay': weight_decay, 'train_set_size': train_set_size, 'step_size': step_size, 'gamma': gamma, 'early_stopping': early_stopping, 'patience': patience, 'total_iterations': 0, 'total_epochs': 0, 'timeout': True, } # Load checkpoint if available if input_checkpoint: hparam_path = Path(input_checkpoint).parent / 'hparams.pickle' logger.info('Loading model checkpoint from '.format(input_checkpoint)) input_checkpoint = torch.load(input_checkpoint, map_location=torch.device(device)) # FIXME Bad overload with open(hparam_path, 'rb') as f: hparam_dict = pickle.load(f) # Load the training parameters from the saved hparam dictionary if load_params: warmup_iterations, batch_size, test_size, epochs, trainable_layers, lr, momentum,\ weight_decay, train_set_size, step_size, gamma, early_stopping, patience = itemgetter( 'warmup_iterations', 'training_batch_size', 'test_size', 'epochs', 'trainable_layers', 'lr', 'momentum', 'weight_decay', 'train_set_size', 'step_size', 'gamma', 'early_stopping', 'patience')(hparam_dict) try: train_set_size -= 1 except TypeError as e: pass print('Hparams: ', hparam_dict) # Define train and test datasets train_loader, val_loader, labels_enum = get_data_loaders(train_dataset_ann_file, val_dataset_ann_file, batch_size, test_size, configuration_data.get('image_size'), use_mask=use_mask, _use_toy_testing_set=use_toy_testing_data, num_workers=num_workers, train_set_size=train_set_size) # Hparams hparam_dict['training_set_size'] = len(train_loader) * batch_size hparam_dict['validation_set_size'] = len(val_loader) * batch_size with open(os.path.join(output_dir, 'hparams.pickle'), 'wb') as f: pickle.dump(hparam_dict, f) val_dataset = list(chain.from_iterable( zip(*copy.deepcopy(batch)) for batch in iter(val_loader))) # TODO Figure out what this does and use deepcopy. coco_api_val_dataset = convert_to_coco_api(val_dataset) num_classes = max(labels_enum.keys()) + 1 # number of classes plus one for background class configuration_data['num_classes'] = num_classes logger.info('Training with {} classes...'.format(num_classes)) if use_mask: logger.debug('Loading MaskRCNN Model...') model = get_model_instance_segmentation(num_classes, configuration_data.get('mask_predictor_hidden_layer')) else: logger.debug('Loading FasterRCNN Model...') model = get_model_instance_detection(num_classes, backbone_name=backbone_name, trainable_layers=trainable_layers) iou_types = get_iou_types(model) # if there is more than one GPU, parallelize the model if torch.cuda.device_count() > 1: logger.debug("{} GPUs were detected - we will use all of them".format(torch.cuda.device_count())) model = torch.nn.DataParallel(model) # copy the model to each device model.to(device) if input_checkpoint: model.load_state_dict(input_checkpoint['model']) logger.debug('Initializing SummaryWriter...') if use_mask: comment = 'mask' else: comment = 'box-{}'.format(backbone_name) logger.debug('Creating Trainer...') # define Ignite's train and evaluation engine trainer = create_trainer(model, device) logger.debug('Creating Evaluator...') evaluator = create_evaluator(model, device) logger.debug('Initializing Tensorboard Logger...') tb_logger = TensorboardLogger(log_dir=log_dir, comment=comment) if record_histograms: tb_logger.attach( trainer, event_name=Events.ITERATION_COMPLETED(every=500), log_handler=WeightsHistHandler(model) ) writer = tb_logger.writer logger.debug('Setting up profiler...') profiler = BasicTimeProfiler() profiler.attach(trainer) coco_ap = CocoAP(coco_api_val_dataset, iou_types) coco_ap_05 = CocoAP5(coco_api_val_dataset, iou_types) coco_ap_075 = CocoAP75(coco_api_val_dataset, iou_types) coco_ap.attach(evaluator, "AP") coco_ap_05.attach(evaluator, "AP0.5") coco_ap_075.attach(evaluator, "AP0.75") tb_logger.attach( evaluator, log_handler=OutputHandler( tag='evaluation', metric_names=['AP', 'AP0.5', 'AP0.75'], global_step_transform=global_step_from_engine(trainer) ), event_name=Events.EPOCH_COMPLETED ) ## Early stopping def score_function(engine): ap_score = engine.state.metrics['AP'] return ap_score if early_stopping: handler = EarlyStopping(patience=patience, score_function=score_function, trainer=trainer) # Note: the handler is attached to an *Evaluator* (runs one epoch on validation dataset). evaluator.add_event_handler(Events.COMPLETED, handler) @trainer.on(Events.EPOCH_COMPLETED) def log_intermediate_results(): logger.debug('Epoch Complete...') profiler.print_results(profiler.get_results()) @trainer.on(Events.STARTED) def on_training_started(engine): # construct an optimizer logger.info('Started Training...') params = [p for p in model.parameters() if p.requires_grad] engine.state.optimizer = torch.optim.SGD(params, lr=lr, momentum=momentum, weight_decay=weight_decay) tb_logger.attach( trainer, log_handler=OptimizerParamsHandler(engine.state.optimizer), event_name=Events.ITERATION_STARTED ) engine.state.scheduler = torch.optim.lr_scheduler.StepLR(engine.state.optimizer, step_size=step_size, gamma=gamma) if input_checkpoint: # Load traininer states trainer.state.epoch = input_checkpoint['epoch'] if 'iteration' in input_checkpoint: trainer.state.iteration = input_checkpoint['iteration'] else: trainer.state.iteration = int(hparam_dict['training_set_size'] / batch_size * input_checkpoint['epoch']) if load_optimizer: print('loading optimizer') logger.info('Loading optimizer and scheduler...') engine.state.optimizer.load_state_dict(input_checkpoint['optimizer']) engine.state.scheduler.load_state_dict(input_checkpoint['lr_scheduler']) engine.state.scheduler.last_epoch = trainer.state.epoch else: print('not loading optimizer') @trainer.on(Events.EPOCH_STARTED) def on_epoch_started(engine): logger.debug('Started Epoch...') model.train() engine.state.warmup_scheduler = None #TODO Print optimizer values if engine.state.epoch == 1: warmup_iters = min(warmup_iterations, len(train_loader) - 1) print('Warm up period was set to {} iterations'.format(warmup_iters)) warmup_factor = 1. / warmup_iters engine.state.warmup_scheduler = utils.warmup_lr_scheduler(engine.state.optimizer, warmup_iters, warmup_factor) @trainer.on(Events.ITERATION_COMPLETED) def on_iteration_completed(engine): images, targets, loss_dict_reduced = engine.state.output if engine.state.iteration % log_interval == 0: loss = sum(loss for loss in loss_dict_reduced.values()).item() print("Epoch: {}, Iteration: {}, Loss: {}".format(engine.state.epoch, engine.state.iteration, loss)) for k, v in loss_dict_reduced.items(): writer.add_scalar("loss/{}".format(k), v.item(), engine.state.iteration) writer.add_scalar("loss/total_loss", sum(loss for loss in loss_dict_reduced.values()).item(), engine.state.iteration) # writer.add_scalar("learning_rate/lr", engine.state.optimizer.param_groups[0]['lr'], engine.state.iteration) if engine.state.iteration % debug_images_interval == 0: for n, debug_image in enumerate(draw_debug_images(images, targets)): writer.add_image("training/image_{}".format(n), debug_image, engine.state.iteration, dataformats='HWC') if 'masks' in targets[n]: writer.add_image("training/image_{}_mask".format(n), draw_mask(targets[n]), engine.state.iteration, dataformats='HW') images = targets = loss_dict_reduced = engine.state.output = None @trainer.on(Events.EPOCH_COMPLETED) def on_epoch_completed(engine): logger.debug('Finished Epoch...') update_hparams(engine) engine.state.scheduler.step() evaluator.run(val_loader) # for res_type in evaluator.state.coco_evaluator.iou_types: # average_precision_05 = evaluator.state.coco_evaluator.coco_eval[res_type].stats[1] # writer.add_scalar("validation-{}/average precision 0_5".format(res_type), average_precision_05, # engine.state.iteration) checkpoint_path = os.path.join(output_dir, 'model_epoch_{}.pth'.format(engine.state.epoch)) print('Saving model checkpoint') checkpoint = { 'model': model.state_dict(), 'optimizer': engine.state.optimizer.state_dict(), 'lr_scheduler': engine.state.scheduler.state_dict(), 'epoch': engine.state.epoch, 'iteration': engine.state.iteration, 'configuration': configuration_data, 'labels_enumeration': labels_enum} utils.save_on_master(checkpoint, checkpoint_path) print('Model checkpoint from epoch {} was saved at {}'.format(engine.state.epoch, checkpoint_path)) checkpoint = None evaluator.state = State() @trainer.on(Events.COMPLETED) def on_training_completed(engine): logger.debug('Finished Training...') update_hparams(engine, finished=True) writer.add_hparams(hparam_dict=hparam_dict, metric_dict={ 'hparams/AP': coco_ap.ap, 'hparams/AP.5': coco_ap_05.ap5, 'hparams/AP.75': coco_ap_075.ap75 }) logger.debug('Wrote hparams...') def update_hparams(engine, finished=False): hparam_dict['total_iterations'] = global_step_from_engine(engine)(engine, Events.ITERATION_COMPLETED) hparam_dict['total_epochs'] = global_step_from_engine(engine)(engine, Events.EPOCH_COMPLETED) hparam_dict['timeout'] = not finished if hparam_dict['train_set_size'] is None: hparam_dict['train_set_size'] = hparam_dict['training_set_size'] try: shutil.copyfile(os.path.join(output_dir, 'hparams.pickle'), os.path.join(output_dir, 'hparams.pickle.backup')) with open(os.path.join(output_dir, 'hparams.pickle'), 'wb') as f: pickle.dump(hparam_dict, f) except AttributeError as e: print('Could not pickle one of the total vars.', e) os.replace(os.path.join(output_dir, 'hparams.pickle.backup'), os.path.join(output_dir, 'hparams.pickle')) @evaluator.on(Events.STARTED) def on_evaluation_started(engine): logger.debug('Started Evaluation...') model.eval() # engine.state.coco_evaluator = CocoEvaluator(coco_api_val_dataset, iou_types) @evaluator.on(Events.ITERATION_COMPLETED) def on_eval_iteration_completed(engine): images, targets, results = engine.state.output if engine.state.iteration % log_interval == 0: print("Evaluation: Iteration: {}".format(engine.state.iteration)) if engine.state.iteration % debug_images_interval == 0: for n, debug_image in enumerate(draw_debug_images(images, targets, results)): print('Drawing debug image "validation/image_{}_{}"'.format(engine.state.iteration, n)) writer.add_image("evaluation/image_{}_{}".format(engine.state.iteration, n), debug_image, trainer.state.iteration, dataformats='HWC') if 'masks' in targets[n]: writer.add_image("validation/image_{}_{}_mask".format(engine.state.iteration, n), draw_mask(targets[n]), trainer.state.iteration, dataformats='HW') curr_image_id = int(targets[n]['image_id']) writer.add_image("validation/image_{}_{}_predicted_mask".format(engine.state.iteration, n), draw_mask(results[curr_image_id]).squeeze(), trainer.state.iteration, dataformats='HW') images = targets = results = engine.state.output = None @evaluator.on(Events.COMPLETED) def on_evaluation_completed(engine): logger.debug('Finished Evaluation...') # gather the stats from all processes # engine.state.coco_evaluator.synchronize_between_processes() # # # accumulate predictions from all images # engine.state.coco_evaluator.accumulate() # engine.state.coco_evaluator.summarize() # # pr_50, pr_75 = get_pr_levels(engine.state.coco_evaluator.coco_eval['bbox']) # TODO Bring this back # writer.add_hparams(hparam_dict, { # 'hparams/AP.5': np.mean(pr_50), # 'hparams/AP.75': np.mean(pr_75) # }) logger.debug('Running Trainer...') trainer.run(train_loader, max_epochs=epochs) writer.close() profiler.write_results('{}/time_profiling.csv'.format(output_dir))
def run(train_loader, val_loader, epochs, lr, momentum, weight_decay, lr_step, k1, k2, es_patience, log_dir): model = Vgg16() device = 'cpu' if torch.cuda.is_available(): device = 'cuda' model.to(device) optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay) lr_scheduler = ExponentialLR(optimizer, gamma=0.975) # criterion = VAELoss(k1=k1, k2=k2).to(device) def update_fn(engine, batch): x, y = _prepare_batch(batch, device=device, non_blocking=True) model.train() optimizer.zero_grad() output = model(x) # Compute loss loss = F.nll_loss(output, y) loss.backward() optimizer.step() return { "batchloss": loss.item(), } trainer = Engine(update_fn) try: GpuInfo().attach(trainer) except RuntimeError: print( "INFO: By default, in this example it is possible to log GPU information (used memory, utilization). " "As there is no pynvml python package installed, GPU information won't be logged. Otherwise, please " "install it : `pip install pynvml`") trainer.add_event_handler(Events.ITERATION_COMPLETED(every=lr_step), lambda engine: lr_scheduler.step()) metric_names = [ 'batchloss', ] def output_transform(x, name): return x[name] for n in metric_names: # We compute running average values on the output (batch loss) across all devices RunningAverage(output_transform=partial(output_transform, name=n), epoch_bound=False, device=device).attach(trainer, n) exp_name = datetime.now().strftime("%Y%m%d-%H%M%S") log_path = log_dir + "/vgg_vae/{}".format(exp_name) tb_logger = TensorboardLogger(log_dir=log_path) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=metric_names), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer, "lr"), event_name=Events.ITERATION_STARTED) ProgressBar(persist=True, bar_format="").attach(trainer, event_name=Events.EPOCH_STARTED, closing_event_name=Events.COMPLETED) ProgressBar(persist=False, bar_format="").attach(trainer, metric_names=metric_names) # val process definition def loss_output_transform(output): return output def acc_output_transform(output): return output customed_loss = Loss(loss_fn=F.nll_loss, output_transform=loss_output_transform, device=device) customed_accuracy = Accuracy(output_transform=acc_output_transform, device=device) metrics = {'Loss': customed_loss, 'Accuracy': customed_accuracy} def val_update_fn(engine, batch): model.eval() with torch.no_grad(): x, y = _prepare_batch(batch, device=device, non_blocking=True) output = model(x) return output, y val_evaluator = Engine(val_update_fn) for name, metric in metrics.items(): metric.attach(val_evaluator, name) def run_evaluation(engine): val_evaluator.run(val_loader) trainer.add_event_handler(Events.EPOCH_COMPLETED, run_evaluation) trainer.add_event_handler(Events.COMPLETED, run_evaluation) ProgressBar(persist=False, desc="Train evaluation").attach(val_evaluator) # Log val metrics: tb_logger.attach(val_evaluator, log_handler=OutputHandler(tag="val", metric_names=list( metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) # trainer.add_event_handler(Events.ITERATION_COMPLETED, TerminateOnNan()) # Store the best model def default_score_fn(engine): score = engine.state.metrics['Accuracy'] return score best_model_handler = ModelCheckpoint(dirname=log_path, filename_prefix="best", n_saved=3, score_name="val_acc", score_function=default_score_fn) val_evaluator.add_event_handler(Events.COMPLETED, best_model_handler, { 'model': model, }) # Add early stopping es_patience = es_patience es_handler = EarlyStopping(patience=es_patience, score_function=default_score_fn, trainer=trainer) val_evaluator.add_event_handler(Events.COMPLETED, es_handler) setup_logger(es_handler._logger) setup_logger(logging.getLogger("ignite.engine.engine.Engine")) def empty_cuda_cache(engine): torch.cuda.empty_cache() import gc gc.collect() trainer.add_event_handler(Events.EPOCH_COMPLETED, empty_cuda_cache) val_evaluator.add_event_handler(Events.COMPLETED, empty_cuda_cache) trainer.run(train_loader, max_epochs=epochs)
def run(output_path, config): device = "cuda" local_rank = config["local_rank"] distributed = backend is not None if distributed: torch.cuda.set_device(local_rank) device = "cuda" rank = dist.get_rank() if distributed else 0 torch.manual_seed(config["seed"] + rank) # Rescale batch_size and num_workers ngpus_per_node = torch.cuda.device_count() ngpus = dist.get_world_size() if distributed else 1 batch_size = config["batch_size"] // ngpus num_workers = int( (config["num_workers"] + ngpus_per_node - 1) / ngpus_per_node) train_loader, test_loader = get_train_test_loaders( path=config["data_path"], batch_size=batch_size, distributed=distributed, num_workers=num_workers, ) model = get_model(config["model"]) model = model.to(device) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[ local_rank, ], output_device=local_rank, ) optimizer = optim.SGD( model.parameters(), lr=config["learning_rate"], momentum=config["momentum"], weight_decay=config["weight_decay"], nesterov=True, ) criterion = nn.CrossEntropyLoss().to(device) le = len(train_loader) milestones_values = [ (0, 0.0), (le * config["num_warmup_epochs"], config["learning_rate"]), (le * config["num_epochs"], 0.0), ] lr_scheduler = PiecewiseLinear(optimizer, param_name="lr", milestones_values=milestones_values) def _prepare_batch(batch, device, non_blocking): x, y = batch return ( convert_tensor(x, device=device, non_blocking=non_blocking), convert_tensor(y, device=device, non_blocking=non_blocking), ) def process_function(engine, batch): x, y = _prepare_batch(batch, device=device, non_blocking=True) model.train() # Supervised part y_pred = model(x) loss = criterion(y_pred, y) optimizer.zero_grad() loss.backward() optimizer.step() return { "batch loss": loss.item(), } trainer = Engine(process_function) train_sampler = train_loader.sampler if distributed else None to_save = { "trainer": trainer, "model": model, "optimizer": optimizer, "lr_scheduler": lr_scheduler, } metric_names = [ "batch loss", ] common.setup_common_training_handlers( trainer, train_sampler=train_sampler, to_save=to_save, save_every_iters=config["checkpoint_every"], output_path=output_path, lr_scheduler=lr_scheduler, output_names=metric_names, with_pbar_on_iters=config["display_iters"], log_every_iters=10, ) if rank == 0: tb_logger = TensorboardLogger(log_dir=output_path) tb_logger.attach( trainer, log_handler=OutputHandler(tag="train", metric_names=metric_names), event_name=Events.ITERATION_COMPLETED, ) tb_logger.attach( trainer, log_handler=OptimizerParamsHandler(optimizer, param_name="lr"), event_name=Events.ITERATION_STARTED, ) metrics = { "accuracy": Accuracy(device=device if distributed else None), "loss": Loss(criterion, device=device if distributed else None), } evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def run_validation(engine): torch.cuda.synchronize() train_evaluator.run(train_loader) evaluator.run(test_loader) trainer.add_event_handler( Events.EPOCH_STARTED(every=config["validate_every"]), run_validation) trainer.add_event_handler(Events.COMPLETED, run_validation) if rank == 0: if config["display_iters"]: ProgressBar(persist=False, desc="Train evaluation").attach(train_evaluator) ProgressBar(persist=False, desc="Test evaluation").attach(evaluator) tb_logger.attach( train_evaluator, log_handler=OutputHandler( tag="train", metric_names=list(metrics.keys()), global_step_transform=global_step_from_engine(trainer), ), event_name=Events.COMPLETED, ) tb_logger.attach( evaluator, log_handler=OutputHandler( tag="test", metric_names=list(metrics.keys()), global_step_transform=global_step_from_engine(trainer), ), event_name=Events.COMPLETED, ) # Store the best model by validation accuracy: common.save_best_model_by_val_score( output_path, evaluator, model=model, metric_name="accuracy", n_saved=3, trainer=trainer, tag="test", ) if config["log_model_grads_every"] is not None: tb_logger.attach( trainer, log_handler=GradsHistHandler(model, tag=model.__class__.__name__), event_name=Events.ITERATION_COMPLETED( every=config["log_model_grads_every"]), ) if config["crash_iteration"] is not None: @trainer.on(Events.ITERATION_STARTED(once=config["crash_iteration"])) def _(engine): raise Exception("STOP at iteration: {}".format( engine.state.iteration)) resume_from = config["resume_from"] if resume_from is not None: checkpoint_fp = Path(resume_from) assert checkpoint_fp.exists(), "Checkpoint '{}' is not found".format( checkpoint_fp.as_posix()) print("Resume from a checkpoint: {}".format(checkpoint_fp.as_posix())) checkpoint = torch.load(checkpoint_fp.as_posix()) Checkpoint.load_objects(to_load=to_save, checkpoint=checkpoint) try: trainer.run(train_loader, max_epochs=config["num_epochs"]) except Exception as e: import traceback print(traceback.format_exc()) if rank == 0: tb_logger.close()
def run(output_path, config): device = "cuda" batch_size = config['batch_size'] train_labelled_loader, train_unlabelled_loader, test_loader = \ get_train_test_loaders(dataset_name=config['dataset'], num_labelled_samples=config['num_labelled_samples'], path=config['data_path'], batch_size=batch_size, unlabelled_batch_size=config.get('unlabelled_batch_size', None), num_workers=config['num_workers']) model = get_model(config['model']) model = model.to(device) optimizer = optim.SGD(model.parameters(), lr=config['learning_rate'], momentum=config['momentum'], weight_decay=config['weight_decay'], nesterov=True) with_SWA = config['with_SWA'] if with_SWA: optimizer = torchcontrib.optim.SWA(optimizer) criterion = nn.CrossEntropyLoss().to(device) if config['consistency_criterion'] == "MSE": consistency_criterion = nn.MSELoss() elif config['consistency_criterion'] == "KL": consistency_criterion = nn.KLDivLoss(reduction='batchmean') else: raise RuntimeError("Unknown consistency criterion {}".format( config['consistency_criterion'])) consistency_criterion = consistency_criterion.to(device) le = len(train_labelled_loader) num_train_steps = le * config['num_epochs'] mlflow.log_param("num train steps", num_train_steps) lr = config['learning_rate'] eta_min = lr * config['min_lr_ratio'] num_warmup_steps = config['num_warmup_steps'] lr_scheduler = CosineAnnealingLR(optimizer, eta_min=eta_min, T_max=num_train_steps - num_warmup_steps) if num_warmup_steps > 0: lr_scheduler = create_lr_scheduler_with_warmup( lr_scheduler, warmup_start_value=0.0, warmup_end_value=lr * (1.0 + 1.0 / num_warmup_steps), warmup_duration=num_warmup_steps) def _prepare_batch(batch, device, non_blocking): x, y = batch return (convert_tensor(x, device=device, non_blocking=non_blocking), convert_tensor(y, device=device, non_blocking=non_blocking)) def cycle(iterable): while True: for i in iterable: yield i train_unlabelled_loader_iter = cycle(train_unlabelled_loader) lam = config['consistency_lambda'] tsa = TrainingSignalAnnealing(num_steps=num_train_steps, min_threshold=config['TSA_proba_min'], max_threshold=config['TSA_proba_max']) with_tsa = config['with_TSA'] with_UDA = not config['no_UDA'] def uda_process_function(engine, labelled_batch): x, y = _prepare_batch(labelled_batch, device=device, non_blocking=True) if with_UDA: unsup_x, unsup_aug_x = next(train_unlabelled_loader_iter) unsup_x = convert_tensor(unsup_x, device=device, non_blocking=True) unsup_aug_x = convert_tensor(unsup_aug_x, device=device, non_blocking=True) model.train() # Supervised part y_pred = model(x) loss = criterion(y_pred, y) supervised_loss = loss step = engine.state.iteration - 1 if with_tsa and with_UDA: new_y_pred, new_y = tsa(y_pred, y, step=step) new_loss = criterion(new_y_pred, new_y) engine.state.tsa_log = { "new_y_pred": new_y_pred, "loss": loss.item(), "tsa_loss": new_loss.item() } supervised_loss = new_loss # Unsupervised part if with_UDA: unsup_orig_y_pred = model(unsup_x).detach() unsup_orig_y_probas = torch.softmax(unsup_orig_y_pred, dim=-1) unsup_aug_y_pred = model(unsup_aug_x) unsup_aug_y_probas = torch.log_softmax(unsup_aug_y_pred, dim=-1) consistency_loss = consistency_criterion(unsup_aug_y_probas, unsup_orig_y_probas) final_loss = supervised_loss if with_UDA: final_loss += lam * consistency_loss optimizer.zero_grad() final_loss.backward() optimizer.step() return { 'supervised batch loss': supervised_loss.item(), 'consistency batch loss': consistency_loss.item() if with_UDA else 0.0, 'final batch loss': final_loss.item(), } trainer = Engine(uda_process_function) if with_UDA and with_tsa: @trainer.on(Events.ITERATION_COMPLETED) def log_tsa(engine): step = engine.state.iteration - 1 if step % 50 == 0: mlflow.log_metric("TSA threshold", tsa.thresholds[step].item(), step=step) mlflow.log_metric("TSA selection", engine.state.tsa_log['new_y_pred'].shape[0], step=step) mlflow.log_metric("Original X Loss", engine.state.tsa_log['loss'], step=step) mlflow.log_metric("TSA X Loss", engine.state.tsa_log['tsa_loss'], step=step) if not hasattr(lr_scheduler, "step"): trainer.add_event_handler(Events.ITERATION_STARTED, lr_scheduler) else: trainer.add_event_handler(Events.ITERATION_STARTED, lambda engine: lr_scheduler.step()) @trainer.on(Events.ITERATION_STARTED) def log_learning_rate(engine): step = engine.state.iteration - 1 if step % 50 == 0: lr = optimizer.param_groups[0]['lr'] mlflow.log_metric("learning rate", lr, step=step) if with_SWA: @trainer.on(Events.COMPLETED) def swap_swa_sgd(engine): optimizer.swap_swa_sgd() optimizer.bn_update(train_labelled_loader, model) @trainer.on(Events.EPOCH_COMPLETED) def update_swa(engine): if engine.state.epoch - 1 > int(num_epochs * 0.75): optimizer.update_swa() metric_names = [ 'supervised batch loss', 'consistency batch loss', 'final batch loss' ] def output_transform(x, name): return x[name] for n in metric_names: RunningAverage(output_transform=partial(output_transform, name=n), epoch_bound=False).attach(trainer, n) ProgressBar(persist=True, bar_format="").attach(trainer, event_name=Events.EPOCH_STARTED, closing_event_name=Events.COMPLETED) tb_logger = TensorboardLogger(log_dir=output_path) tb_logger.attach(trainer, log_handler=tbOutputHandler(tag="train", metric_names=[ 'final batch loss', 'consistency batch loss', 'supervised batch loss' ]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=tbOptimizerParamsHandler(optimizer, param_name="lr"), event_name=Events.ITERATION_STARTED) metrics = { "accuracy": Accuracy(), } evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def run_validation(engine, val_interval): if (engine.state.epoch - 1) % val_interval == 0: train_evaluator.run(train_labelled_loader) evaluator.run(test_loader) trainer.add_event_handler(Events.EPOCH_STARTED, run_validation, val_interval=2) trainer.add_event_handler(Events.COMPLETED, run_validation, val_interval=1) tb_logger.attach(train_evaluator, log_handler=tbOutputHandler(tag="train", metric_names=list( metrics.keys()), another_engine=trainer), event_name=Events.COMPLETED) tb_logger.attach(evaluator, log_handler=tbOutputHandler(tag="test", metric_names=list( metrics.keys()), another_engine=trainer), event_name=Events.COMPLETED) def mlflow_batch_metrics_logging(engine, tag): step = trainer.state.iteration for name, value in engine.state.metrics.items(): mlflow.log_metric("{} {}".format(tag, name), value, step=step) def mlflow_val_metrics_logging(engine, tag): step = trainer.state.epoch for name in metrics.keys(): value = engine.state.metrics[name] mlflow.log_metric("{} {}".format(tag, name), value, step=step) trainer.add_event_handler(Events.ITERATION_COMPLETED, mlflow_batch_metrics_logging, "train") train_evaluator.add_event_handler(Events.COMPLETED, mlflow_val_metrics_logging, "train") evaluator.add_event_handler(Events.COMPLETED, mlflow_val_metrics_logging, "test") trainer.run(train_labelled_loader, max_epochs=config['num_epochs'])
def run(output_path, config): distributed = dist.is_available() and dist.is_initialized() rank = dist.get_rank() if distributed else 0 manual_seed(config["seed"] + rank) # Setup dataflow, model, optimizer, criterion train_loader, test_loader = utils.get_dataflow(config, distributed) model, optimizer = utils.get_model_optimizer(config, distributed) criterion = nn.CrossEntropyLoss().to(utils.device) le = len(train_loader) milestones_values = [ (0, 0.0), (le * config["num_warmup_epochs"], config["learning_rate"]), (le * config["num_epochs"], 0.0), ] lr_scheduler = PiecewiseLinear(optimizer, param_name="lr", milestones_values=milestones_values) # Setup Ignite trainer: # - let's define training step # - add other common handlers: # - TerminateOnNan, # - handler to setup learning rate scheduling, # - ModelCheckpoint # - RunningAverage` on `train_step` output # - Two progress bars on epochs and optionally on iterations def train_step(engine, batch): x = convert_tensor(batch[0], device=utils.device, non_blocking=True) y = convert_tensor(batch[1], device=utils.device, non_blocking=True) model.train() # Supervised part y_pred = model(x) loss = criterion(y_pred, y) optimizer.zero_grad() loss.backward() optimizer.step() return { "batch loss": loss.item(), } if config["deterministic"] and rank == 0: print("Setup deterministic trainer") trainer = Engine(train_step) if not config["deterministic"] else DeterministicEngine(train_step) train_sampler = train_loader.sampler if distributed else None to_save = {"trainer": trainer, "model": model, "optimizer": optimizer, "lr_scheduler": lr_scheduler} metric_names = [ "batch loss", ] common.setup_common_training_handlers( trainer, train_sampler=train_sampler, to_save=to_save, save_every_iters=config["checkpoint_every"], output_path=output_path, lr_scheduler=lr_scheduler, output_names=metric_names, with_pbar_on_iters=config["display_iters"], log_every_iters=10, ) if rank == 0: # Setup Tensorboard logger - wrapper on SummaryWriter tb_logger = TensorboardLogger(log_dir=output_path) # Attach logger to the trainer and log trainer's metrics (stored in trainer.state.metrics) every iteration tb_logger.attach( trainer, log_handler=OutputHandler(tag="train", metric_names=metric_names), event_name=Events.ITERATION_COMPLETED, ) # log optimizer's parameters: "lr" every iteration tb_logger.attach( trainer, log_handler=OptimizerParamsHandler(optimizer, param_name="lr"), event_name=Events.ITERATION_STARTED ) # Let's now setup evaluator engine to perform model's validation and compute metrics metrics = { "accuracy": Accuracy(device=utils.device if distributed else None), "loss": Loss(criterion, device=utils.device if distributed else None), } # We define two evaluators as they wont have exactly similar roles: # - `evaluator` will save the best model based on validation score evaluator = create_supervised_evaluator(model, metrics=metrics, device=utils.device, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=utils.device, non_blocking=True) def run_validation(engine): train_evaluator.run(train_loader) evaluator.run(test_loader) trainer.add_event_handler(Events.EPOCH_STARTED(every=config["validate_every"]), run_validation) trainer.add_event_handler(Events.COMPLETED, run_validation) if rank == 0: # Setup progress bar on evaluation engines if config["display_iters"]: ProgressBar(persist=False, desc="Train evaluation").attach(train_evaluator) ProgressBar(persist=False, desc="Test evaluation").attach(evaluator) # Let's log metrics of `train_evaluator` stored in `train_evaluator.state.metrics` when validation run is done tb_logger.attach( train_evaluator, log_handler=OutputHandler( tag="train", metric_names="all", global_step_transform=global_step_from_engine(trainer) ), event_name=Events.COMPLETED, ) # Let's log metrics of `evaluator` stored in `evaluator.state.metrics` when validation run is done tb_logger.attach( evaluator, log_handler=OutputHandler( tag="test", metric_names="all", global_step_transform=global_step_from_engine(trainer) ), event_name=Events.COMPLETED, ) # Store 3 best models by validation accuracy: common.save_best_model_by_val_score( output_path, evaluator, model=model, metric_name="accuracy", n_saved=3, trainer=trainer, tag="test" ) # Optionally log model gradients if config["log_model_grads_every"] is not None: tb_logger.attach( trainer, log_handler=GradsHistHandler(model, tag=model.__class__.__name__), event_name=Events.ITERATION_COMPLETED(every=config["log_model_grads_every"]), ) # In order to check training resuming we can emulate a crash if config["crash_iteration"] is not None: @trainer.on(Events.ITERATION_STARTED(once=config["crash_iteration"])) def _(engine): raise Exception("STOP at iteration: {}".format(engine.state.iteration)) resume_from = config["resume_from"] if resume_from is not None: checkpoint_fp = Path(resume_from) assert checkpoint_fp.exists(), "Checkpoint '{}' is not found".format(checkpoint_fp.as_posix()) print("Resume from a checkpoint: {}".format(checkpoint_fp.as_posix())) checkpoint = torch.load(checkpoint_fp.as_posix()) Checkpoint.load_objects(to_load=to_save, checkpoint=checkpoint) try: trainer.run(train_loader, max_epochs=config["num_epochs"]) except Exception as e: import traceback print(traceback.format_exc()) if rank == 0: tb_logger.close()
evaluator.add_event_handler( Events.EPOCH_COMPLETED, partial(reduce_on_plateau, scheduler=scheduler) ) RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") pbar = ProgressBar(persist=False, bar_format=None) pbar.attach(trainer, ["loss"]) trainer.add_event_handler( Events.EPOCH_COMPLETED, lambda e: evaluator.run(valid_data_iter) ) log_dir = config.pop("results_path") if Path(log_dir).exists(): rmtree(log_dir) tb_logger = TensorboardLogger(log_dir=log_dir) tb_logger.attach( trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.EPOCH_COMPLETED, ) tb_logger.attach( evaluator, log_handler=OutputHandler( tag="validation", metric_names=["loss", "ppl", "bleu", "lr"], another_engine=trainer, ), event_name=Events.EPOCH_COMPLETED,
def run(args): if args.seed is not None: random.seed(args.seed) torch.manual_seed(args.seed) num_classes = 21 device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') model = GoogLeNetFCN(num_classes) model.init_from_googlenet() device_count = torch.cuda.device_count() if device_count > 1: print("Using %d GPU(s)" % device_count) model = nn.DataParallel(model) args.batch_size = device_count * args.batch_size args.val_batch_size = device_count * args.val_batch_size model = model.to(device) train_loader, val_loader = get_data_loaders( args.dataset_dir, args.batch_size, args.val_batch_size, args.num_workers, args.download, args.augmentations) criterion = nn.CrossEntropyLoss(ignore_index=255, reduction='sum') optimizer = optim.SGD([{ 'params': [ param for name, param in model.named_parameters() if name.endswith('weight') ] }, { 'params': [ param for name, param in model.named_parameters() if name.endswith('bias') ], 'lr': args.lr * 2, 'weight_decay': 0 }], lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.resume: if os.path.isfile(args.resume): print("Loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_iou = checkpoint['bestIoU'] model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) print("Loaded checkpoint '{}' (Epoch {})".format( args.resume, checkpoint['epoch'])) else: print("No checkpoint found at '{}'".format(args.resume)) sys.exit() if args.freeze_bn: print("Freezing batch norm") model = freeze_batchnorm(model) trainer = create_supervised_trainer(model, optimizer, criterion, device, non_blocking=True) RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss') # attach progress bar pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=['loss']) cm = ConfusionMatrix(num_classes) evaluator = create_supervised_evaluator(model, metrics={ 'loss': Loss(criterion), 'IoU': IoU(cm) }, device=device, non_blocking=True) pbar2 = ProgressBar(persist=True, desc='Eval Epoch') pbar2.attach(evaluator) def _global_step_transform(engine, event_name): return trainer.state.iteration tb_logger = TensorboardLogger(args.log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag='training', metric_names=['loss']), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(trainer, log_handler=WeightsHistHandler(model), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(evaluator, log_handler=OutputHandler( tag='validation', metric_names=['loss', 'IoU'], global_step_transform=_global_step_transform), event_name=Events.EPOCH_COMPLETED) @evaluator.on(Events.EPOCH_COMPLETED) def save_checkpoint(engine): iou = engine.state.metrics['IoU'] * 100.0 mean_iou = iou.mean() is_best = mean_iou.item() > trainer.state.best_iou trainer.state.best_iou = max(mean_iou.item(), trainer.state.best_iou) name = 'epoch{}_mIoU={:.1f}.pth'.format(trainer.state.epoch, mean_iou) file = { 'model': model.state_dict(), 'epoch': trainer.state.epoch, 'iteration': engine.state.iteration, 'optimizer': optimizer.state_dict(), 'args': args, 'bestIoU': trainer.state.best_iou } save(file, args.output_dir, 'checkpoint_{}'.format(name)) if is_best: save(model.state_dict(), args.output_dir, 'model_{}'.format(name)) @trainer.on(Events.STARTED) def initialize(engine): if args.resume: engine.state.epoch = args.start_epoch engine.state.iteration = args.start_epoch * len( engine.state.dataloader) engine.state.best_iou = best_iou else: engine.state.best_iou = 0.0 @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): pbar.log_message("Start Validation - Epoch: [{}/{}]".format( engine.state.epoch, engine.state.max_epochs)) evaluator.run(val_loader) metrics = evaluator.state.metrics loss = metrics['loss'] iou = metrics['IoU'] mean_iou = iou.mean() pbar.log_message( "Validation results - Epoch: [{}/{}]: Loss: {:.2e}, mIoU: {:.1f}". format(engine.state.epoch, engine.state.max_epochs, loss, mean_iou * 100.0)) print("Start training") trainer.run(train_loader, max_epochs=args.epochs) tb_logger.close()
filename_prefix="best", n_saved=1, score_name="accuracy", score_function=Checkpoint.get_default_score_fn("accuracy"), ) evaluator.add_event_handler(Events.COMPLETED, best_model_handler) # ### Setting up TensorBoard as an experiment tracking system tb_logger = TensorboardLogger(log_dir=output_path) # Attach handler to plot trainer's loss every 100 iterations tb_logger.attach_output_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), tag="training", output_transform=lambda loss: {"batch_loss": loss}, ) # Attach handler for plotting both evaluators' metrics after every epoch completes tb_logger.attach_output_handler( evaluator, event_name=Events.EPOCH_COMPLETED, tag="validation", metric_names="all",
def run(output_path, config): device = "cuda" local_rank = config['local_rank'] distributed = backend is not None if distributed: torch.cuda.set_device(local_rank) device = "cuda" rank = dist.get_rank() if distributed else 0 # Rescale batch_size and num_workers ngpus_per_node = torch.cuda.device_count() ngpus = dist.get_world_size() if distributed else 1 batch_size = config['batch_size'] // ngpus num_workers = int( (config['num_workers'] + ngpus_per_node - 1) / ngpus_per_node) train_labelled_loader, test_loader = \ get_train_test_loaders(path=config['data_path'], batch_size=batch_size, distributed=distributed, num_workers=num_workers) model = get_model(config['model']) model = model.to(device) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[ local_rank, ], output_device=local_rank) optimizer = optim.SGD(model.parameters(), lr=config['learning_rate'], momentum=config['momentum'], weight_decay=config['weight_decay'], nesterov=True) criterion = nn.CrossEntropyLoss().to(device) le = len(train_labelled_loader) milestones_values = [(0, 0.0), (le * config['num_warmup_epochs'], config['learning_rate']), (le * config['num_epochs'], 0.0)] lr_scheduler = PiecewiseLinear(optimizer, param_name="lr", milestones_values=milestones_values) def _prepare_batch(batch, device, non_blocking): x, y = batch return (convert_tensor(x, device=device, non_blocking=non_blocking), convert_tensor(y, device=device, non_blocking=non_blocking)) def process_function(engine, labelled_batch): x, y = _prepare_batch(labelled_batch, device=device, non_blocking=True) model.train() # Supervised part y_pred = model(x) loss = criterion(y_pred, y) optimizer.zero_grad() loss.backward() optimizer.step() return { 'batch loss': loss.item(), } trainer = Engine(process_function) if not hasattr(lr_scheduler, "step"): trainer.add_event_handler(Events.ITERATION_STARTED, lr_scheduler) else: trainer.add_event_handler(Events.ITERATION_COMPLETED, lambda engine: lr_scheduler.step()) metric_names = [ 'batch loss', ] def output_transform(x, name): return x[name] for n in metric_names: # We compute running average values on the output (batch loss) across all devices RunningAverage(output_transform=partial(output_transform, name=n), epoch_bound=False, device=device).attach(trainer, n) if rank == 0: checkpoint_handler = ModelCheckpoint(dirname=output_path, filename_prefix="checkpoint") trainer.add_event_handler(Events.ITERATION_COMPLETED(every=1000), checkpoint_handler, { 'model': model, 'optimizer': optimizer }) ProgressBar(persist=True, bar_format="").attach(trainer, event_name=Events.EPOCH_STARTED, closing_event_name=Events.COMPLETED) if config['display_iters']: ProgressBar(persist=False, bar_format="").attach(trainer, metric_names=metric_names) tb_logger = TensorboardLogger(log_dir=output_path) tb_logger.attach(trainer, log_handler=tbOutputHandler( tag="train", metric_names=metric_names), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=tbOptimizerParamsHandler(optimizer, param_name="lr"), event_name=Events.ITERATION_STARTED) metrics = { "accuracy": Accuracy(device=device if distributed else None), "loss": Loss(criterion, device=device if distributed else None) } evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def run_validation(engine): torch.cuda.synchronize() train_evaluator.run(train_labelled_loader) evaluator.run(test_loader) trainer.add_event_handler(Events.EPOCH_STARTED(every=3), run_validation) trainer.add_event_handler(Events.COMPLETED, run_validation) if rank == 0: if config['display_iters']: ProgressBar(persist=False, desc="Train evaluation").attach(train_evaluator) ProgressBar(persist=False, desc="Test evaluation").attach(evaluator) tb_logger.attach(train_evaluator, log_handler=tbOutputHandler(tag="train", metric_names=list( metrics.keys()), another_engine=trainer), event_name=Events.COMPLETED) tb_logger.attach(evaluator, log_handler=tbOutputHandler(tag="test", metric_names=list( metrics.keys()), another_engine=trainer), event_name=Events.COMPLETED) # Store the best model def default_score_fn(engine): score = engine.state.metrics['accuracy'] return score score_function = default_score_fn if not hasattr( config, "score_function") else config.score_function best_model_handler = ModelCheckpoint( dirname=output_path, filename_prefix="best", n_saved=3, global_step_transform=global_step_from_engine(trainer), score_name="val_accuracy", score_function=score_function) evaluator.add_event_handler(Events.COMPLETED, best_model_handler, { 'model': model, }) trainer.run(train_labelled_loader, max_epochs=config['num_epochs']) if rank == 0: tb_logger.close()
def run(args): train_loader, val_loader = get_data_loaders(args.dataset_dir, args.batch_size, args.val_batch_size, args.num_workers) if args.seed is not None: torch.manual_seed(args.seed) num_classes = CityscapesDataset.num_classes() device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') model = GoogLeNetFCN(num_classes) model.init_from_googlenet() device_count = torch.cuda.device_count() if device_count > 1: print("Using %d GPU(s)" % device_count) model = nn.DataParallel(model) args.batch_size = device_count * args.batch_size args.val_batch_size = device_count * args.val_batch_size model = model.to(device) criterion = nn.CrossEntropyLoss(ignore_index=255) optimizer = optim.SGD([{'params': [p for p, name in model.named_parameters() if name[-4:] != 'bias'], 'lr': args.lr, 'weight_decay': 5e-4}, {'params': [p for p, name in model.named_parameters() if name[-4:] == 'bias'], 'lr': args.lr * 2}], momentum=args.momentum, lr=args.lr) if args.resume: if os.path.isfile(args.resume): print("Loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) print("Loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch'])) else: print("No checkpoint found at '{}'".format(args.resume)) trainer = create_supervised_trainer(model, optimizer, criterion, device, non_blocking=True) RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss') # attach progress bar pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=['loss']) cm = ConfusionMatrix(num_classes) evaluator = create_supervised_evaluator(model, metrics={'loss': Loss(criterion), 'IoU': IoU(cm, ignore_index=0)}, device=device, non_blocking=True) pbar2 = ProgressBar(persist=True, desc='Eval Epoch') pbar2.attach(evaluator) def _global_step_transform(engine, event_name): return trainer.state.iteration tb_logger = TensorboardLogger(args.log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag='training', metric_names=['loss']), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag='validation', metric_names=['loss', 'IoU'], global_step_transform=_global_step_transform), event_name=Events.EPOCH_COMPLETED) @evaluator.on(Events.EPOCH_COMPLETED) def save_checkpoint(engine): iou = engine.state.metrics['IoU'] * 100.0 mean_iou = iou.mean() name = 'epoch{}_mIoU={:.1f}.pth'.format(trainer.state.epoch, mean_iou) file = {'model': model.state_dict(), 'epoch': trainer.state.epoch, 'optimizer': optimizer.state_dict(), 'args': args} torch.save(file, os.path.join(args.output_dir, 'checkpoint_{}'.format(name))) torch.save(model.state_dict(), os.path.join(args.output_dir, 'model_{}'.format(name))) @trainer.on(Events.STARTED) def initialize(engine): if args.resume: engine.state.epoch = args.start_epoch @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): pbar.log_message('Start Validation - Epoch: [{}/{}]'.format(engine.state.epoch, engine.state.max_epochs)) evaluator.run(val_loader) metrics = evaluator.state.metrics loss = metrics['loss'] iou = metrics['IoU'] mean_iou = iou.mean() pbar.log_message('Validation results - Epoch: [{}/{}]: Loss: {:.2e}, mIoU: {:.1f}' .format(engine.state.epoch, engine.state.max_epochs, loss, mean_iou * 100.0)) @trainer.on(Events.EXCEPTION_RAISED) def handle_exception(engine, e): engine.state.exception_raised = True if isinstance(e, KeyboardInterrupt) and (engine.state.iteration > 1): engine.terminate() warnings.warn("KeyboardInterrupt caught. Exiting gracefully.") name = 'epoch{}_exception.pth'.format(trainer.state.epoch) file = {'model': model.state_dict(), 'epoch': trainer.state.epoch, 'optimizer': optimizer.state_dict()} torch.save(file, os.path.join(args.output_dir, 'checkpoint_{}'.format(name))) torch.save(model.state_dict(), os.path.join(args.output_dir, 'model_{}'.format(name))) else: raise e print("Start training") trainer.run(train_loader, max_epochs=args.epochs) tb_logger.close()
def run(output_path, config): device = "cuda" batch_size = config['batch_size'] train_loader, test_loader = get_train_test_loaders( dataset_name=config['dataset'], path=config['data_path'], batch_size=batch_size, num_workers=config['num_workers']) model = get_model(config['model']) model = model.to(device) optim_fn = optim.SGD if config['with_layca']: optim_fn = LaycaSGD optimizer = optim_fn(model.parameters(), lr=0.0, momentum=config['momentum'], weight_decay=config['weight_decay'], nesterov=True) criterion = nn.CrossEntropyLoss() le = len(train_loader) milestones_values = [(le * m, v) for m, v in config['lr_milestones_values']] scheduler = PiecewiseLinear(optimizer, "lr", milestones_values=milestones_values) def _prepare_batch(batch, device, non_blocking): x, y = batch return (convert_tensor(x, device=device, non_blocking=non_blocking), convert_tensor(y, device=device, non_blocking=non_blocking)) def process_function(engine, batch): x, y = _prepare_batch(batch, device=device, non_blocking=True) model.train() y_pred = model(x) loss = criterion(y_pred, y) optimizer.zero_grad() loss.backward() optimizer.step() return loss.item() trainer = Engine(process_function) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) RunningAverage(output_transform=lambda x: x, epoch_bound=False).attach(trainer, 'batchloss') ProgressBar(persist=True, bar_format="").attach(trainer, event_name=Events.EPOCH_STARTED, closing_event_name=Events.COMPLETED) tb_logger = TensorboardLogger(log_dir=output_path) tb_logger.attach(trainer, log_handler=tbOutputHandler(tag="train", metric_names='all'), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=tbOptimizerParamsHandler(optimizer, param_name="lr"), event_name=Events.ITERATION_STARTED) tb_logger.attach(trainer, log_handler=LayerRotationStatsHandler(model), event_name=Events.EPOCH_STARTED) metrics = { "accuracy": Accuracy(), } evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True) def run_validation(engine, val_interval): if (engine.state.epoch - 1) % val_interval == 0: train_evaluator.run(train_loader) evaluator.run(test_loader) trainer.add_event_handler(Events.EPOCH_COMPLETED, run_validation, val_interval=2) trainer.add_event_handler(Events.COMPLETED, run_validation, val_interval=1) tb_logger.attach(train_evaluator, log_handler=tbOutputHandler(tag="train", metric_names='all', another_engine=trainer), event_name=Events.COMPLETED) tb_logger.attach(evaluator, log_handler=tbOutputHandler(tag="test", metric_names='all', another_engine=trainer), event_name=Events.COMPLETED) def mlflow_batch_metrics_logging(engine, tag): step = trainer.state.iteration for name, value in engine.state.metrics.items(): mlflow.log_metric("{} {}".format(tag, name), value, step=step) def mlflow_val_metrics_logging(engine, tag): step = trainer.state.epoch for name in metrics.keys(): value = engine.state.metrics[name] mlflow.log_metric("{} {}".format(tag, name), value, step=step) trainer.add_event_handler(Events.ITERATION_COMPLETED, mlflow_batch_metrics_logging, "train") train_evaluator.add_event_handler(Events.COMPLETED, mlflow_val_metrics_logging, "train") evaluator.add_event_handler(Events.COMPLETED, mlflow_val_metrics_logging, "test") trainer.run(train_loader, max_epochs=config['num_epochs']) tb_logger.close()
def run(batch_size=1, log_interval=50, debug_images_interval=10, val_dataset_ann_file='~/bigdata/coco/annotations/instances_val2017.json', input_checkpoint='', log_dir="/tmp/tensorboard_logs", use_mask=True, backbone_name='resnet101'): hparam_dict = { # 'warmup_iterations': warmup_iterations, 'batch_size': batch_size, # 'test_size': test_size, # 'epochs': epochs, # 'trainable_layers': trainable_layers, # 'load_optimizer': load_optimizer, # 'lr': lr, # 'momentum': momentum, # 'weight_decay': weight_decay, } # Load the old hparams hparam_file = Path(input_checkpoint).parent / 'hparams.pickle' try: print('Opening hparams file from {}'.format(hparam_file.absolute())) with open(hparam_file, 'rb') as f: # The protocol version used is detected automatically, so we do not # have to specify it. data = pickle.load(f) print('Updating hparams with {}'.format(data)) hparam_dict.update(data) except FileNotFoundError as e: print('HParam file not found at {}'.format(hparam_file.absolute())) print('Params: {}'.format(hparam_dict)) # Define train and test datasets val_loader, labels_enum = get_eval_data_loader( val_dataset_ann_file, batch_size, configuration_data.get('image_size'), use_mask=use_mask) val_dataset = list( chain.from_iterable(zip(*batch) for batch in iter(val_loader))) coco_api_val_dataset = convert_to_coco_api(val_dataset) num_classes = max(labels_enum.keys() ) + 1 # number of classes plus one for background class configuration_data['num_classes'] = num_classes print('Testing with {} classes...'.format(num_classes)) # Set the training device to GPU if available - if not set it to CPU device = torch.cuda.current_device() if torch.cuda.is_available( ) else torch.device('cpu') torch.backends.cudnn.benchmark = True if torch.cuda.is_available( ) else False # optimization for fixed input size if use_mask: print('Loading MaskRCNN Model...') model = get_model_instance_segmentation( num_classes, configuration_data.get('mask_predictor_hidden_layer')) else: print('Loading FasterRCNN Model...') model = get_model_instance_detection(num_classes, backbone_name=backbone_name) iou_types = get_iou_types(model) # if there is more than one GPU, parallelize the model if torch.cuda.device_count() > 1: print("{} GPUs were detected - we will use all of them".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) # copy the model to each device model.to(device) print('Loading model checkpoint from {}'.format(input_checkpoint)) input_checkpoint = torch.load(input_checkpoint, map_location=torch.device(device)) model.load_state_dict(input_checkpoint['model']) if use_mask: comment = 'mask' else: comment = 'box-{}'.format(backbone_name) tb_logger = TensorboardLogger(log_dir=log_dir, comment=comment) writer = tb_logger.writer # define Ignite's train and evaluation engine evaluator = create_evaluator(model, device) coco_ap = CocoAP(coco_api_val_dataset, iou_types) coco_ap_05 = CocoAP5(coco_api_val_dataset, iou_types) coco_ap_075 = CocoAP75(coco_api_val_dataset, iou_types) coco_ap.attach(evaluator, "AP") coco_ap_05.attach(evaluator, "AP0.5") coco_ap_075.attach(evaluator, "AP0.75") tb_logger.attach( evaluator, log_handler=OutputHandler( tag='evaluation', metric_names=['AP', 'AP0.5', 'AP0.75'], global_step_transform=global_step_from_engine(evaluator)), event_name=Events.COMPLETED) @evaluator.on(Events.STARTED) def on_evaluation_started(engine): model.eval() # engine.state.coco_evaluator = CocoEvaluator(coco_api_val_dataset, iou_types) @evaluator.on(Events.ITERATION_COMPLETED) def on_eval_iteration_completed(engine): images, targets, results = engine.state.output if engine.state.iteration % log_interval == 0: print("Evaluation: Iteration: {}".format(engine.state.iteration)) if engine.state.iteration % debug_images_interval == 0: print('Saving debug image...') for n, debug_image in enumerate( draw_debug_images(images, targets, results)): writer.add_image("evaluation/image_{}_{}".format( engine.state.iteration, n), debug_image, evaluator.state.iteration, dataformats='HWC') if 'masks' in targets[n]: writer.add_image("evaluation/image_{}_{}_mask".format( engine.state.iteration, n), draw_mask(targets[n]), evaluator.state.iteration, dataformats='HW') curr_image_id = int(targets[n]['image_id']) writer.add_image( "evaluation/image_{}_{}_predicted_mask".format( engine.state.iteration, n), draw_mask(results[curr_image_id]).squeeze(), evaluator.state.iteration, dataformats='HW') images = targets = results = engine.state.output = None @evaluator.on(Events.COMPLETED) def on_evaluation_completed(engine): # gather the stats from all processes # engine.state.coco_evaluator.synchronize_between_processes() # accumulate predictions from all images # engine.state.coco_evaluator.accumulate() # engine.state.coco_evaluator.summarize() # # pr_50, pr_75 = get_pr_levels(engine.state.coco_evaluator.coco_eval['bbox']) # plot_pr_curve_tensorboard(pr_50, pr_75, writer=writer) print('Writing hparams: {}'.format(hparam_dict)) writer.add_hparams(hparam_dict=hparam_dict, metric_dict={ 'hparams/AP': coco_ap.ap, 'hparams/AP.5': coco_ap_05.ap5, 'hparams/AP.75': coco_ap_075.ap75 }) coco_ap.write_tensorboard_pr_curve(writer) # evaluator.state = State() evaluator.run(val_loader) writer.close()