def run(config_file): print("--- Check dataflow --- ") print("Load config file ... ") config = load_config(config_file) seed = config.get("SEED", 2018) random.seed(seed) torch.manual_seed(seed) output = config["OUTPUT_PATH"] debug = config.get("DEBUG", False) from datetime import datetime now = datetime.now() log_dir = os.path.join( output, "check_dataflow_{}".format(now.strftime("%Y%m%d_%H%M"))) if not os.path.exists(log_dir): os.makedirs(log_dir) log_level = logging.INFO if debug: log_level = logging.DEBUG print("Activated debug mode") logger = logging.getLogger("Check dataflow") setup_logger(logger, os.path.join(log_dir, "check.log"), log_level) logger.debug("Setup tensorboard writer") writer = SummaryWriter(log_dir=os.path.join(log_dir, "tensorboard")) save_conf(config_file, log_dir, logger, writer) device = 'cpu' if torch.cuda.is_available(): logger.debug("CUDA is enabled") from torch.backends import cudnn cudnn.benchmark = True logger.debug("Setup data loader") data_loader = config["DATA_LOADER"] logger.debug("Setup ignite dataflow checker") dataflow_checker = create_dataflow_checker() logger.debug("Setup handlers") # Setup timer to measure training time timer = Timer(average=True) timer.attach(dataflow_checker, start=Events.EPOCH_STARTED, pause=Events.ITERATION_COMPLETED, resume=Events.ITERATION_STARTED) n_classes = 200 n_batches = len(data_loader) n_channels = 3 y_counts_per_batch = np.zeros((n_batches, n_classes), dtype=np.int) x_mins_per_batch = np.zeros((n_batches, n_channels), dtype=np.float) x_maxs_per_batch = np.zeros((n_batches, n_channels), dtype=np.float) x_avgs_per_batch = np.zeros((n_batches, n_channels), dtype=np.float) x_shapes_per_batch = np.empty((n_batches, 1), dtype=np.object) x_dtypes_per_batch = np.empty((n_batches, 1), dtype=np.object) def log_dataflow_iteration(engine, y_counts_per_batch): x, y = engine.state.output curr_iter = engine.state.iteration - 1 y_counts_per_batch[curr_iter, :] = np.bincount(y.numpy(), minlength=n_classes) for i in range(n_channels): x_mins_per_batch[curr_iter, i] = x[:, i, :, :].min() x_maxs_per_batch[curr_iter, i] = x[:, i, :, :].max() x_avgs_per_batch[curr_iter, i] = torch.mean(x[:, i, :, :]) x_shapes_per_batch[curr_iter, 0] = str(list(x.shape[1:])) x_dtypes_per_batch[curr_iter, 0] = type(x).__name__ dataflow_checker.add_event_handler(Events.ITERATION_COMPLETED, log_dataflow_iteration, y_counts_per_batch) def log_dataflow_epoch(engine): logger.info("One epoch dataflow time (seconds): {}".format( timer.value())) dataflow_checker.add_event_handler(Events.EPOCH_COMPLETED, log_dataflow_epoch) n_epochs = config["N_EPOCHS"] logger.debug("Start dataflow checking: {} epochs".format(n_epochs)) try: dataflow_checker.run(data_loader, max_epochs=n_epochs) except KeyboardInterrupt: logger.info("Catched KeyboardInterrupt -> exit") exit(0) except Exception as e: # noqa logger.exception("") if debug: try: # open an ipython shell if possible import IPython IPython.embed() # noqa except ImportError: print("Failed to start IPython console") raise e logger.debug("Dataflow check is ended") writer.close() logger.debug("Create and write y_counts_per_batch.csv") cols = ["class_{}".format(i) for i in range(n_classes)] y_counts_df = pd.DataFrame(y_counts_per_batch, columns=cols) y_counts_df.to_csv(os.path.join(log_dir, "y_counts_per_batch.csv"), index=False) # Save figure of total target distributions logger.debug("Save figure of total target distributions") fig = create_fig_target_distribution_per_batch(y_counts_df=y_counts_df, n_classes_per_fig=20) fig.savefig(os.path.join(log_dir, "target_distribution_per_batch.png")) logger.debug("Save figure of total targets distributions") fig = create_fig_targets_distribution(y_counts_df, n_classes_per_fig=20) fig.savefig(os.path.join(log_dir, "targets_distribution.png")) del y_counts_df del y_counts_per_batch logger.debug("Create and write x_stats_df.csv") min_cols = ["b{}_min".format(i) for i in range(n_channels)] avg_cols = ["b{}_avg".format(i) for i in range(n_channels)] max_cols = ["b{}_max".format(i) for i in range(n_channels)] cols = min_cols + avg_cols + max_cols + ["shape", "dtype"] x_stats_df = pd.DataFrame(columns=cols, index=np.arange(n_batches), dtype=np.float) x_stats_df[min_cols] = x_mins_per_batch x_stats_df[avg_cols] = x_avgs_per_batch x_stats_df[max_cols] = x_maxs_per_batch x_stats_df["shape"] = x_shapes_per_batch x_stats_df["dtype"] = x_dtypes_per_batch x_stats_df.to_csv(os.path.join(log_dir, "x_stats_df.csv"), index=False) # Save figure with sample mins, avgs, maxs logger.debug("Save figure with sample mins, avgs, maxs") fig = create_fig_samples_min_avg_max_per_batch(x_stats_df, min_cols, avg_cols, max_cols) fig.savefig(os.path.join(log_dir, "samples_min_avg_max_per_batch.png")) logger.debug("Save figure with sample shapes") fig = create_fig_samples_param_per_batch(x_stats_df, "shape") fig.savefig(os.path.join(log_dir, "samples_shape_per_batch.png")) logger.debug("Save figure with sample dtypes") fig = create_fig_samples_param_per_batch(x_stats_df, "dtype") fig.savefig(os.path.join(log_dir, "samples_dtype_per_batch.png"))
def run(config_file): print("--- iMaterialist 2018 : Meta-learner training --- ") print("Load config file ... ") config = load_config(config_file) seed = config.get("SEED", 2018) random.seed(seed) output = Path(config["OUTPUT_PATH"]) model = config["MODEL"] model_name = model.__class__.__name__ debug = config.get("DEBUG", False) from datetime import datetime now = datetime.now() log_dir = output / ("training_meta_{}_{}".format( model_name, now.strftime("%Y%m%d_%H%M"))) assert not log_dir.exists(), \ "Output logging directory '{}' already existing".format(log_dir) log_dir.mkdir(parents=True) log_level = logging.INFO if debug: log_level = logging.DEBUG print("Activated debug mode") logger = logging.getLogger("iMaterialist 2018: Train meta-learner") setup_logger(logger, (log_dir / "train.log").as_posix(), log_level) save_conf(config_file, log_dir.as_posix(), logger) X = config["X"] y = config["Y"] n_trials = config["N_TRIALS"] scorings = config["SCORINGS"] cv = config["CV_SPLIT"] estimator_cls = config["MODEL"] model_params = config["MODEL_PARAMS"] model_hp_params = config["MODEL_HP_PARAMS"] model_hp_params.update(model_params) fit_params = config["FIT_PARAMS"] n_jobs = config["N_JOBS"] def hp_score(model_hp_params): estimator = estimator_cls(**model_hp_params) scores = cross_validate(estimator, X, y, cv=cv, scoring=scorings, fit_params=fit_params, n_jobs=n_jobs, verbose=debug) logger.info("CV scores:") for scoring in scorings: logger.info("{} : {}".format(scoring, scores[scoring].tolist())) return {'loss': scores[scorings[0]], 'status': STATUS_OK} logger.debug("Start training: {} epochs".format(n_trials)) try: best_params, trials = hp_optimize(hp_score, model_hp_params, max_evals=n_trials) best_params.update(model_params) save_params(best_params, (log_dir / "best_params.json").as_posix()) logger.info("Best parameters: \n{}".format(best_params)) logger.info("Best trial : \n{}".format(trials.best_trial)) logger.info("Train meta model on complete dataset") estimator = estimator_cls(**best_params) estimator.fit(X, y) save_model(estimator, (log_dir / "best_model.pkl").as_posix()) except KeyboardInterrupt: logger.info("Catched KeyboardInterrupt -> exit") except Exception as e: # noqa logger.exception("") if debug: try: # open an ipython shell if possible import IPython IPython.embed() # noqa except ImportError: print("Failed to start IPython console") logger.debug("Training is ended")
def run(config_file): print("--- iMaterialist 2018 : Training --- ") print("Load config file ... ") config = load_config(config_file) seed = config.get("SEED", 2018) random.seed(seed) torch.manual_seed(seed) output = Path(config["OUTPUT_PATH"]) debug = config.get("DEBUG", False) from datetime import datetime now = datetime.now() log_dir = output / ("{}".format(Path(config_file).stem)) / "{}".format( now.strftime("%Y%m%d_%H%M")) assert not log_dir.exists(), \ "Output logging directory '{}' already existing".format(log_dir) log_dir.mkdir(parents=True) shutil.copyfile(config_file, (log_dir / Path(config_file).name).as_posix()) log_level = logging.INFO if debug: log_level = logging.DEBUG print("Activated debug mode") logger = logging.getLogger("iMaterialist 2018: Train") setup_logger(logger, (log_dir / "train.log").as_posix(), log_level) logger.debug("Setup tensorboard writer") writer = SummaryWriter(log_dir=(log_dir / "tensorboard").as_posix()) save_conf(config_file, log_dir.as_posix(), logger, writer) model = config["MODEL"] model_name = model.__class__.__name__ device = config.get("DEVICE", 'cuda') if 'cuda' in device: assert torch.cuda.is_available(), \ "Device {} is not compatible with torch.cuda.is_available()".format(device) from torch.backends import cudnn cudnn.benchmark = True logger.debug("CUDA is enabled") model = model.to(device) logger.debug("Setup train/val dataloaders") train_loader, val_loader = config["TRAIN_LOADER"], config["VAL_LOADER"] # Setup training subset to run evaluation on: indices = np.arange(len(train_loader.sampler)) np.random.shuffle(indices) indices = indices[:len(val_loader.sampler)] if len( val_loader.sampler) < len(train_loader.sampler) else indices train_eval_loader = get_train_eval_data_loader(train_loader, indices) logger.debug( "- train data loader: {} number of batches | {} number of samples". format(len(train_loader), len(train_loader.sampler))) logger.debug( "- train eval data loader: {} number of batches | {} number of samples" .format(len(train_eval_loader), len(train_eval_loader.sampler))) logger.debug( "- validation data loader: {} number of batches | {} number of samples" .format(len(val_loader), len(val_loader.sampler))) # write_model_graph(writer, model=model, data_loader=train_loader, device=device) optimizer = config["OPTIM"] logger.debug("Setup criterion") criterion = config["CRITERION"] if "cuda" in device and isinstance(criterion, nn.Module): criterion = criterion.to(device) lr_schedulers = config.get("LR_SCHEDULERS") logger.debug("Setup ignite trainer and evaluator") trainer = create_supervised_trainer(model, optimizer, criterion, device=device) metrics = { 'accuracy': CategoricalAccuracy(), 'precision': Precision(), 'recall': Recall(), 'nll': Loss(criterion) } train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) val_metrics = { 'accuracy': CategoricalAccuracy(), 'precision': Precision(), 'recall': Recall(), 'nll': Loss(nn.CrossEntropyLoss()) } val_evaluator = create_supervised_evaluator(model, metrics=val_metrics, device=device) logger.debug("Setup handlers") log_interval = config.get("LOG_INTERVAL", 100) reduce_on_plateau = config.get("REDUCE_LR_ON_PLATEAU") # Setup timer to measure training time timer = Timer(average=True) timer.attach(trainer, start=Events.EPOCH_STARTED, resume=Events.ITERATION_STARTED, pause=Events.ITERATION_COMPLETED) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: logger.info("Epoch[{}] Iteration[{}/{}] Loss: {:.4f}".format( engine.state.epoch, iter, len(train_loader), engine.state.output)) writer.add_scalar("training/loss_vs_iterations", engine.state.output, engine.state.iteration) @trainer.on(Events.EPOCH_STARTED) def update_lr_schedulers(engine): if lr_schedulers is not None: for lr_scheduler in lr_schedulers: lr_scheduler.step() @trainer.on(Events.EPOCH_STARTED) def log_lrs(engine): if len(optimizer.param_groups) == 1: lr = float(optimizer.param_groups[0]['lr']) writer.add_scalar("learning_rate", lr, engine.state.epoch) logger.debug("Learning rate: {}".format(lr)) else: for i, param_group in enumerate(optimizer.param_groups): lr = float(param_group['lr']) logger.debug("Learning rate (group {}): {}".format(i, lr)) writer.add_scalar("learning_rate_group_{}".format(i), lr, engine.state.epoch) log_images_dir = log_dir / "figures" log_images_dir.mkdir(parents=True) def log_precision_recall_results(metrics, epoch, mode): for metric_name in ['precision', 'recall']: value = metrics[metric_name] avg_value = torch.mean(value).item() writer.add_scalar("{}/avg_{}".format(mode, metric_name), avg_value, epoch) # Save metric per class figure sorted_values = value.to('cpu').numpy() indices = np.argsort(sorted_values) sorted_values = sorted_values[indices] n_classes = len(sorted_values) classes = np.array( ["class_{}".format(i) for i in range(n_classes)]) sorted_classes = classes[indices] fig = create_fig_param_per_class(sorted_values, metric_name, classes=sorted_classes, n_classes_per_fig=20) fname = log_images_dir / ("{}_{}_{}_per_class.png".format( mode, epoch, metric_name)) fig.savefig(fname.as_posix()) # Add figure in TB img = Image.open(fname.as_posix()) tag = "{}_{}".format(mode, metric_name) writer.add_image(tag, np.asarray(img), epoch) @trainer.on(Events.EPOCH_COMPLETED) def log_training_metrics(engine): epoch = engine.state.epoch logger.info("One epoch training time (seconds): {}".format( timer.value())) metrics = train_evaluator.run(train_eval_loader).metrics logger.info( "Training Results - Epoch: {} Avg accuracy: {:.4f} Avg loss: {:.4f}" .format(engine.state.epoch, metrics['accuracy'], metrics['nll'])) writer.add_scalar("training/avg_accuracy", metrics['accuracy'], epoch) writer.add_scalar("training/avg_error", 1.0 - metrics['accuracy'], epoch) writer.add_scalar("training/avg_loss", metrics['nll'], epoch) log_precision_recall_results(metrics, epoch, "training") @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): epoch = engine.state.epoch metrics = val_evaluator.run(val_loader).metrics writer.add_scalar("validation/avg_loss", metrics['nll'], epoch) writer.add_scalar("validation/avg_accuracy", metrics['accuracy'], epoch) writer.add_scalar("validation/avg_error", 1.0 - metrics['accuracy'], epoch) logger.info( "Validation Results - Epoch: {} Avg accuracy: {:.4f} Avg loss: {:.4f}" .format(engine.state.epoch, metrics['accuracy'], metrics['nll'])) log_precision_recall_results(metrics, epoch, "validation") if reduce_on_plateau is not None: @val_evaluator.on(Events.COMPLETED) def update_reduce_on_plateau(engine): val_loss = engine.state.metrics['nll'] reduce_on_plateau.step(val_loss) def score_function(engine): val_loss = engine.state.metrics['nll'] # Objects with highest scores will be retained. return -val_loss # Setup early stopping: if "EARLY_STOPPING_KWARGS" in config: kwargs = config["EARLY_STOPPING_KWARGS"] if 'score_function' not in kwargs: kwargs['score_function'] = score_function handler = EarlyStopping(trainer=trainer, **kwargs) setup_logger(handler._logger, (log_dir / "train.log").as_posix(), log_level) val_evaluator.add_event_handler(Events.COMPLETED, handler) # Setup model checkpoint: best_model_saver = ModelCheckpoint(log_dir.as_posix(), filename_prefix="model", score_name="val_loss", score_function=score_function, n_saved=5, atomic=True, create_dir=True) val_evaluator.add_event_handler(Events.COMPLETED, best_model_saver, {model_name: model}) last_model_saver = ModelCheckpoint(log_dir.as_posix(), filename_prefix="checkpoint", save_interval=1, n_saved=1, atomic=True, create_dir=True) trainer.add_event_handler(Events.EPOCH_COMPLETED, last_model_saver, {model_name: model}) # Setup custom event handlers: for (event, handler) in config["TRAINER_CUSTOM_EVENT_HANDLERS"]: trainer.add_event_handler(event, handler, val_evaluator, logger) for (event, handler) in config["EVALUATOR_CUSTOM_EVENT_HANDLERS"]: val_evaluator.add_event_handler(event, handler, trainer, logger) n_epochs = config["N_EPOCHS"] logger.info("Start training: {} epochs".format(n_epochs)) try: trainer.run(train_loader, max_epochs=n_epochs) except KeyboardInterrupt: logger.info("Catched KeyboardInterrupt -> exit") except Exception as e: # noqa logger.exception("") if debug: try: # open an ipython shell if possible import IPython IPython.embed() # noqa except ImportError: print("Failed to start IPython console") logger.debug("Training is ended") writer.close()
def run(config_file): print("--- Tiny ImageNet 200 Playground : Inference --- ") print("Load config file ... ") config = load_config(config_file) seed = config.get("SEED", 2018) random.seed(seed) torch.manual_seed(seed) output = config["OUTPUT_PATH"] model = config["MODEL"] model_name = model.__class__.__name__ debug = config.get("DEBUG", False) from datetime import datetime now = datetime.now() log_dir = os.path.join(output, "inference_{}_{}".format(model_name, now.strftime("%Y%m%d_%H%M"))) if not os.path.exists(log_dir): os.makedirs(log_dir) log_level = logging.INFO if debug: log_level = logging.DEBUG print("Activated debug mode") logger = logging.getLogger("Tiny ImageNet 200: Inference") setup_logger(logger, os.path.join(log_dir, "test.log"), log_level) logger.debug("Setup tensorboard writer") writer = SummaryWriter(log_dir=os.path.join(log_dir, "tensorboard")) save_conf(config_file, log_dir, logger, writer) device = 'cpu' if torch.cuda.is_available(): logger.debug("CUDA is enabled") from torch.backends import cudnn cudnn.benchmark = True device = 'cuda' model = model.to(device) logger.debug("Setup test dataloader") dataset_path = config["DATASET_PATH"] test_data_transform = config["TEST_TRANSFORMS"] batch_size = config.get("BATCH_SIZE", 64) num_workers = config.get("NUM_WORKERS", 8) test_loader = get_test_data_loader(dataset_path, test_data_transform, batch_size, num_workers, device=device) logger.debug("Setup ignite trainer and evaluator") inferencer = create_inferencer(model, device=device) n_tta = config["N_TTA"] logger.debug("Setup handlers") # Setup timer to measure evaluation time timer = Timer(average=True) timer.attach(inferencer, start=Events.STARTED, resume=Events.ITERATION_STARTED, pause=Events.ITERATION_COMPLETED) n_samples = len(test_loader.dataset) files = np.zeros((n_samples, ), dtype=np.object) y_probas_tta = np.zeros((n_samples, 200, n_tta)) @inferencer.on(Events.EPOCH_COMPLETED) def log_tta(engine): logger.debug("TTA {} / {}".format(engine.state.epoch, n_tta)) @inferencer.on(Events.ITERATION_COMPLETED) def save_results(engine): output = engine.state.output tta_index = engine.state.epoch - 1 start_index = ((engine.state.iteration - 1) % len(test_loader)) * batch_size end_index = min(start_index + batch_size, n_samples) batch_y_probas = output['y_pred'].detach().numpy() y_probas_tta[start_index:end_index, :, tta_index] = batch_y_probas if tta_index == 0: files[start_index:end_index] = output['files'] logger.debug("Start inference") try: inferencer.run(test_loader, max_epochs=n_tta) except KeyboardInterrupt: logger.info("Catched KeyboardInterrupt -> exit") except Exception as e: # noqa logger.exception("") if debug: try: # open an ipython shell if possible import IPython IPython.embed() # noqa except ImportError: print("Failed to start IPython console") exit(1) writer.close() # Average probabilities: y_probas = np.mean(y_probas_tta, axis=-1) y_preds = np.argmax(y_probas, axis=-1) logger.info("Write submission file") submission_filepath = os.path.join(log_dir, "predictions.csv") write_submission(files, y_preds, test_loader.dataset.classes, submission_filepath)
def run(config_file): print("--- Tiny ImageNet 200 Playground : Training --- ") print("Load config file ... ") config = load_config(config_file) seed = config.get("SEED", 2018) random.seed(seed) torch.manual_seed(seed) output = config["OUTPUT_PATH"] model = config["MODEL"] model_name = model.__class__.__name__ debug = config.get("DEBUG", False) from datetime import datetime now = datetime.now() log_dir = os.path.join( output, "training_{}_{}".format(model_name, now.strftime("%Y%m%d_%H%M"))) if not os.path.exists(log_dir): os.makedirs(log_dir) log_level = logging.INFO if debug: log_level = logging.DEBUG print("Activated debug mode") logger = logging.getLogger("Tiny ImageNet 200: Train") setup_logger(logger, os.path.join(log_dir, "train.log"), log_level) logger.debug("Setup tensorboard writer") writer = SummaryWriter(log_dir=os.path.join(log_dir, "tensorboard")) save_conf(config_file, log_dir, logger, writer) device = 'cpu' if torch.cuda.is_available(): logger.debug("CUDA is enabled") from torch.backends import cudnn cudnn.benchmark = True device = 'cuda' model = model.to(device) logger.debug("Setup train/val dataloaders") dataset_path = config["DATASET_PATH"] train_data_transform = config["TRAIN_TRANSFORMS"] val_data_transform = config["VAL_TRANSFORMS"] train_batch_size = config.get("BATCH_SIZE", 64) val_batch_size = config.get("VAL_BATCH_SIZE", train_batch_size) num_workers = config.get("NUM_WORKERS", 8) trainval_split = config.get("TRAINVAL_SPLIT", { 'fold_index': 0, 'n_splits': 7 }) train_loader, val_loader = get_trainval_data_loaders(dataset_path, train_data_transform, val_data_transform, train_batch_size, val_batch_size, trainval_split, num_workers, device=device) indices = np.arange(len(train_loader.dataset)) np.random.shuffle(indices) indices = indices[:len(val_loader.dataset)] if len( val_loader.dataset) < len(train_loader.dataset) else indices train_eval_loader = get_train_eval_data_loader(train_loader, indices) write_model_graph(writer, model=model, data_loader=train_loader, device=device) optimizer = config["OPTIM"] logger.debug("Setup criterion") criterion = nn.CrossEntropyLoss() if 'cuda' in device: criterion = criterion.to(device) lr_schedulers = config.get("LR_SCHEDULERS") logger.debug("Setup ignite trainer and evaluator") trainer = create_supervised_trainer(model, optimizer, criterion, device=device) metrics = { 'accuracy': CategoricalAccuracy(), 'precision': Precision(), 'recall': Recall(), 'nll': Loss(criterion) } train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) val_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) logger.debug("Setup handlers") log_interval = config.get("LOG_INTERVAL", 100) reduce_on_plateau = config.get("REDUCE_LR_ON_PLATEAU") # Setup timer to measure training time timer = Timer(average=True) timer.attach(trainer, start=Events.EPOCH_STARTED, resume=Events.ITERATION_STARTED, pause=Events.ITERATION_COMPLETED) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: logger.info("Epoch[{}] Iteration[{}/{}] Loss: {:.4f}".format( engine.state.epoch, iter, len(train_loader), engine.state.output)) writer.add_scalar("training/loss_vs_iterations", engine.state.output, engine.state.iteration) @trainer.on(Events.EPOCH_STARTED) def update_lr_schedulers(engine): if lr_schedulers is not None: for lr_scheduler in lr_schedulers: lr_scheduler.step() @trainer.on(Events.EPOCH_STARTED) def log_lrs(engine): if len(optimizer.param_groups) == 1: lr = float(optimizer.param_groups[0]['lr']) writer.add_scalar("learning_rate", lr, engine.state.epoch) logger.debug("Learning rate: {}".format(lr)) else: for i, param_group in enumerate(optimizer.param_groups): lr = float(param_group['lr']) logger.debug("Learning rate (group {}): {}".format(i, lr)) writer.add_scalar("learning_rate_group_{}".format(i), lr, engine.state.epoch) log_images_dir = os.path.join(log_dir, "figures") os.makedirs(log_images_dir) def log_precision_recall_results(metrics, epoch, mode): for metric_name in ['precision', 'recall']: value = metrics[metric_name] avg_value = torch.mean(value).item() writer.add_scalar("{}/avg_{}".format(mode, metric_name), avg_value, epoch) # Save metric per class figure sorted_values = value.to('cpu').numpy() indices = np.argsort(sorted_values) sorted_values = sorted_values[indices] n_classes = len(sorted_values) classes = np.array( ["class_{}".format(i) for i in range(n_classes)]) sorted_classes = classes[indices] fig = create_fig_param_per_class(sorted_values, metric_name, classes=sorted_classes, n_classes_per_fig=20) fname = os.path.join( log_images_dir, "{}_{}_{}_per_class.png".format(mode, epoch, metric_name)) fig.savefig(fname) # Add figure in TB img = Image.open(fname) tag = "{}_{}".format(mode, metric_name) writer.add_image(tag, np.asarray(img), epoch) @trainer.on(Events.EPOCH_COMPLETED) def log_training_metrics(engine): epoch = engine.state.epoch logger.info("One epoch training time (seconds): {}".format( timer.value())) metrics = train_evaluator.run(train_eval_loader).metrics logger.info( "Training Results - Epoch: {} Avg accuracy: {:.4f} Avg loss: {:.4f}" .format(engine.state.epoch, metrics['accuracy'], metrics['nll'])) writer.add_scalar("training/avg_accuracy", metrics['accuracy'], epoch) writer.add_scalar("training/avg_error", 1.0 - metrics['accuracy'], epoch) writer.add_scalar("training/avg_loss", metrics['nll'], epoch) log_precision_recall_results(metrics, epoch, "training") @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): epoch = engine.state.epoch metrics = val_evaluator.run(val_loader).metrics writer.add_scalar("validation/avg_loss", metrics['nll'], epoch) writer.add_scalar("validation/avg_accuracy", metrics['accuracy'], epoch) writer.add_scalar("validation/avg_error", 1.0 - metrics['accuracy'], epoch) logger.info( "Validation Results - Epoch: {} Avg accuracy: {:.4f} Avg loss: {:.4f}" .format(engine.state.epoch, metrics['accuracy'], metrics['nll'])) log_precision_recall_results(metrics, epoch, "validation") if reduce_on_plateau is not None: @val_evaluator.on(Events.COMPLETED) def update_reduce_on_plateau(engine): val_loss = engine.state.metrics['nll'] reduce_on_plateau.step(val_loss) def score_function(engine): val_loss = engine.state.metrics['nll'] # Objects with highest scores will be retained. return -val_loss # Setup early stopping: if "EARLY_STOPPING_KWARGS" in config: kwargs = config["EARLY_STOPPING_KWARGS"] if 'score_function' not in kwargs: kwargs['score_function'] = score_function handler = EarlyStopping(trainer=trainer, **kwargs) setup_logger(handler._logger, os.path.join(log_dir, "train.log"), log_level) val_evaluator.add_event_handler(Events.COMPLETED, handler) # Setup model checkpoint: best_model_saver = ModelCheckpoint(log_dir, filename_prefix="model", score_name="val_loss", score_function=score_function, n_saved=5, atomic=True, create_dir=True) val_evaluator.add_event_handler(Events.COMPLETED, best_model_saver, {model_name: model}) last_model_saver = ModelCheckpoint(log_dir, filename_prefix="checkpoint", save_interval=1, n_saved=1, atomic=True, create_dir=True) trainer.add_event_handler(Events.EPOCH_COMPLETED, last_model_saver, {model_name: model}) n_epochs = config["N_EPOCHS"] logger.info("Start training: {} epochs".format(n_epochs)) try: trainer.run(train_loader, max_epochs=n_epochs) except KeyboardInterrupt: logger.info("Catched KeyboardInterrupt -> exit") except Exception as e: # noqa logger.exception("") if debug: try: # open an ipython shell if possible import IPython IPython.embed() # noqa except ImportError: print("Failed to start IPython console") logger.debug("Training is ended") writer.close()
def run(config_file): print("--- iMaterialist 2018 : Inference --- ") print("Load config file ... ") config = load_config(config_file) seed = config.get("SEED", 2018) random.seed(seed) torch.manual_seed(seed) output = Path(config["OUTPUT_PATH"]) debug = config.get("DEBUG", False) from datetime import datetime now = datetime.now() # log_dir = output / "inference_{}_{}".format(model_name, now.strftime("%Y%m%d_%H%M")) log_dir = output / ("{}".format(Path(config_file).stem)) / "{}".format( now.strftime("%Y%m%d_%H%M")) assert not log_dir.exists(), \ "Output logging directory '{}' already existing".format(log_dir) log_dir.mkdir(parents=True) shutil.copyfile(config_file, (log_dir / Path(config_file).name).as_posix()) log_level = logging.INFO if debug: log_level = logging.DEBUG print("Activated debug mode") logger = logging.getLogger("iMaterialist 2018: Inference") setup_logger(logger, (log_dir / "predict.log").as_posix(), log_level) logger.debug("Setup tensorboard writer") writer = SummaryWriter(log_dir=(log_dir / "tensorboard").as_posix()) save_conf(config_file, log_dir.as_posix(), logger, writer) model = config["MODEL"] device = config.get("DEVICE", 'cuda') if 'cuda' in device: assert torch.cuda.is_available(), \ "Device {} is not compatible with torch.cuda.is_available()".format(device) from torch.backends import cudnn cudnn.benchmark = True logger.debug("CUDA is enabled") model = model.to(device) logger.debug("Setup test dataloader") test_loader = config["TEST_LOADER"] logger.debug("Setup ignite inferencer") inferencer = create_inferencer(model, device=device) n_tta = config["N_TTA"] n_classes = config["N_CLASSES"] batch_size = test_loader.batch_size logger.debug("Setup handlers") # Setup timer to measure evaluation time timer = Timer(average=True) timer.attach(inferencer, start=Events.STARTED, resume=Events.ITERATION_STARTED, pause=Events.ITERATION_COMPLETED) n_samples = len(test_loader.dataset) indices = np.zeros((n_samples, ), dtype=np.int) y_probas_tta = np.zeros((n_samples, n_classes, n_tta)) @inferencer.on(Events.EPOCH_COMPLETED) def log_tta(engine): logger.debug("TTA {} / {}".format(engine.state.epoch, n_tta)) @inferencer.on(Events.ITERATION_COMPLETED) def save_results(engine): output = engine.state.output tta_index = engine.state.epoch - 1 start_index = ( (engine.state.iteration - 1) % len(test_loader)) * batch_size end_index = min(start_index + batch_size, n_samples) batch_y_probas = output['y_pred'].detach().numpy() y_probas_tta[start_index:end_index, :, tta_index] = batch_y_probas if tta_index == 0: indices[start_index:end_index] = output['indices'] logger.info("Start inference") try: inferencer.run(test_loader, max_epochs=n_tta) except KeyboardInterrupt: logger.info("Catched KeyboardInterrupt -> exit") return except Exception as e: # noqa logger.exception("") if debug: try: # open an ipython shell if possible import IPython IPython.embed() # noqa except ImportError: print("Failed to start IPython console") return # Average probabilities: y_probas = np.mean(y_probas_tta, axis=-1) if config["SAVE_PROBAS"]: logger.info("Write probabilities file") probas_filepath = log_dir / "probas.csv" write_probas(indices, y_probas, probas_filepath) else: y_preds = np.argmax(y_probas, axis=-1) + 1 # as labels are one-based logger.info("Write submission file") submission_filepath = log_dir / "predictions.csv" sample_submission_path = config["SAMPLE_SUBMISSION_PATH"] write_submission(indices, y_preds, sample_submission_path, submission_filepath)