示例#1
0
def run(config_file):
    print("--- Check dataflow  --- ")

    print("Load config file ... ")
    config = load_config(config_file)

    seed = config.get("SEED", 2018)
    random.seed(seed)
    torch.manual_seed(seed)

    output = config["OUTPUT_PATH"]
    debug = config.get("DEBUG", False)

    from datetime import datetime
    now = datetime.now()
    log_dir = os.path.join(
        output, "check_dataflow_{}".format(now.strftime("%Y%m%d_%H%M")))
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    log_level = logging.INFO
    if debug:
        log_level = logging.DEBUG
        print("Activated debug mode")

    logger = logging.getLogger("Check dataflow")
    setup_logger(logger, os.path.join(log_dir, "check.log"), log_level)

    logger.debug("Setup tensorboard writer")
    writer = SummaryWriter(log_dir=os.path.join(log_dir, "tensorboard"))

    save_conf(config_file, log_dir, logger, writer)

    device = 'cpu'
    if torch.cuda.is_available():
        logger.debug("CUDA is enabled")
        from torch.backends import cudnn
        cudnn.benchmark = True

    logger.debug("Setup data loader")
    data_loader = config["DATA_LOADER"]

    logger.debug("Setup ignite dataflow checker")
    dataflow_checker = create_dataflow_checker()

    logger.debug("Setup handlers")
    # Setup timer to measure training time
    timer = Timer(average=True)
    timer.attach(dataflow_checker,
                 start=Events.EPOCH_STARTED,
                 pause=Events.ITERATION_COMPLETED,
                 resume=Events.ITERATION_STARTED)

    n_classes = 200
    n_batches = len(data_loader)

    n_channels = 3
    y_counts_per_batch = np.zeros((n_batches, n_classes), dtype=np.int)
    x_mins_per_batch = np.zeros((n_batches, n_channels), dtype=np.float)
    x_maxs_per_batch = np.zeros((n_batches, n_channels), dtype=np.float)
    x_avgs_per_batch = np.zeros((n_batches, n_channels), dtype=np.float)
    x_shapes_per_batch = np.empty((n_batches, 1), dtype=np.object)
    x_dtypes_per_batch = np.empty((n_batches, 1), dtype=np.object)

    def log_dataflow_iteration(engine, y_counts_per_batch):
        x, y = engine.state.output
        curr_iter = engine.state.iteration - 1
        y_counts_per_batch[curr_iter, :] = np.bincount(y.numpy(),
                                                       minlength=n_classes)
        for i in range(n_channels):
            x_mins_per_batch[curr_iter, i] = x[:, i, :, :].min()
            x_maxs_per_batch[curr_iter, i] = x[:, i, :, :].max()
            x_avgs_per_batch[curr_iter, i] = torch.mean(x[:, i, :, :])
        x_shapes_per_batch[curr_iter, 0] = str(list(x.shape[1:]))
        x_dtypes_per_batch[curr_iter, 0] = type(x).__name__

    dataflow_checker.add_event_handler(Events.ITERATION_COMPLETED,
                                       log_dataflow_iteration,
                                       y_counts_per_batch)

    def log_dataflow_epoch(engine):
        logger.info("One epoch dataflow time (seconds): {}".format(
            timer.value()))

    dataflow_checker.add_event_handler(Events.EPOCH_COMPLETED,
                                       log_dataflow_epoch)

    n_epochs = config["N_EPOCHS"]
    logger.debug("Start dataflow checking: {} epochs".format(n_epochs))
    try:
        dataflow_checker.run(data_loader, max_epochs=n_epochs)
    except KeyboardInterrupt:
        logger.info("Catched KeyboardInterrupt -> exit")
        exit(0)
    except Exception as e:  # noqa
        logger.exception("")
        if debug:
            try:
                # open an ipython shell if possible
                import IPython
                IPython.embed()  # noqa
            except ImportError:
                print("Failed to start IPython console")
        raise e

    logger.debug("Dataflow check is ended")
    writer.close()

    logger.debug("Create and write y_counts_per_batch.csv")
    cols = ["class_{}".format(i) for i in range(n_classes)]
    y_counts_df = pd.DataFrame(y_counts_per_batch, columns=cols)
    y_counts_df.to_csv(os.path.join(log_dir, "y_counts_per_batch.csv"),
                       index=False)

    # Save figure of total target distributions
    logger.debug("Save figure of total target distributions")
    fig = create_fig_target_distribution_per_batch(y_counts_df=y_counts_df,
                                                   n_classes_per_fig=20)
    fig.savefig(os.path.join(log_dir, "target_distribution_per_batch.png"))

    logger.debug("Save figure of total targets distributions")
    fig = create_fig_targets_distribution(y_counts_df, n_classes_per_fig=20)
    fig.savefig(os.path.join(log_dir, "targets_distribution.png"))
    del y_counts_df
    del y_counts_per_batch

    logger.debug("Create and write x_stats_df.csv")
    min_cols = ["b{}_min".format(i) for i in range(n_channels)]
    avg_cols = ["b{}_avg".format(i) for i in range(n_channels)]
    max_cols = ["b{}_max".format(i) for i in range(n_channels)]
    cols = min_cols + avg_cols + max_cols + ["shape", "dtype"]
    x_stats_df = pd.DataFrame(columns=cols,
                              index=np.arange(n_batches),
                              dtype=np.float)
    x_stats_df[min_cols] = x_mins_per_batch
    x_stats_df[avg_cols] = x_avgs_per_batch
    x_stats_df[max_cols] = x_maxs_per_batch
    x_stats_df["shape"] = x_shapes_per_batch
    x_stats_df["dtype"] = x_dtypes_per_batch
    x_stats_df.to_csv(os.path.join(log_dir, "x_stats_df.csv"), index=False)

    # Save figure with sample mins, avgs, maxs
    logger.debug("Save figure with sample mins, avgs, maxs")
    fig = create_fig_samples_min_avg_max_per_batch(x_stats_df, min_cols,
                                                   avg_cols, max_cols)
    fig.savefig(os.path.join(log_dir, "samples_min_avg_max_per_batch.png"))

    logger.debug("Save figure with sample shapes")
    fig = create_fig_samples_param_per_batch(x_stats_df, "shape")
    fig.savefig(os.path.join(log_dir, "samples_shape_per_batch.png"))

    logger.debug("Save figure with sample dtypes")
    fig = create_fig_samples_param_per_batch(x_stats_df, "dtype")
    fig.savefig(os.path.join(log_dir, "samples_dtype_per_batch.png"))
示例#2
0
def run(config_file):
    print("--- iMaterialist 2018 : Meta-learner training --- ")

    print("Load config file ... ")
    config = load_config(config_file)

    seed = config.get("SEED", 2018)
    random.seed(seed)

    output = Path(config["OUTPUT_PATH"])
    model = config["MODEL"]
    model_name = model.__class__.__name__
    debug = config.get("DEBUG", False)

    from datetime import datetime
    now = datetime.now()
    log_dir = output / ("training_meta_{}_{}".format(
        model_name, now.strftime("%Y%m%d_%H%M")))
    assert not log_dir.exists(), \
        "Output logging directory '{}' already existing".format(log_dir)
    log_dir.mkdir(parents=True)

    log_level = logging.INFO
    if debug:
        log_level = logging.DEBUG
        print("Activated debug mode")

    logger = logging.getLogger("iMaterialist 2018: Train meta-learner")
    setup_logger(logger, (log_dir / "train.log").as_posix(), log_level)

    save_conf(config_file, log_dir.as_posix(), logger)

    X = config["X"]
    y = config["Y"]

    n_trials = config["N_TRIALS"]
    scorings = config["SCORINGS"]
    cv = config["CV_SPLIT"]
    estimator_cls = config["MODEL"]
    model_params = config["MODEL_PARAMS"]
    model_hp_params = config["MODEL_HP_PARAMS"]
    model_hp_params.update(model_params)
    fit_params = config["FIT_PARAMS"]
    n_jobs = config["N_JOBS"]

    def hp_score(model_hp_params):

        estimator = estimator_cls(**model_hp_params)

        scores = cross_validate(estimator,
                                X,
                                y,
                                cv=cv,
                                scoring=scorings,
                                fit_params=fit_params,
                                n_jobs=n_jobs,
                                verbose=debug)

        logger.info("CV scores:")
        for scoring in scorings:
            logger.info("{} : {}".format(scoring, scores[scoring].tolist()))

        return {'loss': scores[scorings[0]], 'status': STATUS_OK}

    logger.debug("Start training: {} epochs".format(n_trials))
    try:
        best_params, trials = hp_optimize(hp_score,
                                          model_hp_params,
                                          max_evals=n_trials)
        best_params.update(model_params)

        save_params(best_params, (log_dir / "best_params.json").as_posix())

        logger.info("Best parameters: \n{}".format(best_params))
        logger.info("Best trial : \n{}".format(trials.best_trial))

        logger.info("Train meta model on complete dataset")
        estimator = estimator_cls(**best_params)
        estimator.fit(X, y)

        save_model(estimator, (log_dir / "best_model.pkl").as_posix())

    except KeyboardInterrupt:
        logger.info("Catched KeyboardInterrupt -> exit")
    except Exception as e:  # noqa
        logger.exception("")
        if debug:
            try:
                # open an ipython shell if possible
                import IPython
                IPython.embed()  # noqa
            except ImportError:
                print("Failed to start IPython console")

    logger.debug("Training is ended")
示例#3
0
def run(config_file):
    print("--- iMaterialist 2018 : Training --- ")

    print("Load config file ... ")
    config = load_config(config_file)

    seed = config.get("SEED", 2018)
    random.seed(seed)
    torch.manual_seed(seed)

    output = Path(config["OUTPUT_PATH"])
    debug = config.get("DEBUG", False)

    from datetime import datetime
    now = datetime.now()
    log_dir = output / ("{}".format(Path(config_file).stem)) / "{}".format(
        now.strftime("%Y%m%d_%H%M"))
    assert not log_dir.exists(), \
        "Output logging directory '{}' already existing".format(log_dir)
    log_dir.mkdir(parents=True)

    shutil.copyfile(config_file, (log_dir / Path(config_file).name).as_posix())

    log_level = logging.INFO
    if debug:
        log_level = logging.DEBUG
        print("Activated debug mode")

    logger = logging.getLogger("iMaterialist 2018: Train")
    setup_logger(logger, (log_dir / "train.log").as_posix(), log_level)

    logger.debug("Setup tensorboard writer")
    writer = SummaryWriter(log_dir=(log_dir / "tensorboard").as_posix())

    save_conf(config_file, log_dir.as_posix(), logger, writer)

    model = config["MODEL"]
    model_name = model.__class__.__name__

    device = config.get("DEVICE", 'cuda')
    if 'cuda' in device:
        assert torch.cuda.is_available(), \
            "Device {} is not compatible with torch.cuda.is_available()".format(device)
        from torch.backends import cudnn
        cudnn.benchmark = True
        logger.debug("CUDA is enabled")
        model = model.to(device)

    logger.debug("Setup train/val dataloaders")
    train_loader, val_loader = config["TRAIN_LOADER"], config["VAL_LOADER"]

    # Setup training subset to run evaluation on:
    indices = np.arange(len(train_loader.sampler))
    np.random.shuffle(indices)
    indices = indices[:len(val_loader.sampler)] if len(
        val_loader.sampler) < len(train_loader.sampler) else indices
    train_eval_loader = get_train_eval_data_loader(train_loader, indices)

    logger.debug(
        "- train data loader: {} number of batches | {} number of samples".
        format(len(train_loader), len(train_loader.sampler)))
    logger.debug(
        "- train eval data loader: {} number of batches | {} number of samples"
        .format(len(train_eval_loader), len(train_eval_loader.sampler)))
    logger.debug(
        "- validation data loader: {} number of batches | {} number of samples"
        .format(len(val_loader), len(val_loader.sampler)))

    # write_model_graph(writer, model=model, data_loader=train_loader, device=device)

    optimizer = config["OPTIM"]

    logger.debug("Setup criterion")
    criterion = config["CRITERION"]
    if "cuda" in device and isinstance(criterion, nn.Module):
        criterion = criterion.to(device)

    lr_schedulers = config.get("LR_SCHEDULERS")

    logger.debug("Setup ignite trainer and evaluator")
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=device)

    metrics = {
        'accuracy': CategoricalAccuracy(),
        'precision': Precision(),
        'recall': Recall(),
        'nll': Loss(criterion)
    }
    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device)

    val_metrics = {
        'accuracy': CategoricalAccuracy(),
        'precision': Precision(),
        'recall': Recall(),
        'nll': Loss(nn.CrossEntropyLoss())
    }
    val_evaluator = create_supervised_evaluator(model,
                                                metrics=val_metrics,
                                                device=device)

    logger.debug("Setup handlers")
    log_interval = config.get("LOG_INTERVAL", 100)
    reduce_on_plateau = config.get("REDUCE_LR_ON_PLATEAU")

    # Setup timer to measure training time
    timer = Timer(average=True)
    timer.attach(trainer,
                 start=Events.EPOCH_STARTED,
                 resume=Events.ITERATION_STARTED,
                 pause=Events.ITERATION_COMPLETED)

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        iter = (engine.state.iteration - 1) % len(train_loader) + 1
        if iter % log_interval == 0:
            logger.info("Epoch[{}] Iteration[{}/{}] Loss: {:.4f}".format(
                engine.state.epoch, iter, len(train_loader),
                engine.state.output))

            writer.add_scalar("training/loss_vs_iterations",
                              engine.state.output, engine.state.iteration)

    @trainer.on(Events.EPOCH_STARTED)
    def update_lr_schedulers(engine):
        if lr_schedulers is not None:
            for lr_scheduler in lr_schedulers:
                lr_scheduler.step()

    @trainer.on(Events.EPOCH_STARTED)
    def log_lrs(engine):
        if len(optimizer.param_groups) == 1:
            lr = float(optimizer.param_groups[0]['lr'])
            writer.add_scalar("learning_rate", lr, engine.state.epoch)
            logger.debug("Learning rate: {}".format(lr))
        else:
            for i, param_group in enumerate(optimizer.param_groups):
                lr = float(param_group['lr'])
                logger.debug("Learning rate (group {}): {}".format(i, lr))
                writer.add_scalar("learning_rate_group_{}".format(i), lr,
                                  engine.state.epoch)

    log_images_dir = log_dir / "figures"
    log_images_dir.mkdir(parents=True)

    def log_precision_recall_results(metrics, epoch, mode):
        for metric_name in ['precision', 'recall']:
            value = metrics[metric_name]
            avg_value = torch.mean(value).item()
            writer.add_scalar("{}/avg_{}".format(mode, metric_name), avg_value,
                              epoch)
            # Save metric per class figure
            sorted_values = value.to('cpu').numpy()
            indices = np.argsort(sorted_values)
            sorted_values = sorted_values[indices]
            n_classes = len(sorted_values)
            classes = np.array(
                ["class_{}".format(i) for i in range(n_classes)])
            sorted_classes = classes[indices]
            fig = create_fig_param_per_class(sorted_values,
                                             metric_name,
                                             classes=sorted_classes,
                                             n_classes_per_fig=20)
            fname = log_images_dir / ("{}_{}_{}_per_class.png".format(
                mode, epoch, metric_name))
            fig.savefig(fname.as_posix())
            # Add figure in TB
            img = Image.open(fname.as_posix())
            tag = "{}_{}".format(mode, metric_name)
            writer.add_image(tag, np.asarray(img), epoch)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_metrics(engine):
        epoch = engine.state.epoch
        logger.info("One epoch training time (seconds): {}".format(
            timer.value()))
        metrics = train_evaluator.run(train_eval_loader).metrics
        logger.info(
            "Training Results - Epoch: {}  Avg accuracy: {:.4f} Avg loss: {:.4f}"
            .format(engine.state.epoch, metrics['accuracy'], metrics['nll']))
        writer.add_scalar("training/avg_accuracy", metrics['accuracy'], epoch)
        writer.add_scalar("training/avg_error", 1.0 - metrics['accuracy'],
                          epoch)
        writer.add_scalar("training/avg_loss", metrics['nll'], epoch)
        log_precision_recall_results(metrics, epoch, "training")

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        epoch = engine.state.epoch
        metrics = val_evaluator.run(val_loader).metrics
        writer.add_scalar("validation/avg_loss", metrics['nll'], epoch)
        writer.add_scalar("validation/avg_accuracy", metrics['accuracy'],
                          epoch)
        writer.add_scalar("validation/avg_error", 1.0 - metrics['accuracy'],
                          epoch)
        logger.info(
            "Validation Results - Epoch: {}  Avg accuracy: {:.4f} Avg loss: {:.4f}"
            .format(engine.state.epoch, metrics['accuracy'], metrics['nll']))
        log_precision_recall_results(metrics, epoch, "validation")

    if reduce_on_plateau is not None:

        @val_evaluator.on(Events.COMPLETED)
        def update_reduce_on_plateau(engine):
            val_loss = engine.state.metrics['nll']
            reduce_on_plateau.step(val_loss)

    def score_function(engine):
        val_loss = engine.state.metrics['nll']
        # Objects with highest scores will be retained.
        return -val_loss

    # Setup early stopping:
    if "EARLY_STOPPING_KWARGS" in config:
        kwargs = config["EARLY_STOPPING_KWARGS"]
        if 'score_function' not in kwargs:
            kwargs['score_function'] = score_function
        handler = EarlyStopping(trainer=trainer, **kwargs)
        setup_logger(handler._logger, (log_dir / "train.log").as_posix(),
                     log_level)
        val_evaluator.add_event_handler(Events.COMPLETED, handler)

    # Setup model checkpoint:
    best_model_saver = ModelCheckpoint(log_dir.as_posix(),
                                       filename_prefix="model",
                                       score_name="val_loss",
                                       score_function=score_function,
                                       n_saved=5,
                                       atomic=True,
                                       create_dir=True)
    val_evaluator.add_event_handler(Events.COMPLETED, best_model_saver,
                                    {model_name: model})

    last_model_saver = ModelCheckpoint(log_dir.as_posix(),
                                       filename_prefix="checkpoint",
                                       save_interval=1,
                                       n_saved=1,
                                       atomic=True,
                                       create_dir=True)
    trainer.add_event_handler(Events.EPOCH_COMPLETED, last_model_saver,
                              {model_name: model})

    # Setup custom event handlers:
    for (event, handler) in config["TRAINER_CUSTOM_EVENT_HANDLERS"]:
        trainer.add_event_handler(event, handler, val_evaluator, logger)

    for (event, handler) in config["EVALUATOR_CUSTOM_EVENT_HANDLERS"]:
        val_evaluator.add_event_handler(event, handler, trainer, logger)

    n_epochs = config["N_EPOCHS"]
    logger.info("Start training: {} epochs".format(n_epochs))
    try:
        trainer.run(train_loader, max_epochs=n_epochs)
    except KeyboardInterrupt:
        logger.info("Catched KeyboardInterrupt -> exit")
    except Exception as e:  # noqa
        logger.exception("")
        if debug:
            try:
                # open an ipython shell if possible
                import IPython
                IPython.embed()  # noqa
            except ImportError:
                print("Failed to start IPython console")

    logger.debug("Training is ended")
    writer.close()
def run(config_file):

    print("--- Tiny ImageNet 200 Playground : Inference --- ")

    print("Load config file ... ")
    config = load_config(config_file)

    seed = config.get("SEED", 2018)
    random.seed(seed)
    torch.manual_seed(seed)

    output = config["OUTPUT_PATH"]
    model = config["MODEL"]
    model_name = model.__class__.__name__
    debug = config.get("DEBUG", False)

    from datetime import datetime
    now = datetime.now()
    log_dir = os.path.join(output, "inference_{}_{}".format(model_name, now.strftime("%Y%m%d_%H%M")))
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    log_level = logging.INFO
    if debug:
        log_level = logging.DEBUG
        print("Activated debug mode")

    logger = logging.getLogger("Tiny ImageNet 200: Inference")
    setup_logger(logger, os.path.join(log_dir, "test.log"), log_level)

    logger.debug("Setup tensorboard writer")
    writer = SummaryWriter(log_dir=os.path.join(log_dir, "tensorboard"))

    save_conf(config_file, log_dir, logger, writer)

    device = 'cpu'
    if torch.cuda.is_available():
        logger.debug("CUDA is enabled")
        from torch.backends import cudnn
        cudnn.benchmark = True
        device = 'cuda'
        model = model.to(device)

    logger.debug("Setup test dataloader")
    dataset_path = config["DATASET_PATH"]
    test_data_transform = config["TEST_TRANSFORMS"]
    batch_size = config.get("BATCH_SIZE", 64)
    num_workers = config.get("NUM_WORKERS", 8)
    test_loader = get_test_data_loader(dataset_path, test_data_transform, batch_size, num_workers, device=device)

    logger.debug("Setup ignite trainer and evaluator")
    inferencer = create_inferencer(model, device=device)

    n_tta = config["N_TTA"]

    logger.debug("Setup handlers")
    # Setup timer to measure evaluation time
    timer = Timer(average=True)
    timer.attach(inferencer,
                 start=Events.STARTED,
                 resume=Events.ITERATION_STARTED,
                 pause=Events.ITERATION_COMPLETED)

    n_samples = len(test_loader.dataset)
    files = np.zeros((n_samples, ), dtype=np.object)
    y_probas_tta = np.zeros((n_samples, 200, n_tta))

    @inferencer.on(Events.EPOCH_COMPLETED)
    def log_tta(engine):
        logger.debug("TTA {} / {}".format(engine.state.epoch, n_tta))

    @inferencer.on(Events.ITERATION_COMPLETED)
    def save_results(engine):
        output = engine.state.output
        tta_index = engine.state.epoch - 1
        start_index = ((engine.state.iteration - 1) % len(test_loader)) * batch_size
        end_index = min(start_index + batch_size, n_samples)
        batch_y_probas = output['y_pred'].detach().numpy()
        y_probas_tta[start_index:end_index, :, tta_index] = batch_y_probas
        if tta_index == 0:
            files[start_index:end_index] = output['files']

    logger.debug("Start inference")
    try:
        inferencer.run(test_loader, max_epochs=n_tta)
    except KeyboardInterrupt:
        logger.info("Catched KeyboardInterrupt -> exit")
    except Exception as e:  # noqa
        logger.exception("")
        if debug:
            try:
                # open an ipython shell if possible
                import IPython
                IPython.embed()  # noqa
            except ImportError:
                print("Failed to start IPython console")
        exit(1)
    writer.close()

    # Average probabilities:
    y_probas = np.mean(y_probas_tta, axis=-1)
    y_preds = np.argmax(y_probas, axis=-1)

    logger.info("Write submission file")
    submission_filepath = os.path.join(log_dir, "predictions.csv")
    write_submission(files, y_preds, test_loader.dataset.classes, submission_filepath)
示例#5
0
def run(config_file):

    print("--- Tiny ImageNet 200 Playground : Training --- ")

    print("Load config file ... ")
    config = load_config(config_file)

    seed = config.get("SEED", 2018)
    random.seed(seed)
    torch.manual_seed(seed)

    output = config["OUTPUT_PATH"]
    model = config["MODEL"]
    model_name = model.__class__.__name__
    debug = config.get("DEBUG", False)

    from datetime import datetime
    now = datetime.now()
    log_dir = os.path.join(
        output, "training_{}_{}".format(model_name,
                                        now.strftime("%Y%m%d_%H%M")))
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    log_level = logging.INFO
    if debug:
        log_level = logging.DEBUG
        print("Activated debug mode")

    logger = logging.getLogger("Tiny ImageNet 200: Train")
    setup_logger(logger, os.path.join(log_dir, "train.log"), log_level)

    logger.debug("Setup tensorboard writer")
    writer = SummaryWriter(log_dir=os.path.join(log_dir, "tensorboard"))

    save_conf(config_file, log_dir, logger, writer)

    device = 'cpu'
    if torch.cuda.is_available():
        logger.debug("CUDA is enabled")
        from torch.backends import cudnn
        cudnn.benchmark = True
        device = 'cuda'
        model = model.to(device)

    logger.debug("Setup train/val dataloaders")
    dataset_path = config["DATASET_PATH"]
    train_data_transform = config["TRAIN_TRANSFORMS"]
    val_data_transform = config["VAL_TRANSFORMS"]
    train_batch_size = config.get("BATCH_SIZE", 64)
    val_batch_size = config.get("VAL_BATCH_SIZE", train_batch_size)
    num_workers = config.get("NUM_WORKERS", 8)
    trainval_split = config.get("TRAINVAL_SPLIT", {
        'fold_index': 0,
        'n_splits': 7
    })
    train_loader, val_loader = get_trainval_data_loaders(dataset_path,
                                                         train_data_transform,
                                                         val_data_transform,
                                                         train_batch_size,
                                                         val_batch_size,
                                                         trainval_split,
                                                         num_workers,
                                                         device=device)

    indices = np.arange(len(train_loader.dataset))
    np.random.shuffle(indices)
    indices = indices[:len(val_loader.dataset)] if len(
        val_loader.dataset) < len(train_loader.dataset) else indices
    train_eval_loader = get_train_eval_data_loader(train_loader, indices)

    write_model_graph(writer,
                      model=model,
                      data_loader=train_loader,
                      device=device)

    optimizer = config["OPTIM"]

    logger.debug("Setup criterion")
    criterion = nn.CrossEntropyLoss()
    if 'cuda' in device:
        criterion = criterion.to(device)

    lr_schedulers = config.get("LR_SCHEDULERS")

    logger.debug("Setup ignite trainer and evaluator")
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=device)

    metrics = {
        'accuracy': CategoricalAccuracy(),
        'precision': Precision(),
        'recall': Recall(),
        'nll': Loss(criterion)
    }
    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device)
    val_evaluator = create_supervised_evaluator(model,
                                                metrics=metrics,
                                                device=device)

    logger.debug("Setup handlers")
    log_interval = config.get("LOG_INTERVAL", 100)
    reduce_on_plateau = config.get("REDUCE_LR_ON_PLATEAU")

    # Setup timer to measure training time
    timer = Timer(average=True)
    timer.attach(trainer,
                 start=Events.EPOCH_STARTED,
                 resume=Events.ITERATION_STARTED,
                 pause=Events.ITERATION_COMPLETED)

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        iter = (engine.state.iteration - 1) % len(train_loader) + 1
        if iter % log_interval == 0:
            logger.info("Epoch[{}] Iteration[{}/{}] Loss: {:.4f}".format(
                engine.state.epoch, iter, len(train_loader),
                engine.state.output))

            writer.add_scalar("training/loss_vs_iterations",
                              engine.state.output, engine.state.iteration)

    @trainer.on(Events.EPOCH_STARTED)
    def update_lr_schedulers(engine):
        if lr_schedulers is not None:
            for lr_scheduler in lr_schedulers:
                lr_scheduler.step()

    @trainer.on(Events.EPOCH_STARTED)
    def log_lrs(engine):
        if len(optimizer.param_groups) == 1:
            lr = float(optimizer.param_groups[0]['lr'])
            writer.add_scalar("learning_rate", lr, engine.state.epoch)
            logger.debug("Learning rate: {}".format(lr))
        else:
            for i, param_group in enumerate(optimizer.param_groups):
                lr = float(param_group['lr'])
                logger.debug("Learning rate (group {}): {}".format(i, lr))
                writer.add_scalar("learning_rate_group_{}".format(i), lr,
                                  engine.state.epoch)

    log_images_dir = os.path.join(log_dir, "figures")
    os.makedirs(log_images_dir)

    def log_precision_recall_results(metrics, epoch, mode):
        for metric_name in ['precision', 'recall']:
            value = metrics[metric_name]
            avg_value = torch.mean(value).item()
            writer.add_scalar("{}/avg_{}".format(mode, metric_name), avg_value,
                              epoch)
            # Save metric per class figure
            sorted_values = value.to('cpu').numpy()
            indices = np.argsort(sorted_values)
            sorted_values = sorted_values[indices]
            n_classes = len(sorted_values)
            classes = np.array(
                ["class_{}".format(i) for i in range(n_classes)])
            sorted_classes = classes[indices]
            fig = create_fig_param_per_class(sorted_values,
                                             metric_name,
                                             classes=sorted_classes,
                                             n_classes_per_fig=20)
            fname = os.path.join(
                log_images_dir,
                "{}_{}_{}_per_class.png".format(mode, epoch, metric_name))
            fig.savefig(fname)
            # Add figure in TB
            img = Image.open(fname)
            tag = "{}_{}".format(mode, metric_name)
            writer.add_image(tag, np.asarray(img), epoch)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_metrics(engine):
        epoch = engine.state.epoch
        logger.info("One epoch training time (seconds): {}".format(
            timer.value()))
        metrics = train_evaluator.run(train_eval_loader).metrics
        logger.info(
            "Training Results - Epoch: {}  Avg accuracy: {:.4f} Avg loss: {:.4f}"
            .format(engine.state.epoch, metrics['accuracy'], metrics['nll']))
        writer.add_scalar("training/avg_accuracy", metrics['accuracy'], epoch)
        writer.add_scalar("training/avg_error", 1.0 - metrics['accuracy'],
                          epoch)
        writer.add_scalar("training/avg_loss", metrics['nll'], epoch)
        log_precision_recall_results(metrics, epoch, "training")

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        epoch = engine.state.epoch
        metrics = val_evaluator.run(val_loader).metrics
        writer.add_scalar("validation/avg_loss", metrics['nll'], epoch)
        writer.add_scalar("validation/avg_accuracy", metrics['accuracy'],
                          epoch)
        writer.add_scalar("validation/avg_error", 1.0 - metrics['accuracy'],
                          epoch)
        logger.info(
            "Validation Results - Epoch: {}  Avg accuracy: {:.4f} Avg loss: {:.4f}"
            .format(engine.state.epoch, metrics['accuracy'], metrics['nll']))
        log_precision_recall_results(metrics, epoch, "validation")

    if reduce_on_plateau is not None:

        @val_evaluator.on(Events.COMPLETED)
        def update_reduce_on_plateau(engine):
            val_loss = engine.state.metrics['nll']
            reduce_on_plateau.step(val_loss)

    def score_function(engine):
        val_loss = engine.state.metrics['nll']
        # Objects with highest scores will be retained.
        return -val_loss

    # Setup early stopping:
    if "EARLY_STOPPING_KWARGS" in config:
        kwargs = config["EARLY_STOPPING_KWARGS"]
        if 'score_function' not in kwargs:
            kwargs['score_function'] = score_function
        handler = EarlyStopping(trainer=trainer, **kwargs)
        setup_logger(handler._logger, os.path.join(log_dir, "train.log"),
                     log_level)
        val_evaluator.add_event_handler(Events.COMPLETED, handler)

    # Setup model checkpoint:
    best_model_saver = ModelCheckpoint(log_dir,
                                       filename_prefix="model",
                                       score_name="val_loss",
                                       score_function=score_function,
                                       n_saved=5,
                                       atomic=True,
                                       create_dir=True)
    val_evaluator.add_event_handler(Events.COMPLETED, best_model_saver,
                                    {model_name: model})

    last_model_saver = ModelCheckpoint(log_dir,
                                       filename_prefix="checkpoint",
                                       save_interval=1,
                                       n_saved=1,
                                       atomic=True,
                                       create_dir=True)
    trainer.add_event_handler(Events.EPOCH_COMPLETED, last_model_saver,
                              {model_name: model})

    n_epochs = config["N_EPOCHS"]
    logger.info("Start training: {} epochs".format(n_epochs))
    try:
        trainer.run(train_loader, max_epochs=n_epochs)
    except KeyboardInterrupt:
        logger.info("Catched KeyboardInterrupt -> exit")
    except Exception as e:  # noqa
        logger.exception("")
        if debug:
            try:
                # open an ipython shell if possible
                import IPython
                IPython.embed()  # noqa
            except ImportError:
                print("Failed to start IPython console")

    logger.debug("Training is ended")
    writer.close()
示例#6
0
def run(config_file):

    print("--- iMaterialist 2018 : Inference --- ")

    print("Load config file ... ")
    config = load_config(config_file)

    seed = config.get("SEED", 2018)
    random.seed(seed)
    torch.manual_seed(seed)

    output = Path(config["OUTPUT_PATH"])
    debug = config.get("DEBUG", False)

    from datetime import datetime
    now = datetime.now()
    # log_dir = output / "inference_{}_{}".format(model_name, now.strftime("%Y%m%d_%H%M"))
    log_dir = output / ("{}".format(Path(config_file).stem)) / "{}".format(
        now.strftime("%Y%m%d_%H%M"))
    assert not log_dir.exists(), \
        "Output logging directory '{}' already existing".format(log_dir)
    log_dir.mkdir(parents=True)

    shutil.copyfile(config_file, (log_dir / Path(config_file).name).as_posix())

    log_level = logging.INFO
    if debug:
        log_level = logging.DEBUG
        print("Activated debug mode")

    logger = logging.getLogger("iMaterialist 2018: Inference")
    setup_logger(logger, (log_dir / "predict.log").as_posix(), log_level)

    logger.debug("Setup tensorboard writer")
    writer = SummaryWriter(log_dir=(log_dir / "tensorboard").as_posix())

    save_conf(config_file, log_dir.as_posix(), logger, writer)

    model = config["MODEL"]
    device = config.get("DEVICE", 'cuda')
    if 'cuda' in device:
        assert torch.cuda.is_available(), \
            "Device {} is not compatible with torch.cuda.is_available()".format(device)
        from torch.backends import cudnn
        cudnn.benchmark = True
        logger.debug("CUDA is enabled")
        model = model.to(device)

    logger.debug("Setup test dataloader")
    test_loader = config["TEST_LOADER"]

    logger.debug("Setup ignite inferencer")
    inferencer = create_inferencer(model, device=device)

    n_tta = config["N_TTA"]
    n_classes = config["N_CLASSES"]
    batch_size = test_loader.batch_size

    logger.debug("Setup handlers")
    # Setup timer to measure evaluation time
    timer = Timer(average=True)
    timer.attach(inferencer,
                 start=Events.STARTED,
                 resume=Events.ITERATION_STARTED,
                 pause=Events.ITERATION_COMPLETED)

    n_samples = len(test_loader.dataset)
    indices = np.zeros((n_samples, ), dtype=np.int)
    y_probas_tta = np.zeros((n_samples, n_classes, n_tta))

    @inferencer.on(Events.EPOCH_COMPLETED)
    def log_tta(engine):
        logger.debug("TTA {} / {}".format(engine.state.epoch, n_tta))

    @inferencer.on(Events.ITERATION_COMPLETED)
    def save_results(engine):
        output = engine.state.output
        tta_index = engine.state.epoch - 1
        start_index = (
            (engine.state.iteration - 1) % len(test_loader)) * batch_size
        end_index = min(start_index + batch_size, n_samples)
        batch_y_probas = output['y_pred'].detach().numpy()
        y_probas_tta[start_index:end_index, :, tta_index] = batch_y_probas
        if tta_index == 0:
            indices[start_index:end_index] = output['indices']

    logger.info("Start inference")
    try:
        inferencer.run(test_loader, max_epochs=n_tta)
    except KeyboardInterrupt:
        logger.info("Catched KeyboardInterrupt -> exit")
        return
    except Exception as e:  # noqa
        logger.exception("")
        if debug:
            try:
                # open an ipython shell if possible
                import IPython
                IPython.embed()  # noqa
            except ImportError:
                print("Failed to start IPython console")
        return

    # Average probabilities:
    y_probas = np.mean(y_probas_tta, axis=-1)

    if config["SAVE_PROBAS"]:
        logger.info("Write probabilities file")
        probas_filepath = log_dir / "probas.csv"
        write_probas(indices, y_probas, probas_filepath)
    else:
        y_preds = np.argmax(y_probas, axis=-1) + 1  # as labels are one-based
        logger.info("Write submission file")
        submission_filepath = log_dir / "predictions.csv"
        sample_submission_path = config["SAMPLE_SUBMISSION_PATH"]
        write_submission(indices, y_preds, sample_submission_path,
                         submission_filepath)