def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_interval):
    train_loader, val_loader = get_data_loaders(train_batch_size,
                                                val_batch_size)
    model = FushionNet()
    #model=torch.load(SAVE_PATH+"350-0.908.pth")

    if torch.cuda.is_available():
        device = 'cuda'
    else:
        device = 'cpu'
    optimizer = optim.SGD(model.parameters(),
                          lr=lr,
                          momentum=momentum,
                          weight_decay=2e-6,
                          nesterov=False)
    #optimizer = optim.Adamax(model.parameters(),lr,(0.9,0.999),1e-8,1e-6)
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer,
                                               [50, 100, 150, 200, 250, 300],
                                               0.1)
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        F.nll_loss,
                                        device=device)
    evaluator = create_supervised_evaluator(model,
                                            metrics={
                                                'accuracy': Accuracy(),
                                                'nll': Loss(F.nll_loss)
                                            },
                                            device=device)

    desc = "ITERATION - loss: {:.2f}"
    pbar = tqdm(initial=0,
                leave=False,
                total=len(train_loader),
                desc=desc.format(0))

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        iter = (engine.state.iteration - 1) % len(train_loader) + 1

        if iter % log_interval == 0:
            pbar.desc = desc.format(engine.state.output)
            pbar.update(log_interval)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        pbar.refresh()
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_nll = metrics['nll']
        current_lr = optimizer.param_groups[0]['lr']
        tqdm.write(
            "Training Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f} Current lr: {:.6f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll, current_lr))
        scheduler.step()

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_nll = metrics['nll']
        tqdm.write(
            "Validation Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))
        pbar.n = pbar.last_print_n = 0
        if (engine.state.epoch % 10 == 0):
            torch.save(
                model, SAVE_PATH + str(engine.state.epoch) + "-" +
                str(avg_accuracy) + ".pth")

    trainer.run(train_loader, max_epochs=epochs)
    pbar.close()
def run(train_batch_size, val_batch_size, epochs, lr, momentum):
    train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size)
    model = Net()
    device = "cpu"

    if torch.cuda.is_available():
        device = "cuda"

    model.to(device)  # Move model before creating optimizer
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    criterion = nn.CrossEntropyLoss()
    trainer = create_supervised_trainer(model, optimizer, criterion, device=device)
    trainer.logger = setup_logger("Trainer")

    metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)}

    train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device)
    train_evaluator.logger = setup_logger("Train Evaluator")
    validation_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device)
    validation_evaluator.logger = setup_logger("Val Evaluator")

    @trainer.on(Events.EPOCH_COMPLETED)
    def compute_metrics(engine):
        train_evaluator.run(train_loader)
        validation_evaluator.run(val_loader)

    trains_logger = TrainsLogger(project_name="examples", task_name="ignite")

    trains_logger.attach_output_handler(
        trainer,
        event_name=Events.ITERATION_COMPLETED(every=100),
        tag="training",
        output_transform=lambda loss: {"batchloss": loss},
    )

    for tag, evaluator in [("training metrics", train_evaluator), ("validation metrics", validation_evaluator)]:
        trains_logger.attach_output_handler(
            evaluator,
            event_name=Events.EPOCH_COMPLETED,
            tag=tag,
            metric_names=["loss", "accuracy"],
            global_step_transform=global_step_from_engine(trainer),
        )

    trains_logger.attach_opt_params_handler(
        trainer, event_name=Events.ITERATION_COMPLETED(every=100), optimizer=optimizer
    )

    trains_logger.attach(
        trainer, log_handler=WeightsScalarHandler(model), event_name=Events.ITERATION_COMPLETED(every=100)
    )

    trains_logger.attach(trainer, log_handler=WeightsHistHandler(model), event_name=Events.EPOCH_COMPLETED(every=100))

    trains_logger.attach(
        trainer, log_handler=GradsScalarHandler(model), event_name=Events.ITERATION_COMPLETED(every=100)
    )

    trains_logger.attach(trainer, log_handler=GradsHistHandler(model), event_name=Events.EPOCH_COMPLETED(every=100))

    handler = Checkpoint(
        {"model": model},
        TrainsSaver(),
        n_saved=1,
        score_function=lambda e: e.state.metrics["accuracy"],
        score_name="val_acc",
        filename_prefix="best",
        global_step_transform=global_step_from_engine(trainer),
    )
    validation_evaluator.add_event_handler(Events.EPOCH_COMPLETED, handler)

    # kick everything off
    trainer.run(train_loader, max_epochs=epochs)

    trains_logger.close()
예제 #3
0
def training(local_rank, config):

    rank = idist.get_rank()
    manual_seed(config["seed"] + rank)
    device = idist.device()

    logger = setup_logger(name="CIFAR10-QAT-Training",
                          distributed_rank=local_rank)

    log_basic_info(logger, config)

    output_path = config["output_path"]
    if rank == 0:
        now = datetime.now().strftime("%Y%m%d-%H%M%S")

        folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}"
        output_path = Path(output_path) / folder_name
        if not output_path.exists():
            output_path.mkdir(parents=True)
        config["output_path"] = output_path.as_posix()
        logger.info(f"Output path: {config['output_path']}")

        if "cuda" in device.type:
            config["cuda device name"] = torch.cuda.get_device_name(local_rank)

        if config["with_clearml"]:
            try:
                from clearml import Task
            except ImportError:
                # Backwards-compatibility for legacy Trains SDK
                from trains import Task

            task = Task.init("CIFAR10-Training", task_name=output_path.stem)
            task.connect_configuration(config)
            # Log hyper parameters
            hyper_params = [
                "model",
                "batch_size",
                "momentum",
                "weight_decay",
                "num_epochs",
                "learning_rate",
                "num_warmup_epochs",
            ]
            task.connect({k: config[k] for k in hyper_params})

    # Setup dataflow, model, optimizer, criterion
    train_loader, test_loader = get_dataflow(config)

    config["num_iters_per_epoch"] = len(train_loader)
    model, optimizer, criterion, lr_scheduler = initialize(config)

    # Create trainer for current task
    trainer = create_trainer(model, optimizer, criterion, lr_scheduler,
                             train_loader.sampler, config, logger)

    # Let's now setup evaluator engine to perform model's validation and compute metrics
    metrics = {
        "Accuracy": Accuracy(),
        "Loss": Loss(criterion),
    }

    # We define two evaluators as they wont have exactly similar roles:
    # - `evaluator` will save the best model based on validation score
    evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device,
                                            non_blocking=True)
    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device,
                                                  non_blocking=True)

    def run_validation(engine):
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train",
                    state.metrics)
        state = evaluator.run(test_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test",
                    state.metrics)

    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=config["validate_every"])
        | Events.COMPLETED, run_validation)

    if rank == 0:
        # Setup TensorBoard logging on trainer and evaluators. Logged values are:
        #  - Training metrics, e.g. running average loss values
        #  - Learning rate
        #  - Evaluation train/test metrics
        evaluators = {"training": train_evaluator, "test": evaluator}
        tb_logger = common.setup_tb_logging(output_path,
                                            trainer,
                                            optimizer,
                                            evaluators=evaluators)

    # Store 2 best models by validation accuracy starting from num_epochs / 2:
    best_model_handler = Checkpoint(
        {"model": model},
        get_save_handler(config),
        filename_prefix="best",
        n_saved=2,
        global_step_transform=global_step_from_engine(trainer),
        score_name="test_accuracy",
        score_function=Checkpoint.get_default_score_fn("Accuracy"),
    )
    evaluator.add_event_handler(
        Events.COMPLETED(
            lambda *_: trainer.state.epoch > config["num_epochs"] // 2),
        best_model_handler)

    try:
        trainer.run(train_loader, max_epochs=config["num_epochs"])
    except Exception as e:
        logger.exception("")
        raise e

    if rank == 0:
        tb_logger.close()
예제 #4
0
def fit_model(model, train_loader, test_loader, lr, max_epochs=5):

    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    criterion = torch.nn.BCEWithLogitsLoss()

    def threshold_output_transform(output):
        y_pred, y = output
        y_pred = torch.heaviside(y_pred, values=torch.zeros(1))
        # print(f'y_pred size : {y_pred.size()}')
        # print(f'y size : {y.size()}')
        return y_pred, y

    def prepare_batch(batch, device, non_blocking):
        x, y = batch
        x = x.float()
        y = y.float()
        y = torch.unsqueeze(y, 1)
        return (x, y)

    def squeeze_y_dims(output):
        prediction, target = output
        # print(f'prediction size: {prediction.size()}')
        # print(f'target size: {target.size()}')
        return prediction, target

    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        prepare_batch=prepare_batch)

    val_metrics = {
        "accuracy": Accuracy(threshold_output_transform),
        "bce": Loss(criterion, output_transform=squeeze_y_dims)
        # "precision" : Precision(threshold_output_transform, average=False),
        # "recall": Recall(threshold_output_transform, average=False)
    }

    evaluator = create_supervised_evaluator(model,
                                            metrics=val_metrics,
                                            prepare_batch=prepare_batch)

    @trainer.on(Events.ITERATION_COMPLETED(every=10))
    def log_training_loss(trainer):
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        print(
            f"Training Results - Epoch: {trainer.state.epoch}  Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['bce']:.2f}"
        )

    # @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(trainer):
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        # print(f"Training Results - Epoch: {trainer.state.epoch}  Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['bce']:.2f} Avg precision : {metrics['precision']:.2f} Avg recall: {metrics['recall']:.2f}")
        print(
            f"Training Results - Epoch: {trainer.state.epoch}  Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['bce']:.2f}"
        )

    @trainer.on(Events.EPOCH_COMPLETED(every=10))
    def log_validation_results(trainer):
        evaluator.run(test_loader)
        metrics = evaluator.state.metrics
        # print(f"Validation Results - Epoch: {trainer.state.epoch}  Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['bce']:.2f} Avg precision : {metrics['precision']:.2f} Avg recall: {metrics['recall']:.2f}")
        print(
            f"Validation Results - Epoch: {trainer.state.epoch}  Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['bce']:.2f}"
        )

    trainer.run(train_loader, max_epochs=max_epochs)

    return model
    def __call__(self, model, train_dataset, val_dataset=None, **_):
        """Train a PyTorch model.

        Args:
            model (torch.nn.Module): PyTorch model to train.
            train_dataset (torch.utils.data.Dataset): Dataset used to train.
            val_dataset (torch.utils.data.Dataset, optional): Dataset used to validate.

        Returns:
            trained_model (torch.nn.Module): Trained PyTorch model.
        """
        assert train_dataset is not None
        train_params = self.train_params
        mlflow_logging = self.mlflow_logging

        if mlflow_logging:
            try:
                import mlflow  # NOQA
            except ImportError:
                log.warning(
                    "Failed to import mlflow. MLflow logging is disabled.")
                mlflow_logging = False

        loss_fn = train_params.get("loss_fn")
        assert loss_fn
        epochs = train_params.get("epochs")
        seed = train_params.get("seed")
        optimizer = train_params.get("optimizer")
        assert optimizer
        optimizer_params = train_params.get("optimizer_params", dict())
        train_dataset_size_limit = train_params.get("train_dataset_size_limit")
        if train_dataset_size_limit:
            train_dataset = PartialDataset(train_dataset,
                                           train_dataset_size_limit)
            log.info("train dataset size is set to {}".format(
                len(train_dataset)))

        val_dataset_size_limit = train_params.get("val_dataset_size_limit")
        if val_dataset_size_limit and (val_dataset is not None):
            val_dataset = PartialDataset(val_dataset, val_dataset_size_limit)
            log.info("val dataset size is set to {}".format(len(val_dataset)))

        train_data_loader_params = train_params.get("train_data_loader_params",
                                                    dict())
        val_data_loader_params = train_params.get("val_data_loader_params",
                                                  dict())
        evaluation_metrics = train_params.get("evaluation_metrics")
        evaluate_train_data = train_params.get("evaluate_train_data")
        evaluate_val_data = train_params.get("evaluate_val_data")
        progress_update = train_params.get("progress_update")

        scheduler = train_params.get("scheduler")
        scheduler_params = train_params.get("scheduler_params", dict())

        model_checkpoint = train_params.get("model_checkpoint")
        model_checkpoint_params = train_params.get("model_checkpoint_params")
        early_stopping_params = train_params.get("early_stopping_params")
        time_limit = train_params.get("time_limit")

        cudnn_deterministic = train_params.get("cudnn_deterministic")
        cudnn_benchmark = train_params.get("cudnn_benchmark")

        if seed:
            torch.manual_seed(seed)
            np.random.seed(seed)
        if cudnn_deterministic:
            torch.backends.cudnn.deterministic = cudnn_deterministic
        if cudnn_benchmark:
            torch.backends.cudnn.benchmark = cudnn_benchmark

        device = "cuda" if torch.cuda.is_available() else "cpu"
        model.to(device)
        optimizer_ = optimizer(model.parameters(), **optimizer_params)
        trainer = create_supervised_trainer(model,
                                            optimizer_,
                                            loss_fn=loss_fn,
                                            device=device)

        train_data_loader_params.setdefault("shuffle", True)
        train_data_loader_params.setdefault("drop_last", True)
        train_data_loader_params["batch_size"] = _clip_batch_size(
            train_data_loader_params.get("batch_size", 1), train_dataset,
            "train")
        train_loader = DataLoader(train_dataset, **train_data_loader_params)

        RunningAverage(output_transform=lambda x: x,
                       alpha=0.98).attach(trainer, "ema_loss")

        RunningAverage(output_transform=lambda x: x,
                       alpha=2**(-1022)).attach(trainer, "batch_loss")

        if scheduler:

            class ParamSchedulerSavingAsMetric(
                    ParamSchedulerSavingAsMetricMixIn, scheduler):
                pass

            cycle_epochs = scheduler_params.pop("cycle_epochs", 1)
            scheduler_params.setdefault("cycle_size",
                                        int(cycle_epochs * len(train_loader)))
            scheduler_params.setdefault("param_name", "lr")
            scheduler_ = ParamSchedulerSavingAsMetric(optimizer_,
                                                      **scheduler_params)
            trainer.add_event_handler(Events.ITERATION_STARTED, scheduler_)

        if evaluate_train_data:
            evaluator_train = create_supervised_evaluator(
                model, metrics=evaluation_metrics, device=device)

        if evaluate_val_data:
            val_data_loader_params["batch_size"] = _clip_batch_size(
                val_data_loader_params.get("batch_size", 1), val_dataset,
                "val")
            val_loader = DataLoader(val_dataset, **val_data_loader_params)
            evaluator_val = create_supervised_evaluator(
                model, metrics=evaluation_metrics, device=device)

        if model_checkpoint_params:
            assert isinstance(model_checkpoint_params, dict)
            minimize = model_checkpoint_params.pop("minimize", True)
            save_interval = model_checkpoint_params.get("save_interval", None)
            if not save_interval:
                model_checkpoint_params.setdefault(
                    "score_function",
                    get_score_function("ema_loss", minimize=minimize))
            model_checkpoint_params.setdefault("score_name", "ema_loss")
            mc = model_checkpoint(**model_checkpoint_params)
            trainer.add_event_handler(Events.EPOCH_COMPLETED, mc,
                                      {"model": model})

        if early_stopping_params:
            assert isinstance(early_stopping_params, dict)
            metric = early_stopping_params.pop("metric", None)
            assert (metric is None) or (metric in evaluation_metrics)
            minimize = early_stopping_params.pop("minimize", False)
            if metric:
                assert (
                    "score_function" not in early_stopping_params
                ), "Remove either 'metric' or 'score_function' from early_stopping_params: {}".format(
                    early_stopping_params)
                early_stopping_params["score_function"] = get_score_function(
                    metric, minimize=minimize)

            es = EarlyStopping(trainer=trainer, **early_stopping_params)
            if evaluate_val_data:
                evaluator_val.add_event_handler(Events.COMPLETED, es)
            elif evaluate_train_data:
                evaluator_train.add_event_handler(Events.COMPLETED, es)
            elif early_stopping_params:
                log.warning(
                    "Early Stopping is disabled because neither "
                    "evaluate_val_data nor evaluate_train_data is set True.")

        if time_limit:
            assert isinstance(time_limit, (int, float))
            tl = TimeLimit(limit_sec=time_limit)
            trainer.add_event_handler(Events.ITERATION_COMPLETED, tl)

        pbar = None
        if progress_update:
            if not isinstance(progress_update, dict):
                progress_update = dict()
            progress_update.setdefault("persist", True)
            progress_update.setdefault("desc", "")
            pbar = ProgressBar(**progress_update)
            pbar.attach(trainer, ["ema_loss"])

        else:

            def log_train_metrics(engine):
                log.info("[Epoch: {} | {}]".format(engine.state.epoch,
                                                   engine.state.metrics))

            trainer.add_event_handler(Events.EPOCH_COMPLETED,
                                      log_train_metrics)

        if evaluate_train_data:

            def log_evaluation_train_data(engine):
                evaluator_train.run(train_loader)
                train_report = _get_report_str(engine, evaluator_train,
                                               "Train Data")
                if pbar:
                    pbar.log_message(train_report)
                else:
                    log.info(train_report)

            eval_train_event = (Events[evaluate_train_data] if isinstance(
                evaluate_train_data, str) else Events.EPOCH_COMPLETED)
            trainer.add_event_handler(eval_train_event,
                                      log_evaluation_train_data)

        if evaluate_val_data:

            def log_evaluation_val_data(engine):
                evaluator_val.run(val_loader)
                val_report = _get_report_str(engine, evaluator_val, "Val Data")
                if pbar:
                    pbar.log_message(val_report)
                else:
                    log.info(val_report)

            eval_val_event = (Events[evaluate_val_data] if isinstance(
                evaluate_val_data, str) else Events.EPOCH_COMPLETED)
            trainer.add_event_handler(eval_val_event, log_evaluation_val_data)

        if mlflow_logging:
            mlflow_logger = MLflowLogger()

            logging_params = {
                "train_n_samples": len(train_dataset),
                "train_n_batches": len(train_loader),
                "optimizer": _name(optimizer),
                "loss_fn": _name(loss_fn),
                "pytorch_version": torch.__version__,
                "ignite_version": ignite.__version__,
            }
            logging_params.update(_loggable_dict(optimizer_params,
                                                 "optimizer"))
            logging_params.update(
                _loggable_dict(train_data_loader_params, "train"))
            if scheduler:
                logging_params.update({"scheduler": _name(scheduler)})
                logging_params.update(
                    _loggable_dict(scheduler_params, "scheduler"))

            if evaluate_val_data:
                logging_params.update({
                    "val_n_samples": len(val_dataset),
                    "val_n_batches": len(val_loader),
                })
                logging_params.update(
                    _loggable_dict(val_data_loader_params, "val"))

            mlflow_logger.log_params(logging_params)

            batch_metric_names = ["batch_loss", "ema_loss"]
            if scheduler:
                batch_metric_names.append(scheduler_params.get("param_name"))

            mlflow_logger.attach(
                trainer,
                log_handler=OutputHandler(
                    tag="step",
                    metric_names=batch_metric_names,
                    global_step_transform=global_step_from_engine(trainer),
                ),
                event_name=Events.ITERATION_COMPLETED,
            )

            if evaluate_train_data:
                mlflow_logger.attach(
                    evaluator_train,
                    log_handler=OutputHandler(
                        tag="train",
                        metric_names=list(evaluation_metrics.keys()),
                        global_step_transform=global_step_from_engine(trainer),
                    ),
                    event_name=Events.COMPLETED,
                )
            if evaluate_val_data:
                mlflow_logger.attach(
                    evaluator_val,
                    log_handler=OutputHandler(
                        tag="val",
                        metric_names=list(evaluation_metrics.keys()),
                        global_step_transform=global_step_from_engine(trainer),
                    ),
                    event_name=Events.COMPLETED,
                )

        trainer.run(train_loader, max_epochs=epochs)

        try:
            if pbar and pbar.pbar:
                pbar.pbar.close()
        except Exception as e:
            log.error(e, exc_info=True)

        model = load_latest_model(model_checkpoint_params)(model)

        return model
예제 #6
0
def main():
    monai.config.print_config()
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    # create a temporary directory and 40 random image, mask paris
    tempdir = tempfile.mkdtemp()
    print(f"generating synthetic data to {tempdir} (this may take a while)")
    for i in range(40):
        im, seg = create_test_image_3d(128,
                                       128,
                                       128,
                                       num_seg_classes=1,
                                       channel_dim=-1)

        n = nib.Nifti1Image(im, np.eye(4))
        nib.save(n, os.path.join(tempdir, f"img{i:d}.nii.gz"))

        n = nib.Nifti1Image(seg, np.eye(4))
        nib.save(n, os.path.join(tempdir, f"seg{i:d}.nii.gz"))

    images = sorted(glob(os.path.join(tempdir, "img*.nii.gz")))
    segs = sorted(glob(os.path.join(tempdir, "seg*.nii.gz")))
    train_files = [{
        "img": img,
        "seg": seg
    } for img, seg in zip(images[:20], segs[:20])]
    val_files = [{
        "img": img,
        "seg": seg
    } for img, seg in zip(images[-20:], segs[-20:])]

    # define transforms for image and segmentation
    train_transforms = Compose([
        LoadNiftid(keys=["img", "seg"]),
        AsChannelFirstd(keys=["img", "seg"], channel_dim=-1),
        ScaleIntensityd(keys=["img", "seg"]),
        RandCropByPosNegLabeld(keys=["img", "seg"],
                               label_key="seg",
                               size=[96, 96, 96],
                               pos=1,
                               neg=1,
                               num_samples=4),
        RandRotate90d(keys=["img", "seg"], prob=0.5, spatial_axes=[0, 2]),
        ToTensord(keys=["img", "seg"]),
    ])
    val_transforms = Compose([
        LoadNiftid(keys=["img", "seg"]),
        AsChannelFirstd(keys=["img", "seg"], channel_dim=-1),
        ScaleIntensityd(keys=["img", "seg"]),
        ToTensord(keys=["img", "seg"]),
    ])

    # define dataset, data loader
    check_ds = monai.data.Dataset(data=train_files, transform=train_transforms)
    # use batch_size=2 to load images and use RandCropByPosNegLabeld to generate 2 x 4 images for network training
    check_loader = DataLoader(check_ds,
                              batch_size=2,
                              num_workers=4,
                              collate_fn=list_data_collate,
                              pin_memory=torch.cuda.is_available())
    check_data = monai.utils.misc.first(check_loader)
    print(check_data["img"].shape, check_data["seg"].shape)

    # create a training data loader
    train_ds = monai.data.Dataset(data=train_files, transform=train_transforms)
    # use batch_size=2 to load images and use RandCropByPosNegLabeld to generate 2 x 4 images for network training
    train_loader = DataLoader(
        train_ds,
        batch_size=2,
        shuffle=True,
        num_workers=4,
        collate_fn=list_data_collate,
        pin_memory=torch.cuda.is_available(),
    )
    # create a validation data loader
    val_ds = monai.data.Dataset(data=val_files, transform=val_transforms)
    val_loader = DataLoader(val_ds,
                            batch_size=5,
                            num_workers=8,
                            collate_fn=list_data_collate,
                            pin_memory=torch.cuda.is_available())

    # create UNet, DiceLoss and Adam optimizer
    net = monai.networks.nets.UNet(
        dimensions=3,
        in_channels=1,
        out_channels=1,
        channels=(16, 32, 64, 128, 256),
        strides=(2, 2, 2, 2),
        num_res_units=2,
    )
    loss = monai.losses.DiceLoss(do_sigmoid=True)
    lr = 1e-3
    opt = torch.optim.Adam(net.parameters(), lr)
    device = torch.device("cuda:0")

    # ignite trainer expects batch=(img, seg) and returns output=loss at every iteration,
    # user can add output_transform to return other values, like: y_pred, y, etc.
    def prepare_batch(batch, device=None, non_blocking=False):
        return _prepare_batch((batch["img"], batch["seg"]), device,
                              non_blocking)

    trainer = create_supervised_trainer(net,
                                        opt,
                                        loss,
                                        device,
                                        False,
                                        prepare_batch=prepare_batch)

    # adding checkpoint handler to save models (network params and optimizer stats) during training
    checkpoint_handler = ModelCheckpoint("./runs/",
                                         "net",
                                         n_saved=10,
                                         require_empty=False)
    trainer.add_event_handler(event_name=Events.EPOCH_COMPLETED,
                              handler=checkpoint_handler,
                              to_save={
                                  "net": net,
                                  "opt": opt
                              })

    # StatsHandler prints loss at every iteration and print metrics at every epoch,
    # we don't set metrics for trainer here, so just print loss, user can also customize print functions
    # and can use output_transform to convert engine.state.output if it's not loss value
    train_stats_handler = StatsHandler(name="trainer")
    train_stats_handler.attach(trainer)

    # TensorBoardStatsHandler plots loss at every iteration and plots metrics at every epoch, same as StatsHandler
    train_tensorboard_stats_handler = TensorBoardStatsHandler()
    train_tensorboard_stats_handler.attach(trainer)

    validation_every_n_iters = 5
    # set parameters for validation
    metric_name = "Mean_Dice"
    # add evaluation metric to the evaluator engine
    val_metrics = {metric_name: MeanDice(add_sigmoid=True, to_onehot_y=False)}

    # ignite evaluator expects batch=(img, seg) and returns output=(y_pred, y) at every iteration,
    # user can add output_transform to return other values
    evaluator = create_supervised_evaluator(net,
                                            val_metrics,
                                            device,
                                            True,
                                            prepare_batch=prepare_batch)

    @trainer.on(Events.ITERATION_COMPLETED(every=validation_every_n_iters))
    def run_validation(engine):
        evaluator.run(val_loader)

    # add early stopping handler to evaluator
    early_stopper = EarlyStopping(
        patience=4,
        score_function=stopping_fn_from_metric(metric_name),
        trainer=trainer)
    evaluator.add_event_handler(event_name=Events.EPOCH_COMPLETED,
                                handler=early_stopper)

    # add stats event handler to print validation stats via evaluator
    val_stats_handler = StatsHandler(
        name="evaluator",
        output_transform=lambda x:
        None,  # no need to print loss value, so disable per iteration output
        global_epoch_transform=lambda x: trainer.state.epoch,
    )  # fetch global epoch number from trainer
    val_stats_handler.attach(evaluator)

    # add handler to record metrics to TensorBoard at every validation epoch
    val_tensorboard_stats_handler = TensorBoardStatsHandler(
        output_transform=lambda x:
        None,  # no need to plot loss value, so disable per iteration output
        global_epoch_transform=lambda x: trainer.state.iteration,
    )  # fetch global iteration number from trainer
    val_tensorboard_stats_handler.attach(evaluator)

    # add handler to draw the first image and the corresponding label and model output in the last batch
    # here we draw the 3D output as GIF format along the depth axis, every 2 validation iterations.
    val_tensorboard_image_handler = TensorBoardImageHandler(
        batch_transform=lambda batch: (batch["img"], batch["seg"]),
        output_transform=lambda output: predict_segmentation(output[0]),
        global_iter_transform=lambda x: trainer.state.epoch,
    )
    evaluator.add_event_handler(event_name=Events.ITERATION_COMPLETED(every=2),
                                handler=val_tensorboard_image_handler)

    train_epochs = 5
    state = trainer.run(train_loader, train_epochs)
    shutil.rmtree(tempdir)
예제 #7
0
def _create_amplitude_evaluator(model):
    return create_supervised_evaluator(model, metrics={"overlap": OverlapMetric()})
예제 #8
0
    trainer = engine.create_supervised_trainer(
        model=model,
        optimizer=optimizer,
        loss_fn=loss_fn,
        device='cuda',
        non_blocking=True,
    )

    evaluator = engine.create_supervised_evaluator(
        model,
        metrics={
            'loss': metrics.Loss(nn.BCELoss()),
            'precision':
            metrics.Precision(thresholded_transform(threshold=0.5)),
            'recall': metrics.Recall(thresholded_transform(threshold=0.5)),
            '[email protected]': IoUMetric(thresholded_transform(threshold=0.3)),
            '[email protected]': IoUMetric(thresholded_transform(threshold=0.5)),
        },
        device='cuda',
        non_blocking=True,
        output_transform=lambda x, y, y_pred:
        (torch.sigmoid(y_pred['out']), y),
    )

    logging.info(f'creating summary writer with tag {args.model_tag}')
    writer = tensorboard.SummaryWriter(log_dir=f'logs/{args.model_tag}')

    logging.info('attaching lr scheduler')
    lr_scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
    attach_lr_scheduler(trainer, lr_scheduler, writer)
예제 #9
0
def task_diagnostics(tasks, train_data, val_data, vocabulary, model, args):
    devicea = -1
    if torch.cuda.is_available():
        devicea = 0
    train_activations = []
    test_activations = []
    train_labels = []
    test_labels = []

    for tid,task in enumerate(tasks):
        train_act, train_lab = evaluate_get_dataset(model, task, vocabulary[task],
                                                   train_data[task], 1000, tid)
        test_act, test_lab = evaluate_get_dataset(model, task, vocabulary[task],
                                                  val_data[task], 500, tid)
        train_activations.append(train_act)
        test_activations.append(test_act)
        train_labels.append(train_lab)
        test_labels.append(test_lab)

    train_activations = torch.cat(train_activations, dim=0)
    test_activations = torch.cat(test_activations, dim=0)
    train_labels = torch.cat(train_labels, dim=0)
    test_labels = torch.cat(test_labels, dim=0)
    print("Activations ", train_activations.shape, test_activations.shape, train_labels.shape, test_labels.shape)
    # Datasets
    train_ds = torch.utils.data.TensorDataset(train_activations, train_labels)
    test_ds = torch.utils.data.TensorDataset(test_activations, test_labels)
    train_dl = torch.utils.data.DataLoader(train_ds, batch_size=32)
    test_dl = torch.utils.data.DataLoader(test_ds, batch_size=2100)

    # Models and Optimizer
    diag_model = DiagnositicClassifier(train_activations.size(1), 128, len(tasks))
    if devicea != -1:
        diag_model.cuda(devicea)
    optimizer = utils.get_optimizer(args.opt_alg, diag_model.parameters(), args.lr, args.wdecay)
    criterion = nn.CrossEntropyLoss()

    # ignite training loops
    if devicea == -1:
        trainer = create_supervised_trainer(diag_model, optimizer, criterion)
        evaluator = create_supervised_evaluator(diag_model, {"accuracy": Accuracy(), "loss": Loss(criterion)})
        val_evaluator = create_supervised_evaluator(diag_model, {"accuracy": Accuracy(), "loss": Loss(criterion)})
    else:
        trainer = create_supervised_trainer(diag_model, optimizer, diag_model.loss_function, device=devicea)
        evaluator = create_supervised_evaluator(diag_model, metrics={'accuracy': Accuracy()}, device=devicea)
        val_evaluator = create_supervised_evaluator(diag_model, metrics={'accuracy': Accuracy()}, device=devicea)
    @trainer.on(Events.EPOCH_COMPLETED)
    def compute_metrics(engine):
        evaluator.run(train_dl)
        val_evaluator.run(test_dl)

    def score_function(engine):
        return engine.state.metrics['accuracy']
    
    early_stop_metric = EarlyStopping(patience=20, score_function=score_function, trainer=trainer)
    val_evaluator.add_event_handler(Events.COMPLETED, early_stop_metric)
    trainer.run(train_dl, max_epochs=1000)
    logits, test_labels = val_evaluator.state.output

    _, predicted = torch.max(logits, 1)
    correct_ones = (predicted == test_labels).sum()
    metrics = {}
    for i,task in enumerate(tasks):
        start = i*500
        end = (i+1)*500
        correct_this = (predicted[start:end] == test_labels[start:end]).sum()
        metrics[task] = correct_this.item()/500
        #print("Task based accuracy", start, end , task, correct_this)

    metrics["overall"] = val_evaluator.state.metrics["accuracy"]
    print("Diagnostics metric", metrics)
    return metrics
예제 #10
0
def training(local_rank, config):

    rank = idist.get_rank()
    manual_seed(config["seed"] + rank)
    device = idist.device()

    logger = setup_logger(name="CIFAR10-Training", distributed_rank=local_rank)

    log_basic_info(logger, config)

    output_path = config["output_path"]
    if rank == 0:
        if config["stop_iteration"] is None:
            now = datetime.now().strftime("%Y%m%d-%H%M%S")
        else:
            now = f"stop-on-{config['stop_iteration']}"

        folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}"
        output_path = Path(output_path) / folder_name
        if not output_path.exists():
            output_path.mkdir(parents=True)
        config["output_path"] = output_path.as_posix()
        logger.info(f"Output path: {config['output_path']}")

        if "cuda" in device.type:
            config["cuda device name"] = torch.cuda.get_device_name(local_rank)

        if config["with_trains"]:
            from trains import Task

            task = Task.init("CIFAR10-Training", task_name=output_path.stem)
            task.connect_configuration(config)
            # Log hyper parameters
            hyper_params = [
                "model",
                "batch_size",
                "momentum",
                "weight_decay",
                "num_epochs",
                "learning_rate",
                "num_warmup_epochs",
            ]
            task.connect({k: config[k] for k in hyper_params})

    # Setup dataflow, model, optimizer, criterion
    train_loader, test_loader = get_dataflow(config)

    config["num_iters_per_epoch"] = len(train_loader)
    model, optimizer, criterion, lr_scheduler = initialize(config)

    # Create trainer for current task
    trainer = create_trainer(model, optimizer, criterion, lr_scheduler,
                             train_loader.sampler, config, logger)

    # Let's now setup evaluator engine to perform model's validation and compute metrics
    metrics = {
        "accuracy": Accuracy(),
        "loss": Loss(criterion),
    }

    # We define two evaluators as they wont have exactly similar roles:
    # - `evaluator` will save the best model based on validation score
    evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device,
                                            non_blocking=True)
    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device,
                                                  non_blocking=True)

    def run_validation(engine):
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train",
                    state.metrics)
        state = evaluator.run(test_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test",
                    state.metrics)

    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=config["validate_every"])
        | Events.COMPLETED, run_validation)

    if rank == 0:
        # Setup TensorBoard logging on trainer and evaluators. Logged values are:
        #  - Training metrics, e.g. running average loss values
        #  - Learning rate
        #  - Evaluation train/test metrics
        evaluators = {"training": train_evaluator, "test": evaluator}
        tb_logger = common.setup_tb_logging(output_path,
                                            trainer,
                                            optimizer,
                                            evaluators=evaluators)

    # Store 3 best models by validation accuracy:
    common.gen_save_best_models_by_val_score(
        save_handler=get_save_handler(config),
        evaluator=evaluator,
        models={"model": model},
        metric_name="accuracy",
        n_saved=3,
        trainer=trainer,
        tag="test",
    )

    # In order to check training resuming we can stop training on a given iteration
    if config["stop_iteration"] is not None:

        @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"]))
        def _():
            logger.info(
                f"Stop training on {trainer.state.iteration} iteration")
            trainer.terminate()

    try:
        trainer.run(train_loader, max_epochs=config["num_epochs"])
    except Exception as e:
        import traceback

        print(traceback.format_exc())

    if rank == 0:
        tb_logger.close()
예제 #11
0
def run(args):
    """
    Run the program
    """
    train_loader, val_loader, test_loader, scale = get_data_loaders(args)

    device = torch.device("cuda" if not args.disable_gpu
                          and torch.cuda.is_available() else "cpu")

    lr_ratio = 1  # feature lr / regression lr
    if args.model == 'WaDIQaM-FR' or args.model == 'DIQaM-FR':
        model = FRnet(weighted_average=args.weighted_average)
        if args.resume is not None:
            model.load_state_dict(torch.load(args.resume))
    elif args.model == 'WaDIQaM-NR' or args.model == 'DIQaM-NR':
        model = NRnet(weighted_average=args.weighted_average)
        if args.resume is not None:
            model_dict = model.state_dict()
            if 'FR' in args.resume:
                lr_ratio = 0.1  # set feature lr / regression lr to 1/10
                # Initialize the feature extractor by pretrained FRNet
                pretrained_model = FRnet(
                    weighted_average=args.weighted_average)
                pretrained_model.load_state_dict(torch.load(args.resume))
                pretrained_dict = pretrained_model.state_dict()
                # 1. filter out unnecessary keys
                pretrained_dict = {
                    k: v
                    for k, v in pretrained_dict.items() if k in model_dict
                }
                # 2. overwrite entries in the existing state dict
                model_dict.update(pretrained_dict)
            # 3. load the new state dict
            model.load_state_dict(model_dict)
    else:
        print('Wrong model name!')

    writer = SummaryWriter(log_dir=args.log_dir)
    model = model.to(device)
    print(model)

    if args.multi_gpu and torch.cuda.device_count() > 1:
        print("Using multiple GPU")
        model = nn.DataParallel(model)
        # batch_size becomes batch_size * torch.cuda.device_count()

        all_params = model.module.parameters()
        regression_params = []
        for pname, p in model.module.named_parameters():
            if pname.find('fc') >= 0:
                regression_params.append(p)
        regression_params_id = list(map(id, regression_params))
        features_params = list(
            filter(lambda p: id(p) not in regression_params_id, all_params))
        optimizer = Adam([{
            'params': regression_params
        }, {
            'params': features_params,
            'lr': args.lr * lr_ratio
        }],
                         lr=args.lr,
                         weight_decay=args.weight_decay)
    else:
        all_params = model.parameters()
        regression_params = []
        for pname, p in model.named_parameters():
            if pname.find('fc') >= 0:
                regression_params.append(p)
        regression_params_id = list(map(id, regression_params))
        features_params = list(
            filter(lambda p: id(p) not in regression_params_id, all_params))
        optimizer = Adam([{
            'params': regression_params
        }, {
            'params': features_params,
            'lr': args.lr * lr_ratio
        }],
                         lr=args.lr,
                         weight_decay=args.weight_decay)
    scheduler = lr_scheduler.StepLR(optimizer,
                                    step_size=args.decay_interval,
                                    gamma=args.decay_ratio)
    global best_criterion
    best_criterion = -1  # SROCC >= -1
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        IQALoss(),
                                        device=device)
    evaluator = create_supervised_evaluator(
        model, metrics={'IQA_performance': IQAPerformance()}, device=device)

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        writer.add_scalar("training/loss", scale * engine.state.output,
                          engine.state.iteration)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        SROCC, KROCC, PLCC, RMSE, MAE, OR = metrics['IQA_performance']
        print(
            "Validation Results - Epoch: {} SROCC: {:.4f} KROCC: {:.4f} PLCC: {:.4f} RMSE: {:.4f} MAE: {:.4f} OR: {:.2f}%"
            .format(engine.state.epoch, SROCC, KROCC, PLCC, scale * RMSE,
                    scale * MAE, 100 * OR))
        writer.add_scalar("SROCC/validation", SROCC, engine.state.epoch)
        writer.add_scalar("KROCC/validation", KROCC, engine.state.epoch)
        writer.add_scalar("PLCC/validation", PLCC, engine.state.epoch)
        writer.add_scalar("RMSE/validation", scale * RMSE, engine.state.epoch)
        writer.add_scalar("MAE/validation", scale * MAE, engine.state.epoch)
        writer.add_scalar("OR/validation", OR, engine.state.epoch)

        scheduler.step(engine.state.epoch)

        global best_criterion
        global best_epoch
        if SROCC > best_criterion and engine.state.epoch / args.epochs > 1 / 6:  #
            # if engine.state.epoch/args.epochs > 1/6 and engine.state.epoch % int(args.epochs/10) == 0:
            best_criterion = SROCC
            best_epoch = engine.state.epoch
            try:
                torch.save(model.module.state_dict(), args.trained_model_file)
            except:
                torch.save(model.state_dict(), args.trained_model_file)
                # torch.save(model.state_dict(), args.trained_model_file + str(engine.state.epoch))

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_testing_results(engine):
        if args.test_during_training:
            evaluator.run(test_loader)
            metrics = evaluator.state.metrics
            SROCC, KROCC, PLCC, RMSE, MAE, OR = metrics['IQA_performance']
            print(
                "Testing Results    - Epoch: {} SROCC: {:.4f} KROCC: {:.4f} PLCC: {:.4f} RMSE: {:.4f} MAE: {:.4f} OR: {:.2f}%"
                .format(engine.state.epoch, SROCC, KROCC, PLCC, scale * RMSE,
                        scale * MAE, 100 * OR))
            writer.add_scalar("SROCC/testing", SROCC, engine.state.epoch)
            writer.add_scalar("KROCC/testing", KROCC, engine.state.epoch)
            writer.add_scalar("PLCC/testing", PLCC, engine.state.epoch)
            writer.add_scalar("RMSE/testing", scale * RMSE, engine.state.epoch)
            writer.add_scalar("MAE/testing", scale * MAE, engine.state.epoch)
            writer.add_scalar("OR/testing", OR, engine.state.epoch)

    @trainer.on(Events.COMPLETED)
    def final_testing_results(engine):
        global best_epoch
        model.load_state_dict(torch.load(args.trained_model_file))
        evaluator.run(test_loader)
        metrics = evaluator.state.metrics
        SROCC, KROCC, PLCC, RMSE, MAE, OR = metrics['IQA_performance']
        print(
            "Final Test Results - Epoch: {} SROCC: {:.4f} KROCC: {:.4f} PLCC: {:.4f} RMSE: {:.4f} MAE: {:.4f} OR: {:.2f}%"
            .format(best_epoch, SROCC, KROCC, PLCC, scale * RMSE, scale * MAE,
                    100 * OR))
        np.save(args.save_result_file,
                (SROCC, KROCC, PLCC, scale * RMSE, scale * MAE, OR))

    # kick everything off
    trainer.run(train_loader, max_epochs=args.epochs)

    writer.close()
예제 #12
0
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_interval):
    train_loader, val_loader = get_data_loaders(train_batch_size,
                                                val_batch_size)
    model = Net()
    device = 'cpu'

    if torch.cuda.is_available():
        device = 'cuda'

    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)

    # define a trainer
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        F.nll_loss,
                                        device=device)
    # define a evaluator
    evaluator = create_supervised_evaluator(model,
                                            metrics={
                                                'accuracy': Accuracy(),
                                                'nll': Loss(F.nll_loss)
                                            },
                                            device=device)
    # Print
    desc = "ITERATION - loss: {:.2f}"  # the loss of each iteration while training
    pbar = tqdm(
        initial=0, leave=False, total=len(train_loader), desc=desc.format(
            0))  # Progress of the current iteration in the entire epoch

    @trainer.on(Events.ITERATION_COMPLETED
                )  # call this function when iteration is completed
    def log_training_loss(engine):
        iter = (engine.state.iteration - 1) % len(train_loader) + 1

        if iter % log_interval == 0:
            pbar.desc = desc.format(
                engine.state.output)  # update the training loss
            pbar.update(log_interval)  # update the progress bar

    @trainer.on(Events.EPOCH_COMPLETED
                )  # call this function when epoch is completed
    def log_training_results(engine):
        pbar.refresh()
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_nll = metrics['nll']
        tqdm.write(
            "Training Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))

    @trainer.on(Events.EPOCH_COMPLETED
                )  # call this function when epoch is completed
    def log_validation_results(engine):
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_nll = metrics['nll']
        tqdm.write(
            "Validation Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))

        pbar.n = pbar.last_print_n = 0

    trainer.run(train_loader, max_epochs=epochs)
    pbar.close()
예제 #13
0
# Inisialisasi objek GPU
gpu = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Pakai model RESNET untuk transfer learning
model = torchvision.models.resnet50(pretrained=True)
model.to(gpu)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

trainer = create_supervised_trainer(model, optimizer, criterion, device=gpu)
metrics = {
    "accuracy": Accuracy(),
    "loss": Loss(criterion)
}
train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=gpu)
val_evaluator = create_supervised_evaluator(model, metrics=metrics, device=gpu)
training_history = {"accuracy":[], "loss":[]}
validation_history = {"accuracy":[], "loss":[]}
last_epoch = []

# RunningAverage metrics
RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")

# EarlyStopping Callbacks
handler = EarlyStopping(patience=10, score_function=score_function, trainer=trainer)
val_evaluator.add_event_handler(Events.COMPLETED, handler)

# Buat Custom Function
# Custom function dibuat untuk menghubungkan dengan dua event yaitu, event saat training dan event saat evaluation.
@trainer.on(Events.EPOCH_COMPLETED)
예제 #14
0
def run(train_batch_size, val_batch_size, epochs, learning_rate, weight_decay,
        log_interval, log_dir):
    train_loader, val_loader = get_data_loaders(train_batch_size,
                                                val_batch_size)
    test_loader = get_test_loader(val_batch_size)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    print("Pytorch Version:", torch.__version__)
    print('device={}'.format(device))

    model = CP_MixedNet()
    writer = create_summary_writer(model, train_loader, log_dir)
    optimizer = optim.Adam(model.parameters(),
                           lr=learning_rate,
                           weight_decay=weight_decay)
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        F.nll_loss,
                                        device=device)
    evaluator = create_supervised_evaluator(model,
                                            metrics={
                                                'accuracy': Accuracy(),
                                                'nll': Loss(F.nll_loss)
                                            },
                                            device=device)
    evaluator_val = create_supervised_evaluator(model,
                                                metrics={
                                                    'accuracy': Accuracy(),
                                                    'nll': Loss(F.nll_loss)
                                                },
                                                device=device)

    desc = "ITERATION - loss: {:.2f}"
    pbar = tqdm(initial=0,
                leave=False,
                total=len(train_loader),
                desc=desc.format(0))

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        iter = (engine.state.iteration - 1) % len(train_loader) + 1

        if iter % log_interval == 0:
            pbar.desc = desc.format(engine.state.output)
            pbar.update(log_interval)
            writer.add_scalar("training/loss", engine.state.output,
                              engine.state.iteration)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        pbar.refresh()
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_nll = metrics['nll']
        tqdm.write(
            "Training Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))
        writer.add_scalar("training/avg_loss", avg_nll, engine.state.epoch)
        writer.add_scalar("training/avg_accuracy", avg_accuracy,
                          engine.state.epoch)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_test_results(engine):
        evaluator.run(test_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_nll = metrics['nll']
        tqdm.write(
            "Test Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}".
            format(engine.state.epoch, avg_accuracy, avg_nll))

        pbar.n = pbar.last_print_n = 0
        writer.add_scalar("test/avg_loss", avg_nll, engine.state.epoch)
        writer.add_scalar("test/avg_accuracy", avg_accuracy,
                          engine.state.epoch)

    handler = EarlyStopping(patience=400,
                            score_function=score_function,
                            trainer=trainer)
    evaluator_val.add_event_handler(Events.COMPLETED, handler)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):

        evaluator_val.run(val_loader)
        metrics = evaluator_val.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_nll = metrics['nll']
        tqdm.write(
            "Validation Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))
        pbar.n = pbar.last_print_n = 0
        writer.add_scalar("val/avg_loss", avg_nll, engine.state.epoch)
        writer.add_scalar("val/avg_accuracy", avg_accuracy, engine.state.epoch)

    trainer.run(train_loader, max_epochs=epochs)
    pbar.close()
    writer.close()

    save_model = True
    if (save_model):
        torch.save(model.state_dict(), "weights_BCI.pt")
예제 #15
0
)
device = torch.device("cuda:0")


def prepare_batch(batch, device=None, non_blocking=False):
    return _prepare_batch((batch['img'], batch['label']), device, non_blocking)


metric_name = 'Accuracy'
# add evaluation metric to the evaluator engine
val_metrics = {metric_name: Accuracy()}
# ignite evaluator expects batch=(img, label) and returns output=(y_pred, y) at every iteration,
# user can add output_transform to return other values
evaluator = create_supervised_evaluator(net,
                                        val_metrics,
                                        device,
                                        True,
                                        prepare_batch=prepare_batch)

# Add stats event handler to print validation stats via evaluator
val_stats_handler = StatsHandler(
    name='evaluator',
    output_transform=lambda x:
    None  # no need to print loss value, so disable per iteration output
)
val_stats_handler.attach(evaluator)

# for the arrary data format, assume the 3rd item of batch data is the meta_data
prediction_saver = ClassificationSaver(
    output_dir='tempdir',
    name='evaluator',
예제 #16
0
def main(cfg, resume_state=None):

    if 'AUGMENTATION' in cfg:
        if 'INSERT' in cfg['AUGMENTATION']:
            if not os.path.exists(f"{FILEPATH}/inserts_data/"):
                raise Exception(
                    'Insert data not found (data augmentation). Please run python3 prepare_inserts_data.py --cfg=CONFIG_FILE.'
                )

    # Prepare output data directory
    time_str = datetime.datetime.today().strftime("%Y_%m_%d_%H_%M_%S")
    results_directory = f"{FILEPATH}/results/{time_str}"
    os.makedirs(f"{results_directory}/states/")
    cfg['STATES_DIRECTORY'] = f"{results_directory}/states"
    with open(f"{results_directory}/config.yml", "w") as f:
        yaml.dump(cfg, f)

    # Random seed
    if 'SEED' in cfg['TRAINING']:
        torch.manual_seed(cfg['TRAINING']['SEED'])

    # Dataloaders
    dataset = LeddartechDataset(cfg)
    train_indices, valid_indices = train_valid_indices(len(dataset),
                                                       cfg['TRAINING'])
    train_subset = Subset(dataset, train_indices)
    valid_subset = Subset(dataset, valid_indices)
    train_loader = DataLoader(train_subset,
                              batch_size=cfg['TRAINING']['BATCH_SIZE'],
                              num_workers=cfg['TRAINING']['NUM_WORKERS'],
                              shuffle=True,
                              drop_last=True)
    valid_loader = DataLoader(valid_subset,
                              batch_size=cfg['TRAINING']['BATCH_SIZE'],
                              num_workers=cfg['TRAINING']['NUM_WORKERS'],
                              drop_last=True)
    print(
        f"Dataset size: {len(dataset)} | training set: {len(train_subset)} | validation set: {len(valid_subset)}"
    )

    # Model
    in_channels = dataset.check_number_channels()
    model = getattr(models, cfg['NEURAL_NET']['NAME'])(cfg, in_channels)
    print(f"Model size: {model.size_of_net}")
    if cfg['TRAINING']['DEVICE'] == 'cuda' and torch.cuda.device_count(
    ) > 1:  #Multi GPUs
        model = torch.nn.DataParallel(model)
    model.to(cfg['TRAINING']['DEVICE'])
    print(f"Device set to: {cfg['TRAINING']['DEVICE']}")

    # Loss
    loss_function = list(cfg['TRAINING']['LOSS'].keys())[0]
    loss = getattr(losses,
                   loss_function)(cfg,
                                  **cfg['TRAINING']['LOSS'][loss_function])

    # Optimizer
    optimizer_function = list(cfg['TRAINING']['OPTIMIZER'].keys())[0]
    optimizer = getattr(torch.optim, optimizer_function)(
        model.parameters(), **cfg['TRAINING']['OPTIMIZER'][optimizer_function])

    # Trainer engine
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        loss,
                                        device=cfg['TRAINING']['DEVICE'])
    pbar = tqdm_logger.ProgressBar(persist=True)
    pbar.attach(trainer, output_transform=lambda x: {'loss': x})

    # Evaluator engine
    eval_metrics = {
        'loss': ignite_loss(loss, device=cfg['TRAINING']['DEVICE'])
    }
    if 'METRICS' in cfg['TRAINING']:
        for metric in cfg['TRAINING']['METRICS']:
            eval_metrics[metric] = getattr(metrics, metric)(
                cfg, **cfg['TRAINING']['METRICS'][metric])
    evaluator = create_supervised_evaluator(model,
                                            metrics=eval_metrics,
                                            device=cfg['TRAINING']['DEVICE'])
    pbar2 = tqdm_logger.ProgressBar(persist=True, desc='Validation')
    pbar2.attach(evaluator)

    # Check for gradient explosion
    def check_grad(_):
        if not np.isfinite(trainer.state.output):
            print(loss.log)
            raise ValueError("Loss is not finite.")

    trainer.add_event_handler(Events.ITERATION_COMPLETED, check_grad)

    # Learning rate decay
    optimizer.lr_decay_factor = 1

    def lr_decay(_):
        for param_group in optimizer.param_groups:
            ep = trainer.state.epoch
            N = cfg['TRAINING']['SCHEDULER']['DECAY']['n_epochs']
            f = cfg['TRAINING']['SCHEDULER']['DECAY']['factor']
            optimizer.lr_decay_factor = np.exp(
                -ep / N) + f * (1 - np.exp(-ep / N))
            param_group['lr'] = optimizer.lr_decay_factor * cfg['TRAINING'][
                'OPTIMIZER'][optimizer_function]['lr']
            print(f"learning rate set to: {param_group['lr']}")

    if 'SCHEDULER' in cfg['TRAINING']:
        if 'DECAY' in cfg['TRAINING']['SCHEDULER']:
            trainer.add_event_handler(Events.EPOCH_STARTED, lr_decay)

    def handle_epoch_completed(_):
        torch.save(
            model.state_dict(),
            f"{cfg['STATES_DIRECTORY']}/{cfg['NEURAL_NET']['STATE_ID']}_{trainer.state.epoch:03d}.pt"
        )
        dataset.data_augmentation = False
        evaluator.run(valid_loader)
        dataset.data_augmentation = True
        print('Validation results: ', evaluator.state.metrics)
        with open(f"{results_directory}/{cfg['NEURAL_NET']['STATE_ID']}.yml",
                  "a") as f:
            yaml.dump(
                {f'Epoch {trainer.state.epoch:03d}': evaluator.state.metrics},
                f)

    trainer.add_event_handler(Events.EPOCH_COMPLETED, handle_epoch_completed)

    # Resume training
    def resume_training(trainer):
        if resume_state is not None:
            resume_epoch = int(resume_state.split('_')[-1].split('.')[0])
            model.load_state_dict(torch.load(resume_state))
            trainer.state.iteration = resume_epoch * len(
                trainer.state.dataloader)
            trainer.state.epoch = resume_epoch
        else:
            with open(
                    f"{results_directory}/{cfg['NEURAL_NET']['STATE_ID']}.yml",
                    "w") as f:
                pass

    trainer.add_event_handler(Events.STARTED, resume_training)

    # Start training
    dataset.data_augmentation = True
    trainer.run(train_loader, max_epochs=cfg['TRAINING']['EPOCHS'])

    return results_directory
예제 #17
0
파일: main.py 프로젝트: lynphoenix/ignite
def training(local_rank, config):

    rank = idist.get_rank()
    manual_seed(config["seed"] + rank)
    device = idist.device()

    logger = setup_logger(name="ImageNet-Training",
                          distributed_rank=local_rank)

    log_basic_info(logger, config)

    output_path = config["output_path"]
    if rank == 0:
        if config["stop_iteration"] is None:
            now = datetime.now().strftime("%Y%m%d-%H%M%S")
        else:
            now = "stop-on-{}".format(config["stop_iteration"])

        folder_name = "{}_backend-{}-{}_{}".format(config["model"],
                                                   idist.backend(),
                                                   idist.get_world_size(), now)
        output_path = Path(output_path) / folder_name
        if not output_path.exists():
            output_path.mkdir(parents=True)
        config["output_path"] = output_path.as_posix()
        logger.info("Output path: {}".format(config["output_path"]))

        if "cuda" in device.type:
            config["cuda device name"] = torch.cuda.get_device_name(local_rank)

    # Setup dataflow, model, optimizer, criterion
    train_loader, test_loader = get_imagenet_dataloader(config)

    config["num_iters_per_epoch"] = len(train_loader)
    model, optimizer, criterion, lr_scheduler = initialize(config)

    # Create trainer for current task
    trainer = create_supervised_trainer(model, optimizer, criterion,
                                        lr_scheduler, train_loader.sampler,
                                        config, logger)

    # Let's now setup evaluator engine to perform model's validation and compute metrics
    metrics = {
        "accuracy": Accuracy(),
        "loss": Loss(criterion),
    }

    # We define two evaluators as they wont have exactly similar roles:
    # - `evaluator` will save the best model based on validation score
    evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device,
                                            non_blocking=True)
    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device,
                                                  non_blocking=True)

    def run_validation(engine):
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train",
                    state.metrics)
        state = evaluator.run(test_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test",
                    state.metrics)

    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=config["validate_every"])
        | Events.COMPLETED, run_validation)

    if rank == 0:
        # Setup TensorBoard logging on trainer and evaluators. Logged values are:
        #  - Training metrics, e.g. running average loss values
        #  - Learning rate
        #  - Evaluation train/test metrics
        evaluators = {"training": train_evaluator, "test": evaluator}
        tb_logger = common.setup_tb_logging(output_path,
                                            trainer,
                                            optimizer,
                                            evaluators=evaluators)

    # Store 3 best models by validation accuracy:
    common.gen_save_best_models_by_val_score(
        save_handler=get_save_handler(config),
        evaluator=evaluator,
        models={"model": model},
        metric_name="accuracy",
        n_saved=3,
        trainer=trainer,
        tag="test",
    )

    # In order to check training resuming we can stop training on a given iteration
    if config["stop_iteration"] is not None:

        @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"]))
        def _():
            logger.info("Stop training on {} iteration".format(
                trainer.state.iteration))
            trainer.terminate()

    @trainer.on(Events.ITERATION_COMPLETED(every=20))
    def print_acc(engine):
        if rank == 0:
            print("Epoch[{}] Iteration[{}/{}] Loss: {:.3f}"\
                    .format(engine.state.epoch, engine.state.iteration, len(train_loader),
                            engine.state.saved_batch_loss
                            ))

    try:
        trainer.run(train_loader, max_epochs=config["num_epochs"])
    except Exception as e:
        import traceback

        print(traceback.format_exc())

    if rank == 0:
        tb_logger.close()
예제 #18
0
def run(*options, cfg=None):
    """Run training and validation of model

    Notes:
        Options can be passed in via the options argument and loaded from the cfg file
        Options from default.py will be overridden by options loaded from cfg file
        Options passed in via options argument will override option loaded from cfg file
    
    Args:
        *options (str,int ,optional): Options used to overide what is loaded from the
                                      config. To see what options are available consult
                                      default.py
        cfg (str, optional): Location of config file to load. Defaults to None.
    """

    update_config(config, options=options, config_file=cfg)

    # Start logging
    load_log_configuration(config.LOG_CONFIG)
    logger = logging.getLogger(__name__)
    logger.debug(config.WORKERS)
    torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK

    torch.manual_seed(config.SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(config.SEED)
    np.random.seed(seed=config.SEED)

    # load the data
    TrainVoxelLoader = get_voxel_loader(config)

    train_set = TrainVoxelLoader(
        config.DATASET.ROOT,
        config.DATASET.FILENAME,
        split="train",
        window_size=config.WINDOW_SIZE,
        len=config.TRAIN.BATCH_SIZE_PER_GPU * config.TRAIN.BATCH_PER_EPOCH,
        batch_size=config.TRAIN.BATCH_SIZE_PER_GPU,
    )
    val_set = TrainVoxelLoader(
        config.DATASET.ROOT,
        config.DATASET.FILENAME,
        split="val",
        window_size=config.WINDOW_SIZE,
        len=config.TRAIN.BATCH_SIZE_PER_GPU * config.TRAIN.BATCH_PER_EPOCH,
        batch_size=config.TRAIN.BATCH_SIZE_PER_GPU,
    )

    n_classes = train_set.n_classes

    # set dataset length to batch size to be consistent with 5000 iterations
    # each of size 32 in the original Waldeland implementation
    train_loader = data.DataLoader(
        train_set, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS, shuffle=False,
    )
    val_loader = data.DataLoader(
        val_set, batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS, shuffle=False,
    )

    # this is how we import model for CV - here we're importing a seismic
    # segmentation model
    model = TextureNet(n_classes=config.DATASET.NUM_CLASSES)

    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=config.TRAIN.LR,
        # momentum=config.TRAIN.MOMENTUM,
        weight_decay=config.TRAIN.WEIGHT_DECAY,
    )

    device = "cpu"

    if torch.cuda.is_available():
        device = "cuda"
        model = model.cuda()

    loss = torch.nn.CrossEntropyLoss()

    trainer = create_supervised_trainer(model, optimizer, loss, prepare_batch=_prepare_batch, device=device)

    desc = "ITERATION - loss: {:.2f}"
    pbar = tqdm(initial=0, leave=False, total=len(train_loader), desc=desc.format(0))

    # add model checkpointing
    output_dir = path.join(config.OUTPUT_DIR, config.TRAIN.MODEL_DIR)
    checkpoint_handler = ModelCheckpoint(
        output_dir, "model", save_interval=1, n_saved=3, create_dir=True, require_empty=False,
    )

    criterion = torch.nn.CrossEntropyLoss(reduction="mean")

    # save model at each epoch
    trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {config.MODEL.NAME: model})

    def _select_pred_and_mask(model_out):
        # receive a tuple of (x, y_pred), y
        # so actually in line 51 of
        # cv_lib/cv_lib/segmentation/dutch_f3/metrics/__init__.py
        # we do the following line, so here we just select the model
        # _, y_pred = torch.max(model_out[0].squeeze(), 1, keepdim=True)
        y_pred = model_out[0].squeeze()
        y = model_out[1].squeeze()
        return (y_pred.squeeze(), y)

    evaluator = create_supervised_evaluator(
        model,
        metrics={
            "nll": Loss(criterion, device=device),
            "pixa": pixelwise_accuracy(n_classes, output_transform=_select_pred_and_mask, device=device),
            "cacc": class_accuracy(n_classes, output_transform=_select_pred_and_mask, device=device),
            "mca": mean_class_accuracy(n_classes, output_transform=_select_pred_and_mask, device=device),
            "ciou": class_iou(n_classes, output_transform=_select_pred_and_mask, device=device),
            "mIoU": mean_iou(n_classes, output_transform=_select_pred_and_mask, device=device),
        },
        device=device,
        prepare_batch=_prepare_batch,
    )

    # Set the validation run to start on the epoch completion of the training run
    trainer.add_event_handler(Events.EPOCH_COMPLETED, Evaluator(evaluator, val_loader))

    summary_writer = create_summary_writer(log_dir=path.join(output_dir, config.LOG_DIR))

    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED,
        logging_handlers.log_metrics(
            "Validation results",
            metrics_dict={
                "mIoU": "Avg IoU :",
                "nll": "Avg loss :",
                "pixa": "Pixelwise Accuracy :",
                "mca": "Mean Class Accuracy :",
            },
        ),
    )
    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED,
        tensorboard_handlers.log_metrics(
            summary_writer,
            trainer,
            "epoch",
            metrics_dict={"mIoU": "Validation/IoU", "nll": "Validation/Loss", "mca": "Validation/MCA",},
        ),
    )

    summary_writer = create_summary_writer(log_dir=path.join(output_dir, config.LOG_DIR))

    snapshot_duration = 2

    def snapshot_function():
        return (trainer.state.iteration % snapshot_duration) == 0

    checkpoint_handler = SnapshotHandler(
        path.join(output_dir, config.TRAIN.MODEL_DIR),
        config.MODEL.NAME,
        extract_metric_from("mIoU"),
        snapshot_function,
    )
    evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {"model": model})

    logger.info("Starting training")
    trainer.run(train_loader, max_epochs=config.TRAIN.END_EPOCH // config.TRAIN.BATCH_PER_EPOCH)
    pbar.close()
예제 #19
0
def main():
    parser = argparse.ArgumentParser()
    arg = parser.add_argument

    arg('clf_gt', help='segmentation predictions')
    # Dataset params
    arg('--test-height', type=int, default=2528)
    arg('--crop-height', type=int, default=768)
    arg('--crop-width', type=int, default=512)
    arg('--scale-aug', type=float, default=0.3)
    arg('--color-hue-aug', type=int, default=7)
    arg('--color-sat-aug', type=int, default=30)
    arg('--color-val-aug', type=int, default=30)
    arg('--n-tta', type=int, default=1)
    arg('--pseudolabels',
        nargs='+',
        help='path to pseudolabels to be added to train')
    arg('--pseudolabels-oversample', type=int, default=1)
    arg('--test-book', help='use only this book for testing and pseudolabels')
    arg('--fold', type=int, default=0)
    arg('--n-folds', type=int, default=5)
    arg('--train-limit', type=int)
    arg('--test-limit', type=int)
    # Model params
    arg('--base', default='resnet50')
    arg('--use-sequences', type=int, default=0)
    arg('--head-dropout', type=float, default=0.5)
    arg('--frozen-start', type=int)
    arg('--head', type=str, default='Head')
    # Training params
    arg('--device', default='cuda', help='device')
    arg('--opt-level', help='pass 01 to use fp16 training with apex')
    arg('--benchmark', type=int)
    arg('--batch-size', default=10, type=int)
    arg('--max-targets', type=int)
    arg('--workers',
        default=8,
        type=int,
        help='number of data loading workers')
    arg('--lr', default=14e-3, type=float, help='initial learning rate')
    arg('--wd', default=1e-4, type=float, help='weight decay')
    arg('--optimizer', default='sgd')
    arg('--accumulation-steps', type=int, default=1)
    arg('--epochs', default=50, type=int, help='number of total epochs to run')
    arg('--repeat-train', type=int, default=6)
    arg('--drop-lr-epoch',
        default=0,
        type=int,
        help='epoch at which to drop lr')
    arg('--cosine', type=int, default=1, help='cosine lr schedule')
    # Misc. params
    arg('--output-dir', help='path where to save')
    arg('--resume', help='resume from checkpoint')
    arg('--test-only', help='Only test the model', action='store_true')
    arg('--submission', help='Create submission', action='store_true')
    arg('--detailed-postfix', default='', help='postfix of detailed file name')
    arg('--print-model', default=1, type=int)
    arg('--dump-features', default=0, type=int)  # for knn, unused
    args = parser.parse_args()
    if args.test_only and args.submission:
        parser.error('pass one of --test-only and --submission')
    print(args)

    output_dir = Path(args.output_dir) if args.output_dir else None
    if output_dir:
        output_dir.mkdir(parents=True, exist_ok=True)
        if not args.resume:
            (output_dir / 'params.json').write_text(
                json.dumps(vars(args), indent=4))

    print('Loading data')
    df_train_gt, df_valid_gt = load_train_valid_df(args.fold, args.n_folds)
    df_clf_gt = load_train_df(args.clf_gt)[['labels', 'image_id']]
    if args.submission:
        df_valid = df_train = df_clf_gt
        empty_index = df_valid['labels'] == ''
        empty_pages = df_valid[empty_index]['image_id'].values
        df_valid = df_valid[~empty_index]
    else:
        df_train, df_valid = [
            df_clf_gt[df_clf_gt['image_id'].isin(set(df['image_id']))]
            for df in [df_train_gt, df_valid_gt]
        ]
        df_valid = df_valid[df_valid['labels'] != '']
    if args.pseudolabels:
        df_ps = pd.concat(
            [pd.read_csv(p)[df_train.columns] for p in args.pseudolabels])
        if args.test_book:
            df_ps = df_ps[df_ps['image_id'].apply(
                lambda x: get_book_id(x) == args.test_book)]
        df_train = (
            pd.concat([df_train] +
                      [df_ps] * args.pseudolabels_oversample).reset_index(
                          drop=True))
    if args.test_book:
        df_valid = df_valid[df_valid['image_id'].apply(
            lambda x: get_book_id(x) == args.test_book)]
    if args.train_limit:
        df_train = df_train.sample(n=args.train_limit, random_state=42)
    if args.test_limit:
        df_valid = df_valid.sample(n=args.test_limit, random_state=42)
    gt_by_image_id = {item.image_id: item for item in df_valid_gt.itertuples()}
    print(f'{len(df_train):,} in train, {len(df_valid):,} in valid')
    classes = get_encoded_classes()

    def _get_transforms(*, train: bool):
        if not train and args.n_tta > 1:
            test_heights = [
                args.test_height * (1 + s)
                for s in np.linspace(0, args.scale_aug, args.n_tta)
            ]
            print('TTA test heights:', list(map(int, test_heights)))
        else:
            test_heights = [args.test_height]
        return [
            get_transform(
                train=train,
                test_height=test_height,
                crop_width=args.crop_width,
                crop_height=args.crop_height,
                scale_aug=args.scale_aug,
                color_hue_aug=args.color_hue_aug,
                color_sat_aug=args.color_sat_aug,
                color_val_aug=args.color_val_aug,
            ) for test_height in test_heights
        ]

    def make_test_data_loader(df):
        return DataLoader(
            Dataset(
                df=df,
                transforms=_get_transforms(train=False),
                resample_empty=False,
                classes=classes,
            ),
            batch_size=1,
            collate_fn=collate_fn,
            num_workers=args.workers,
        )

    data_loader_test = make_test_data_loader(df_valid)
    if args.dump_features:  # unused
        df_train = df_train[df_train['labels'] != '']
        data_loader_train = make_test_data_loader(df_train)
    else:
        data_loader_train = DataLoader(
            Dataset(
                df=pd.concat([df_train] * args.repeat_train),
                transforms=_get_transforms(train=True),
                resample_empty=True,
                classes=classes,
            ),
            num_workers=args.workers,
            shuffle=True,
            collate_fn=partial(collate_fn, max_targets=args.max_targets),
            batch_size=args.batch_size,
        )

    print('Creating model')
    fp16 = bool(args.opt_level)
    model: nn.Module = build_model(
        base=args.base,
        head=args.head,
        frozen_start=args.frozen_start,
        fp16=fp16,
        n_classes=len(classes),
        head_dropout=args.head_dropout,
        use_sequences=bool(args.use_sequences),
    )
    if args.print_model:
        print(model)
    device = torch.device(args.device)
    model.to(device)
    if args.benchmark:
        torch.backends.cudnn.benchmark = True

    parameters = model.parameters()
    if args.optimizer == 'adam':
        optimizer = optim.Adam(parameters, lr=args.lr, weight_decay=args.wd)
    elif args.optimizer == 'sgd':
        optimizer = optim.SGD(parameters,
                              lr=args.lr,
                              weight_decay=args.wd,
                              momentum=0.9)
    else:
        parser.error(f'Unexpected optimzier {args.optimizer}')

    if fp16:
        from apex import amp
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.opt_level)
    loss = nn.CrossEntropyLoss()
    step = epoch = 0
    best_f1 = 0

    if args.resume:
        state = torch.load(args.resume, map_location='cpu')
        if 'optimizer' in state:
            optimizer.load_state_dict(state['optimizer'])
            model.load_state_dict(state['model'])
            step = state['step']
            epoch = state['epoch']
            best_f1 = state['best_f1']
        else:
            model.load_state_dict(state)
        del state

    @contextmanager
    def no_benchmark():
        torch.backends.cudnn.benchmark = False
        yield
        if args.benchmark:
            torch.backends.cudnn.benchmark = True

    if args.dump_features and not args.submission:  # unused
        if not output_dir:
            parser.error('set --output-dir with --dump-features')
        # We also dump test features below
        feature_evaluator = create_supervised_evaluator(
            model,
            device=device,
            prepare_batch=_prepare_batch,
            metrics={'features': GetFeatures(n_tta=args.n_tta)},
        )
        with no_benchmark():
            run_with_pbar(feature_evaluator,
                          data_loader_train,
                          desc='train features')
        torch.save(feature_evaluator.state.metrics['features'],
                   output_dir / 'train_features.pth')

    def get_y_pred_y(output):
        y_pred, y = output
        return get_output(y_pred), get_labels(y)

    metrics = {
        'accuracy': Accuracy(output_transform=get_y_pred_y),
        'loss': Loss(loss, output_transform=get_y_pred_y),
        'predictions': GetPredictions(n_tta=args.n_tta, classes=classes),
        'detailed': GetDetailedPrediction(n_tta=args.n_tta, classes=classes),
    }
    if args.dump_features:
        metrics['features'] = GetFeatures(n_tta=args.n_tta)
    evaluator = create_supervised_evaluator(model,
                                            device=device,
                                            prepare_batch=_prepare_batch,
                                            metrics=metrics)

    def evaluate():
        with no_benchmark():
            run_with_pbar(evaluator, data_loader_test, desc='evaluate')
        metrics = {
            'valid_loss': evaluator.state.metrics['loss'],
            'accuracy': evaluator.state.metrics['accuracy'],
        }
        scores = []
        for prediction, meta in evaluator.state.metrics['predictions']:
            item = gt_by_image_id[meta['image_id']]
            target_boxes, target_labels = get_target_boxes_labels(item)
            target_boxes = torch.from_numpy(target_boxes)
            pred_centers = np.array([p['center'] for p in prediction])
            pred_labels = [p['cls'] for p in prediction]
            scores.append(
                dict(score_boxes(
                    truth_boxes=from_coco(target_boxes).numpy(),
                    truth_label=target_labels,
                    preds_center=pred_centers,
                    preds_label=np.array(pred_labels),
                ),
                     image_id=item.image_id))
        metrics.update(get_metrics(scores))
        if output_dir:
            pd.DataFrame(evaluator.state.metrics['detailed']).to_csv(
                output_dir / f'detailed{args.detailed_postfix}.csv.gz',
                index=None)
        if args.dump_features:
            f_name = 'test' if args.submission else 'valid'
            torch.save(evaluator.state.metrics['features'],
                       output_dir / f'{f_name}_features.pth')
        return metrics

    def make_submission():
        with no_benchmark():
            run_with_pbar(evaluator, data_loader_test, desc='evaluate')
        submission = []
        for prediction, meta in tqdm.tqdm(
                evaluator.state.metrics['predictions']):
            submission.append(submission_item(meta['image_id'], prediction))
        submission.extend(
            submission_item(image_id, []) for image_id in empty_pages)
        pd.DataFrame(submission).to_csv(output_dir /
                                        f'submission_{output_dir.name}.csv.gz',
                                        index=None)
        pd.DataFrame(evaluator.state.metrics['detailed']).to_csv(
            output_dir / f'test_detailed{args.detailed_postfix}.csv.gz',
            index=None)
        if args.dump_features:
            torch.save(evaluator.state.metrics['features'],
                       output_dir / 'test_features.pth')

    if args.test_only or args.submission:
        if not args.resume:
            parser.error('please pass --resume when running with --test-only '
                         'or --submission')
        if args.test_only:
            print_metrics(evaluate())
        elif args.submission:
            if not output_dir:
                parser.error('--output-dir required with --submission')
            make_submission()
        return

    trainer = create_supervised_trainer(
        model,
        optimizer,
        loss_fn=lambda y_pred, y: loss(get_output(y_pred), get_labels(y)),
        device=device,
        prepare_batch=_prepare_batch,
        accumulation_steps=args.accumulation_steps,
        fp16=fp16,
    )

    epochs_left = args.epochs - epoch
    epochs_pbar = tqdm.trange(epochs_left)
    epoch_pbar = tqdm.trange(len(data_loader_train))
    train_losses = deque(maxlen=20)

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(_):
        nonlocal step
        train_losses.append(trainer.state.output)
        smoothed_loss = np.mean(train_losses)
        epoch_pbar.set_postfix(loss=f'{smoothed_loss:.4f}')
        epoch_pbar.update(1)
        step += 1
        if step % 20 == 0 and output_dir:
            json_log_plots.write_event(output_dir,
                                       step=step * args.batch_size,
                                       loss=smoothed_loss)

    @trainer.on(Events.EPOCH_COMPLETED)
    def checkpoint(_):
        if output_dir:
            torch.save(
                {
                    'model': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'step': step,
                    'epoch': epoch,
                    'best_f1': best_f1,
                }, output_dir / 'checkpoint.pth')

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(_):
        nonlocal best_f1
        metrics = evaluate()
        if output_dir:
            json_log_plots.write_event(output_dir,
                                       step=step * args.batch_size,
                                       **metrics)
        if metrics['f1'] > best_f1:
            best_f1 = metrics['f1']
            if output_dir:
                torch.save(model.state_dict(), output_dir / 'model_best.pth')
        epochs_pbar.set_postfix(
            {k: format_value(v)
             for k, v in metrics.items()})

    @trainer.on(Events.EPOCH_COMPLETED)
    def update_pbars_on_epoch_completion(_):
        nonlocal epoch
        epochs_pbar.update(1)
        epoch_pbar.reset()
        epoch += 1

    scheduler = None
    if args.drop_lr_epoch and args.cosine:
        parser.error('Choose only one schedule')
    if args.drop_lr_epoch:
        scheduler = StepLR(optimizer, step_size=args.drop_lr_epoch, gamma=0.1)
    if args.cosine:
        scheduler = CosineAnnealingLR(optimizer, epochs_left)
    if scheduler is not None:
        trainer.on(Events.EPOCH_COMPLETED)(lambda _: scheduler.step())

    trainer.run(data_loader_train, max_epochs=epochs_left)
예제 #20
0
def train(model,
          model_name,
          train_dataloader,
          eval_dataloader,
          labels_name,
          trainer_name='ocr',
          backbone_url=None):
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    scaler = torch.cuda.amp.GradScaler()

    def _prepare_batch(batch, device=None, non_blocking=False):
        """Prepare batch for training: pass to a device with options.
        """
        images, labels = batch
        images = images.to(device)
        labels = [label.to(device) for label in labels]
        return (images, labels)

    writer = SummaryWriter(log_dir=f'logs/{trainer_name}/{model_name}')
    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                              factor=0.5,
                                                              patience=250,
                                                              cooldown=100,
                                                              min_lr=1e-6)

    def _update(engine, batch):
        model.train()
        optimizer.zero_grad()
        x, y = _prepare_batch(batch, device=device)
        # loss = model(x, y)
        # loss.backward()
        # optimizer.step()
        with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
            loss = model(x, y)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        return loss.item()

    trainer = Engine(_update)
    evaluator = create_supervised_evaluator(
        model,
        prepare_batch=_prepare_batch,
        metrics={'edit_distance': EditDistanceMetric()},
        device=device)

    if path.exists(f'{trainer_name}_{model_name}_checkpoint.pt'):
        checkpoint = torch.load(f'{trainer_name}_{model_name}_checkpoint.pt')
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        scaler.load_state_dict(checkpoint['scaler'])
        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
        logging.info(
            f'load checkpoint {trainer_name}_{model_name}_checkpoint.pt')
    elif path.exists(f'{model_name}_backbone.pt'):
        pretrained_dict = torch.load(f'{model_name}_backbone.pt')['model']
        model_dict = model.state_dict()
        pretrained_dict = {
            k: v
            for k, v in pretrained_dict.items()
            if k in model_dict and 'neck.' not in k and 'fc.' not in k
        }
        model_dict.update(pretrained_dict)
        model.load_state_dict(model_dict)
        logging.info(f'load transfer parameters from {model_name}_backbone.pt')
    elif backbone_url is not None:
        pretrained_dict = torch.hub.load_state_dict_from_url(backbone_url,
                                                             progress=False)
        model_dict = model.backbone.state_dict()
        pretrained_dict = {
            k: v
            for k, v in pretrained_dict.items() if k in model_dict
        }
        model_dict.update(pretrained_dict)
        model.backbone.load_state_dict(model_dict)
        logging.info(f'load backbone from {backbone_url}')

    early_stop_arr = [0.0]

    def early_stop_score_function(engine):
        val_acc = engine.state.metrics['edit_distance']
        if val_acc < 0.8:  # do not early stop when acc is less than 0.9
            early_stop_arr[0] += 0.000001
            return early_stop_arr[0]
        return val_acc

    early_stop_handler = EarlyStopping(
        patience=20, score_function=early_stop_score_function, trainer=trainer)
    evaluator.add_event_handler(Events.COMPLETED, early_stop_handler)

    checkpoint_handler = ModelCheckpoint(f'models/{trainer_name}/{model_name}',
                                         model_name,
                                         n_saved=10,
                                         create_dir=True)
    # trainer.add_event_handler(Events.ITERATION_COMPLETED(every=1000), checkpoint_handler,
    #                           {'model': model, 'optimizer': optimizer, 'lr_scheduler': lr_scheduler})
    trainer.add_event_handler(
        Events.ITERATION_COMPLETED(every=1000), checkpoint_handler, {
            'model': model,
            'optimizer': optimizer,
            'lr_scheduler': lr_scheduler,
            'scaler': scaler
        })

    @trainer.on(Events.ITERATION_COMPLETED(every=10))
    def log_training_loss(trainer):
        lr = optimizer.param_groups[0]['lr']
        logging.info("Epoch[{}]: {} - Loss: {:.4f}, Lr: {}".format(
            trainer.state.epoch, trainer.state.iteration, trainer.state.output,
            lr))
        writer.add_scalar("training/loss", trainer.state.output,
                          trainer.state.iteration)
        writer.add_scalar("training/learning_rate", lr,
                          trainer.state.iteration)

    @trainer.on(Events.ITERATION_COMPLETED(every=10))
    def step_lr(trainer):
        lr_scheduler.step(trainer.state.output)

    @trainer.on(Events.ITERATION_COMPLETED(every=1000))
    def log_training_results(trainer):
        evaluator.run(eval_dataloader)
        metrics = evaluator.state.metrics
        logging.info(
            "Eval Results - Epoch[{}]: {} - Avg edit distance: {:.4f}".format(
                trainer.state.epoch, trainer.state.iteration,
                metrics['edit_distance']))
        writer.add_scalar("evaluation/avg_edit_distance",
                          metrics['edit_distance'], trainer.state.iteration)

    @trainer.on(Events.ITERATION_COMPLETED(every=100))
    def read_lr_from_file(trainer):
        if path.exists('lr.txt'):
            with open('lr.txt', 'r', encoding='utf-8') as f:
                lr = float(f.read())
            for group in optimizer.param_groups:
                group['lr'] = lr

    trainer.run(train_dataloader, max_epochs=1)
예제 #21
0
def main(dataset_path, batch_size=256, max_epochs=10):
    assert torch.cuda.is_available()
    assert torch.backends.cudnn.enabled, "NVIDIA/Apex:Amp requires cudnn backend to be enabled."
    torch.backends.cudnn.benchmark = True

    device = "cuda"

    train_loader, test_loader, eval_train_loader = get_train_eval_loaders(dataset_path, batch_size=batch_size)

    model = wide_resnet50_2(num_classes=100).to(device)
    optimizer = SGD(model.parameters(), lr=0.01)
    criterion = CrossEntropyLoss().to(device)

    scaler = GradScaler()

    def train_step(engine, batch):
        x = convert_tensor(batch[0], device, non_blocking=True)
        y = convert_tensor(batch[1], device, non_blocking=True)

        optimizer.zero_grad()

        # Runs the forward pass with autocasting.
        with autocast():
            y_pred = model(x)
            loss = criterion(y_pred, y)

        # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
        # Backward passes under autocast are not recommended.
        # Backward ops run in the same precision that autocast used for corresponding forward ops.
        scaler.scale(loss).backward()

        # scaler.step() first unscales the gradients of the optimizer's assigned params.
        # If these gradients do not contain infs or NaNs, optimizer.step() is then called,
        # otherwise, optimizer.step() is skipped.
        scaler.step(optimizer)

        # Updates the scale for next iteration.
        scaler.update()

        return loss.item()

    trainer = Engine(train_step)
    timer = Timer(average=True)
    timer.attach(trainer, step=Events.EPOCH_COMPLETED)
    ProgressBar(persist=True).attach(trainer, output_transform=lambda out: {"batch loss": out})

    metrics = {"Accuracy": Accuracy(), "Loss": Loss(criterion)}

    evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True)

    def log_metrics(engine, title):
        for name in metrics:
            print(f"\t{title} {name}: {engine.state.metrics[name]:.2f}")

    @trainer.on(Events.COMPLETED)
    def run_validation(_):
        print(f"- Mean elapsed time for 1 epoch: {timer.value()}")
        print("- Metrics:")
        with evaluator.add_event_handler(Events.COMPLETED, log_metrics, "Train"):
            evaluator.run(eval_train_loader)

        with evaluator.add_event_handler(Events.COMPLETED, log_metrics, "Test"):
            evaluator.run(test_loader)

    trainer.run(train_loader, max_epochs=max_epochs)
예제 #22
0
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_dir):
    train_loader, val_loader = get_data_loaders(train_batch_size,
                                                val_batch_size)
    model = Net()
    device = "cpu"

    if torch.cuda.is_available():
        device = "cuda"

    model.to(device)  # Move model before creating optimizer
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    criterion = nn.CrossEntropyLoss()
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=device)
    trainer.logger = setup_logger("Trainer")

    if sys.version_info > (3, ):
        from ignite.contrib.metrics.gpu_info import GpuInfo

        try:
            GpuInfo().attach(trainer)
        except RuntimeError:
            print(
                "INFO: By default, in this example it is possible to log GPU information (used memory, utilization). "
                "As there is no pynvml python package installed, GPU information won't be logged. Otherwise, please "
                "install it : `pip install pynvml`")

    metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)}

    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device)
    train_evaluator.logger = setup_logger("Train Evaluator")
    validation_evaluator = create_supervised_evaluator(model,
                                                       metrics=metrics,
                                                       device=device)
    validation_evaluator.logger = setup_logger("Val Evaluator")

    @trainer.on(Events.EPOCH_COMPLETED)
    def compute_metrics(engine):
        train_evaluator.run(train_loader)
        validation_evaluator.run(val_loader)

    tb_logger = TensorboardLogger(log_dir=log_dir)

    tb_logger.attach_output_handler(
        trainer,
        event_name=Events.ITERATION_COMPLETED(every=100),
        tag="training",
        output_transform=lambda loss: {"batchloss": loss},
        metric_names="all",
    )

    for tag, evaluator in [("training", train_evaluator),
                           ("validation", validation_evaluator)]:
        tb_logger.attach_output_handler(
            evaluator,
            event_name=Events.EPOCH_COMPLETED,
            tag=tag,
            metric_names=["loss", "accuracy"],
            global_step_transform=global_step_from_engine(trainer),
        )

    tb_logger.attach_opt_params_handler(
        trainer,
        event_name=Events.ITERATION_COMPLETED(every=100),
        optimizer=optimizer)

    tb_logger.attach(trainer,
                     log_handler=WeightsScalarHandler(model),
                     event_name=Events.ITERATION_COMPLETED(every=100))

    tb_logger.attach(trainer,
                     log_handler=WeightsHistHandler(model),
                     event_name=Events.EPOCH_COMPLETED(every=100))

    tb_logger.attach(trainer,
                     log_handler=GradsScalarHandler(model),
                     event_name=Events.ITERATION_COMPLETED(every=100))

    tb_logger.attach(trainer,
                     log_handler=GradsHistHandler(model),
                     event_name=Events.EPOCH_COMPLETED(every=100))

    def score_function(engine):
        return engine.state.metrics["accuracy"]

    model_checkpoint = ModelCheckpoint(
        log_dir,
        n_saved=2,
        filename_prefix="best",
        score_function=score_function,
        score_name="validation_accuracy",
        global_step_transform=global_step_from_engine(trainer),
    )
    validation_evaluator.add_event_handler(Events.COMPLETED, model_checkpoint,
                                           {"model": model})

    # kick everything off
    trainer.run(train_loader, max_epochs=epochs)

    tb_logger.close()
예제 #23
0
def fit_model_multiclass(model,
                         train_loader,
                         test_loader,
                         lr,
                         max_epochs=5,
                         number_of_classes=2):

    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    criterion = torch.nn.CrossEntropyLoss()

    def threshold_output_transform(output):
        y_pred, y = output
        print('got in here threshold ')
        y_pred = torch.heaviside(y_pred, values=torch.zeros(1))
        # print(f'y_pred size : {y_pred.size()}')
        # print(f'y size : {y.size()}')
        return y_pred, y

    def prepare_batch(batch, device, non_blocking):
        x, y = batch
        # print(f'x type: {x.dtype}')
        # print(f'y type: {y.dtype}')
        x = x.to(dtype=torch.long)
        y = y.to(dtype=torch.long)
        # print(f'x type: {x.dtype}')
        # print(f'y type: {y.dtype}')
        # y = torch.unsqueeze(y, 1)
        y = y.squeeze()
        return (x, y)

    def squeeze_y_dims(output):
        prediction, target = output
        print('got in here squeeze y dims')
        # print(f'prediction size: {prediction.size()}')
        # print(f'target size: {target.size()}')
        return prediction.long(), target.squeeze().long()

    def correct_shape(output):
        y_pred, y = output
        print('got in here squeeze correct shape')
        one_hot_y = torch.nn.functional.one_hot(y,
                                                num_classes=number_of_classes)
        one_hot_y = one_hot_y.squeeze(1)

        argmax = y_pred.argmax(1)
        m = torch.zeros(y_pred.shape).scatter(1, argmax.unsqueeze(1), 1.0)

        return m, one_hot_y

    def trainer_output_shape(output):
        print('got here output transform trainer ')
        x, y, y_pred, loss = output

    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        prepare_batch=prepare_batch)

    val_metrics = {
        "accuracy": Accuracy(output_transform=correct_shape,
                             is_multilabel=True),
        "loss": Loss(criterion, output_transform=squeeze_y_dims)
        # "precision" : Precision(threshold_output_transform, average=False),
        # "recall": Recall(threshold_output_transform, average=False)
    }

    evaluator = create_supervised_evaluator(model,
                                            metrics=val_metrics,
                                            prepare_batch=prepare_batch)

    # @trainer.on(Events.ITERATION_COMPLETED(every=10))
    # def log_training_loss(trainer):
    #     evaluator.run(train_loader)
    #     metrics = evaluator.state.metrics
    #     print(f"Training Results - Epoch: {trainer.state.epoch}  Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['loss']:.2f}")

    # @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(trainer):
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        # print(f"Training Results - Epoch: {trainer.state.epoch}  Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['bce']:.2f} Avg precision : {metrics['precision']:.2f} Avg recall: {metrics['recall']:.2f}")
        print(
            f"Training Results - Epoch: {trainer.state.epoch}  Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['loss']:.2f}"
        )

    @trainer.on(Events.EPOCH_COMPLETED(every=10))
    def log_validation_results(trainer):
        evaluator.run(test_loader)
        metrics = evaluator.state.metrics
        # print(f"Validation Results - Epoch: {trainer.state.epoch}  Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['bce']:.2f} Avg precision : {metrics['precision']:.2f} Avg recall: {metrics['recall']:.2f}")
        print(
            f"Validation Results - Epoch: {trainer.state.epoch}  Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['loss']:.2f}"
        )

    trainer.run(train_loader, max_epochs=max_epochs)

    return model
예제 #24
0
def run(train_batch_size, val_batch_size, epochs, lr, momentum):
    train_loader, val_loader = get_data_loaders(train_batch_size,
                                                val_batch_size)
    model = Net()
    device = "cpu"

    if torch.cuda.is_available():
        device = "cuda"

    model.to(device)  # Move model before creating optimizer
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    criterion = nn.CrossEntropyLoss()
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=device)
    trainer.logger = setup_logger("Trainer")

    metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)}

    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device)
    train_evaluator.logger = setup_logger("Train Evaluator")
    validation_evaluator = create_supervised_evaluator(model,
                                                       metrics=metrics,
                                                       device=device)
    validation_evaluator.logger = setup_logger("Val Evaluator")

    @trainer.on(Events.EPOCH_COMPLETED)
    def compute_metrics(engine):
        train_evaluator.run(train_loader)
        validation_evaluator.run(val_loader)

    wandb_logger = WandBLogger(
        project="pytorch-ignite-integration",
        name="ignite-mnist-example",
        config={
            "train_batch_size": train_batch_size,
            "val_batch_size": val_batch_size,
            "epochs": epochs,
            "lr": lr,
            "momentum": momentum,
        },
    )

    wandb_logger.attach_output_handler(
        trainer,
        event_name=Events.ITERATION_COMPLETED(every=100),
        tag="training",
        output_transform=lambda loss: {"batchloss": loss},
    )

    for tag, evaluator in [("training", train_evaluator),
                           ("validation", validation_evaluator)]:
        wandb_logger.attach_output_handler(
            evaluator,
            event_name=Events.EPOCH_COMPLETED,
            tag=tag,
            metric_names=["loss", "accuracy"],
            global_step_transform=lambda *_: trainer.state.iteration,
        )

    wandb_logger.attach_opt_params_handler(
        trainer,
        event_name=Events.ITERATION_COMPLETED(every=100),
        optimizer=optimizer)
    wandb_logger.watch(model, log="all")

    def score_function(engine):
        return engine.state.metrics["accuracy"]

    model_checkpoint = ModelCheckpoint(
        wandb_logger.run.dir,
        n_saved=2,
        filename_prefix="best",
        score_function=score_function,
        score_name="validation_accuracy",
        global_step_transform=global_step_from_engine(trainer),
    )
    validation_evaluator.add_event_handler(Events.COMPLETED, model_checkpoint,
                                           {"model": model})

    # kick everything off
    trainer.run(train_loader, max_epochs=epochs)

    wandb_logger.close()
예제 #25
0
    with autocast():
        outputs = model(inputs.cuda())
        loss = criterion(outputs, targets.cuda())

    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()

    return loss.item()


trainer = Engine(update)
#trainer = create_supervised_trainer(model, optimizer, criterion, device='cuda')
val_metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)}
evaluator = create_supervised_evaluator(model, metrics=val_metrics, device='cuda')

data = Data(x=X,
            y=y)

loader = DataLoader(dataset=data,
                    shuffle=True,
                    batch_size=args.batch_size)


@trainer.on(Events.EPOCH_COMPLETED)
def log_training_results(trainer):
    evaluator.run(data=loader)
    metrics = evaluator.state.metrics
    print("Training Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
          .format(trainer.state.epoch, metrics["accuracy"], metrics["loss"]))
예제 #26
0
    # model
    model = AttnCanAdcrowdNet()
    model = model.to(device)

    # loss function
    loss_fn = nn.MSELoss(size_average=False).to(device)

    optimizer = torch.optim.SGD(model.parameters(), args.lr,
                                momentum=args.momentum,
                                weight_decay=args.decay)

    trainer = create_supervised_trainer(model, optimizer, loss_fn, device=device)
    evaluator = create_supervised_evaluator(model,
                                            metrics={
                                                'mae': CrowdCountingMeanAbsoluteError(),
                                                'mse': CrowdCountingMeanSquaredError(),
                                                'nll': Loss(loss_fn)
                                            }, device=device)
    print(model)

    print(args)


    @trainer.on(Events.ITERATION_COMPLETED(every=50))
    def log_training_loss(trainer):
        timestamp = get_readable_time()
        print(timestamp + " Epoch[{}] Loss: {:.2f}".format(trainer.state.epoch, trainer.state.output))


    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(trainer):
예제 #27
0
파일: train.py 프로젝트: aidamash/MetaRec
writer = SummaryWriter(log_dir=log_dir)

# Instantiate the model class object
model = MF(n_user, n_item, k=k, c_kld=c_kld, c_bias=c_bias, writer=writer)

# Use Adam optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# Create a supervised trainer
trainer = create_supervised_trainer(model, optimizer, model.loss)

# Use Mean Squared Error as evaluation metric
metrics = {'evaluation': MeanSquaredError()}

# Create a supervised evaluator
evaluator = create_supervised_evaluator(model, metrics=metrics)

# Load the train and test data
train_loader = Loader(train_x, train_y, batchsize=1024)
test_loader = Loader(test_x, test_y, batchsize=1024)


def log_training_loss(engine, log_interval=500):
    """
    Function to log the training loss
    """
    model.itr = engine.state.iteration  # Keep track of iterations
    if model.itr % log_interval == 0:
        fmt = "Epoch[{}] Iteration[{}/{}] Loss: {:.2f}"
        # Keep track of epochs and outputs
        msg = fmt.format(engine.state.epoch, engine.state.iteration,
예제 #28
0
trainer = create_supervised_trainer(model, optimizer, criterion, device=device)

metrics = {
    "accuracy":
    Accuracy(),
    "MAE":
    MeanAbsoluteError(
        output_transform=lambda out: (torch.max(out[0], dim=1)[1], out[1])),
    "MSE":
    MeanSquaredError(
        output_transform=lambda out: (torch.max(out[0], dim=1)[1], out[1])),
    "loss":
    Loss(loss_fn=criterion)
}

evaluator = create_supervised_evaluator(model, metrics=metrics, device=device)


@trainer.on(Events.ITERATION_COMPLETED)
def log_training_loss(trainer):
    print(
        f"Training (Epoch {trainer.state.epoch}): {trainer.state.output:.3f}")


best_epoch = 0
best_val_metrics = {"MAE": np.inf}
best_test_metrics = {"MAE": np.inf}


@trainer.on(Events.EPOCH_COMPLETED)
def log_validation_results(trainer):
예제 #29
0
def run(epochs, lr, momentum, log_interval):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    net = Net().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=lr, momentum=momentum)

    trainer = create_supervised_trainer(net,
                                        optimizer,
                                        criterion,
                                        device=device)
    trainer.logger = setup_logger("trainer")

    val_metrics = {
        "accuracy": Accuracy(),
        "loss": Loss(criterion),
        "recall": Recall()
    }
    evaluator = create_supervised_evaluator(net,
                                            metrics=val_metrics,
                                            device=device)
    evaluator.logger = setup_logger("evaluator")

    # Attach handler to plot trainer's loss every 100 iterations
    tb_logger.attach_output_handler(
        trainer,
        event_name=Events.ITERATION_COMPLETED(every=params.get("loss_report")),
        tag="training",
        output_transform=lambda loss: {"loss": loss},
    )

    # Attach handler to dump evaluator's metrics every epoch completed
    for tag, evaluator in [("training", trainer), ("validation", evaluator)]:
        tb_logger.attach_output_handler(
            evaluator,
            event_name=Events.EPOCH_COMPLETED,
            tag=tag,
            metric_names="all",
            global_step_transform=global_step_from_engine(trainer),
        )

    # Attach function to build debug images and report every epoch end
    tb_logger.attach(
        evaluator,
        log_handler=predictions_gt_images_handler,
        event_name=Events.EPOCH_COMPLETED(once=1),
    )

    desc = "ITERATION - loss: {:.2f}"
    pbar = tqdm(initial=0,
                leave=False,
                total=len(trainloader),
                desc=desc.format(0))

    @trainer.on(Events.ITERATION_COMPLETED(every=log_interval))
    def log_training_loss(engine):
        pbar.desc = desc.format(engine.state.output)
        pbar.update(log_interval)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        pbar.refresh()
        evaluator.run(trainloader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics["accuracy"]
        avg_nll = metrics["loss"]
        tqdm.write(
            "Training Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(testloader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics["accuracy"]
        avg_nll = metrics["loss"]
        tqdm.write(
            "Validation Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))

        pbar.n = pbar.last_print_n = 0

    @trainer.on(Events.EPOCH_COMPLETED | Events.COMPLETED)
    def log_time():
        tqdm.write("{} took {} seconds".format(
            trainer.last_event_name.name,
            trainer.state.times[trainer.last_event_name.name],
        ))

    trainer.run(trainloader, max_epochs=epochs)
    pbar.close()

    PATH = "./cifar_net.pth"
    torch.save(net.state_dict(), PATH)

    print("Finished Training")
    print("Task ID number is: {}".format(task.id))
예제 #30
0
def run(train_batch_size,
        epochs,
        lr,
        weight_decay,
        model_name,
        config,
        exp_id,
        log_dir,
        trained_model_file,
        save_result_file,
        disable_gpu=False):
    if config['test_ratio']:
        train_loader, val_loader, test_loader = get_data_loaders(
            config, train_batch_size, exp_id)
    else:
        train_loader, val_loader = get_data_loaders(config, train_batch_size,
                                                    exp_id)

    device = torch.device(
        "cuda" if not disable_gpu and torch.cuda.is_available() else "cpu")
    if model_name == 'CNNIQAplus' or model_name == 'CNNIQA':
        model = CNNIQAplusnet(n_distortions=config['n_distortions'],
                              ker_size=config['kernel_size'],
                              n_kers=config['n_kernels'],
                              n1_nodes=config['n1_nodes'],
                              n2_nodes=config['n2_nodes'])
    else:
        model = CNNIQAplusplusnet(n_distortions=config['n_distortions'],
                                  ker_size=config['kernel_size'],
                                  n1_kers=config['n1_kernels'],
                                  pool_size=config['pool_size'],
                                  n2_kers=config['n2_kernels'],
                                  n1_nodes=config['n1_nodes'],
                                  n2_nodes=config['n2_nodes'])
    writer = SummaryWriter(log_dir=log_dir)
    model = model.to(device)
    print(model)
    # if multi_gpu and torch.cuda.device_count() > 1:
    #     model = nn.DataParallel(model)

    optimizer = Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    global best_criterion
    best_criterion = -1  # SROCC>=-1
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        loss_fn,
                                        device=device)
    evaluator = create_supervised_evaluator(model,
                                            metrics={
                                                'IQA_performance':
                                                IQAPerformance(),
                                                'IDC_performance':
                                                IDCPerformance()
                                            },
                                            device=device)

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        writer.add_scalar("training/loss", engine.state.output,
                          engine.state.iteration)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        SROCC, KROCC, PLCC, RMSE, MAE, OR = metrics['IQA_performance']
        Acc = metrics['IDC_performance']
        print(
            "Validation Results - Epoch: {} Acc:  {:.2f}% SROCC: {:.4f} KROCC: {:.4f} PLCC: {:.4f} RMSE: {:.4f} MAE: {:.4f} OR: {:.2f}%"
            .format(engine.state.epoch, 100 * Acc, SROCC, KROCC, PLCC, RMSE,
                    MAE, 100 * OR))
        writer.add_scalar("validation/SROCC", SROCC, engine.state.epoch)
        writer.add_scalar("validation/KROCC", KROCC, engine.state.epoch)
        writer.add_scalar("validation/PLCC", PLCC, engine.state.epoch)
        writer.add_scalar("validation/RMSE", RMSE, engine.state.epoch)
        writer.add_scalar("validation/MAE", MAE, engine.state.epoch)
        writer.add_scalar("validation/OR", OR, engine.state.epoch)
        writer.add_scalar("validation/Acc", Acc, engine.state.epoch)
        global best_criterion
        global best_epoch
        if SROCC > best_criterion:
            best_criterion = SROCC
            best_epoch = engine.state.epoch
            torch.save(model.state_dict(), trained_model_file)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_testing_results(engine):
        if config["test_ratio"] > 0 and config['test_during_training']:
            evaluator.run(test_loader)
            metrics = evaluator.state.metrics
            SROCC, KROCC, PLCC, RMSE, MAE, OR = metrics['IQA_performance']
            Acc = metrics['IDC_performance']
            print(
                "Testing Results    - Epoch: {} Acc:  {:.2f}% SROCC: {:.4f} KROCC: {:.4f} PLCC: {:.4f} RMSE: {:.4f} MAE: {:.4f} OR: {:.2f}%"
                .format(engine.state.epoch, 100 * Acc, SROCC, KROCC, PLCC,
                        RMSE, MAE, 100 * OR))
            writer.add_scalar("testing/SROCC", SROCC, engine.state.epoch)
            writer.add_scalar("testing/KROCC", KROCC, engine.state.epoch)
            writer.add_scalar("testing/PLCC", PLCC, engine.state.epoch)
            writer.add_scalar("testing/RMSE", RMSE, engine.state.epoch)
            writer.add_scalar("testing/MAE", MAE, engine.state.epoch)
            writer.add_scalar("testing/OR", OR, engine.state.epoch)
            writer.add_scalar("testing/Acc", Acc, engine.state.epoch)

    @trainer.on(Events.COMPLETED)
    def final_testing_results(engine):
        if config["test_ratio"] > 0:
            model.load_state_dict(torch.load(trained_model_file))
            evaluator.run(test_loader)
            metrics = evaluator.state.metrics
            SROCC, KROCC, PLCC, RMSE, MAE, OR = metrics['IQA_performance']
            Acc = metrics['IDC_performance']
            global best_epoch
            print(
                "Final Test Results - Epoch: {} Acc:  {:.2f}% SROCC: {:.4f} KROCC: {:.4f} PLCC: {:.4f} RMSE: {:.4f} MAE: {:.4f} OR: {:.2f}%"
                .format(best_epoch, 100 * Acc, SROCC, KROCC, PLCC, RMSE, MAE,
                        100 * OR))
            np.save(save_result_file, (Acc, SROCC, KROCC, PLCC, RMSE, MAE, OR))

    # kick everything off
    trainer.run(train_loader, max_epochs=epochs)

    writer.close()