예제 #1
0
def get_save_handler(config):
    if config["with_clearml"]:
        from ignite.contrib.handlers.clearml_logger import ClearMLSaver

        return ClearMLSaver(dirname=config["output_path"])

    return DiskSaver(config["output_path"], require_empty=False)
예제 #2
0
def get_save_handler(config):
    if exp_tracking.has_clearml:
        from ignite.contrib.handlers.clearml_logger import ClearMLSaver

        return ClearMLSaver(dirname=config.output_path.as_posix())

    return DiskSaver(config.output_path.as_posix())
예제 #3
0
def test_clearml_disk_saver_integration_no_logger():
    model = torch.nn.Module()
    to_save_serializable = {"model": model}

    with pytest.warns(
            UserWarning,
            match="ClearMLSaver created a temporary checkpoints directory"):
        clearml.Task.current_task = Mock(return_value=object())
        clearml.binding.frameworks.WeightsFileHandler.create_output_model = MagicMock(
        )
        clearml_saver = ClearMLSaver()
        checkpoint = Checkpoint(to_save=to_save_serializable,
                                save_handler=clearml_saver,
                                n_saved=1)

    trainer = Engine(lambda e, b: None)
    trainer.state = State(epoch=0, iteration=0)
    checkpoint(trainer)
    trainer.state.iteration = 1
    checkpoint(trainer)

    if clearml_saver._atomic:
        assert clearml.binding.frameworks.WeightsFileHandler.create_output_model.call_count == 2
    else:
        saved_files = list(os.listdir(clearml_saver.dirname))
        assert len(saved_files) == 1
        assert saved_files[0] == "model_1.pt"
예제 #4
0
def get_save_handler(output_path, with_clearml):
    if with_clearml:
        from ignite.contrib.handlers.clearml_logger import ClearMLSaver

        return ClearMLSaver(dirname=output_path)

    return DiskSaver(output_path)
예제 #5
0
    def create_callbacks(self):

        ## SETUP CALLBACKS
        print('[INFO] Creating callback functions for training loop...',
              end='')
        # Early Stopping - stops training if the validation loss does not decrease after 5 epochs
        handler = EarlyStopping(patience=self.config.EARLY_STOPPING_PATIENCE,
                                score_function=score_function_loss,
                                trainer=self.train_engine)
        self.evaluator.add_event_handler(Events.COMPLETED, handler)
        print('Early Stopping ({} epochs)...'.format(
            self.config.EARLY_STOPPING_PATIENCE),
              end='')

        val_checkpointer = Checkpoint(
            {"model": self.model},
            ClearMLSaver(),
            n_saved=1,
            score_function=score_function_acc,
            score_name="val_acc",
            filename_prefix='cub200_{}_ignite_best'.format(
                self.config.MODEL.MODEL_NAME),
            global_step_transform=global_step_from_engine(self.train_engine),
        )
        self.evaluator.add_event_handler(Events.EPOCH_COMPLETED,
                                         val_checkpointer)
        print('Model Checkpointing...', end='')
        print('Done')
예제 #6
0
def _test_save_model_optimizer_lr_scheduler_with_state_dict(
        device, on_zero_rank=False):

    if idist.get_rank() == 0:
        clearml.Task.current_task = Mock(return_value=object())
        clearml.binding.frameworks.WeightsFileHandler.create_output_model = MagicMock(
        )

    torch.manual_seed(23)

    model = DummyModel().to(device)

    optim = torch.optim.SGD(model.parameters(), lr=0.1)
    lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optim, gamma=0.5)

    def update_fn(engine, batch):
        x = torch.rand((4, 2)).to(device)
        optim.zero_grad()
        y = model(x)
        # Below code raises: RuntimeError: torch_xla/csrc/tensor_impl.cpp:144 : XLA tensors do not have storage
        # Probably related to https://github.com/pytorch/xla/issues/2576
        # loss = y.pow(2.0).sum()
        loss = y.sum()
        loss.backward()
        if idist.has_xla_support:
            import torch_xla.core.xla_model as xm

            xm.optimizer_step(optim, barrier=True)
        else:
            optim.step()
        lr_scheduler.step()

    engine = Engine(update_fn)

    to_save = {
        "model": model,
        "optimizer": optim,
        "lr_scheduler": lr_scheduler
    }

    with pytest.warns(
            UserWarning,
            match=r"ClearMLSaver created a temporary checkpoints directory"):
        clearml_saver = ClearMLSaver()

    if (not on_zero_rank) or (on_zero_rank and idist.get_rank() == 0):
        checkpoint = Checkpoint(to_save=to_save,
                                save_handler=clearml_saver,
                                n_saved=1)
        engine.add_event_handler(Events.EPOCH_COMPLETED, checkpoint)

    engine.run([0], max_epochs=4)

    idist.barrier()

    saved_objects = sorted(os.listdir(clearml_saver.dirname))
    # saved object is ['PREFIX_checkpoint_3.pt', ]
    saved_checkpoint = os.path.join(clearml_saver.dirname, saved_objects[0])

    if idist.has_xla_support:
        device = "cpu"

    loaded_obj = torch.load(saved_checkpoint, map_location=device)
    for f in ["model", "optimizer", "lr_scheduler"]:
        assert f in loaded_obj
    loaded_model_state_dict = loaded_obj["model"]
    loaded_optimizer_state_dict = loaded_obj["optimizer"]
    loaded_lr_scheduler_state_dict = loaded_obj["lr_scheduler"]

    assert isinstance(loaded_model_state_dict, dict)
    assert isinstance(loaded_optimizer_state_dict, dict)
    assert isinstance(loaded_lr_scheduler_state_dict, dict)

    # Specifically move device to CPU first
    model_state_dict = model.cpu().state_dict()
    for key in model_state_dict.keys():
        assert key in loaded_model_state_dict
        model_value = model_state_dict[key]
        loaded_model_value = loaded_model_state_dict[key]
        assert (model_value.cpu().numpy() == loaded_model_value.cpu().numpy()
                ).all()

    optim_state_dict = optim.state_dict()
    for key in optim_state_dict.keys():
        assert key in loaded_optimizer_state_dict
        optim_value = optim_state_dict[key]
        loaded_optim_value = loaded_optimizer_state_dict[key]
        if idist.get_rank() == 0:
            assert optim_value == loaded_optim_value

    lr_scheduler_state_dict = lr_scheduler.state_dict()
    for key in lr_scheduler_state_dict.keys():
        assert key in loaded_lr_scheduler_state_dict
        lr_scheduler_value = lr_scheduler_state_dict[key]
        loaded_lr_scheduler_value = loaded_lr_scheduler_state_dict[key]
        assert lr_scheduler_value == loaded_lr_scheduler_value
def run(train_batch_size, val_batch_size, epochs, lr, momentum):
    train_loader, val_loader = get_data_loaders(train_batch_size,
                                                val_batch_size)
    model = Net()
    device = "cpu"

    if torch.cuda.is_available():
        device = "cuda"

    model.to(device)  # Move model before creating optimizer
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    criterion = nn.CrossEntropyLoss()
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=device)
    trainer.logger = setup_logger("Trainer")

    metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)}

    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device)
    train_evaluator.logger = setup_logger("Train Evaluator")
    validation_evaluator = create_supervised_evaluator(model,
                                                       metrics=metrics,
                                                       device=device)
    validation_evaluator.logger = setup_logger("Val Evaluator")

    @trainer.on(Events.EPOCH_COMPLETED)
    def compute_metrics(engine):
        train_evaluator.run(train_loader)
        validation_evaluator.run(val_loader)

    clearml_logger = ClearMLLogger(project_name="examples", task_name="ignite")

    clearml_logger.attach_output_handler(
        trainer,
        event_name=Events.ITERATION_COMPLETED(every=100),
        tag="training",
        output_transform=lambda loss: {"batchloss": loss},
    )

    for tag, evaluator in [("training metrics", train_evaluator),
                           ("validation metrics", validation_evaluator)]:
        clearml_logger.attach_output_handler(
            evaluator,
            event_name=Events.EPOCH_COMPLETED,
            tag=tag,
            metric_names=["loss", "accuracy"],
            global_step_transform=global_step_from_engine(trainer),
        )

    clearml_logger.attach_opt_params_handler(
        trainer,
        event_name=Events.ITERATION_COMPLETED(every=100),
        optimizer=optimizer)

    clearml_logger.attach(trainer,
                          log_handler=WeightsScalarHandler(model),
                          event_name=Events.ITERATION_COMPLETED(every=100))

    clearml_logger.attach(trainer,
                          log_handler=WeightsHistHandler(model),
                          event_name=Events.EPOCH_COMPLETED(every=100))

    clearml_logger.attach(trainer,
                          log_handler=GradsScalarHandler(model),
                          event_name=Events.ITERATION_COMPLETED(every=100))

    clearml_logger.attach(trainer,
                          log_handler=GradsHistHandler(model),
                          event_name=Events.EPOCH_COMPLETED(every=100))

    handler = Checkpoint(
        {"model": model},
        ClearMLSaver(),
        n_saved=1,
        score_function=lambda e: e.state.metrics["accuracy"],
        score_name="val_acc",
        filename_prefix="best",
        global_step_transform=global_step_from_engine(trainer),
    )
    validation_evaluator.add_event_handler(Events.EPOCH_COMPLETED, handler)

    # kick everything off
    trainer.run(train_loader, max_epochs=epochs)

    clearml_logger.close()