예제 #1
0
def test_mlflow_logger_experiment_calls(client, mlflow, time, tmpdir):
    """
    Test that the logger calls methods on the mlflow experiment correctly.
    """
    time.return_value = 1

    logger = MLFlowLogger("test",
                          save_dir=tmpdir,
                          artifact_location="my_artifact_location")
    logger._mlflow_client.get_experiment_by_name.return_value = None

    params = {"test": "test_param"}
    logger.log_hyperparams(params)

    logger.experiment.log_param.assert_called_once_with(
        logger.run_id, "test", "test_param")

    metrics = {"some_metric": 10}
    logger.log_metrics(metrics)

    logger.experiment.log_metric.assert_called_once_with(
        logger.run_id, "some_metric", 10, 1000, None)

    logger._mlflow_client.create_experiment.assert_called_once_with(
        name="test", artifact_location="my_artifact_location")
예제 #2
0
def objective(trial, args):
    params = get_trial_params(trial)
    params['hidden_size'] = 2**params['hidden_size']
    params['acc_grads'] = 2**params['acc_grads']

    early_stopper = EarlyStopping(
        monitor='val_loss', min_delta=0.005, patience=3, mode='min')
    callbacks = [early_stopper, PyTorchLightningPruningCallback(
        trial, monitor="val_loss")]

    if args.model_type == 'attnlstm':
        params['attn_width'] = trial.suggest_int("attn_width", 3, 64)

    if 'split' in args.val_mode:
        dataset_hour = args.data.split('_')[-1]
        logger = MLFlowLogger(experiment_name=f'Optuna_{dataset_hour}h_{args.val_mode[-1]}_split')
        print(f'Optuna_{dataset_hour}_{args.val_mode[-1]}_split')
        val_losses = []
        for _split_id in range(int(args.val_mode[-1])):
            print(f"Split {_split_id} Trial {trial.number}")
            args.__dict__["split_id"] = 0
            for key in params:
                args.__dict__[str(key)] = params.get(key)
            model = LitLSTM(args)
            trainer = Trainer(
                logger=logger,
                callbacks=callbacks,
                **get_trainer_params(args),
            )
            logger.log_hyperparams(model.args)
            args.__dict__["val_mode"] = args.val_mode
            args.__dict__["split_id"] = _split_id
            model._get_data(args, data_mode='init')
            trainer.fit(model)
            trainer.test(model, test_dataloaders=model.test_dataloader())
            # logger.finalize()
            val_losses.append(model.metrics['val_loss'])

        # log mean val loss for later retrieval of best model
        mean_val_loss = torch.stack(val_losses).mean()
        logger.log_metrics({"mean_val_loss": mean_val_loss}, step=0)
        logger.finalize()
        return mean_val_loss

    elif args.val_mode == 'full':
        logger = MLFlowLogger(experiment_name='Optuna_full')
        for key in params:
            args.__dict__[str(key)] = params.get(key)
        model = LitLSTM(args)
        trainer = Trainer(
            logger=logger,
            callbacks=callbacks,
            **get_trainer_params(args),
        )
        logger.log_hyperparams(model.args)
        trainer.fit(model)
        trainer.test(model, test_dataloaders=model.test_dataloader())
        model.save_preds_and_targets(to_disk=True)
        logger.finalize()
        return model.metrics['val_loss']
예제 #3
0
def train(args):
    seed_everything(args.seed)
    model = LitLSTM(args)

    logger = MLFlowLogger(experiment_name='Default')

    early_stop_callback = EarlyStopping(
        monitor='val_loss', min_delta=0.005, patience=3, verbose=args.verbose, mode='min') if args.early else None

    trainer = Trainer(
        log_gpu_memory='all' if args.verbose else None,
        track_grad_norm=2 if args.verbose else -1,
        logger=logger,
        weights_summary='full',
        callbacks=[early_stop_callback],
        accumulate_grad_batches=args.acc_grads,
        profiler=args.verbose,
        **get_trainer_params(args),
    )

    logger.log_hyperparams(model.args)
    trainer.fit(model)
    trainer.test(model, test_dataloaders=model.test_dataloader())
    model.save_preds_and_targets(to_disk=True)
    logger.finalize()

    return logger.run_id
예제 #4
0
def test_mlflow_logger_experiment_calls(client, mlflow, time, tmpdir):
    """
    Test that the logger calls methods on the mlflow experiment correctly.
    """
    time.return_value = 1

    logger = MLFlowLogger('test',
                          save_dir=tmpdir,
                          artifact_location='my_artifact_location')
    logger._mlflow_client.get_experiment_by_name.return_value = None

    params = {'test': 'test_param'}
    logger.log_hyperparams(params)

    logger.experiment.log_param.assert_called_once_with(
        logger.run_id, 'test', 'test_param')

    metrics = {'some_metric': 10}
    logger.log_metrics(metrics)

    logger.experiment.log_metric.assert_called_once_with(
        logger.run_id, 'some_metric', 10, 1000, None)

    logger._mlflow_client.create_experiment.assert_called_once_with(
        name='test',
        artifact_location='my_artifact_location',
    )
def test_mlflow_logger_with_long_param_value(client, mlflow, tmpdir):
    """Test that the logger raises warning with special characters not accepted by MLFlow."""
    logger = MLFlowLogger("test", save_dir=tmpdir)
    value = "test" * 100
    key = "test_param"
    params = {key: value}

    with pytest.warns(RuntimeWarning, match=f"Discard {key}={value}"):
        logger.log_hyperparams(params)
예제 #6
0
def _train(
    task: str,
    ov: List[str],
    do_sweep: bool,
):
    """
    Run training

    Args:
        task: task to run training for
        ov: overwrites for config manager
        do_sweep: determine best emprical parameters for run
    """
    print(f"Overwrites: {ov}")
    initialize_config_module(config_module="nndet.conf")
    cfg = compose(task, "config.yaml", overrides=ov if ov is not None else [])

    assert cfg.host.parent_data is not None, 'Parent data can not be None'
    assert cfg.host.parent_results is not None, 'Output dir can not be None'

    train_dir = init_train_dir(cfg)

    pl_logger = MLFlowLogger(
        experiment_name=cfg["task"],
        tags={
            "host": socket.gethostname(),
            "fold": cfg["exp"]["fold"],
            "task": cfg["task"],
            "job_id": os.getenv('LSB_JOBID', 'no_id'),
            "mlflow.runName": cfg["exp"]["id"],
        },
        save_dir=os.getenv("MLFLOW_TRACKING_URI", "./mlruns"),
    )
    pl_logger.log_hyperparams(
        flatten_mapping(
            {"model": OmegaConf.to_container(cfg["model_cfg"], resolve=True)}))
    pl_logger.log_hyperparams(
        flatten_mapping({
            "trainer":
            OmegaConf.to_container(cfg["trainer_cfg"], resolve=True)
        }))

    logger.remove()
    logger.add(sys.stdout, format="{level} {message}", level="INFO")
    log_file = Path(os.getcwd()) / "train.log"
    logger.add(log_file, level="INFO")
    logger.info(f"Log file at {log_file}")

    meta_data = {}
    meta_data["torch_version"] = str(torch.__version__)
    meta_data["date"] = str(datetime.now())
    meta_data["git"] = log_git(nndet.__path__[0], repo_name="nndet")
    save_json(meta_data, "./meta.json")
    try:
        write_requirements_to_file("requirements.txt")
    except Exception as e:
        logger.error(f"Could not log req: {e}")

    plan_path = Path(str(cfg.host["plan_path"]))
    plan = load_pickle(plan_path)
    save_json(create_debug_plan(plan), "./plan_debug.json")

    data_dir = Path(cfg.host["preprocessed_output_dir"]
                    ) / plan["data_identifier"] / "imagesTr"

    datamodule = Datamodule(
        augment_cfg=OmegaConf.to_container(cfg["augment_cfg"], resolve=True),
        plan=plan,
        data_dir=data_dir,
        fold=cfg["exp"]["fold"],
    )
    module = MODULE_REGISTRY[cfg["module"]](
        model_cfg=OmegaConf.to_container(cfg["model_cfg"], resolve=True),
        trainer_cfg=OmegaConf.to_container(cfg["trainer_cfg"], resolve=True),
        plan=plan,
    )
    callbacks = []
    checkpoint_cb = ModelCheckpoint(
        dirpath=train_dir,
        filename='model_best',
        save_last=True,
        save_top_k=1,
        monitor=cfg["trainer_cfg"]["monitor_key"],
        mode=cfg["trainer_cfg"]["monitor_mode"],
    )
    checkpoint_cb.CHECKPOINT_NAME_LAST = 'model_last'
    callbacks.append(checkpoint_cb)
    callbacks.append(LearningRateMonitor(logging_interval="epoch"))

    OmegaConf.save(cfg, str(Path(os.getcwd()) / "config.yaml"))
    OmegaConf.save(cfg,
                   str(Path(os.getcwd()) / "config_resolved.yaml"),
                   resolve=True)
    save_pickle(plan, train_dir / "plan.pkl")  # backup plan
    splits = load_pickle(
        Path(cfg.host.preprocessed_output_dir) / datamodule.splits_file)
    save_pickle(splits, train_dir / "splits.pkl")

    trainer_kwargs = {}
    if cfg["train"]["mode"].lower() == "resume":
        trainer_kwargs[
            "resume_from_checkpoint"] = train_dir / "model_last.ckpt"

    num_gpus = cfg["trainer_cfg"]["gpus"]
    logger.info(f"Using {num_gpus} GPUs for training")
    plugins = cfg["trainer_cfg"].get("plugins", None)
    logger.info(f"Using {plugins} plugins for training")

    trainer = pl.Trainer(
        gpus=list(range(num_gpus)) if num_gpus > 1 else num_gpus,
        accelerator=cfg["trainer_cfg"]["accelerator"],
        precision=cfg["trainer_cfg"]["precision"],
        amp_backend=cfg["trainer_cfg"]["amp_backend"],
        amp_level=cfg["trainer_cfg"]["amp_level"],
        benchmark=cfg["trainer_cfg"]["benchmark"],
        deterministic=cfg["trainer_cfg"]["deterministic"],
        callbacks=callbacks,
        logger=pl_logger,
        max_epochs=module.max_epochs,
        progress_bar_refresh_rate=None
        if bool(int(os.getenv("det_verbose", 1))) else 0,
        reload_dataloaders_every_epoch=False,
        num_sanity_val_steps=10,
        weights_summary='full',
        plugins=plugins,
        terminate_on_nan=True,  # TODO: make modular
        move_metrics_to_cpu=True,
        **trainer_kwargs)
    trainer.fit(module, datamodule=datamodule)

    if do_sweep:
        case_ids = splits[cfg["exp"]["fold"]]["val"]
        if "debug" in cfg and "num_cases_val" in cfg["debug"]:
            case_ids = case_ids[:cfg["debug"]["num_cases_val"]]

        inference_plan = module.sweep(
            cfg=OmegaConf.to_container(cfg, resolve=True),
            save_dir=train_dir,
            train_data_dir=data_dir,
            case_ids=case_ids,
            run_prediction=True,
        )

        plan["inference_plan"] = inference_plan
        save_pickle(plan, train_dir / "plan_inference.pkl")

        ensembler_cls = module.get_ensembler_cls(
            key="boxes",
            dim=plan["network_dim"])  # TODO: make this configurable
        for restore in [True, False]:
            target_dir = train_dir / "val_predictions" if restore else \
                train_dir / "val_predictions_preprocessed"
            extract_results(
                source_dir=train_dir / "sweep_predictions",
                target_dir=target_dir,
                ensembler_cls=ensembler_cls,
                restore=restore,
                **inference_plan,
            )

        _evaluate(
            task=cfg["task"],
            model=cfg["exp"]["id"],
            fold=cfg["exp"]["fold"],
            test=False,
            do_boxes_eval=True,  # TODO: make this configurable
            do_analyze_boxes=True,  # TODO: make this configurable
        )