def cli_main():
    pl.seed_everything(1234)

    # ------------
    # args
    # ------------
    parser = ArgumentParser()
    parser.add_argument("--batch_size", default=32, type=int)
    parser.add_argument("--hidden_dim", type=int, default=128)
    parser = pl.Trainer.add_argparse_args(parser)
    args = parser.parse_args()

    # ------------
    # data
    # ------------
    dataset = MNIST("",
                    train=True,
                    download=True,
                    transform=transforms.ToTensor())
    mnist_test = MNIST("",
                       train=False,
                       download=True,
                       transform=transforms.ToTensor())
    mnist_train, mnist_val = random_split(dataset, [55000, 5000])

    train_loader = DataLoader(mnist_train, batch_size=args.batch_size)
    val_loader = DataLoader(mnist_val, batch_size=args.batch_size)
    test_loader = DataLoader(mnist_test, batch_size=args.batch_size)

    # ------------
    # model
    # ------------
    model = LitAutoEncoder()

    # ------------
    # logging
    # ------------
    # get azureml run object
    run = Run.get_context()
    # get the tracking uri for the azureml workspace
    mlflow_uri = run.experiment.workspace.get_mlflow_tracking_uri()
    # get the azureml experiment name
    exp_name = run.experiment.name

    mlf_logger = MLFlowLogger(experiment_name=exp_name,
                              tracking_uri=mlflow_uri)
    # link the mlflowlogger run ID to the azureml run ID
    mlf_logger._run_id = run.id

    # ------------
    # training
    # ------------
    trainer = pl.Trainer.from_argparse_args(args, logger=mlf_logger)
    trainer.fit(model, train_loader, val_loader)

    # ------------
    # testing
    # ------------
    result = trainer.test(test_dataloaders=test_loader)
    print(result)
def test_mlflow_run_name_setting(client, mlflow, tmpdir):
    """Test that the run_name argument makes the MLFLOW_RUN_NAME tag."""

    tags = resolve_tags({MLFLOW_RUN_NAME: "run-name-1"})

    # run_name is appended to tags
    logger = MLFlowLogger("test", run_name="run-name-1", save_dir=tmpdir)
    logger = mock_mlflow_run_creation(logger, experiment_id="exp-id")
    _ = logger.experiment
    client.return_value.create_run.assert_called_with(experiment_id="exp-id",
                                                      tags=tags)

    # run_name overrides tags[MLFLOW_RUN_NAME]
    logger = MLFlowLogger("test",
                          run_name="run-name-1",
                          tags={MLFLOW_RUN_NAME: "run-name-2"},
                          save_dir=tmpdir)
    logger = mock_mlflow_run_creation(logger, experiment_id="exp-id")
    _ = logger.experiment
    client.return_value.create_run.assert_called_with(experiment_id="exp-id",
                                                      tags=tags)

    # default run_name (= None) does not append new tag
    logger = MLFlowLogger("test", save_dir=tmpdir)
    logger = mock_mlflow_run_creation(logger, experiment_id="exp-id")
    _ = logger.experiment
    default_tags = resolve_tags(None)
    client.return_value.create_run.assert_called_with(experiment_id="exp-id",
                                                      tags=default_tags)
Exemplo n.º 3
0
def test_mlflow_logger_with_unexpected_characters(client, mlflow, tmpdir):
    """Test that the logger raises warning with special characters not accepted by MLFlow."""
    logger = MLFlowLogger("test", save_dir=tmpdir)
    metrics = {"[some_metric]": 10}

    with pytest.warns(RuntimeWarning, match="special characters in metric name"):
        logger.log_metrics(metrics)
Exemplo n.º 4
0
def test_mlflow_logger(tmpdir):
    """Verify that basic functionality of mlflow logger works."""
    tutils.reset_seed()

    hparams = tutils.get_default_hparams()
    model = LightningTestModel(hparams)

    mlflow_dir = os.path.join(tmpdir, 'mlruns')
    logger = MLFlowLogger('test', tracking_uri=f'file:{os.sep * 2}{mlflow_dir}')

    # Test already exists
    logger2 = MLFlowLogger('test', tracking_uri=f'file:{os.sep * 2}{mlflow_dir}')
    _ = logger2.run_id

    # Try logging string
    logger.log_metrics({'acc': 'test'})

    trainer_options = dict(
        default_root_dir=tmpdir,
        max_epochs=1,
        train_percent_check=0.05,
        logger=logger
    )
    trainer = Trainer(**trainer_options)
    result = trainer.fit(model)

    assert result == 1, 'Training failed'
def test_mlflow_logger_exists(tmpdir):
    """ Test launching two independent loggers. """
    logger = MLFlowLogger('test', save_dir=tmpdir)
    # same name leads to same experiment id, but different runs get recorded
    logger2 = MLFlowLogger('test', save_dir=tmpdir)
    assert logger.experiment_id == logger2.experiment_id
    assert logger.run_id != logger2.run_id
    logger3 = MLFlowLogger('new', save_dir=tmpdir)
    assert logger3.experiment_id != logger.experiment_id
def test_mlflow_logger_with_long_param_value(client, mlflow, tmpdir):
    """Test that the logger raises warning with special characters not accepted by MLFlow."""
    logger = MLFlowLogger("test", save_dir=tmpdir)
    value = "test" * 100
    key = "test_param"
    params = {key: value}

    with pytest.warns(RuntimeWarning, match=f"Discard {key}={value}"):
        logger.log_hyperparams(params)
def test_mlflow_logger_exists(client, mlflow, tmpdir):
    """Test launching three independent loggers with either same or different experiment name."""

    run1 = MagicMock()
    run1.info.run_id = "run-id-1"
    run1.info.experiment_id = "exp-id-1"

    run2 = MagicMock()
    run2.info.run_id = "run-id-2"

    run3 = MagicMock()
    run3.info.run_id = "run-id-3"

    # simulate non-existing experiment creation
    client.return_value.get_experiment_by_name = MagicMock(return_value=None)
    client.return_value.create_experiment = MagicMock(
        return_value="exp-id-1")  # experiment_id
    client.return_value.create_run = MagicMock(return_value=run1)

    logger = MLFlowLogger("test", save_dir=tmpdir)
    assert logger._experiment_id is None
    assert logger._run_id is None
    _ = logger.experiment
    assert logger.experiment_id == "exp-id-1"
    assert logger.run_id == "run-id-1"
    assert logger.experiment.create_experiment.asset_called_once()
    client.reset_mock(return_value=True)

    # simulate existing experiment returns experiment id
    exp1 = MagicMock()
    exp1.experiment_id = "exp-id-1"
    client.return_value.get_experiment_by_name = MagicMock(return_value=exp1)
    client.return_value.create_run = MagicMock(return_value=run2)

    # same name leads to same experiment id, but different runs get recorded
    logger2 = MLFlowLogger("test", save_dir=tmpdir)
    assert logger2.experiment_id == logger.experiment_id
    assert logger2.run_id == "run-id-2"
    assert logger2.experiment.create_experiment.call_count == 0
    assert logger2.experiment.create_run.asset_called_once()
    client.reset_mock(return_value=True)

    # simulate a 3rd experiment with new name
    client.return_value.get_experiment_by_name = MagicMock(return_value=None)
    client.return_value.create_experiment = MagicMock(return_value="exp-id-3")
    client.return_value.create_run = MagicMock(return_value=run3)

    # logger with new experiment name causes new experiment id and new run id to be created
    logger3 = MLFlowLogger("new", save_dir=tmpdir)
    assert logger3.experiment_id == "exp-id-3" != logger.experiment_id
    assert logger3.run_id == "run-id-3"
Exemplo n.º 8
0
def train_model(config, gpus, w2v, num_epochs=10):
    early_stop_callback = EarlyStopping(monitor="val_Accuracy",
                                        min_delta=0.0,
                                        patience=5,
                                        verbose=True,
                                        mode="min")
    checkpoint_callback = ModelCheckpoint(
        "models/wav2vec_kws/",
        save_top_k=1,
        verbose=True,
        monitor='val_Accuracy',
        mode='min',
    )
    tune_callback = TuneReportCallback({"acc": "val_Accuracy"},
                                       on="validation_end")
    logger = TensorBoardLogger("tb_logs", name="wav2vec_kws_tune")
    mlf_logger = MLFlowLogger(experiment_name="wav2vec_kws",
                              tracking_uri="http://192.168.0.32")
    mlflow.pytorch.autolog()
    trainer = pl.Trainer(
        gpus=gpus,
        callbacks=[checkpoint_callback, early_stop_callback, tune_callback],
        logger=[logger, mlf_logger],
        accumulate_grad_batches=4,
        amp_level="O0",
        max_epochs=num_epochs,
        progress_bar_refresh_rate=1,
        log_every_n_steps=1,
        flush_logs_every_n_steps=1)
    config = argparse.Namespace(**config)
    model = Wav2VecKWS(config, w2v)
    trainer.fit(model)
Exemplo n.º 9
0
def main(args: DictConfig):
    # torch.autograd.set_detect_anomaly(True)

    model = SuperGlueLightning(args)
    mlf_logger = MLFlowLogger(experiment_name='SuperGlue',
                              tracking_uri=args.exp.mlflow_uri)
    mlf_logger.experiment.log_param(
        mlf_logger.run_id, 'chekpoints_path',
        f'{os.getcwd()}/{args.exp.checkpoint_path}')
    checkpoint_callback = ModelCheckpoint(dirpath=args.exp.checkpoint_path,
                                          verbose=True,
                                          save_weights_only=True)

    trainer = Trainer(
        gpus=eval(str(args.exp.gpus)),
        logger=mlf_logger if args.exp.logging else None,
        max_epochs=args.exp.epochs,
        accumulate_grad_batches=args.exp.accumulate_grad_batches,
        checkpoint_callback=checkpoint_callback
        if args.exp.checkpoint else None,
        val_check_interval=args.exp.val_check_interval,
        # limit_val_batches=args.data.val_size,
        auto_lr_find=False,
        accelerator='ddp',
        log_every_n_steps=100,
        num_sanity_val_steps=0,
    )

    trainer.fit(model)
Exemplo n.º 10
0
def test_mlflow_experiment_id_retrieved_once(client, mlflow, tmpdir):
    """Test that the logger experiment_id retrieved only once."""
    logger = MLFlowLogger("test", save_dir=tmpdir)
    _ = logger.experiment
    _ = logger.experiment
    _ = logger.experiment
    assert logger.experiment.get_experiment_by_name.call_count == 1
Exemplo n.º 11
0
def test_mlflow_logger_dirs_creation(tmpdir):
    """ Test that the logger creates the folders and files in the right place. """
    if not _MLFLOW_AVAILABLE:
        pytest.xfail("test for explicit file creation requires mlflow dependency to be installed.")

    assert not os.listdir(tmpdir)
    logger = MLFlowLogger('test', save_dir=tmpdir)
    assert logger.save_dir == tmpdir
    assert set(os.listdir(tmpdir)) == {'.trash'}
    run_id = logger.run_id
    exp_id = logger.experiment_id

    # multiple experiment calls should not lead to new experiment folders
    for i in range(2):
        _ = logger.experiment
        assert set(os.listdir(tmpdir)) == {'.trash', exp_id}
        assert set(os.listdir(tmpdir / exp_id)) == {run_id, 'meta.yaml'}

    model = EvalModelTemplate()
    trainer = Trainer(
        default_root_dir=tmpdir,
        logger=logger,
        max_epochs=1,
        limit_val_batches=3,
        log_gpu_memory=True,
    )
    trainer.fit(model)
    assert set(os.listdir(tmpdir / exp_id)) == {run_id, 'meta.yaml'}
    assert 'epoch' in os.listdir(tmpdir / exp_id / run_id / 'metrics')
    assert set(os.listdir(tmpdir / exp_id / run_id / 'params')) == model.hparams.keys()
    assert trainer.checkpoint_callback.dirpath == (tmpdir / exp_id / run_id / 'checkpoints')
    assert set(os.listdir(trainer.checkpoint_callback.dirpath)) == {'epoch=0-step=9.ckpt'}
Exemplo n.º 12
0
 def __init__(self, experiment, name, max_epochs, min_epochs, patience,
              val_check_interval):
     # logger = TensorBoardLogger('../logs', name=name)
     tags = {'mlflow.runName': name}
     logger = MLFlowLogger(experiment, 'file:../logs/mlruns', tags)
     if patience == 0:
         early_stopping = False
     else:
         early_stopping = EarlyStopping(patience=patience,
                                        monitor='val_loss',
                                        mode='min')
     # filepath = pathlib.Path('../logs') / name / f'version_{logger.version}' / 'model'
     filepath = pathlib.Path('../models') / name / 'model'
     model_checkpoint = ModelCheckpoint(str(filepath),
                                        monitor='val_loss',
                                        mode='min')
     super().__init__(
         default_save_path='../logs',
         gpus=1,
         max_epochs=max_epochs,
         min_epochs=min_epochs,
         early_stop_callback=early_stopping,
         logger=logger,
         row_log_interval=100,
         checkpoint_callback=model_checkpoint,
         val_check_interval=val_check_interval,
     )
Exemplo n.º 13
0
def test_mlflow_logger_dirs_creation(tmpdir):
    """ Test that the logger creates the folders and files in the right place. """
    assert not os.listdir(tmpdir)
    logger = MLFlowLogger('test', save_dir=tmpdir)
    assert logger.save_dir == tmpdir
    assert set(os.listdir(tmpdir)) == {'.trash'}
    run_id = logger.run_id
    exp_id = logger.experiment_id

    # multiple experiment calls should not lead to new experiment folders
    for i in range(2):
        _ = logger.experiment
        assert set(os.listdir(tmpdir)) == {'.trash', exp_id}
        assert set(os.listdir(tmpdir / exp_id)) == {run_id, 'meta.yaml'}

    model = EvalModelTemplate()
    trainer = Trainer(default_root_dir=tmpdir,
                      logger=logger,
                      max_epochs=1,
                      limit_val_batches=3)
    trainer.fit(model)
    assert set(os.listdir(tmpdir / exp_id)) == {run_id, 'meta.yaml'}
    assert 'epoch' in os.listdir(tmpdir / exp_id / run_id / 'metrics')
    assert set(os.listdir(tmpdir / exp_id / run_id /
                          'params')) == model.hparams.keys()
    assert trainer.checkpoint_callback.dirpath == (tmpdir / exp_id / run_id /
                                                   'checkpoints')
    assert set(os.listdir(
        trainer.checkpoint_callback.dirpath)) == {'epoch=0.ckpt'}
Exemplo n.º 14
0
def main(hparams) -> None:
    experiment_name = hparams.algo_name

    save_folder = 'model_weights/' + experiment_name
    if not os.path.exists(save_folder):
        os.mkdir(save_folder)
    
    checkpoint_callback = ModelCheckpoint(
                            filepath=save_folder+'/model_{epoch:02d}')

    mlf_logger = MLFlowLogger(
                                experiment_name=experiment_name,
                                tracking_uri="file:./mlruns"
                                )

    # telegram
    token = telegram_config['token']
    user_id = telegram_config['user_id']
    bot = RLBot(token=token, user_id=user_id)
    telegramCallback = TelegramRLCallback(bot)
    trainer = Trainer(checkpoint_callback=checkpoint_callback,
        max_epochs=10000,
        early_stop_callback=False,
        val_check_interval=100,
        logger=mlf_logger,
        callbacks=[telegramCallback],
    )
    model = ValueRL(hparams)
    trainer.fit(model)
Exemplo n.º 15
0
def test_mlflow_log_dir(client, mlflow, tmpdir):
    """Test that the trainer saves checkpoints in the logger's save dir."""

    # simulate experiment creation with mlflow client mock
    run = MagicMock()
    run.info.run_id = "run-id"
    client.return_value.get_experiment_by_name = MagicMock(return_value=None)
    client.return_value.create_experiment = MagicMock(return_value="exp-id")
    client.return_value.create_run = MagicMock(return_value=run)

    # test construction of default log dir path
    logger = MLFlowLogger("test", save_dir=tmpdir)
    assert logger.save_dir == tmpdir
    assert logger.version == "run-id"
    assert logger.name == "exp-id"

    model = BoringModel()
    trainer = Trainer(default_root_dir=tmpdir,
                      logger=logger,
                      max_epochs=1,
                      limit_train_batches=1,
                      limit_val_batches=3)
    assert trainer.log_dir == logger.save_dir
    trainer.fit(model)
    assert trainer.checkpoint_callback.dirpath == (tmpdir / "exp-id" /
                                                   "run-id" / "checkpoints")
    assert set(os.listdir(
        trainer.checkpoint_callback.dirpath)) == {"epoch=0-step=1.ckpt"}
    assert trainer.log_dir == logger.save_dir
Exemplo n.º 16
0
def main(hparams):
    engine = Module(hparams)

    mlf_logger = MLFlowLogger(experiment_name=hparams.exp_name,
                              tracking_uri="./mlruns",
                              tags=args.tags)

    exp = mlf_logger.experiment.get_experiment_by_name(hparams.exp_name)
    artifacts_dir = os.path.join(exp.artifact_location, mlf_logger.run_id,
                                 "artifacts")

    checkpoint_callback = ModelCheckpoint(
        filepath=artifacts_dir,
        save_top_k=-1,
        verbose=True,
        monitor="val_loss_avg",
        mode="min",
        prefix="",
    )

    trainer = Trainer(logger=mlf_logger,
                      checkpoint_callback=checkpoint_callback,
                      max_epochs=hparams.num_epochs,
                      gpus=[0])
    trainer.fit(engine)
Exemplo n.º 17
0
def main():
    load_dotenv('cassava.env')
    seed_everything(SEED)

    root_path = os.getenv('ROOT_PATH')
    train_csv_path = root_path + 'train.csv'
    train_root_path = root_path + 'train_images'

    num_classes = int(os.getenv('NUM_CLASSES', 5))
    num_epoch = int(os.getenv('NUM_EPOCH', 10))
    num_folds = int(os.getenv('NUM_FOLDS', 5))
    batch_size = int(os.getenv('BATCH_SIZE'), 16)
    grad_acc = int(os.getenv('GRAD_ACC', 8))

    resize = os.getenv('RESIZE', 224)

    normalize = A.Normalize(mean=[0.485, 0.456, 0.406],
                            std=[0.229, 0.224, 0.225])
    train_transform = A.Compose([
        A.HorizontalFlip(),
        A.ShiftScaleRotate(p=1.0),
        A.ColorJitter(brightness=0.1, contrast=0.2, saturation=0.2, hue=0.0, p=1.0, always_apply=False),
        A.RandomResizedCrop(resize, resize, p=1.0, always_apply=True),
        normalize,
        ToTensorV2(p=1.0),
    ], p=1.0)
    test_transform = A.Compose([
        A.Resize(int(resize * 1.5), int(resize * 1.5)),
        normalize,
        ToTensorV2(p=1.0),
    ], p=1.0)
    tta_transform = tta.Compose([
        tta.FiveCrops(resize, resize),
    ])

    criterion = MixedLabelLoss(nn.CrossEntropyLoss(reduction='none'))
    augmentations = [snapmix, ]

    df = pd.read_csv(train_csv_path)
    folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=SEED).split(df['image_id'], df['label'])
    for _fold, (train, test) in enumerate(folds):
        train = df.iloc[train]
        test = df.iloc[test]
        scheduler = optim.lr_scheduler.CosineAnnealingLR

        model = TimmNet('efficientnet_b3a', num_classes, criterion, learning_rate=1e-3, scheduler=scheduler,
                        n_epoch=num_epoch, eta_min=1e-6, augmentations=augmentations, tta_transform=tta_transform)
        dm = DataFrameDataModule(train, train_root_path, test, batch_size=batch_size,
                                 train_transform=train_transform, test_transform=test_transform)

        mlf_logger = MLFlowLogger(
            experiment_name='cassava',
            tracking_uri='file:./cassava'
        )
        trainer = Trainer(gpus=-1, precision=32, deterministic=True, accumulate_grad_batches=grad_acc,
                          profiler='simple', val_check_interval=1.0, logger=mlf_logger, max_epochs=num_epoch)
        trainer.fit(model, datamodule=dm)
Exemplo n.º 18
0
def test_mlflow_experiment_id_retrieved_once(tmpdir):
    logger = MLFlowLogger('test', save_dir=tmpdir)
    get_experiment_name = logger._mlflow_client.get_experiment_by_name
    with mock.patch.object(MlflowClient,
                           'get_experiment_by_name',
                           wraps=get_experiment_name) as mocked:
        _ = logger.experiment
        _ = logger.experiment
        _ = logger.experiment
        assert mocked.call_count == 1
Exemplo n.º 19
0
def create_logger() -> Union[bool, LightningLoggerBase]:
    """
    Loosely imitate:
    https://github.com/Azure/azureml-examples/blob/main/tutorials/using-pytorch-lightning/3.log-with-mlflow.ipynb
    """
    run = Run.get_context()
    if isinstance(run, _SubmittedRun):
        experiment = run.experiment
        tracking_uri = experiment.workspace.get_mlflow_tracking_uri()
        exp_name = run.experiment.name
        log.info(
            f"Using MLFlow logger with tracking URI {tracking_uri} and experiment name {exp_name}"
        )
        rv = MLFlowLogger(exp_name, tracking_uri)
        rv._run_id = run.id
    else:
        log.warning("Unable to get AML run context! Logging locally.")
        rv = True

    return rv
def main(config):
    pl.seed_everything(config.seed)
    gpus = [0] if torch.cuda.is_available() else None

    filepath_list_train=generate_pathlist.make_datapath_list(
        config.project_dir+config.train_dir,
    )
    dm = image_datamodule.ImageDataModule(
        filepath_list_train=filepath_list_train,
        filepath_list_test=filepath_list_train,
        )

    discriminator=dcgan.Discriminator()
    generator=dcgan.Generator()
    criterion=nn.BCEWithLogitsLoss(reduction="mean")
    model = GAN(
        discriminator=discriminator,
        generator=generator,
        criterion=criterion,
        **dc.asdict(config),
        )

    mlflow_tags={}
    mlflow_tags["mlflow.runName"]=config.run_name
    mlflow_tags["mlflow.user"]=config.user
    mlflow_tags["mlflow.source.name"]=str(os.path.abspath(__file__)).replace("/",'\\')
    mlf_logger = MLFlowLogger(
        experiment_name=config.experiment_name,
        tracking_uri=config.tracking_uri,
        tags=mlflow_tags
        )

    now=datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
    checkpoint_callback = ModelCheckpoint(
        filepath=f"{config.checkpoint_dir}{now}_{mlf_logger.run_id}",
        save_top_k=None,
        monitor=None,
    )

    trainer = pl.Trainer(
        max_epochs=config.max_epochs,
        logger=mlf_logger,
        gpus=gpus,
        checkpoint_callback=checkpoint_callback,
        resume_from_checkpoint=None,
        )
    trainer.fit(model, datamodule=dm)

    # save to mlflow
    mlf_logger.experiment.log_artifact(mlf_logger.run_id,
                                config.log_dir+"/"+config.log_normal)
    mlf_logger.experiment.log_artifact(mlf_logger.run_id,
                                config.log_dir+"/"+config.log_error)
Exemplo n.º 21
0
def test_mlflow_logger_experiment_calls(client, mlflow, time, tmpdir):
    """
    Test that the logger calls methods on the mlflow experiment correctly.
    """
    time.return_value = 1

    logger = MLFlowLogger("test",
                          save_dir=tmpdir,
                          artifact_location="my_artifact_location")
    logger._mlflow_client.get_experiment_by_name.return_value = None

    params = {"test": "test_param"}
    logger.log_hyperparams(params)

    logger.experiment.log_param.assert_called_once_with(
        logger.run_id, "test", "test_param")

    metrics = {"some_metric": 10}
    logger.log_metrics(metrics)

    logger.experiment.log_metric.assert_called_once_with(
        logger.run_id, "some_metric", 10, 1000, None)

    logger._mlflow_client.create_experiment.assert_called_once_with(
        name="test", artifact_location="my_artifact_location")
Exemplo n.º 22
0
def test_mlflow_logger_experiment_calls(client, mlflow, time, tmpdir):
    """
    Test that the logger calls methods on the mlflow experiment correctly.
    """
    time.return_value = 1

    logger = MLFlowLogger('test',
                          save_dir=tmpdir,
                          artifact_location='my_artifact_location')
    logger._mlflow_client.get_experiment_by_name.return_value = None

    params = {'test': 'test_param'}
    logger.log_hyperparams(params)

    logger.experiment.log_param.assert_called_once_with(
        logger.run_id, 'test', 'test_param')

    metrics = {'some_metric': 10}
    logger.log_metrics(metrics)

    logger.experiment.log_metric.assert_called_once_with(
        logger.run_id, 'some_metric', 10, 1000, None)

    logger._mlflow_client.create_experiment.assert_called_once_with(
        name='test',
        artifact_location='my_artifact_location',
    )
Exemplo n.º 23
0
def train(args):
    seed_everything(args.seed)
    model = LitLSTM(args)

    logger = MLFlowLogger(experiment_name='Default')

    early_stop_callback = EarlyStopping(
        monitor='val_loss', min_delta=0.005, patience=3, verbose=args.verbose, mode='min') if args.early else None

    trainer = Trainer(
        log_gpu_memory='all' if args.verbose else None,
        track_grad_norm=2 if args.verbose else -1,
        logger=logger,
        weights_summary='full',
        callbacks=[early_stop_callback],
        accumulate_grad_batches=args.acc_grads,
        profiler=args.verbose,
        **get_trainer_params(args),
    )

    logger.log_hyperparams(model.args)
    trainer.fit(model)
    trainer.test(model, test_dataloaders=model.test_dataloader())
    model.save_preds_and_targets(to_disk=True)
    logger.finalize()

    return logger.run_id
Exemplo n.º 24
0
def main(args: Args):
    train_image = Path(args.train_image)
    train_label = Path(args.train_label)
    test_image = Path(args.test_image)
    test_label = Path(args.test_label)
    data = Mnist(32, 0.9, train_image, train_label, test_image, test_label)
    model = MnistEncoder(28, 64, 3)

    # https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.loggers.mlflow.html
    mlflow_logger = MLFlowLogger()
    trainer = pl.Trainer(max_epochs=1, logger=mlflow_logger)

    trainer.fit(model, train_dataloader=data)
Exemplo n.º 25
0
def test_mlflow_pickle(tmpdir):
    """Verify that pickling trainer with mlflow logger works."""
    tutils.reset_seed()

    mlflow_dir = os.path.join(tmpdir, 'mlruns')
    logger = MLFlowLogger('test',
                          tracking_uri=f'file:{os.sep * 2}{mlflow_dir}')
    trainer_options = dict(default_save_path=tmpdir,
                           max_epochs=1,
                           logger=logger)

    trainer = Trainer(**trainer_options)
    pkl_bytes = pickle.dumps(trainer)
    trainer2 = pickle.loads(pkl_bytes)
    trainer2.logger.log_metrics({'acc': 1.0})
Exemplo n.º 26
0
def main(config):
    pl.seed_everything(config.seed)
    gpus = [0] if torch.cuda.is_available() else None

    filepath_list_train, label_list_train = generate_pathlist.make_datapath_list(
        config.project_dir + config.train_dir,
        config.project_dir + config.train_label_path)
    filepath_list_test, label_list_test = generate_pathlist.make_datapath_list(
        config.project_dir + config.test_dir,
        config.project_dir + config.test_label_path)
    dm = image_datamodule.ImageDataModule(
        filepath_list_train=filepath_list_train,
        filepath_list_test=filepath_list_test,
        label_list_train=label_list_train,
        label_list_test=label_list_test,
    )

    net = image_network.CNN()
    model = LitClassifier(
        model=net,
        learning_rate=config.learning_rate,
    )

    mlflow_tags = {}
    mlflow_tags["mlflow.runName"] = config.run_name
    mlflow_tags["mlflow.user"] = config.user
    mlflow_tags["mlflow.source.name"] = str(os.path.abspath(__file__)).replace(
        "/", '\\')
    mlf_logger = MLFlowLogger(experiment_name=config.experiment_name,
                              tracking_uri=config.tracking_uri,
                              tags=mlflow_tags)

    trainer = pl.Trainer(
        max_epochs=config.max_epochs,
        logger=mlf_logger,
        gpus=gpus,
        resume_from_checkpoint=None,
    )
    trainer.fit(model, datamodule=dm)
    result = trainer.test(model, datamodule=dm)
    pprint(result)

    mlf_logger.experiment.log_artifact(
        mlf_logger.run_id, config.log_dir + "/" + config.log_normal)
    mlf_logger.experiment.log_artifact(mlf_logger.run_id,
                                       config.log_dir + "/" + config.log_error)
Exemplo n.º 27
0
def test_mlflow_run_id_setting(client, mlflow, tmpdir):
    """Test that the run_id argument uses the provided run_id."""

    run = MagicMock()
    run.info.run_id = "run-id"
    run.info.experiment_id = "experiment-id"

    # simulate existing run
    client.return_value.get_run = MagicMock(return_value=run)

    # run_id exists uses the existing run
    logger = MLFlowLogger("test", run_id=run.info.run_id, save_dir=tmpdir)
    _ = logger.experiment
    client.return_value.get_run.assert_called_with(run.info.run_id)
    assert logger.experiment_id == run.info.experiment_id
    assert logger.run_id == run.info.run_id
    client.reset_mock(return_value=True)
Exemplo n.º 28
0
def test_mlflow_logger_dirs_creation(tmpdir):
    """Test that the logger creates the folders and files in the right place."""
    if not _MLFLOW_AVAILABLE:
        pytest.xfail(
            "test for explicit file creation requires mlflow dependency to be installed."
        )

    assert not os.listdir(tmpdir)
    logger = MLFlowLogger("test", save_dir=tmpdir)
    assert logger.save_dir == tmpdir
    assert set(os.listdir(tmpdir)) == {".trash"}
    run_id = logger.run_id
    exp_id = logger.experiment_id

    # multiple experiment calls should not lead to new experiment folders
    for i in range(2):
        _ = logger.experiment
        assert set(os.listdir(tmpdir)) == {".trash", exp_id}
        assert set(os.listdir(tmpdir / exp_id)) == {run_id, "meta.yaml"}

    class CustomModel(BoringModel):
        def training_epoch_end(self, *args, **kwargs):
            super().training_epoch_end(*args, **kwargs)
            self.log("epoch", self.current_epoch)

    model = CustomModel()
    limit_batches = 5
    trainer = Trainer(
        default_root_dir=tmpdir,
        logger=logger,
        max_epochs=1,
        limit_train_batches=limit_batches,
        limit_val_batches=limit_batches,
        log_gpu_memory=True,
    )
    trainer.fit(model)
    assert set(os.listdir(tmpdir / exp_id)) == {run_id, "meta.yaml"}
    assert "epoch" in os.listdir(tmpdir / exp_id / run_id / "metrics")
    assert set(os.listdir(tmpdir / exp_id / run_id /
                          "params")) == model.hparams.keys()
    assert trainer.checkpoint_callback.dirpath == (tmpdir / exp_id / run_id /
                                                   "checkpoints")
    assert os.listdir(trainer.checkpoint_callback.dirpath) == [
        f"epoch=0-step={limit_batches - 1}.ckpt"
    ]
Exemplo n.º 29
0
def test_mlflow_pickle(tmpdir):
    """Verify that pickling trainer with mlflow logger works."""
    tutils.reset_seed()

    # hparams = tutils.get_hparams()
    # model = LightningTestModel(hparams)

    mlflow_dir = os.path.join(tmpdir, "mlruns")
    logger = MLFlowLogger("test",
                          tracking_uri=f"file:{os.sep * 2}{mlflow_dir}")
    trainer_options = dict(default_save_path=tmpdir,
                           max_epochs=1,
                           logger=logger)

    trainer = Trainer(**trainer_options)
    pkl_bytes = pickle.dumps(trainer)
    trainer2 = pickle.loads(pkl_bytes)
    trainer2.logger.log_metrics({"acc": 1.0})
Exemplo n.º 30
0
def main():
    parser = ArgumentParser()
    # using this will log all params in mlflow board automatically
    parser = Trainer.add_argparse_args(parser) 
    parser = MLP.add_model_specific_args(parser)
    args = parser.parse_args()

    experiment_name = 'mlp'
    # tb_logger = loggers.TensorBoardLogger('logs')
    mlf_logger = MLFlowLogger(
                                experiment_name=experiment_name,
                                tracking_uri="file:./mlruns"
                                )
    save_folder = 'model_weights/' + experiment_name + '/'
    if not os.path.exists(save_folder):
        os.mkdir(save_folder)
    save_folder = save_folder + mlf_logger.run_id + '/'
    if not os.path.exists(save_folder):
        os.mkdir(save_folder)

    early_stopping = EarlyStopping('val_loss')
    # saves checkpoints to 'save_folder' whenever 'val_loss' has a new min
    checkpoint_callback = ModelCheckpoint(
                            filepath=save_folder+'/model_{epoch:02d}-{val_loss:.2f}')

    # telegram
    token = telegram_config['token']
    user_id = telegram_config['user_id']
    bot = DLBot(token=token, user_id=user_id)
    telegramCallback = TelegramBotCallback(bot)

    model = MLP(args)

    trainer = Trainer(checkpoint_callback=checkpoint_callback,
                        early_stop_callback=early_stopping,
                        fast_dev_run=False,                     # make this as True only to check for bugs
                        max_epochs=1000,
                        resume_from_checkpoint=None,            # change this to model_path
                        logger=mlf_logger,                      # mlflow logger
                        callbacks=[telegramCallback],           # telegrad
                        )

    trainer.fit(model)
    trainer.test()