Exemplo n.º 1
0
def test_hyperopt_ray_mlflow(csv_filename, tmpdir, ray_cluster_4cpu):
    mlflow_uri = f"file://{tmpdir}/mlruns"
    mlflow.set_tracking_uri(mlflow_uri)
    client = MlflowClient(tracking_uri=mlflow_uri)

    num_samples = 2
    config = _get_config(
        {"type": "variant_generator"}, {"type": "ray", "num_samples": num_samples}  # search_alg  # executor
    )

    rel_path = generate_data(config["input_features"], config["output_features"], csv_filename)

    exp_name = "mlflow_test"
    run_hyperopt(config, rel_path, tmpdir, experiment_name=exp_name, callbacks=[MlflowCallback(mlflow_uri)])

    experiment = client.get_experiment_by_name(exp_name)
    assert experiment is not None

    runs = client.search_runs([experiment.experiment_id])
    assert len(runs) > 0

    for run in runs:
        artifacts = [f.path for f in client.list_artifacts(run.info.run_id, "")]
        assert "config.yaml" in artifacts
        assert "model" in artifacts
Exemplo n.º 2
0
def test_hyperopt_ray_mlflow(csv_filename, ray_start_4_cpus, tmpdir):
    mlflow_uri = f'file://{tmpdir}/mlruns'
    mlflow.set_tracking_uri(mlflow_uri)
    client = MlflowClient(tracking_uri=mlflow_uri)

    num_samples = 2
    config = _get_config({
        "type": "ray",
        "num_samples": num_samples
    }, {"type": "ray"})

    rel_path = generate_data(config['input_features'],
                             config['output_features'], csv_filename)

    exp_name = 'mlflow_test'
    run_hyperopt(config,
                 rel_path,
                 experiment_name=exp_name,
                 callbacks=[MlflowCallback(mlflow_uri)])

    experiment = client.get_experiment_by_name(exp_name)
    assert experiment is not None

    runs = client.search_runs([experiment.experiment_id])
    assert len(runs) > 0

    for run in runs:
        artifacts = [
            f.path for f in client.list_artifacts(run.info.run_id, "")
        ]
        assert 'config.yaml' in artifacts
        assert 'model' in artifacts
def test_mlflow_csv_data_set_save_reload(tmp_path, tracking_uri, dataset,
                                         extension, data, artifact_path):
    mlflow.set_tracking_uri(tracking_uri.as_uri())
    mlflow_client = MlflowClient(tracking_uri=tracking_uri.as_uri())
    filepath = (tmp_path / "data").with_suffix(extension)

    mlflow_csv_dataset = MlflowArtifactDataSet(
        artifact_path=artifact_path,
        data_set=dict(type=CSVDataSet, filepath=filepath.as_posix()),
    )

    with mlflow.start_run():
        mlflow_csv_dataset.save(data)
        run_id = mlflow.active_run().info.run_id

    # the artifact must be properly uploaded to "mlruns" and reloadable
    run_artifacts = [
        fileinfo.path
        for fileinfo in mlflow_client.list_artifacts(run_id=run_id,
                                                     path=artifact_path)
    ]
    remote_path = (filepath.name if artifact_path is None else
                   (Path(artifact_path) / filepath.name).as_posix())
    assert remote_path in run_artifacts
    assert data.equals(mlflow_csv_dataset.load())
Exemplo n.º 4
0
def test_autolog_logs_expected_data():
    mlflow.paddle.autolog()

    with mlflow.start_run() as run:
        train_model()

    client = MlflowClient()
    data = client.get_run(run.info.run_id).data

    # Testing params are logged
    for param_key, expected_param_value in [("optimizer_name", "Adam"),
                                            ("learning_rate", "0.01")]:
        assert param_key in data.params
        assert data.params[param_key] == expected_param_value

    # Testing metrics are logged
    for metric_key in [
            "batch_size", "loss", "step", "eval_batch_size", "eval_loss",
            "eval_step"
    ]:
        assert metric_key in data.metrics
        metric_history = client.get_metric_history(run.info.run_id, metric_key)
        assert len(metric_history) == NUM_EPOCHS

    # Testing model_summary.txt is saved
    artifacts = client.list_artifacts(run.info.run_id)
    assert any(x.path == "model_summary.txt" for x in artifacts)
Exemplo n.º 5
0
def _parse_runid_ref(parsed: ParseResult, client: MlflowClient):
    runid = parsed.hostname
    run = client.get_run(runid)
    path = parsed.path.lstrip("/")
    if path:
        return (
            "runs:/{}/{}".format(runid, path),
            run.data.tags,
            run.data.params,
        )
    else:
        artifacts = client.list_artifacts(runid)
        if not artifacts:
            raise SpecError("Run {} has no artifacts".format(runid))
        elif len(artifacts) == 1:
            return (
                "runs:/{}/{}".format(runid, artifacts[0].path),
                run.data.tags,
                run.data.params,
            )
        else:
            # TODO allow setting default path from config
            raise SpecError(
                (
                    "Run {} has more than 1 artifact ({})."
                    "Please specify path like "
                    "mlflows://<runid>/path/to/artifact in "
                    "CREATE MODEL or ML_PREDICT"
                ).format(runid, [x.path for x in artifacts])
            )
def test_mlflow_hook_save_pipeline_ml(
    kedro_project_with_mlflow_conf,
    pipeline_to_run,
    dummy_catalog,
    dummy_run_params,
):

    bootstrap_project(kedro_project_with_mlflow_conf)
    with KedroSession.create(
            project_path=kedro_project_with_mlflow_conf) as session:
        context = session.load_context()  # triggers conf setup

        # config_with_base_mlflow_conf is a conftest fixture
        mlflow_hook = MlflowHook()
        mlflow_hook.after_context_created(context)  # setup mlflow config
        runner = SequentialRunner()
        mlflow_hook.after_catalog_created(
            catalog=dummy_catalog,
            # `after_catalog_created` is not using any of below arguments,
            # so we are setting them to empty values.
            conf_catalog={},
            conf_creds={},
            feed_dict={},
            save_version="",
            load_versions="",
        )
        mlflow_hook.before_pipeline_run(run_params=dummy_run_params,
                                        pipeline=pipeline_to_run,
                                        catalog=dummy_catalog)
        runner.run(pipeline_to_run, dummy_catalog, session._hook_manager)
        run_id = mlflow.active_run().info.run_id
        mlflow_hook.after_pipeline_run(run_params=dummy_run_params,
                                       pipeline=pipeline_to_run,
                                       catalog=dummy_catalog)
        # test : parameters should have been logged
        mlflow_client = MlflowClient(context.mlflow.server.mlflow_tracking_uri)
        run_data = mlflow_client.get_run(run_id).data

        # all run_params are recorded as tags
        for k, v in dummy_run_params.items():
            if v:
                assert run_data.tags[k] == str(v)

        # params are not recorded because we don't have MlflowHook here
        # and the model should not be logged when it is not a PipelineML
        nb_artifacts = len(mlflow_client.list_artifacts(run_id))
        if isinstance(pipeline_to_run, PipelineML):
            assert nb_artifacts == 1
        else:
            assert nb_artifacts == 0

        if isinstance(pipeline_to_run, PipelineML):
            trained_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model")
            assert trained_model.metadata.signature.to_dict() == {
                "inputs": '[{"name": "a", "type": "long"}]',
                "outputs": None,
            }
Exemplo n.º 7
0
def setup_log(cfg):
    import tempfile
    import shutil
    from os.path import join
    from mlflow.tracking import MlflowClient

    # --- SETUP MLFOW ---
    mlflow.set_tracking_uri(cfg['mlflow']['uri'])
    mlflow.set_experiment(cfg['experiment']['name'] if
                          not cfg['script-arguments'].debug else 'DEBUG_RUNS')
    mlflow.pytorch.autolog(log_models=False)
    tags = cfg.experiment.tags.to_dict()
    tags['subexp'] = cfg.experiment['sub-experiment']
    tags['subexpID'] = str(cfg.experiment['sub-experiment-id'])
    run_name = f"{cfg.experiment['sub-experiment']}{cfg.experiment['sub-experiment-id']}-{cfg.trial.id:02}"
    mlflow.start_run(run_name=run_name, tags=tags)

    # --- CREATE TMP ---
    tmp = tempfile.TemporaryDirectory(dir=cfg['script-arguments']['tmp-dir'])

    # --- SAVE CFG ---
    shutil.copy(cfg['script-arguments'].config, join(tmp.name, 'cfg.yaml'))
    mlflow.log_artifact(join(tmp.name, 'cfg.yaml'))
    # Sanity check of artifact saving
    client = MlflowClient()
    artifacts = client.list_artifacts(mlflow.active_run().info.run_id)
    if len(artifacts) != 1 or artifacts[0].path != 'cfg.yaml':
        raise RuntimeError(
            'The sanity check for storing artifacts failed.'
            'Interrupting the script before the training starts.')

    with open(join(tmp.name, 'cfg_extended.yaml'), 'w') as f:
        cfg.to_yaml(f)

    mlflow.log_param('sub-experiment', cfg.experiment['sub-experiment'])
    if cfg.experiment['sub-experiment-id']:
        mlflow.log_param('sub-experiment-id',
                         cfg.experiment['sub-experiment-id'])
    for k, v in cfg.trial.items():
        mlflow.log_param('trial.' + k, v)

    for k, v in cfg['model'].items():
        mlflow.log_param(f'model.{k}', v)
    for k, v in cfg['data-augmentation'].items():
        if isinstance(v, dict):
            for k1, v1 in v.items():
                mlflow.log_param(f'DA.{k} {k1}', v1)
        else:
            mlflow.log_param(f'DA.{k}', v)
    mlflow.log_param('dropout', cfg['hyper-parameters']['drop-out'])
    mlflow.log_param('training.file', cfg.training['dataset-file'])
    mlflow.log_param('training.dataset', cfg.training['training-dataset'])

    return tmp
def test_mlflow_data_set_save_with_run_id(tmp_path, tracking_uri, df1,
                                          exists_active_run):
    mlflow.set_tracking_uri(tracking_uri.as_uri())
    mlflow_client = MlflowClient(tracking_uri=tracking_uri.as_uri())
    nb_runs = 0
    # create a first run and get its id
    with mlflow.start_run():
        mlflow.log_param("fake", 2)
        run_id = mlflow.active_run().info.run_id
        nb_runs += 1

    # check behaviour when logging with an already opened run
    if exists_active_run:
        mlflow.start_run()
        active_run_id = mlflow.active_run().info.run_id
        nb_runs += 1

    # then same scenario but the run_id where data is saved is specified
    mlflow_csv_dataset = MlflowArtifactDataSet(
        data_set=dict(type=CSVDataSet,
                      filepath=(tmp_path / "df1.csv").as_posix()),
        run_id=run_id,
    )
    mlflow_csv_dataset.save(df1)

    # same tests as previously, bu no new experiments must have been created
    runs_list = mlflow_client.list_run_infos(experiment_id="0")
    run_artifacts = [
        fileinfo.path
        for fileinfo in mlflow_client.list_artifacts(run_id=run_id)
    ]

    assert len(
        runs_list) == nb_runs  # no new run must have been created when saving
    assert (mlflow.active_run().info.run_id == active_run_id
            if mlflow.active_run() else True
            )  # if a run was opened before saving, it must be reopened
    assert "df1.csv" in run_artifacts  # the file must exists
    assert df1.equals(mlflow_csv_dataset.load())  # and must loadable

    if exists_active_run:
        mlflow.end_run()
def test_is_versioned_dataset_logged_correctly_in_mlflow(
        tmp_path, tracking_uri, df1):
    """Check if versioned dataset is logged correctly in MLflow as artifact.

    For versioned datasets just artifacts from current run should be logged.
    """
    mlflow.set_tracking_uri(tracking_uri.as_uri())
    mlflow_client = MlflowClient(tracking_uri=tracking_uri.as_uri())

    mlflow.start_run()

    run_id = mlflow.active_run().info.run_id
    active_run_id = mlflow.active_run().info.run_id

    mlflow_csv_dataset = MlflowArtifactDataSet(
        data_set=dict(type=CSVDataSet,
                      filepath=(tmp_path / "df1.csv").as_posix(),
                      versioned=True),
        run_id=run_id,
    )
    mlflow_csv_dataset.save(df1)

    run_artifacts = [
        fileinfo.path
        for fileinfo in mlflow_client.list_artifacts(run_id=run_id)
    ]

    # Check if just one artifact was created in given run.
    assert len(run_artifacts) == 1

    artifact_path = mlflow_client.download_artifacts(run_id=run_id,
                                                     path=run_artifacts[0])

    # Check if saved artifact is file and not folder where versioned datasets are stored.
    assert Path(artifact_path).is_file()

    assert (mlflow.active_run().info.run_id == active_run_id
            if mlflow.active_run() else True
            )  # if a run was opened before saving, it must be reopened
    assert df1.equals(mlflow_csv_dataset.load())  # and must loadable

    mlflow.end_run()
def test_partitioned_dataset_save_and_reload(tmp_path, tracking_uri,
                                             artifact_path, df1, df2):

    mlflow.set_tracking_uri(tracking_uri.as_uri())
    mlflow_client = MlflowClient(tracking_uri=tracking_uri.as_uri())

    mlflow_dataset = MlflowArtifactDataSet(
        artifact_path=artifact_path,
        data_set=dict(
            type=PartitionedDataSet,
            path=(tmp_path / "df_dir").as_posix(),
            dataset="pandas.CSVDataSet",
            filename_suffix=".csv",
        ),
    )

    data = dict(df1=df1, df2=df2)

    with mlflow.start_run():
        mlflow_dataset.save(data)
        run_id = mlflow.active_run().info.run_id

    # the artifact must be properly uploaded to "mlruns" and reloadable
    artifact_path_df_dir = f"{artifact_path}/df_dir" if artifact_path else "df_dir"
    run_artifacts = [
        fileinfo.path for fileinfo in mlflow_client.list_artifacts(
            run_id=run_id,
            path=artifact_path_df_dir,
        )
    ]
    for df_name in data.keys():
        remote_path = (f"df_dir/{df_name}.csv" if artifact_path is None else
                       (Path(artifact_path) / "df_dir" /
                        df_name).with_suffix(".csv").as_posix())
        assert remote_path in run_artifacts

    reloaded_data = {
        k: loader()
        for k, loader in mlflow_dataset.load().items()
    }
    for k, df in data.items():
        pd.testing.assert_frame_equal(df, reloaded_data[k])
Exemplo n.º 11
0
def test_mlflow_pipeline_hook_with_different_pipeline_types(
    mocker,
    monkeypatch,
    tmp_path,
    config_dir,
    env_from_dict,
    pipeline_to_run,
    dummy_catalog,
    dummy_run_params,
    dummy_mlflow_conf,
):
    # config_with_base_mlflow_conf is a conftest fixture
    mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True)
    monkeypatch.chdir(tmp_path)
    pipeline_hook = MlflowPipelineHook(conda_env=env_from_dict,
                                       model_name="model")
    runner = SequentialRunner()
    pipeline_hook.before_pipeline_run(run_params=dummy_run_params,
                                      pipeline=pipeline_to_run,
                                      catalog=dummy_catalog)
    runner.run(pipeline_to_run, dummy_catalog, dummy_run_params["run_id"])
    run_id = mlflow.active_run().info.run_id
    pipeline_hook.after_pipeline_run(run_params=dummy_run_params,
                                     pipeline=pipeline_to_run,
                                     catalog=dummy_catalog)
    # test : parameters should have been logged
    mlflow_conf = get_mlflow_config(tmp_path)
    mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri)
    run_data = mlflow_client.get_run(run_id).data
    # all run_params are recorded as tags
    for k, v in dummy_run_params.items():
        if v:
            assert run_data.tags[k] == str(v)
    # params are not recorded because we don't have MlflowNodeHook here
    # and the model should not be logged when it is not a PipelineML
    nb_artifacts = len(mlflow_client.list_artifacts(run_id))
    if isinstance(pipeline_to_run, PipelineML):
        assert nb_artifacts == 1
    else:
        assert nb_artifacts == 0
Exemplo n.º 12
0
import json

from mlflow.tracking import MlflowClient

if __name__ == "__main__":

    # Create some artifacts data to preserve
    features = "rooms, zipcode, median_price, school_rating, transport"
    data = {"state": "TX", "Available": 25, "Type": "Detached"}

    # Create couple of artifact files under the directory "data"
    os.makedirs("data", exist_ok=True)
    with open("data/data.json", 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2)
    with open("data/features.txt", 'w') as f:
        f.write(features)

    # Create a run under the default experiment (whose id is "0"), and log
    # all files in "data" to root artifact_uri/states
    client = MlflowClient()
    expermient_id = "0"
    run = client.create_run(expermient_id)
    client.log_artifacts(run.info.run_id, "data", artifact_path="states")
    artifacts = client.list_artifacts(run.info.run_id)
    for artifact in artifacts:
        print("artifact: {}".format(artifact.path))
        print("is_dir: {}".format(artifact.is_dir))
    client.set_terminated(run.info.run_id)


Exemplo n.º 13
0
# Databricks notebook source
import re
from mlflow.tracking import MlflowClient
mlflow_client = MlflowClient()
ci_holder_name = "cet_debris_detection_cicd"
versions = mlflow_client.get_latest_versions(ci_holder_name, stages=["Production"])
assert len(versions) == 1
ci_holder = versions[0]
source_run = mlflow_client.get_run(ci_holder.run_id)
dist_info = [fi for fi in mlflow_client.list_artifacts(source_run.info.run_id, 'dist') if fi.path.endswith('.whl')]
assert len(dist_info) == 1
dist_info = dist_info[0]
lib_path = f"{source_run.info.artifact_uri}/{dist_info.path}"
lib_path = re.sub(r"^dbfs:/", "/dbfs/", lib_path)
job_info = [fi for fi in mlflow_client.list_artifacts(source_run.info.run_id, 'job') if fi.path.endswith('runtime_requirements.txt')]
assert len(job_info) == 1
job_info = job_info[0]
req_path = f"{source_run.info.artifact_uri}/{job_info.path}"
req_path = re.sub(r"^dbfs:/", "/dbfs/", req_path)
print(lib_path)
print(req_path)
%pip install -r $req_path
%pip install -U $lib_path
Exemplo n.º 14
0
def test_mlflow_callback(tmpdir):
    epochs = 2
    batch_size = 8
    num_examples = 32

    input_features = [sequence_feature(reduce_output="sum")]
    output_features = [category_feature(vocab_size=2, reduce_input="sum")]

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {"type": "concat", "output_size": 14},
        TRAINER: {"epochs": epochs, "batch_size": batch_size},
    }

    data_csv = generate_data(
        input_features, output_features, os.path.join(tmpdir, "train.csv"), num_examples=num_examples
    )
    val_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, "validation.csv"))
    test_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, "test.csv"))

    mlflow_uri = f"file://{tmpdir}/mlruns"
    mlflow.set_tracking_uri(mlflow_uri)
    client = MlflowClient(tracking_uri=mlflow_uri)

    exp_name = "mlflow_test"
    callback = MlflowCallback()
    wrapped_callback = mock.Mock(wraps=callback)

    model = LudwigModel(config, callbacks=[wrapped_callback], backend=FakeRemoteBackend())
    model.train(training_set=data_csv, validation_set=val_csv, test_set=test_csv, experiment_name=exp_name)
    expected_df, _ = model.predict(test_csv)

    # Check mlflow artifacts
    assert callback.experiment_id is not None
    assert callback.run is not None

    experiment = mlflow.get_experiment_by_name(exp_name)
    assert experiment.experiment_id == callback.experiment_id

    df = mlflow.search_runs([experiment.experiment_id])
    assert len(df) == 1

    run_id = df.run_id[0]
    assert run_id == callback.run.info.run_id

    run = mlflow.get_run(run_id)
    assert run.info.status == "FINISHED"
    assert wrapped_callback.on_trainer_train_setup.call_count == 1
    assert wrapped_callback.on_trainer_train_teardown.call_count == 1

    artifacts = [f.path for f in client.list_artifacts(callback.run.info.run_id, "")]
    local_dir = f"{tmpdir}/local_artifacts"
    os.makedirs(local_dir)

    assert "config.yaml" in artifacts
    local_config_path = client.download_artifacts(callback.run.info.run_id, "config.yaml", local_dir)

    with open(local_config_path) as f:
        config_artifact = yaml.safe_load(f)
    assert config_artifact == config

    model_path = f"runs:/{callback.run.info.run_id}/model"
    loaded_model = mlflow.pyfunc.load_model(model_path)

    assert "ludwig" in loaded_model.metadata.flavors
    flavor = loaded_model.metadata.flavors["ludwig"]

    def compare_features(key):
        assert len(model.config[key]) == len(flavor["ludwig_schema"][key])
        for feature, schema_feature in zip(model.config[key], flavor["ludwig_schema"][key]):
            assert feature["name"] == schema_feature["name"]
            assert feature["type"] == schema_feature["type"]

    compare_features("input_features")
    compare_features("output_features")

    test_df = pd.read_csv(test_csv)
    pred_df = loaded_model.predict(test_df)
    assert pred_df.equals(expected_df)
Exemplo n.º 15
0
class CustomerMlflowClient:
    def __init__(self, tracking_server_uri, experiment_name):
        try:
            self.mlflow_client = MlflowClient(tracking_server_uri)
            logger.info("established mlflow rest-api client")
        except Exception as e:
            logger.error(str(e))

        try:
            self.experiment_id = self.set_experiment(experiment_name)
            logger.info("started mlflow experiment {} with id {}".format(
                experiment_name, self.experiment_id))
        except Exception as e:
            logger.error(str(e))

    def logger(self,
               params,
               metrics,
               local_artifact_path,
               mlflow_artifact_path=None):
        run = self.mlflow_client.create_run(self.experiment_id)
        run_id = run.info.run_id
        logger.info("staring new run with id: {}".format(run_id))
        logger.info("logging parameter to mlflow tracking server")
        self.log_params(run_id, params)
        logger.info("successfully logged parameter to mlflow tracking server")
        logger.info("logging model metrics to mlflow tracking server")
        self.log_metrics(run_id, metrics)
        logger.info(
            "successfully logged model metrics to mlflow tracking server")
        logger.info("logging model artifact to mlflow tracking server")
        self.log_artifact(run_id, local_artifact_path)
        logger.info(
            "successfully logged model artifact to mlflow tracking server")
        logger.info("exiting run with id: {}".format(run_id))

    def set_experiment(self, experiment_name):
        experiment = self.mlflow_client.get_experiment_by_name(experiment_name)
        if experiment is None:
            return self.mlflow_client.create_experiment(experiment_name)
        else:
            return experiment.experiment_id

    def log_params(self, run_id: int, params):
        for key, value in params.items():
            self.mlflow_client.log_param(run_id=run_id, key=key, value=value)

    def log_metrics(self, run_id: int, metrics):
        for key, value in metrics.items():
            self.mlflow_client.log_metric(run_id=run_id, key=key, value=value)

    def log_artifact(self, run_id: int, artifact):
        self.mlflow_client.log_artifact(run_id=run_id, local_path=artifact)

    def get_latest_artifact(self, dest_path):
        run_info = self.mlflow_client.list_run_infos(self.experiment_id)
        latest_run_info = run_info[0]
        file_name = self.mlflow_client.list_artifacts(
            run_id=latest_run_info.run_id)[0].path
        complete_artifact_path = latest_run_info.artifact_uri + '/' + file_name
        self.mlflow_client.download_artifacts(run_id=latest_run_info.run_id,
                                              path=complete_artifact_path,
                                              dst_path=dest_path)
        return dest_path + file_name
Exemplo n.º 16
0
def test_mlflow_callback(tmpdir):
    epochs = 2
    batch_size = 8
    num_examples = 32

    input_features = [sequence_feature(reduce_output='sum')]
    output_features = [category_feature(vocab_size=2, reduce_input='sum')]

    config = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {
            'type': 'concat',
            'fc_size': 14
        },
        'training': {
            'epochs': epochs,
            'batch_size': batch_size
        },
    }

    data_csv = generate_data(input_features,
                             output_features,
                             os.path.join(tmpdir, 'train.csv'),
                             num_examples=num_examples)
    val_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, 'validation.csv'))
    test_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, 'test.csv'))

    mlflow_uri = f'file://{tmpdir}/mlruns'
    mlflow.set_tracking_uri(mlflow_uri)
    client = MlflowClient(tracking_uri=mlflow_uri)

    exp_name = 'mlflow_test'
    callback = MlflowCallback()

    model = LudwigModel(config, callbacks=[callback])
    model.train(training_set=data_csv,
                validation_set=val_csv,
                test_set=test_csv,
                experiment_name=exp_name)
    expected_df, _ = model.predict(test_csv)

    # Check mlflow artifacts
    assert callback.experiment_id is not None
    assert callback.run is not None

    experiment = mlflow.get_experiment_by_name(exp_name)
    assert experiment.experiment_id == callback.experiment_id

    df = mlflow.search_runs([experiment.experiment_id])
    assert len(df) == 1

    run_id = df.run_id[0]
    assert run_id == callback.run.info.run_id

    artifacts = [
        f.path for f in client.list_artifacts(callback.run.info.run_id, "")
    ]
    local_dir = f'{tmpdir}/local_artifacts'
    os.makedirs(local_dir)

    assert 'config.yaml' in artifacts
    local_config_path = client.download_artifacts(callback.run.info.run_id,
                                                  "config.yaml", local_dir)

    with open(local_config_path, 'r') as f:
        config_artifact = yaml.safe_load(f)
    assert config_artifact == config

    model_path = f'runs:/{callback.run.info.run_id}/model'
    loaded_model = mlflow.pyfunc.load_model(model_path)

    assert 'ludwig' in loaded_model.metadata.flavors
    flavor = loaded_model.metadata.flavors['ludwig']

    def compare_features(key):
        assert len(model.config[key]) == len(flavor['ludwig_schema'][key])
        for feature, schema_feature in zip(model.config[key],
                                           flavor['ludwig_schema'][key]):
            assert feature['name'] == schema_feature['name']
            assert feature['type'] == schema_feature['type']

    compare_features('input_features')
    compare_features('output_features')

    test_df = pd.read_csv(test_csv)
    pred_df = loaded_model.predict(test_df)
    assert (pred_df.equals(expected_df))
def test_mlflow_pipeline_hook_with_different_pipeline_types(
    mocker,
    monkeypatch,
    tmp_path,
    config_dir,
    env_from_dict,
    pipeline_to_run,
    dummy_catalog,
    dummy_run_params,
    dummy_mlflow_conf,
):
    # config_with_base_mlflow_conf is a conftest fixture
    mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True)
    monkeypatch.chdir(tmp_path)
    pipeline_hook = MlflowPipelineHook()
    runner = SequentialRunner()
    pipeline_hook.after_catalog_created(
        catalog=dummy_catalog,
        # `after_catalog_created` is not using any of arguments bellow,
        # so we are setting them to empty values.
        conf_catalog={},
        conf_creds={},
        feed_dict={},
        save_version="",
        load_versions="",
        run_id=dummy_run_params["run_id"],
    )
    pipeline_hook.before_pipeline_run(run_params=dummy_run_params,
                                      pipeline=pipeline_to_run,
                                      catalog=dummy_catalog)
    runner.run(pipeline_to_run, dummy_catalog)
    run_id = mlflow.active_run().info.run_id
    pipeline_hook.after_pipeline_run(run_params=dummy_run_params,
                                     pipeline=pipeline_to_run,
                                     catalog=dummy_catalog)
    # test : parameters should have been logged
    context = load_context(tmp_path)
    mlflow_conf = get_mlflow_config(context)
    mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri)
    run_data = mlflow_client.get_run(run_id).data

    # all run_params are recorded as tags
    for k, v in dummy_run_params.items():
        if v:
            assert run_data.tags[k] == str(v)

    # params are not recorded because we don't have MlflowNodeHook here
    # and the model should not be logged when it is not a PipelineML
    nb_artifacts = len(mlflow_client.list_artifacts(run_id))
    if isinstance(pipeline_to_run, PipelineML):
        assert nb_artifacts == 1
    else:
        assert nb_artifacts == 0

    # Check if metrics datasets have prefix with its names.
    # for metric
    assert dummy_catalog._data_sets["my_metrics"]._prefix == "my_metrics"
    assert dummy_catalog._data_sets["another_metrics"]._prefix == "foo"

    if isinstance(pipeline_to_run, PipelineML):
        trained_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model")
        assert trained_model.metadata.signature.to_dict() == {
            "inputs": '[{"name": "a", "type": "long"}]',
            "outputs": None,
        }
model_name = "lr_trip_duration_model"
run_id = run.info.run_id  #"c5218874277e4644a6536affee9b3ba0"
model_uri = f"runs:/{run_id}/model"

model_details = mlflow.register_model(model_uri, model_name)

# COMMAND ----------

#https://www.mlflow.org/docs/latest/model-registry.html
from mlflow.tracking import MlflowClient

client = MlflowClient()
client.create_registered_model("spark-lr-model")
#client.log_artifacts(run.info.run_id, "/FileStore/spark-model", artifact_path=mlflow.get_artifact_uri())

# COMMAND ----------

artifacts = [f.path for f in client.list_artifacts(run.info.run_id, "model")]

print("artifacts: {}".format(artifacts))

# COMMAND ----------

mlflow.get_artifact_uri()

# COMMAND ----------

# MAGIC %fs ls /databricks/mlflow-tracking/

# COMMAND ----------
Exemplo n.º 19
0
# MAGIC %md
# MAGIC Now list all the runs for your experiment using `.list_run_infos()`, which takes your `experiment_id` as a parameter.

# COMMAND ----------

display(client.list_run_infos(experimentID))

# COMMAND ----------

# MAGIC %md
# MAGIC We can list the artifacts for any run by using the `MLflowClient().list_artifacts(run_id)` method:

# COMMAND ----------

client.list_artifacts(runID)

# COMMAND ----------

# MAGIC %md
# MAGIC Pull out a few fields and create a pandas DataFrame with it.

# COMMAND ----------

runs = pd.DataFrame([(run.run_id, run.start_time, run.artifact_uri)
                     for run in client.list_run_infos(experimentID)])
runs.columns = ["run_id", "start_time", "artifact_uri"]

display(runs)

# COMMAND ----------