def test_hyperopt_ray_mlflow(csv_filename, tmpdir, ray_cluster_4cpu): mlflow_uri = f"file://{tmpdir}/mlruns" mlflow.set_tracking_uri(mlflow_uri) client = MlflowClient(tracking_uri=mlflow_uri) num_samples = 2 config = _get_config( {"type": "variant_generator"}, {"type": "ray", "num_samples": num_samples} # search_alg # executor ) rel_path = generate_data(config["input_features"], config["output_features"], csv_filename) exp_name = "mlflow_test" run_hyperopt(config, rel_path, tmpdir, experiment_name=exp_name, callbacks=[MlflowCallback(mlflow_uri)]) experiment = client.get_experiment_by_name(exp_name) assert experiment is not None runs = client.search_runs([experiment.experiment_id]) assert len(runs) > 0 for run in runs: artifacts = [f.path for f in client.list_artifacts(run.info.run_id, "")] assert "config.yaml" in artifacts assert "model" in artifacts
def test_hyperopt_ray_mlflow(csv_filename, ray_start_4_cpus, tmpdir): mlflow_uri = f'file://{tmpdir}/mlruns' mlflow.set_tracking_uri(mlflow_uri) client = MlflowClient(tracking_uri=mlflow_uri) num_samples = 2 config = _get_config({ "type": "ray", "num_samples": num_samples }, {"type": "ray"}) rel_path = generate_data(config['input_features'], config['output_features'], csv_filename) exp_name = 'mlflow_test' run_hyperopt(config, rel_path, experiment_name=exp_name, callbacks=[MlflowCallback(mlflow_uri)]) experiment = client.get_experiment_by_name(exp_name) assert experiment is not None runs = client.search_runs([experiment.experiment_id]) assert len(runs) > 0 for run in runs: artifacts = [ f.path for f in client.list_artifacts(run.info.run_id, "") ] assert 'config.yaml' in artifacts assert 'model' in artifacts
def test_mlflow_csv_data_set_save_reload(tmp_path, tracking_uri, dataset, extension, data, artifact_path): mlflow.set_tracking_uri(tracking_uri.as_uri()) mlflow_client = MlflowClient(tracking_uri=tracking_uri.as_uri()) filepath = (tmp_path / "data").with_suffix(extension) mlflow_csv_dataset = MlflowArtifactDataSet( artifact_path=artifact_path, data_set=dict(type=CSVDataSet, filepath=filepath.as_posix()), ) with mlflow.start_run(): mlflow_csv_dataset.save(data) run_id = mlflow.active_run().info.run_id # the artifact must be properly uploaded to "mlruns" and reloadable run_artifacts = [ fileinfo.path for fileinfo in mlflow_client.list_artifacts(run_id=run_id, path=artifact_path) ] remote_path = (filepath.name if artifact_path is None else (Path(artifact_path) / filepath.name).as_posix()) assert remote_path in run_artifacts assert data.equals(mlflow_csv_dataset.load())
def test_autolog_logs_expected_data(): mlflow.paddle.autolog() with mlflow.start_run() as run: train_model() client = MlflowClient() data = client.get_run(run.info.run_id).data # Testing params are logged for param_key, expected_param_value in [("optimizer_name", "Adam"), ("learning_rate", "0.01")]: assert param_key in data.params assert data.params[param_key] == expected_param_value # Testing metrics are logged for metric_key in [ "batch_size", "loss", "step", "eval_batch_size", "eval_loss", "eval_step" ]: assert metric_key in data.metrics metric_history = client.get_metric_history(run.info.run_id, metric_key) assert len(metric_history) == NUM_EPOCHS # Testing model_summary.txt is saved artifacts = client.list_artifacts(run.info.run_id) assert any(x.path == "model_summary.txt" for x in artifacts)
def _parse_runid_ref(parsed: ParseResult, client: MlflowClient): runid = parsed.hostname run = client.get_run(runid) path = parsed.path.lstrip("/") if path: return ( "runs:/{}/{}".format(runid, path), run.data.tags, run.data.params, ) else: artifacts = client.list_artifacts(runid) if not artifacts: raise SpecError("Run {} has no artifacts".format(runid)) elif len(artifacts) == 1: return ( "runs:/{}/{}".format(runid, artifacts[0].path), run.data.tags, run.data.params, ) else: # TODO allow setting default path from config raise SpecError( ( "Run {} has more than 1 artifact ({})." "Please specify path like " "mlflows://<runid>/path/to/artifact in " "CREATE MODEL or ML_PREDICT" ).format(runid, [x.path for x in artifacts]) )
def test_mlflow_hook_save_pipeline_ml( kedro_project_with_mlflow_conf, pipeline_to_run, dummy_catalog, dummy_run_params, ): bootstrap_project(kedro_project_with_mlflow_conf) with KedroSession.create( project_path=kedro_project_with_mlflow_conf) as session: context = session.load_context() # triggers conf setup # config_with_base_mlflow_conf is a conftest fixture mlflow_hook = MlflowHook() mlflow_hook.after_context_created(context) # setup mlflow config runner = SequentialRunner() mlflow_hook.after_catalog_created( catalog=dummy_catalog, # `after_catalog_created` is not using any of below arguments, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", ) mlflow_hook.before_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) runner.run(pipeline_to_run, dummy_catalog, session._hook_manager) run_id = mlflow.active_run().info.run_id mlflow_hook.after_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) # test : parameters should have been logged mlflow_client = MlflowClient(context.mlflow.server.mlflow_tracking_uri) run_data = mlflow_client.get_run(run_id).data # all run_params are recorded as tags for k, v in dummy_run_params.items(): if v: assert run_data.tags[k] == str(v) # params are not recorded because we don't have MlflowHook here # and the model should not be logged when it is not a PipelineML nb_artifacts = len(mlflow_client.list_artifacts(run_id)) if isinstance(pipeline_to_run, PipelineML): assert nb_artifacts == 1 else: assert nb_artifacts == 0 if isinstance(pipeline_to_run, PipelineML): trained_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model") assert trained_model.metadata.signature.to_dict() == { "inputs": '[{"name": "a", "type": "long"}]', "outputs": None, }
def setup_log(cfg): import tempfile import shutil from os.path import join from mlflow.tracking import MlflowClient # --- SETUP MLFOW --- mlflow.set_tracking_uri(cfg['mlflow']['uri']) mlflow.set_experiment(cfg['experiment']['name'] if not cfg['script-arguments'].debug else 'DEBUG_RUNS') mlflow.pytorch.autolog(log_models=False) tags = cfg.experiment.tags.to_dict() tags['subexp'] = cfg.experiment['sub-experiment'] tags['subexpID'] = str(cfg.experiment['sub-experiment-id']) run_name = f"{cfg.experiment['sub-experiment']}{cfg.experiment['sub-experiment-id']}-{cfg.trial.id:02}" mlflow.start_run(run_name=run_name, tags=tags) # --- CREATE TMP --- tmp = tempfile.TemporaryDirectory(dir=cfg['script-arguments']['tmp-dir']) # --- SAVE CFG --- shutil.copy(cfg['script-arguments'].config, join(tmp.name, 'cfg.yaml')) mlflow.log_artifact(join(tmp.name, 'cfg.yaml')) # Sanity check of artifact saving client = MlflowClient() artifacts = client.list_artifacts(mlflow.active_run().info.run_id) if len(artifacts) != 1 or artifacts[0].path != 'cfg.yaml': raise RuntimeError( 'The sanity check for storing artifacts failed.' 'Interrupting the script before the training starts.') with open(join(tmp.name, 'cfg_extended.yaml'), 'w') as f: cfg.to_yaml(f) mlflow.log_param('sub-experiment', cfg.experiment['sub-experiment']) if cfg.experiment['sub-experiment-id']: mlflow.log_param('sub-experiment-id', cfg.experiment['sub-experiment-id']) for k, v in cfg.trial.items(): mlflow.log_param('trial.' + k, v) for k, v in cfg['model'].items(): mlflow.log_param(f'model.{k}', v) for k, v in cfg['data-augmentation'].items(): if isinstance(v, dict): for k1, v1 in v.items(): mlflow.log_param(f'DA.{k} {k1}', v1) else: mlflow.log_param(f'DA.{k}', v) mlflow.log_param('dropout', cfg['hyper-parameters']['drop-out']) mlflow.log_param('training.file', cfg.training['dataset-file']) mlflow.log_param('training.dataset', cfg.training['training-dataset']) return tmp
def test_mlflow_data_set_save_with_run_id(tmp_path, tracking_uri, df1, exists_active_run): mlflow.set_tracking_uri(tracking_uri.as_uri()) mlflow_client = MlflowClient(tracking_uri=tracking_uri.as_uri()) nb_runs = 0 # create a first run and get its id with mlflow.start_run(): mlflow.log_param("fake", 2) run_id = mlflow.active_run().info.run_id nb_runs += 1 # check behaviour when logging with an already opened run if exists_active_run: mlflow.start_run() active_run_id = mlflow.active_run().info.run_id nb_runs += 1 # then same scenario but the run_id where data is saved is specified mlflow_csv_dataset = MlflowArtifactDataSet( data_set=dict(type=CSVDataSet, filepath=(tmp_path / "df1.csv").as_posix()), run_id=run_id, ) mlflow_csv_dataset.save(df1) # same tests as previously, bu no new experiments must have been created runs_list = mlflow_client.list_run_infos(experiment_id="0") run_artifacts = [ fileinfo.path for fileinfo in mlflow_client.list_artifacts(run_id=run_id) ] assert len( runs_list) == nb_runs # no new run must have been created when saving assert (mlflow.active_run().info.run_id == active_run_id if mlflow.active_run() else True ) # if a run was opened before saving, it must be reopened assert "df1.csv" in run_artifacts # the file must exists assert df1.equals(mlflow_csv_dataset.load()) # and must loadable if exists_active_run: mlflow.end_run()
def test_is_versioned_dataset_logged_correctly_in_mlflow( tmp_path, tracking_uri, df1): """Check if versioned dataset is logged correctly in MLflow as artifact. For versioned datasets just artifacts from current run should be logged. """ mlflow.set_tracking_uri(tracking_uri.as_uri()) mlflow_client = MlflowClient(tracking_uri=tracking_uri.as_uri()) mlflow.start_run() run_id = mlflow.active_run().info.run_id active_run_id = mlflow.active_run().info.run_id mlflow_csv_dataset = MlflowArtifactDataSet( data_set=dict(type=CSVDataSet, filepath=(tmp_path / "df1.csv").as_posix(), versioned=True), run_id=run_id, ) mlflow_csv_dataset.save(df1) run_artifacts = [ fileinfo.path for fileinfo in mlflow_client.list_artifacts(run_id=run_id) ] # Check if just one artifact was created in given run. assert len(run_artifacts) == 1 artifact_path = mlflow_client.download_artifacts(run_id=run_id, path=run_artifacts[0]) # Check if saved artifact is file and not folder where versioned datasets are stored. assert Path(artifact_path).is_file() assert (mlflow.active_run().info.run_id == active_run_id if mlflow.active_run() else True ) # if a run was opened before saving, it must be reopened assert df1.equals(mlflow_csv_dataset.load()) # and must loadable mlflow.end_run()
def test_partitioned_dataset_save_and_reload(tmp_path, tracking_uri, artifact_path, df1, df2): mlflow.set_tracking_uri(tracking_uri.as_uri()) mlflow_client = MlflowClient(tracking_uri=tracking_uri.as_uri()) mlflow_dataset = MlflowArtifactDataSet( artifact_path=artifact_path, data_set=dict( type=PartitionedDataSet, path=(tmp_path / "df_dir").as_posix(), dataset="pandas.CSVDataSet", filename_suffix=".csv", ), ) data = dict(df1=df1, df2=df2) with mlflow.start_run(): mlflow_dataset.save(data) run_id = mlflow.active_run().info.run_id # the artifact must be properly uploaded to "mlruns" and reloadable artifact_path_df_dir = f"{artifact_path}/df_dir" if artifact_path else "df_dir" run_artifacts = [ fileinfo.path for fileinfo in mlflow_client.list_artifacts( run_id=run_id, path=artifact_path_df_dir, ) ] for df_name in data.keys(): remote_path = (f"df_dir/{df_name}.csv" if artifact_path is None else (Path(artifact_path) / "df_dir" / df_name).with_suffix(".csv").as_posix()) assert remote_path in run_artifacts reloaded_data = { k: loader() for k, loader in mlflow_dataset.load().items() } for k, df in data.items(): pd.testing.assert_frame_equal(df, reloaded_data[k])
def test_mlflow_pipeline_hook_with_different_pipeline_types( mocker, monkeypatch, tmp_path, config_dir, env_from_dict, pipeline_to_run, dummy_catalog, dummy_run_params, dummy_mlflow_conf, ): # config_with_base_mlflow_conf is a conftest fixture mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True) monkeypatch.chdir(tmp_path) pipeline_hook = MlflowPipelineHook(conda_env=env_from_dict, model_name="model") runner = SequentialRunner() pipeline_hook.before_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) runner.run(pipeline_to_run, dummy_catalog, dummy_run_params["run_id"]) run_id = mlflow.active_run().info.run_id pipeline_hook.after_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) # test : parameters should have been logged mlflow_conf = get_mlflow_config(tmp_path) mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri) run_data = mlflow_client.get_run(run_id).data # all run_params are recorded as tags for k, v in dummy_run_params.items(): if v: assert run_data.tags[k] == str(v) # params are not recorded because we don't have MlflowNodeHook here # and the model should not be logged when it is not a PipelineML nb_artifacts = len(mlflow_client.list_artifacts(run_id)) if isinstance(pipeline_to_run, PipelineML): assert nb_artifacts == 1 else: assert nb_artifacts == 0
import json from mlflow.tracking import MlflowClient if __name__ == "__main__": # Create some artifacts data to preserve features = "rooms, zipcode, median_price, school_rating, transport" data = {"state": "TX", "Available": 25, "Type": "Detached"} # Create couple of artifact files under the directory "data" os.makedirs("data", exist_ok=True) with open("data/data.json", 'w', encoding='utf-8') as f: json.dump(data, f, indent=2) with open("data/features.txt", 'w') as f: f.write(features) # Create a run under the default experiment (whose id is "0"), and log # all files in "data" to root artifact_uri/states client = MlflowClient() expermient_id = "0" run = client.create_run(expermient_id) client.log_artifacts(run.info.run_id, "data", artifact_path="states") artifacts = client.list_artifacts(run.info.run_id) for artifact in artifacts: print("artifact: {}".format(artifact.path)) print("is_dir: {}".format(artifact.is_dir)) client.set_terminated(run.info.run_id)
# Databricks notebook source import re from mlflow.tracking import MlflowClient mlflow_client = MlflowClient() ci_holder_name = "cet_debris_detection_cicd" versions = mlflow_client.get_latest_versions(ci_holder_name, stages=["Production"]) assert len(versions) == 1 ci_holder = versions[0] source_run = mlflow_client.get_run(ci_holder.run_id) dist_info = [fi for fi in mlflow_client.list_artifacts(source_run.info.run_id, 'dist') if fi.path.endswith('.whl')] assert len(dist_info) == 1 dist_info = dist_info[0] lib_path = f"{source_run.info.artifact_uri}/{dist_info.path}" lib_path = re.sub(r"^dbfs:/", "/dbfs/", lib_path) job_info = [fi for fi in mlflow_client.list_artifacts(source_run.info.run_id, 'job') if fi.path.endswith('runtime_requirements.txt')] assert len(job_info) == 1 job_info = job_info[0] req_path = f"{source_run.info.artifact_uri}/{job_info.path}" req_path = re.sub(r"^dbfs:/", "/dbfs/", req_path) print(lib_path) print(req_path) %pip install -r $req_path %pip install -U $lib_path
def test_mlflow_callback(tmpdir): epochs = 2 batch_size = 8 num_examples = 32 input_features = [sequence_feature(reduce_output="sum")] output_features = [category_feature(vocab_size=2, reduce_input="sum")] config = { "input_features": input_features, "output_features": output_features, "combiner": {"type": "concat", "output_size": 14}, TRAINER: {"epochs": epochs, "batch_size": batch_size}, } data_csv = generate_data( input_features, output_features, os.path.join(tmpdir, "train.csv"), num_examples=num_examples ) val_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, "validation.csv")) test_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, "test.csv")) mlflow_uri = f"file://{tmpdir}/mlruns" mlflow.set_tracking_uri(mlflow_uri) client = MlflowClient(tracking_uri=mlflow_uri) exp_name = "mlflow_test" callback = MlflowCallback() wrapped_callback = mock.Mock(wraps=callback) model = LudwigModel(config, callbacks=[wrapped_callback], backend=FakeRemoteBackend()) model.train(training_set=data_csv, validation_set=val_csv, test_set=test_csv, experiment_name=exp_name) expected_df, _ = model.predict(test_csv) # Check mlflow artifacts assert callback.experiment_id is not None assert callback.run is not None experiment = mlflow.get_experiment_by_name(exp_name) assert experiment.experiment_id == callback.experiment_id df = mlflow.search_runs([experiment.experiment_id]) assert len(df) == 1 run_id = df.run_id[0] assert run_id == callback.run.info.run_id run = mlflow.get_run(run_id) assert run.info.status == "FINISHED" assert wrapped_callback.on_trainer_train_setup.call_count == 1 assert wrapped_callback.on_trainer_train_teardown.call_count == 1 artifacts = [f.path for f in client.list_artifacts(callback.run.info.run_id, "")] local_dir = f"{tmpdir}/local_artifacts" os.makedirs(local_dir) assert "config.yaml" in artifacts local_config_path = client.download_artifacts(callback.run.info.run_id, "config.yaml", local_dir) with open(local_config_path) as f: config_artifact = yaml.safe_load(f) assert config_artifact == config model_path = f"runs:/{callback.run.info.run_id}/model" loaded_model = mlflow.pyfunc.load_model(model_path) assert "ludwig" in loaded_model.metadata.flavors flavor = loaded_model.metadata.flavors["ludwig"] def compare_features(key): assert len(model.config[key]) == len(flavor["ludwig_schema"][key]) for feature, schema_feature in zip(model.config[key], flavor["ludwig_schema"][key]): assert feature["name"] == schema_feature["name"] assert feature["type"] == schema_feature["type"] compare_features("input_features") compare_features("output_features") test_df = pd.read_csv(test_csv) pred_df = loaded_model.predict(test_df) assert pred_df.equals(expected_df)
class CustomerMlflowClient: def __init__(self, tracking_server_uri, experiment_name): try: self.mlflow_client = MlflowClient(tracking_server_uri) logger.info("established mlflow rest-api client") except Exception as e: logger.error(str(e)) try: self.experiment_id = self.set_experiment(experiment_name) logger.info("started mlflow experiment {} with id {}".format( experiment_name, self.experiment_id)) except Exception as e: logger.error(str(e)) def logger(self, params, metrics, local_artifact_path, mlflow_artifact_path=None): run = self.mlflow_client.create_run(self.experiment_id) run_id = run.info.run_id logger.info("staring new run with id: {}".format(run_id)) logger.info("logging parameter to mlflow tracking server") self.log_params(run_id, params) logger.info("successfully logged parameter to mlflow tracking server") logger.info("logging model metrics to mlflow tracking server") self.log_metrics(run_id, metrics) logger.info( "successfully logged model metrics to mlflow tracking server") logger.info("logging model artifact to mlflow tracking server") self.log_artifact(run_id, local_artifact_path) logger.info( "successfully logged model artifact to mlflow tracking server") logger.info("exiting run with id: {}".format(run_id)) def set_experiment(self, experiment_name): experiment = self.mlflow_client.get_experiment_by_name(experiment_name) if experiment is None: return self.mlflow_client.create_experiment(experiment_name) else: return experiment.experiment_id def log_params(self, run_id: int, params): for key, value in params.items(): self.mlflow_client.log_param(run_id=run_id, key=key, value=value) def log_metrics(self, run_id: int, metrics): for key, value in metrics.items(): self.mlflow_client.log_metric(run_id=run_id, key=key, value=value) def log_artifact(self, run_id: int, artifact): self.mlflow_client.log_artifact(run_id=run_id, local_path=artifact) def get_latest_artifact(self, dest_path): run_info = self.mlflow_client.list_run_infos(self.experiment_id) latest_run_info = run_info[0] file_name = self.mlflow_client.list_artifacts( run_id=latest_run_info.run_id)[0].path complete_artifact_path = latest_run_info.artifact_uri + '/' + file_name self.mlflow_client.download_artifacts(run_id=latest_run_info.run_id, path=complete_artifact_path, dst_path=dest_path) return dest_path + file_name
def test_mlflow_callback(tmpdir): epochs = 2 batch_size = 8 num_examples = 32 input_features = [sequence_feature(reduce_output='sum')] output_features = [category_feature(vocab_size=2, reduce_input='sum')] config = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat', 'fc_size': 14 }, 'training': { 'epochs': epochs, 'batch_size': batch_size }, } data_csv = generate_data(input_features, output_features, os.path.join(tmpdir, 'train.csv'), num_examples=num_examples) val_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, 'validation.csv')) test_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, 'test.csv')) mlflow_uri = f'file://{tmpdir}/mlruns' mlflow.set_tracking_uri(mlflow_uri) client = MlflowClient(tracking_uri=mlflow_uri) exp_name = 'mlflow_test' callback = MlflowCallback() model = LudwigModel(config, callbacks=[callback]) model.train(training_set=data_csv, validation_set=val_csv, test_set=test_csv, experiment_name=exp_name) expected_df, _ = model.predict(test_csv) # Check mlflow artifacts assert callback.experiment_id is not None assert callback.run is not None experiment = mlflow.get_experiment_by_name(exp_name) assert experiment.experiment_id == callback.experiment_id df = mlflow.search_runs([experiment.experiment_id]) assert len(df) == 1 run_id = df.run_id[0] assert run_id == callback.run.info.run_id artifacts = [ f.path for f in client.list_artifacts(callback.run.info.run_id, "") ] local_dir = f'{tmpdir}/local_artifacts' os.makedirs(local_dir) assert 'config.yaml' in artifacts local_config_path = client.download_artifacts(callback.run.info.run_id, "config.yaml", local_dir) with open(local_config_path, 'r') as f: config_artifact = yaml.safe_load(f) assert config_artifact == config model_path = f'runs:/{callback.run.info.run_id}/model' loaded_model = mlflow.pyfunc.load_model(model_path) assert 'ludwig' in loaded_model.metadata.flavors flavor = loaded_model.metadata.flavors['ludwig'] def compare_features(key): assert len(model.config[key]) == len(flavor['ludwig_schema'][key]) for feature, schema_feature in zip(model.config[key], flavor['ludwig_schema'][key]): assert feature['name'] == schema_feature['name'] assert feature['type'] == schema_feature['type'] compare_features('input_features') compare_features('output_features') test_df = pd.read_csv(test_csv) pred_df = loaded_model.predict(test_df) assert (pred_df.equals(expected_df))
def test_mlflow_pipeline_hook_with_different_pipeline_types( mocker, monkeypatch, tmp_path, config_dir, env_from_dict, pipeline_to_run, dummy_catalog, dummy_run_params, dummy_mlflow_conf, ): # config_with_base_mlflow_conf is a conftest fixture mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True) monkeypatch.chdir(tmp_path) pipeline_hook = MlflowPipelineHook() runner = SequentialRunner() pipeline_hook.after_catalog_created( catalog=dummy_catalog, # `after_catalog_created` is not using any of arguments bellow, # so we are setting them to empty values. conf_catalog={}, conf_creds={}, feed_dict={}, save_version="", load_versions="", run_id=dummy_run_params["run_id"], ) pipeline_hook.before_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) runner.run(pipeline_to_run, dummy_catalog) run_id = mlflow.active_run().info.run_id pipeline_hook.after_pipeline_run(run_params=dummy_run_params, pipeline=pipeline_to_run, catalog=dummy_catalog) # test : parameters should have been logged context = load_context(tmp_path) mlflow_conf = get_mlflow_config(context) mlflow_client = MlflowClient(mlflow_conf.mlflow_tracking_uri) run_data = mlflow_client.get_run(run_id).data # all run_params are recorded as tags for k, v in dummy_run_params.items(): if v: assert run_data.tags[k] == str(v) # params are not recorded because we don't have MlflowNodeHook here # and the model should not be logged when it is not a PipelineML nb_artifacts = len(mlflow_client.list_artifacts(run_id)) if isinstance(pipeline_to_run, PipelineML): assert nb_artifacts == 1 else: assert nb_artifacts == 0 # Check if metrics datasets have prefix with its names. # for metric assert dummy_catalog._data_sets["my_metrics"]._prefix == "my_metrics" assert dummy_catalog._data_sets["another_metrics"]._prefix == "foo" if isinstance(pipeline_to_run, PipelineML): trained_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model") assert trained_model.metadata.signature.to_dict() == { "inputs": '[{"name": "a", "type": "long"}]', "outputs": None, }
model_name = "lr_trip_duration_model" run_id = run.info.run_id #"c5218874277e4644a6536affee9b3ba0" model_uri = f"runs:/{run_id}/model" model_details = mlflow.register_model(model_uri, model_name) # COMMAND ---------- #https://www.mlflow.org/docs/latest/model-registry.html from mlflow.tracking import MlflowClient client = MlflowClient() client.create_registered_model("spark-lr-model") #client.log_artifacts(run.info.run_id, "/FileStore/spark-model", artifact_path=mlflow.get_artifact_uri()) # COMMAND ---------- artifacts = [f.path for f in client.list_artifacts(run.info.run_id, "model")] print("artifacts: {}".format(artifacts)) # COMMAND ---------- mlflow.get_artifact_uri() # COMMAND ---------- # MAGIC %fs ls /databricks/mlflow-tracking/ # COMMAND ----------
# MAGIC %md # MAGIC Now list all the runs for your experiment using `.list_run_infos()`, which takes your `experiment_id` as a parameter. # COMMAND ---------- display(client.list_run_infos(experimentID)) # COMMAND ---------- # MAGIC %md # MAGIC We can list the artifacts for any run by using the `MLflowClient().list_artifacts(run_id)` method: # COMMAND ---------- client.list_artifacts(runID) # COMMAND ---------- # MAGIC %md # MAGIC Pull out a few fields and create a pandas DataFrame with it. # COMMAND ---------- runs = pd.DataFrame([(run.run_id, run.start_time, run.artifact_uri) for run in client.list_run_infos(experimentID)]) runs.columns = ["run_id", "start_time", "artifact_uri"] display(runs) # COMMAND ----------