def test_autologging_disabled_logging_with_or_without_active_run( spark_session, format_to_file_path): mlflow.spark.autolog(disable=True) data_format = list(format_to_file_path.keys())[0] file_path = format_to_file_path[data_format] df = (spark_session.read.format(data_format).option( "header", "true").option("inferSchema", "true").load(file_path)) # Reading data source before starting a run df.filter("number1 > 0").collect() df.limit(2).collect() df.collect() # If there was any tag info collected it will be logged here with mlflow.start_run(): run_id = mlflow.active_run().info.run_id time.sleep(1) # Confirm nothing was logged. run = mlflow.get_run(run_id) _assert_spark_data_not_logged(run=run) # Reading data source during an active run with mlflow.start_run(): run_id = mlflow.active_run().info.run_id df.collect() time.sleep(1) run = mlflow.get_run(run_id) _assert_spark_data_not_logged(run=run)
def test_autologging_disabled_then_enabled(spark_session, format_to_file_path): mlflow.spark.autolog(disable=True) data_format = list(format_to_file_path.keys())[0] file_path = format_to_file_path[data_format] df = ( spark_session.read.format(data_format) .option("header", "true") .option("inferSchema", "true") .load(file_path) ) # Logging is disabled here. with mlflow.start_run(): run_id = mlflow.active_run().info.run_id df.collect() time.sleep(1) run = mlflow.get_run(run_id) _assert_spark_data_not_logged(run=run) # Logging is enabled here. mlflow.spark.autolog(disable=False) with mlflow.start_run(): run_id = mlflow.active_run().info.run_id df.filter("number1 > 0").collect() time.sleep(1) run = mlflow.get_run(run_id) _assert_spark_data_logged(run=run, path=file_path, data_format=data_format)
def test_autologging_dedups_multiple_reads_of_same_datasource( spark_session, format_to_file_path): mlflow.spark.autolog() data_format = list(format_to_file_path.keys())[0] file_path = format_to_file_path[data_format] df = (spark_session.read.format(data_format).option( "header", "true").option("inferSchema", "true").load(file_path)) with mlflow.start_run(): run_id = mlflow.active_run().info.run_id df.collect() df.filter("number1 > 0").collect() df.limit(2).collect() df.collect() time.sleep(1) run = mlflow.get_run(run_id) _assert_spark_data_logged(run=run, path=file_path, data_format=data_format) # Test context provider flow df.filter("number1 > 0").collect() df.limit(2).collect() df.collect() with mlflow.start_run(): run_id2 = mlflow.active_run().info.run_id time.sleep(1) run2 = mlflow.get_run(run_id2) _assert_spark_data_logged(run=run2, path=file_path, data_format=data_format)
def load_artifacts(run_id: str, device: torch.device = torch.device("cpu")) -> Dict: """Load artifacts for current model. Args: run_id (str): ID of the model run to load artifacts. device (torch.device): Device to run model on. Defaults to CPU. Returns: Artifacts needed for inference. """ # Load artifacts artifact_uri = mlflow.get_run(run_id=run_id).info.artifact_uri.split("file://")[-1] params = Namespace(**utils.load_dict(filepath=Path(artifact_uri, "params.json"))) label_encoder = data.MultiLabelLabelEncoder.load(fp=Path(artifact_uri, "label_encoder.json")) tokenizer = data.Tokenizer.load(fp=Path(artifact_uri, "tokenizer.json")) model_state = torch.load(Path(artifact_uri, "model.pt"), map_location=device) performance = utils.load_dict(filepath=Path(artifact_uri, "performance.json")) # Initialize model model = models.initialize_model( params=params, vocab_size=len(tokenizer), num_classes=len(label_encoder) ) model.load_state_dict(model_state) return { "params": params, "label_encoder": label_encoder, "tokenizer": tokenizer, "model": model, "performance": performance, }
def test_autologging_of_datasources_with_different_formats( spark_session, format_to_file_path): mlflow.spark.autolog() for data_format, file_path in format_to_file_path.items(): base_df = (spark_session.read.format(data_format).option( "header", "true").option("inferSchema", "true").load(file_path)) base_df.createOrReplaceTempView("temptable") table_df0 = spark_session.table("temptable") table_df1 = spark_session.sql( "SELECT number1, number2 from temptable LIMIT 5") dfs = [ base_df, table_df0, table_df1, base_df.filter("number1 > 0"), base_df.select("number1"), base_df.limit(2), base_df.filter("number1 > 0").select("number1").limit(2), ] for df in dfs: with mlflow.start_run(): run_id = mlflow.active_run().info.run_id df.collect() time.sleep(1) run = mlflow.get_run(run_id) _assert_spark_data_logged(run=run, path=file_path, data_format=data_format)
def flasslit(mlflowrun: str): st.set_option('deprecation.showfileUploaderEncoding', False) mlflow_run = mlflow.get_run(mlflowrun) class_names = json.loads(mlflow_run.data.params.get("class_names")) modelpath = os.path.join(mlflow_run.info.artifact_uri, "saved-model") uploaded_file = make_sidebar(mlflow_run, class_names) image_array = None if uploaded_file is not None: image = Image.open(uploaded_file) st.subheader("Uploaded image information") st.write(image) st.image(image) image = image.resize((28, 28), Image.ANTIALIAS).convert('L') st.subheader("Resized image information") st.write(image) st.image(image) image_array = np.array(image) image_array = np.expand_dims(image_array, -1) logging.info(f"Loading model from {modelpath}") loaded_model = load_mlflow_model(modelpath) if image_array is not None: res = loaded_model.predict(np.array([image_array]))[0] res = [float(prob) for prob in res] st.subheader("Predicted class probabilities") st.write(dict(zip(class_names, res)))
def test_enabling_autologging_before_spark_session_works(disable): mlflow.spark.autolog(disable=disable) # creating spark session AFTER autolog was enabled spark_session = _get_or_create_spark_session() rows = [Row(100)] schema = StructType([StructField("number2", IntegerType())]) rdd = spark_session.sparkContext.parallelize(rows) df = spark_session.createDataFrame(rdd, schema) tempdir = tempfile.mkdtemp() filepath = os.path.join(tempdir, "test-data") df.write.option("header", "true").format("csv").save(filepath) read_df = (spark_session.read.format("csv").option( "header", "true").option("inferSchema", "true").load(filepath)) with mlflow.start_run(): run_id = mlflow.active_run().info.run_id read_df.collect() time.sleep(1) run = mlflow.get_run(run_id) if disable: _assert_spark_data_not_logged(run=run) else: _assert_spark_data_logged(run=run, path=filepath, data_format="csv") shutil.rmtree(tempdir) spark_session.stop()
def test_model_is_recorded_when_using_direct_save(spark_model_iris): # Patch `is_local_uri` to enforce direct model serialization to DFS with mock.patch("mlflow.spark.is_local_uri", return_value=False): with mlflow.start_run(): sparkm.log_model(spark_model=spark_model_iris.model, artifact_path="model") current_tags = mlflow.get_run(mlflow.active_run().info.run_id).data.tags assert mlflow.utils.mlflow_tags.MLFLOW_LOGGED_MODELS in current_tags
def get_latest_mlrun(params): """Get latest mlflow run :param params: gdl parameters dictionary :return: mlflow run object """ tracking_uri = params['global']['mlflow_uri'] mlflow.set_tracking_uri(tracking_uri) mlexp = mlflow.get_experiment_by_name( params['global']['mlflow_experiment_name']) exp_id = mlexp.experiment_id try: run_ids = ([ x.run_id for x in mlflow.list_run_infos( exp_id, max_results=1, order_by=["tag.release DESC"]) ]) except AttributeError: mlflow_client = mlflow.tracking.MlflowClient(tracking_uri=tracking_uri) run_ids = [ x.run_id for x in mlflow_client.list_run_infos(exp_id, run_view_type=3)[0:1] ] mlrun = mlflow.get_run(run_ids[0]) return mlrun
def ab_deployment(name, namespace, secret_name, model_a_name, model_a_version, model_b_name, model_b_version, model_a_traffic=50): client = MlflowClient() model_a_run_id = next( mv.run_id for mv in client.search_model_versions(f"name='{model_a_name}'") if mv.version == f"{model_a_version}") model_a_artifact_uri = mlflow.get_run(model_a_run_id).info.artifact_uri model_b_run_id = next( mv.run_id for mv in client.search_model_versions(f"name='{model_b_name}'") if mv.version == f"{model_b_version}") model_b_artifact_uri = mlflow.get_run(model_b_run_id).info.artifact_uri filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), "ab_deployment.json.j2") body = Template(open(filename).read()).render( name=name, namespace=namespace, secret_name=secret_name, model_a_name=model_a_name, model_a_artifact_uri=model_a_artifact_uri, model_b_name=model_b_name, model_b_artifact_uri=model_b_artifact_uri, model_a_traffic=model_a_traffic, ) token = open("/var/run/secrets/kubernetes.io/serviceaccount/token").read() headers = {"Authorization": f"Bearer {token}"} url = "https://kubernetes.default.svc.cluster.local" endpoint = f"/apis/machinelearning.seldon.io/v1alpha2/namespaces/{namespace}/seldondeployments?fieldManager=kubectl-create" return requests.post(url=url + endpoint, json=json.loads(body), headers=headers, verify=False, timeout=30)
def log_tags_and_params(self, remote_run_id): run_id = self.get_local_run_id() mlflow.set_tracking_uri(self.local_experiment_dir) run = mlflow.get_run(run_id=run_id) params = run.data.params tags = run.data.tags self.remote_server.set_tags(remote_run_id, tags) self.remote_server.log_params(remote_run_id, params)
def _fit_keras_model_no_active_run(pandas_df, epochs): orig_runs = mlflow.search_runs() orig_run_ids = set(orig_runs["run_id"]) _fit_keras(pandas_df, epochs) new_runs = mlflow.search_runs() new_run_ids = set(new_runs["run_id"]) assert len(new_run_ids) == len(orig_run_ids) + 1 run_id = (new_run_ids - orig_run_ids).pop() return mlflow.get_run(run_id)
def test_log_params_flatten() -> None: with mlflow.start_run() as run: params = {"a": {"b": 0}} lg.log_params_flatten(params) lg.log_params_flatten(params, parent_key="d") lg.log_params_flatten(params, sep="_") loaded_run = mlflow.get_run(run.info.run_id) assert loaded_run.data.params == {"a.b": "0", "a_b": "0", "d.a.b": "0"}
def test_log_metrics_flatten() -> None: with mlflow.start_run() as run: metrics = {"a": {"b": 0.0}} lg.log_metrics_flatten(metrics) lg.log_metrics_flatten(metrics, parent_key="d") lg.log_metrics_flatten(metrics, sep="_") loaded_run = mlflow.get_run(run.info.run_id) assert loaded_run.data.metrics == {"a.b": 0.0, "a_b": 0.0, "d.a.b": 0.0}
def __init__(self, run=None, run_id=None): assert run is not None or run_id is not None self.run = run self.run_id = run_id if self.run_id is None: self.run_id = self.run.info.run_id elif self.run is None: self.run = mlflow.get_run(self.run_id)
def assert_tag_value_meets_requirements(run_id): """ Verify that the Spark Datasource tag set on the run has been truncated to the maximum tag value length allowed by MLflow """ run = mlflow.get_run(run_id) assert _SPARK_TABLE_INFO_TAG_NAME in run.data.tags table_info_tag = run.data.tags[_SPARK_TABLE_INFO_TAG_NAME] assert len(table_info_tag) == MAX_TAG_VAL_LENGTH assert table_info_tag.endswith("...")
def test_mlflow_logger(): logger = MlflowLogger(experiment_name="test-experiment", run_name="test_run", tag1="my-tag") pipeline = Pipeline.from_config( PipelineConfiguration( name="test-pipeline", head=TaskHeadConfiguration(type=TextClassification, labels=["A", "B"]), )) trainer = TrainerConfiguration() logger.init_train(pipeline, trainer, training=None) for epoch in range(0, 10): logger.log_epoch_metrics(epoch, metrics={"key": 10 * epoch}) model_path = mkdtemp() metrics = {"metric": 200} logger.end_train(TrainingResults(model_path, metrics)) run = mlflow.get_run(logger._run_id) assert run # Tags assert "test_run" == run.data.tags[mlflow_tags.MLFLOW_RUN_NAME] assert "my-tag" == run.data.tags["tag1"] # Parameters expected_parmams = { "pipeline.features.word.trainable": "True", "pipeline.num_parameters": "202", "pipeline.num_trainable_parameters": "202", "pipeline.features.word.embedding_dim": "50", "pipeline.head.type": "biome.text.modules.heads.classification.text_classification.TextClassification", "pipeline.head.labels": "['A', 'B']", "pipeline.name": "test-pipeline", "pipeline.tokenizer.lang": "en", "trainer.batch_size": "16", "trainer.validation_metric": "-loss", "trainer.optimizer.type": "adam", "trainer.patience": "2", "trainer.num_epochs": "20", "trainer.num_serialized_models_to_keep": "1", "pipeline.tokenizer.remove_space_tokens": "True", } assert expected_parmams == run.data.params # Artifacts assert os.path.basename(model_path) in os.listdir( urlparse(run.info.artifact_uri).path) # Metrics for metric in metrics: assert (metric in run.data.metrics and run.data.metrics[metric] == metrics[metric])
def test_autologging_disabled_logging_datasource_with_different_formats( spark_session, format_to_file_path): mlflow.spark.autolog(disable=True) for data_format, file_path in format_to_file_path.items(): df = (spark_session.read.format(data_format).option( "header", "true").option("inferSchema", "true").load(file_path)) with mlflow.start_run(): run_id = mlflow.active_run().info.run_id df.collect() time.sleep(1) run = mlflow.get_run(run_id) _assert_spark_data_not_logged(run=run)
def _copy_mlflow_results_to_dir(run_id: str, dir_: str): """ Copy MLFlow run artifacts to directory :param run_id: :param dir_: :return: """ artifact_uri: str = mlflow.get_run(run_id).info.artifact_uri url = urllib3.util.parse_url(artifact_uri) if url.scheme != 'file': raise NotImplementedError('Only local artifact storage is supported') result_dir = join(dir_, const.SAGEMAKER_MODEL_SUBDIR) shutil.copytree(url.path, join(dir_, result_dir)) logger.info(f'MLFlow run: {run_id} artifacts were copied to {result_dir}')
def test_autologging_multiple_reads_same_run(spark_session, format_to_file_path): mlflow.spark.autolog() with mlflow.start_run(): for data_format, file_path in format_to_file_path.items(): run_id = mlflow.active_run().info.run_id df = spark_session.read.format(data_format).load(file_path) df.collect() time.sleep(1) run = mlflow.get_run(run_id) assert _SPARK_TABLE_INFO_TAG_NAME in run.data.tags table_info_tag = run.data.tags[_SPARK_TABLE_INFO_TAG_NAME] assert table_info_tag == "\n".join([ _get_expected_table_info_row(path, data_format) for data_format, path in format_to_file_path.items() ])
def test_fetch_create_and_log(tmpdir): entry_point_name = "entry_point" parameters = { "method_name": "string", } entry_point = _project_spec.EntryPoint(entry_point_name, parameters, "run_model.sh") mock_fetched_project = _project_spec.Project( env_type="local", env_config_path=None, entry_points={entry_point_name: entry_point}, docker_env=None, name="my_project", ) experiment_id = mlflow.create_experiment("test_fetch_project") expected_dir = tmpdir project_uri = "http://someuri/myproject.git" user_param = {"method_name": "newton"} with mock.patch("mlflow.projects.utils._fetch_project", return_value=expected_dir): with mock.patch("mlflow.projects._project_spec.load_project", return_value=mock_fetched_project): work_dir = fetch_and_validate_project("", "", entry_point_name, user_param) project = load_project(work_dir) assert mock_fetched_project == project assert expected_dir == work_dir # Create a run active_run = get_or_create_run( run_id=None, uri=project_uri, experiment_id=experiment_id, work_dir=work_dir, version=None, entry_point=entry_point_name, parameters=user_param, ) # check tags run = mlflow.get_run(active_run.info.run_id) assert MLFLOW_PROJECT_ENTRY_POINT in run.data.tags assert MLFLOW_SOURCE_NAME in run.data.tags assert entry_point_name == run.data.tags[ MLFLOW_PROJECT_ENTRY_POINT] assert project_uri == run.data.tags[MLFLOW_SOURCE_NAME] assert user_param == run.data.params
def test_autologging_multiple_runs_same_data(spark_session, format_to_file_path): mlflow.spark.autolog() data_format = list(format_to_file_path.keys())[0] file_path = format_to_file_path[data_format] df = (spark_session.read.format(data_format).option( "header", "true").option("inferSchema", "true").load(file_path)) df.collect() for _ in range(2): with mlflow.start_run(): time.sleep(1) run_id = mlflow.active_run().info.run_id run = mlflow.get_run(run_id) _assert_spark_data_logged(run=run, path=file_path, data_format=data_format)
def get_mlflow_logger(output_dir=None, checkpoint_dir=None, mlflow_enable=True): if mlflow_enable: mlflow_logger = MLflowLogger() active_run = mlflow.active_run() active_run = mlflow.get_run(active_run.info.run_id) if output_dir is not None: run_fname = os.path.join(output_dir, RUN_FNAME) with open(run_fname, 'w') as f: yaml.dump(active_run.to_dictionary(), f) if checkpoint_dir is not None and output_dir != checkpoint_dir: run_fname = os.path.join(checkpoint_dir, RUN_FNAME) with open(run_fname, 'w') as f: yaml.dump(active_run.to_dictionary(), f) return mlflow_logger else: return None
def test_mlflow_methods(url, project, model, version, mock_oauth, use_google_oauth): _mock_get_project_call(project) _mock_get_model_call(project, model) _mock_new_model_version_call(model, version) merlin.set_url(url, use_google_oauth=use_google_oauth) merlin.set_project(project.name) merlin.set_model(model.name, model.type) with merlin.new_model_version() as v: merlin.log_metric("metric", 0.1) merlin.log_param("param", "value") merlin.set_tag("tag", "value") run_id = v.mlflow_run_id run = mlflow.get_run(run_id=run_id) assert run.data.metrics["metric"] == 0.1 assert run.data.params["param"] == "value" assert run.data.tags["tag"] == "value"
def get_current_config(default=None): """ Get configuration defined in the current mlflow run :return: """ global configs active_run = mlflow.active_run() if active_run in configs.keys(): return configs[active_run] if not active_run: pads = get_current_pads() if pads.config: return pads.config else: return default run = mlflow.get_run(active_run.info.run_id) if CONFIG_NAME in run.data.tags: configs[active_run] = ast.literal_eval(run.data.tags[CONFIG_NAME]) return configs[active_run] return default
def run_logging_operations(): with mlflow.start_run() as run: mlflow.log_param("p", "param") mlflow.log_metric("m", 1.0) mlflow.set_tag("t", "tag") mlflow.pyfunc.log_model( artifact_path="model", python_model=MockModel(), registered_model_name="mock", ) runs = mlflow.search_runs(experiment_ids=["0"], order_by=["param.start_time DESC"]) run = mlflow.get_run(runs["run_id"][0]) # Ensure the following migration scripts work correctly: # - cfd24bdc0731_update_run_status_constraint_with_killed.py # - 0a8213491aaa_drop_duplicate_killed_constraint.py client = mlflow.tracking.MlflowClient() client.set_terminated(run_id=run.info.run_id, status="KILLED")
def test_execute_solid_with_mlflow_resource(): run_id_holder = {} params = {"learning_rate": "0.01", "n_estimators": "10"} extra_tags = {"super": "experiment"} @solid(required_resource_keys={"mlflow"}) def solid1(_): mlflow.log_params(params) run_id_holder["solid1_run_id"] = mlflow.active_run().info.run_id @solid(required_resource_keys={"mlflow"}) def solid2(_, _arg1): run_id_holder["solid2_run_id"] = mlflow.active_run().info.run_id @pipeline( mode_defs=[ModeDefinition(resource_defs={"mlflow": mlflow_tracking})]) def mlf_pipeline(): solid2(solid1()) result = execute_pipeline( mlf_pipeline, run_config={ "resources": { "mlflow": { "config": { "experiment_name": "my_experiment", "extra_tags": extra_tags, } } } }, ) assert result.success assert run_id_holder["solid1_run_id"] == run_id_holder["solid2_run_id"] run = mlflow.get_run(run_id_holder["solid1_run_id"]) assert run.data.params == params assert set(extra_tags.items()).issubset(run.data.tags.items()) assert mlflow.get_experiment_by_name("my_experiment")
def get_latest_run(experiment_id, tags=None, status="FINISHED", custom_query=None): """Get the latest MLFLow run that matched the parameters. Params: tags: dictionary of tagname, value pairs. Note that a run without a supplied tag will not get matched in any case. custom_query: string to be added to query in addition to tag and status clauses """ query = f"attributes.status = '{status}'" if tags is not None: tags_query = [f"tags.`{key}` = '{value}'" for key, value in tags.items()] tags_query = " and ".join(tags_query) query = f"{query} and {tags_query}" if custom_query is not None: query = f"{query} and {custom_query}" latest_run = mlflow.get_run( mlflow.search_runs(experiment_ids=[experiment_id], run_view_type=ViewType.ACTIVE_ONLY, filter_string=query, max_results=1).loc[0].run_id ) return latest_run
def load_artifacts( run_id: str, device: torch.device = torch.device("cpu"), ) -> Dict: """Load artifacts for a particular `run_id`. Args: run_id (str): ID of the run to load model artifacts from. device (torch.device): Device to run model on. Defaults to CPU. Returns: Artifacts needed for inference. """ # Load model client = mlflow.tracking.MlflowClient() device = torch.device("cpu") with tempfile.TemporaryDirectory() as fp: client.download_artifacts(run_id=run_id, path="", dst_path=fp) label_encoder = data.LabelEncoder.load( fp=Path(fp, "label_encoder.json")) tokenizer = data.Tokenizer.load(fp=Path(fp, "tokenizer.json")) model_state = torch.load(Path(fp, "model.pt"), map_location=device) performance = utils.load_dict(filepath=Path(fp, "performance.json")) # Load model run = mlflow.get_run(run_id=run_id) args = Namespace(**run.data.params) model = models.initialize_model(args=args, vocab_size=len(tokenizer), num_classes=len(label_encoder)) model.load_state_dict(model_state) return { "args": args, "label_encoder": label_encoder, "tokenizer": tokenizer, "model": model, "performance": performance, }
def test_autologging_slow_api_requests(spark_session, format_to_file_path): import mlflow.utils.rest_utils orig = mlflow.utils.rest_utils.http_request def _slow_api_req_mock(*args, **kwargs): if kwargs.get("method") == "POST": print("Sleeping, %s, %s" % (args, kwargs)) time.sleep(1) return orig(*args, **kwargs) mlflow.spark.autolog() with mlflow.start_run(): # Mock slow API requests to log Spark datasource information with mock.patch( "mlflow.utils.rest_utils.http_request") as http_request_mock: http_request_mock.side_effect = _slow_api_req_mock run_id = mlflow.active_run().info.run_id for data_format, file_path in format_to_file_path.items(): df = (spark_session.read.format(data_format).option( "header", "true").option("inferSchema", "true").load(file_path)) df.collect() # Sleep a bit prior to ending the run to guarantee that the Python process can pick up on # datasource read events (simulate the common case of doing work, e.g. model training, # on the DataFrame after reading from it) time.sleep(1) # Python subscriber threads should pick up the active run at the time they're notified # & make API requests against that run, even if those requests are slow. time.sleep(5) run = mlflow.get_run(run_id) assert _SPARK_TABLE_INFO_TAG_NAME in run.data.tags table_info_tag = run.data.tags[_SPARK_TABLE_INFO_TAG_NAME] assert table_info_tag == "\n".join([ _get_expected_table_info_row(path, data_format) for data_format, path in format_to_file_path.items() ])