def _should_log_model(spark_model): from pyspark.ml.base import Model # TODO: Handle PipelineModel/CrossValidatorModel/TrainValidationSplitModel class_name = _get_fully_qualified_class_name(spark_model) should_log = class_name in _log_model_allowlist if not should_log: for name in _log_model_allowlist: # only support one trailing * if name.endswith("*") and class_name.startswith(name[:-1]): should_log = True break if should_log: if class_name == "pyspark.ml.classification.OneVsRestModel": return _should_log_model(spark_model.models[0]) elif class_name == "pyspark.ml.pipeline.PipelineModel": return all( _should_log_model(stage) for stage in spark_model.stages if isinstance(stage, Model)) elif _is_parameter_search_model(spark_model): return _should_log_model(spark_model.bestModel) else: return all( _should_log_model(param_value) for _, param_value in _get_param_map(spark_model).items() # Transformers are logged by default as the same behavior as PipelineModel if isinstance(param_value, Model)) else: return False
def fit_mlflow(original, self, *args, **kwargs): params = get_method_call_arg_value(1, "params", None, args, kwargs) # Do not perform autologging on direct calls to fit() for featurizers. # Note that featurizers will be autologged when they're fit as part of a Pipeline. if _get_fully_qualified_class_name(self).startswith( "pyspark.ml.feature."): return original(self, *args, **kwargs) elif isinstance(params, (list, tuple)): # skip the case params is a list or tuple, this case it will call # fitMultiple and return a model iterator _logger.warning( _get_warning_msg_for_fit_call_with_a_list_of_params(self)) return original(self, *args, **kwargs) else: # we need generate estimator param map so we call `self.copy(params)` to construct # an estimator with the extra params. from pyspark.storagelevel import StorageLevel estimator = self.copy(params) if params is not None else self _log_pretraining_metadata(estimator, params) input_training_df = args[0].persist(StorageLevel.MEMORY_AND_DISK) spark_model = original(self, *args, **kwargs) _log_posttraining_metadata(estimator, spark_model, params, input_training_df) input_training_df.unpersist() return spark_model
def gen_evaluator_info(self, evaluator): """ Generate evaluator information, include evaluator class name and params. """ class_name = _get_fully_qualified_class_name(evaluator) param_map = _truncate_dict(_get_param_map(evaluator), MAX_ENTITY_KEY_LENGTH, MAX_PARAM_VAL_LENGTH) return {"evaluator_class": class_name, "params": param_map}
def _get_estimator_info_tags(estimator): """ :return: A dictionary of MLflow run tag keys and values describing the specified estimator. """ return { "estimator_name": estimator.__class__.__name__, "estimator_class": _get_fully_qualified_class_name(estimator), }
def _should_log_model(spark_model): # TODO: Handle PipelineModel/CrossValidatorModel/TrainValidationSplitModel class_name = _get_fully_qualified_class_name(spark_model) if class_name in _log_model_allowlist: if class_name == "pyspark.ml.classification.OneVsRestModel": return _should_log_model(spark_model.models[0]) else: return True else: return False
def _should_log_model(spark_model): from pyspark.ml.base import Model # TODO: Handle PipelineModel/CrossValidatorModel/TrainValidationSplitModel class_name = _get_fully_qualified_class_name(spark_model) if class_name in _log_model_allowlist: if class_name == "pyspark.ml.classification.OneVsRestModel": return _should_log_model(spark_model.models[0]) elif class_name == "pyspark.ml.pipeline.PipelineModel": return all( _should_log_model(stage) for stage in spark_model.stages if isinstance(stage, Model)) else: return True else: return False
def save(self, path): """Write the evaluation results to the specified local filesystem path""" os.makedirs(path, exist_ok=True) with open(os.path.join(path, "metrics.json"), "w") as fp: json.dump(self.metrics, fp) artifacts_metadata = { artifact_name: { "uri": artifact.uri, "class_name": _get_fully_qualified_class_name(artifact), } for artifact_name, artifact in self.artifacts.items() } with open(os.path.join(path, "artifacts_metadata.json"), "w") as fp: json.dump(artifacts_metadata, fp) artifacts_dir = os.path.join(path, "artifacts") os.mkdir(artifacts_dir) for artifact_name, artifact in self.artifacts.items(): artifact._save(os.path.join(artifacts_dir, artifact_name))
def _should_log_model(spark_model): from pyspark.ml.base import Model # TODO: Handle PipelineModel/CrossValidatorModel/TrainValidationSplitModel class_name = _get_fully_qualified_class_name(spark_model) if class_name in _log_model_allowlist: if class_name == "pyspark.ml.classification.OneVsRestModel": return _should_log_model(spark_model.models[0]) elif class_name == "pyspark.ml.pipeline.PipelineModel": return all( _should_log_model(stage) for stage in spark_model.stages if isinstance(stage, Model) ) elif _is_parameter_search_model(spark_model): return _should_log_model(spark_model.bestModel) else: return all( _should_log_model(param_value) for _, param_value in _get_param_map(spark_model).items() # Transformers are logged by default as the same behavior as PipelineModel if isinstance(param_value, Model) ) else: return False
def fit_mlflow(original, self, *args, **kwargs): params = None if len(args) > 1: params = args[1] elif "params" in kwargs: params = kwargs["params"] # Do not perform autologging on direct calls to fit() for featurizers. # Note that featurizers will be autologged when they're fit as part of a Pipeline. if _get_fully_qualified_class_name(self).startswith( "pyspark.ml.feature."): return original(self, *args, **kwargs) elif isinstance(params, (list, tuple)): # skip the case params is a list or tuple, this case it will call # fitMultiple and return a model iterator _logger.warning( _get_warning_msg_for_fit_call_with_a_list_of_params(self)) return original(self, *args, **kwargs) else: estimator = self.copy(params) _log_pretraining_metadata(estimator, params) spark_model = original(self, *args, **kwargs) _log_posttraining_metadata(estimator, spark_model, params) return spark_model
def test_get_fully_qualified_class_name(): class Foo: pass assert _get_fully_qualified_class_name(Foo()) == f"{__name__}.Foo"
def save_model( xgb_model, path, conda_env=None, code_paths=None, mlflow_model=None, signature: ModelSignature = None, input_example: ModelInputExample = None, pip_requirements=None, extra_pip_requirements=None, ): """ Save an XGBoost model to a path on the local file system. :param xgb_model: XGBoost model (an instance of `xgboost.Booster`_ or models that implement the `scikit-learn API`_) to be saved. :param path: Local path where the model is to be saved. :param conda_env: {{ conda_env }} :param code_paths: A list of local filesystem paths to Python file dependencies (or directories containing file dependencies). These files are *prepended* to the system path when the model is loaded. :param mlflow_model: :py:mod:`mlflow.models.Model` this flavor is being added to. :param signature: :py:class:`ModelSignature <mlflow.models.ModelSignature>` describes model input and output :py:class:`Schema <mlflow.types.Schema>`. The model signature can be :py:func:`inferred <mlflow.models.infer_signature>` from datasets with valid model input (e.g. the training dataset with target column omitted) and valid model output (e.g. model predictions generated on the training dataset), for example: .. code-block:: python from mlflow.models.signature import infer_signature train = df.drop_column("target_label") predictions = ... # compute model predictions signature = infer_signature(train, predictions) :param input_example: Input example provides one or several instances of valid model input. The example can be used as a hint of what data to feed the model. The given example will be converted to a Pandas DataFrame and then serialized to json using the Pandas split-oriented format. Bytes are base64-encoded. :param pip_requirements: {{ pip_requirements }} :param extra_pip_requirements: {{ extra_pip_requirements }} """ import xgboost as xgb _validate_env_arguments(conda_env, pip_requirements, extra_pip_requirements) path = os.path.abspath(path) _validate_and_prepare_target_save_path(path) code_dir_subpath = _validate_and_copy_code_paths(code_paths, path) if mlflow_model is None: mlflow_model = Model() if signature is not None: mlflow_model.signature = signature if input_example is not None: _save_example(mlflow_model, input_example, path) model_data_subpath = "model.xgb" model_data_path = os.path.join(path, model_data_subpath) # Save an XGBoost model xgb_model.save_model(model_data_path) xgb_model_class = _get_fully_qualified_class_name(xgb_model) pyfunc.add_to_model( mlflow_model, loader_module="mlflow.xgboost", data=model_data_subpath, env=_CONDA_ENV_FILE_NAME, code=code_dir_subpath, ) mlflow_model.add_flavor( FLAVOR_NAME, xgb_version=xgb.__version__, data=model_data_subpath, model_class=xgb_model_class, code=code_dir_subpath, ) mlflow_model.save(os.path.join(path, MLMODEL_FILE_NAME)) if conda_env is None: if pip_requirements is None: default_reqs = get_default_pip_requirements() # To ensure `_load_pyfunc` can successfully load the model during the dependency # inference, `mlflow_model.save` must be called beforehand to save an MLmodel file. inferred_reqs = mlflow.models.infer_pip_requirements( path, FLAVOR_NAME, fallback=default_reqs, ) default_reqs = sorted(set(inferred_reqs).union(default_reqs)) else: default_reqs = None conda_env, pip_requirements, pip_constraints = _process_pip_requirements( default_reqs, pip_requirements, extra_pip_requirements, ) else: conda_env, pip_requirements, pip_constraints = _process_conda_env( conda_env) with open(os.path.join(path, _CONDA_ENV_FILE_NAME), "w") as f: yaml.safe_dump(conda_env, stream=f, default_flow_style=False) # Save `constraints.txt` if necessary if pip_constraints: write_to(os.path.join(path, _CONSTRAINTS_FILE_NAME), "\n".join(pip_constraints)) # Save `requirements.txt` write_to(os.path.join(path, _REQUIREMENTS_FILE_NAME), "\n".join(pip_requirements))