Пример #1
0
    def _log_pretraining_metadata(estimator, *args, **kwargs):  # pylint: disable=unused-argument
        """
        Records metadata (e.g., params and tags) for a scikit-learn estimator prior to training.
        This is intended to be invoked within a patched scikit-learn training routine
        (e.g., `fit()`, `fit_transform()`, ...) and assumes the existence of an active
        MLflow run that can be referenced via the fluent Tracking API.

        :param estimator: The scikit-learn estimator for which to log metadata.
        :param args: The arguments passed to the scikit-learn training routine (e.g.,
                     `fit()`, `fit_transform()`, ...).
        :param kwargs: The keyword arguments passed to the scikit-learn training routine.
        """
        # Deep parameter logging includes parameters from children of a given
        # estimator. For some meta estimators (e.g., pipelines), recording
        # these parameters is desirable. For parameter search estimators,
        # however, child estimators act as seeds for the parameter search
        # process; accordingly, we avoid logging initial, untuned parameters
        # for these seed estimators.
        should_log_params_deeply = not _is_parameter_search_estimator(
            estimator)
        # Chunk model parameters to avoid hitting the log_batch API limit
        for chunk in _chunk_dict(
                estimator.get_params(deep=should_log_params_deeply),
                chunk_size=MAX_PARAMS_TAGS_PER_BATCH,
        ):
            truncated = _truncate_dict(chunk, MAX_ENTITY_KEY_LENGTH,
                                       MAX_PARAM_VAL_LENGTH)
            try_mlflow_log(mlflow.log_params, truncated)

        try_mlflow_log(mlflow.set_tags, _get_estimator_info_tags(estimator))
Пример #2
0
    def _log_pretraining_metadata(autologging_client, estimator, *args,
                                  **kwargs):  # pylint: disable=unused-argument
        """
        Records metadata (e.g., params and tags) for a scikit-learn estimator prior to training.
        This is intended to be invoked within a patched scikit-learn training routine
        (e.g., `fit()`, `fit_transform()`, ...) and assumes the existence of an active
        MLflow run that can be referenced via the fluent Tracking API.

        :param autologging_client: An instance of `MlflowAutologgingQueueingClient` used for
                                   efficiently logging run data to MLflow Tracking.
        :param estimator: The scikit-learn estimator for which to log metadata.
        :param args: The arguments passed to the scikit-learn training routine (e.g.,
                     `fit()`, `fit_transform()`, ...).
        :param kwargs: The keyword arguments passed to the scikit-learn training routine.
        """
        # Deep parameter logging includes parameters from children of a given
        # estimator. For some meta estimators (e.g., pipelines), recording
        # these parameters is desirable. For parameter search estimators,
        # however, child estimators act as seeds for the parameter search
        # process; accordingly, we avoid logging initial, untuned parameters
        # for these seed estimators.
        should_log_params_deeply = not _is_parameter_search_estimator(
            estimator)
        run_id = mlflow.active_run().info.run_id
        autologging_client.log_params(
            run_id=mlflow.active_run().info.run_id,
            params=estimator.get_params(deep=should_log_params_deeply),
        )
        autologging_client.set_tags(
            run_id=run_id,
            tags=_get_estimator_info_tags(estimator),
        )
Пример #3
0
    def _log_posttraining_metadata(estimator, *args, **kwargs):
        """
        Records metadata for a scikit-learn estimator after training has completed.
        This is intended to be invoked within a patched scikit-learn training routine
        (e.g., `fit()`, `fit_transform()`, ...) and assumes the existence of an active
        MLflow run that can be referenced via the fluent Tracking API.

        :param estimator: The scikit-learn estimator for which to log metadata.
        :param args: The arguments passed to the scikit-learn training routine (e.g.,
                     `fit()`, `fit_transform()`, ...).
        :param kwargs: The keyword arguments passed to the scikit-learn training routine.
        """
        if hasattr(estimator, "score"):
            try:
                score_args = _get_args_for_score(estimator.score,
                                                 estimator.fit, args, kwargs)
                training_score = estimator.score(*score_args)
            except Exception as e:
                msg = (
                    estimator.score.__qualname__ +
                    " failed. The 'training_score' metric will not be recorded. Scoring error: "
                    + str(e))
                _logger.warning(msg)
            else:
                try_mlflow_log(mlflow.log_metric, "training_score",
                               training_score)

        # log common metrics and artifacts for estimators (classifier, regressor)
        _log_specialized_estimator_content(estimator,
                                           mlflow.active_run().info.run_id,
                                           args, kwargs)

        def get_input_example():
            # Fetch an input example using the first several rows of the array-like
            # training data supplied to the training routine (e.g., `fit()`)
            fit_arg_names = _get_arg_names(estimator.fit)
            X_var_name, y_var_name = fit_arg_names[:2]
            input_example = _get_Xy(args, kwargs, X_var_name,
                                    y_var_name)[0][:INPUT_EXAMPLE_SAMPLE_ROWS]
            return input_example

        def infer_model_signature(input_example):
            if not hasattr(estimator, "predict"):
                raise Exception(
                    "the trained model does not specify a `predict` function, "
                    + "which is required in order to infer the signature")

            return infer_signature(input_example,
                                   estimator.predict(input_example))

        if log_models:
            # Will only resolve `input_example` and `signature` if `log_models` is `True`.
            input_example, signature = resolve_input_example_and_signature(
                get_input_example,
                infer_model_signature,
                log_input_examples,
                log_model_signatures,
                _logger,
            )

            try_mlflow_log(
                log_model,
                estimator,
                artifact_path="model",
                signature=signature,
                input_example=input_example,
            )

        if _is_parameter_search_estimator(estimator):
            if hasattr(estimator, "best_estimator_") and log_models:
                try_mlflow_log(
                    log_model,
                    estimator.best_estimator_,
                    artifact_path="best_estimator",
                    signature=signature,
                    input_example=input_example,
                )

            if hasattr(estimator, "best_score_"):
                try_mlflow_log(mlflow.log_metric, "best_cv_score",
                               estimator.best_score_)

            if hasattr(estimator, "best_params_"):
                best_params = {
                    "best_{param_name}".format(param_name=param_name):
                    param_value
                    for param_name, param_value in
                    estimator.best_params_.items()
                }
                try_mlflow_log(mlflow.log_params, best_params)

            if hasattr(estimator, "cv_results_"):
                try:
                    # Fetch environment-specific tags (e.g., user and source) to ensure that lineage
                    # information is consistent with the parent run
                    child_tags = context_registry.resolve_tags()
                    child_tags.update({MLFLOW_AUTOLOGGING: FLAVOR_NAME})
                    _create_child_runs_for_parameter_search(
                        cv_estimator=estimator,
                        parent_run=mlflow.active_run(),
                        child_tags=child_tags,
                    )
                except Exception as e:

                    msg = (
                        "Encountered exception during creation of child runs for parameter search."
                        " Child runs may be missing. Exception: {}".format(
                            str(e)))
                    _logger.warning(msg)

                try:
                    cv_results_df = pd.DataFrame.from_dict(
                        estimator.cv_results_)
                    _log_parameter_search_results_as_artifact(
                        cv_results_df,
                        mlflow.active_run().info.run_id)
                except Exception as e:

                    msg = (
                        "Failed to log parameter search results as an artifact."
                        " Exception: {}".format(str(e)))
                    _logger.warning(msg)
Пример #4
0
    def _log_posttraining_metadata(estimator, *args, **kwargs):
        """
        Records metadata for a scikit-learn estimator after training has completed.
        This is intended to be invoked within a patched scikit-learn training routine
        (e.g., `fit()`, `fit_transform()`, ...) and assumes the existence of an active
        MLflow run that can be referenced via the fluent Tracking API.

        :param estimator: The scikit-learn estimator for which to log metadata.
        :param args: The arguments passed to the scikit-learn training routine (e.g.,
                     `fit()`, `fit_transform()`, ...).
        :param kwargs: The keyword arguments passed to the scikit-learn training routine.
        """
        if hasattr(estimator, "score"):
            try:
                score_args = _get_args_for_score(estimator.score,
                                                 estimator.fit, args, kwargs)
                training_score = estimator.score(*score_args)
            except Exception as e:  # pylint: disable=broad-except
                msg = (
                    estimator.score.__qualname__ +
                    " failed. The 'training_score' metric will not be recorded. Scoring error: "
                    + str(e))
                _logger.warning(msg)
            else:
                try_mlflow_log(mlflow.log_metric, "training_score",
                               training_score)

        # log common metrics and artifacts for estimators (classifier, regressor)
        _log_specialized_estimator_content(estimator,
                                           mlflow.active_run().info.run_id,
                                           args, kwargs)

        input_example = None
        signature = None
        if hasattr(estimator, "predict"):
            try:
                # Fetch an input example using the first several rows of the array-like
                # training data supplied to the training routine (e.g., `fit()`)
                SAMPLE_ROWS = 5
                fit_arg_names = _get_arg_names(estimator.fit)
                X_var_name, y_var_name = fit_arg_names[:2]
                input_example = _get_Xy(args, kwargs, X_var_name,
                                        y_var_name)[0][:SAMPLE_ROWS]

                model_output = estimator.predict(input_example)
                signature = infer_signature(input_example, model_output)
            except Exception as e:  # pylint: disable=broad-except
                input_example = None
                msg = "Failed to infer an input example and model signature: " + str(
                    e)
                _logger.warning(msg)

        try_mlflow_log(
            log_model,
            estimator,
            artifact_path="model",
            signature=signature,
            input_example=input_example,
        )

        if _is_parameter_search_estimator(estimator):
            if hasattr(estimator, "best_estimator_"):
                try_mlflow_log(
                    log_model,
                    estimator.best_estimator_,
                    artifact_path="best_estimator",
                    signature=signature,
                    input_example=input_example,
                )

            if hasattr(estimator, "best_params_"):
                best_params = {
                    "best_{param_name}".format(param_name=param_name):
                    param_value
                    for param_name, param_value in
                    estimator.best_params_.items()
                }
                try_mlflow_log(mlflow.log_params, best_params)

            if hasattr(estimator, "cv_results_"):
                try:
                    # Fetch environment-specific tags (e.g., user and source) to ensure that lineage
                    # information is consistent with the parent run
                    environment_tags = context_registry.resolve_tags()
                    _create_child_runs_for_parameter_search(
                        cv_estimator=estimator,
                        parent_run=mlflow.active_run(),
                        child_tags=environment_tags,
                    )
                except Exception as e:  # pylint: disable=broad-except

                    msg = (
                        "Encountered exception during creation of child runs for parameter search."
                        " Child runs may be missing. Exception: {}".format(
                            str(e)))
                    _logger.warning(msg)

                try:
                    cv_results_df = pd.DataFrame.from_dict(
                        estimator.cv_results_)
                    _log_parameter_search_results_as_artifact(
                        cv_results_df,
                        mlflow.active_run().info.run_id)
                except Exception as e:  # pylint: disable=broad-except

                    msg = (
                        "Failed to log parameter search results as an artifact."
                        " Exception: {}".format(str(e)))
                    _logger.warning(msg)
Пример #5
0
    def _log_posttraining_metadata(autologging_client, estimator, *args,
                                   **kwargs):
        """
        Records metadata for a scikit-learn estimator after training has completed.
        This is intended to be invoked within a patched scikit-learn training routine
        (e.g., `fit()`, `fit_transform()`, ...) and assumes the existence of an active
        MLflow run that can be referenced via the fluent Tracking API.

        :param autologging_client: An instance of `MlflowAutologgingQueueingClient` used for
                                   efficiently logging run data to MLflow Tracking.
        :param estimator: The scikit-learn estimator for which to log metadata.
        :param args: The arguments passed to the scikit-learn training routine (e.g.,
                     `fit()`, `fit_transform()`, ...).
        :param kwargs: The keyword arguments passed to the scikit-learn training routine.
        """
        def infer_model_signature(input_example):
            if not hasattr(estimator, "predict"):
                raise Exception(
                    "the trained model does not specify a `predict` function, "
                    + "which is required in order to infer the signature")

            return infer_signature(input_example,
                                   estimator.predict(input_example))

        (X, y_true,
         sample_weight) = _get_args_for_metrics(estimator.fit, args, kwargs)

        # log common metrics and artifacts for estimators (classifier, regressor)
        logged_metrics = _log_estimator_content(
            autologging_client=autologging_client,
            estimator=estimator,
            prefix=_TRAINING_PREFIX,
            run_id=mlflow.active_run().info.run_id,
            X=X,
            y_true=y_true,
            sample_weight=sample_weight,
        )
        if y_true is None and not logged_metrics:
            _logger.warning(
                "Training metrics will not be recorded because training labels were not specified."
                " To automatically record training metrics, provide training labels as inputs to"
                " the model training function.")

        def get_input_example():
            # Fetch an input example using the first several rows of the array-like
            # training data supplied to the training routine (e.g., `fit()`)
            input_example = X[:INPUT_EXAMPLE_SAMPLE_ROWS]
            return input_example

        if log_models:
            # Will only resolve `input_example` and `signature` if `log_models` is `True`.
            input_example, signature = resolve_input_example_and_signature(
                get_input_example,
                infer_model_signature,
                log_input_examples,
                log_model_signatures,
                _logger,
            )

            log_model(
                estimator,
                artifact_path="model",
                signature=signature,
                input_example=input_example,
            )

        if _is_parameter_search_estimator(estimator):
            if hasattr(estimator, "best_estimator_") and log_models:
                log_model(
                    estimator.best_estimator_,
                    artifact_path="best_estimator",
                    signature=signature,
                    input_example=input_example,
                )

            if hasattr(estimator, "best_score_"):
                autologging_client.log_metrics(
                    run_id=mlflow.active_run().info.run_id,
                    metrics={"best_cv_score": estimator.best_score_},
                )

            if hasattr(estimator, "best_params_"):
                best_params = {
                    "best_{param_name}".format(param_name=param_name):
                    param_value
                    for param_name, param_value in
                    estimator.best_params_.items()
                }
                autologging_client.log_params(
                    run_id=mlflow.active_run().info.run_id,
                    params=best_params,
                )

            if hasattr(estimator, "cv_results_"):
                try:
                    # Fetch environment-specific tags (e.g., user and source) to ensure that lineage
                    # information is consistent with the parent run
                    child_tags = context_registry.resolve_tags()
                    child_tags.update({MLFLOW_AUTOLOGGING: FLAVOR_NAME})
                    _create_child_runs_for_parameter_search(
                        autologging_client=autologging_client,
                        cv_estimator=estimator,
                        parent_run=mlflow.active_run(),
                        max_tuning_runs=max_tuning_runs,
                        child_tags=child_tags,
                    )
                except Exception as e:

                    msg = (
                        "Encountered exception during creation of child runs for parameter search."
                        " Child runs may be missing. Exception: {}".format(
                            str(e)))
                    _logger.warning(msg)

                try:
                    cv_results_df = pd.DataFrame.from_dict(
                        estimator.cv_results_)
                    _log_parameter_search_results_as_artifact(
                        cv_results_df,
                        mlflow.active_run().info.run_id)
                except Exception as e:

                    msg = (
                        "Failed to log parameter search results as an artifact."
                        " Exception: {}".format(str(e)))
                    _logger.warning(msg)