def test_only_score_contains_sample_weight(): mlflow.sklearn.autolog() from sklearn.gaussian_process import GaussianProcessRegressor assert "sample_weight" not in _get_arg_names(GaussianProcessRegressor.fit) assert "sample_weight" in _get_arg_names(GaussianProcessRegressor.score) mock_obj = mock.Mock() def mock_score(self, X, y, sample_weight=None): # pylint: disable=unused-argument mock_obj(X, y, sample_weight) return 0 assert inspect.signature( GaussianProcessRegressor.score) == inspect.signature(mock_score) GaussianProcessRegressor.score = mock_score model = GaussianProcessRegressor() X, y = get_iris() with mlflow.start_run() as run: model.fit(X, y) mock_obj.assert_called_once_with(X, y, None) run_id = run.info.run_id params, metrics, tags, artifacts = get_run_data(run_id) assert params == truncate_dict( stringify_dict_values(model.get_params(deep=True))) assert {TRAINING_SCORE: model.score(X, y)}.items() <= metrics.items() assert tags == get_expected_class_tags(model) assert MODEL_DIR in artifacts assert_predict_equal(load_model_by_run_id(run_id), model, X)
def test_call_fit_with_arguments_score_does_not_accept(): mlflow.sklearn.autolog() from sklearn.linear_model import SGDRegressor assert "intercept_init" in _get_arg_names(SGDRegressor.fit) assert "intercept_init" not in _get_arg_names(SGDRegressor.score) mock_obj = mock.Mock() def mock_score(self, X, y, sample_weight=None): # pylint: disable=unused-argument mock_obj(X, y, sample_weight) return 0 assert inspect.signature( SGDRegressor.score) == inspect.signature(mock_score) SGDRegressor.score = mock_score model = SGDRegressor() X, y = get_iris() with mlflow.start_run() as run: model.fit(X, y, intercept_init=0) mock_obj.assert_called_once_with(X, y, None) run_id = run._info.run_id params, metrics, tags, artifacts = get_run_data(run_id) assert params == truncate_dict( stringify_dict_values(model.get_params(deep=True))) assert metrics == {TRAINING_SCORE: model.score(X, y)} assert tags == get_expected_class_tags(model) assert MODEL_DIR in artifacts assert_predict_equal(load_model_by_run_id(run_id), model, X)
def get_input_example(): # Fetch an input example using the first several rows of the array-like # training data supplied to the training routine (e.g., `fit()`) fit_arg_names = _get_arg_names(estimator.fit) X_var_name, y_var_name = fit_arg_names[:2] input_example = _get_Xy(args, kwargs, X_var_name, y_var_name)[0][:INPUT_EXAMPLE_SAMPLE_ROWS] return input_example
def test_both_fit_and_score_contain_sample_weight(sample_weight_passed_as): mlflow.sklearn.autolog() from sklearn.linear_model import SGDRegressor # ensure that we use an appropriate model for this test assert "sample_weight" in _get_arg_names(SGDRegressor.fit) assert "sample_weight" in _get_arg_names(SGDRegressor.score) mock_obj = mock.Mock() def mock_score(self, X, y, sample_weight=None): # pylint: disable=unused-argument mock_obj(X, y, sample_weight) return 0 assert inspect.signature( SGDRegressor.score) == inspect.signature(mock_score) SGDRegressor.score = mock_score model = SGDRegressor() X, y = get_iris() sample_weight = abs(np.random.randn(len(X))) with mlflow.start_run() as run: if sample_weight_passed_as == "positional": model.fit(X, y, None, None, sample_weight) elif sample_weight_passed_as == "keyword": model.fit(X, y, sample_weight=sample_weight) mock_obj.assert_called_once_with(X, y, sample_weight) run_id = run.info.run_id params, metrics, tags, artifacts = get_run_data(run_id) assert params == truncate_dict( stringify_dict_values(model.get_params(deep=True))) assert {TRAINING_SCORE: model.score(X, y)}.items() <= metrics.items() assert tags == get_expected_class_tags(model) assert MODEL_DIR in artifacts assert_predict_equal(load_model_by_run_id(run_id), model, X)
def _log_posttraining_metadata(estimator, *args, **kwargs): """ Records metadata for a scikit-learn estimator after training has completed. This is intended to be invoked within a patched scikit-learn training routine (e.g., `fit()`, `fit_transform()`, ...) and assumes the existence of an active MLflow run that can be referenced via the fluent Tracking API. :param estimator: The scikit-learn estimator for which to log metadata. :param args: The arguments passed to the scikit-learn training routine (e.g., `fit()`, `fit_transform()`, ...). :param kwargs: The keyword arguments passed to the scikit-learn training routine. """ if hasattr(estimator, "score"): try: score_args = _get_args_for_score(estimator.score, estimator.fit, args, kwargs) training_score = estimator.score(*score_args) except Exception as e: # pylint: disable=broad-except msg = ( estimator.score.__qualname__ + " failed. The 'training_score' metric will not be recorded. Scoring error: " + str(e)) _logger.warning(msg) else: try_mlflow_log(mlflow.log_metric, "training_score", training_score) # log common metrics and artifacts for estimators (classifier, regressor) _log_specialized_estimator_content(estimator, mlflow.active_run().info.run_id, args, kwargs) input_example = None signature = None if hasattr(estimator, "predict"): try: # Fetch an input example using the first several rows of the array-like # training data supplied to the training routine (e.g., `fit()`) SAMPLE_ROWS = 5 fit_arg_names = _get_arg_names(estimator.fit) X_var_name, y_var_name = fit_arg_names[:2] input_example = _get_Xy(args, kwargs, X_var_name, y_var_name)[0][:SAMPLE_ROWS] model_output = estimator.predict(input_example) signature = infer_signature(input_example, model_output) except Exception as e: # pylint: disable=broad-except input_example = None msg = "Failed to infer an input example and model signature: " + str( e) _logger.warning(msg) try_mlflow_log( log_model, estimator, artifact_path="model", signature=signature, input_example=input_example, ) if _is_parameter_search_estimator(estimator): if hasattr(estimator, "best_estimator_"): try_mlflow_log( log_model, estimator.best_estimator_, artifact_path="best_estimator", signature=signature, input_example=input_example, ) if hasattr(estimator, "best_params_"): best_params = { "best_{param_name}".format(param_name=param_name): param_value for param_name, param_value in estimator.best_params_.items() } try_mlflow_log(mlflow.log_params, best_params) if hasattr(estimator, "cv_results_"): try: # Fetch environment-specific tags (e.g., user and source) to ensure that lineage # information is consistent with the parent run environment_tags = context_registry.resolve_tags() _create_child_runs_for_parameter_search( cv_estimator=estimator, parent_run=mlflow.active_run(), child_tags=environment_tags, ) except Exception as e: # pylint: disable=broad-except msg = ( "Encountered exception during creation of child runs for parameter search." " Child runs may be missing. Exception: {}".format( str(e))) _logger.warning(msg) try: cv_results_df = pd.DataFrame.from_dict( estimator.cv_results_) _log_parameter_search_results_as_artifact( cv_results_df, mlflow.active_run().info.run_id) except Exception as e: # pylint: disable=broad-except msg = ( "Failed to log parameter search results as an artifact." " Exception: {}".format(str(e))) _logger.warning(msg)