def test_evaluate_custom_metric_success(): eval_df = pd.DataFrame({ "prediction": [1.2, 1.9, 3.2], "target": [1, 2, 3] }) metrics = _get_regressor_metrics(eval_df["target"], eval_df["prediction"]) def example_custom_metric(_, given_metrics): return { "example_count_times_1_point_5": given_metrics["example_count"] * 1.5, "sum_on_label_minus_5": given_metrics["sum_on_label"] - 5, "example_np_metric_1": np.float32(123.2), "example_np_metric_2": np.ulonglong(10000000), } res_metrics, res_artifacts = _evaluate_custom_metric( 0, example_custom_metric, eval_df, metrics) assert res_metrics == { "example_count_times_1_point_5": metrics["example_count"] * 1.5, "sum_on_label_minus_5": metrics["sum_on_label"] - 5, "example_np_metric_1": np.float32(123.2), "example_np_metric_2": np.ulonglong(10000000), } assert res_artifacts is None def example_custom_metric_with_artifacts(given_df, given_metrics): return ( { "example_count_times_1_point_5": given_metrics["example_count"] * 1.5, "sum_on_label_minus_5": given_metrics["sum_on_label"] - 5, "example_np_metric_1": np.float32(123.2), "example_np_metric_2": np.ulonglong(10000000), }, { "pred_target_abs_diff": np.abs(given_df["prediction"] - given_df["target"]), "example_dictionary_artifact": { "a": 1, "b": 2 }, }, ) res_metrics_2, res_artifacts_2 = _evaluate_custom_metric( 0, example_custom_metric_with_artifacts, eval_df, metrics) assert res_metrics_2 == { "example_count_times_1_point_5": metrics["example_count"] * 1.5, "sum_on_label_minus_5": metrics["sum_on_label"] - 5, "example_np_metric_1": np.float32(123.2), "example_np_metric_2": np.ulonglong(10000000), } assert "pred_target_abs_diff" in res_artifacts_2 assert res_artifacts_2["pred_target_abs_diff"].equals( np.abs(eval_df["prediction"] - eval_df["target"])) assert "example_dictionary_artifact" in res_artifacts_2 assert res_artifacts_2["example_dictionary_artifact"] == {"a": 1, "b": 2}
def test_evaluate_custom_metric_lambda(fn, expectation): eval_df = pd.DataFrame({ "prediction": [1.2, 1.9, 3.2], "target": [1, 2, 3] }) metrics = _get_regressor_metrics(eval_df["target"], eval_df["prediction"]) with expectation: _evaluate_custom_metric(0, fn, eval_df, metrics)
def test_evaluate_custom_metric_incorrect_return_formats(): eval_df = pd.DataFrame({ "prediction": [1.2, 1.9, 3.2], "target": [1, 2, 3] }) metrics = _get_regressor_metrics(eval_df["target"], eval_df["prediction"]) def dummy_fn(*_): pass with pytest.raises( MlflowException, match=f"'{dummy_fn.__name__}' (.*) returned None", ): _evaluate_custom_metric(_CustomMetric(dummy_fn, "dummy_fn", 0, ""), eval_df, metrics) def incorrect_return_type_1(*_): return 3 def incorrect_return_type_2(*_): return "stuff", 3 for test_fn in ( incorrect_return_type_1, incorrect_return_type_2, ): with pytest.raises( MlflowException, match= f"'{test_fn.__name__}' (.*) did not return in an expected format", ): _evaluate_custom_metric( _CustomMetric(test_fn, test_fn.__name__, 0, ""), eval_df, metrics) def non_str_metric_name(*_): return {123: 123, "a": 32.1, "b": 3} def non_numerical_metric_value(*_): return {"stuff": 12, "non_numerical_metric": "123"} for test_fn in ( non_str_metric_name, non_numerical_metric_value, ): with pytest.raises( MlflowException, match= f"'{test_fn.__name__}' (.*) did not return metrics as a dictionary of " "string metric names with numerical values", ): _evaluate_custom_metric( _CustomMetric(test_fn, test_fn.__name__, 0, ""), eval_df, metrics) def non_str_artifact_name(*_): return {"a": 32.1, "b": 3}, {1: [1, 2, 3]} with pytest.raises( MlflowException, match= f"'{non_str_artifact_name.__name__}' (.*) did not return artifacts as a " "dictionary of string artifact names with their corresponding objects", ): _evaluate_custom_metric( _CustomMetric(non_str_artifact_name, non_str_artifact_name.__name__, 0, ""), eval_df, metrics, )