def test_spark_udf_autofills_column_names_with_schema(spark): class TestModel(PythonModel): def predict(self, context, model_input): return [model_input.columns] * len(model_input) signature = ModelSignature( inputs=Schema([ColSpec("long", "a"), ColSpec("long", "b"), ColSpec("long", "c")]), outputs=Schema([ColSpec("integer")]), ) with mlflow.start_run() as run: mlflow.pyfunc.log_model("model", python_model=TestModel(), signature=signature) udf = mlflow.pyfunc.spark_udf( spark, "runs:/{}/model".format(run.info.run_id), result_type=ArrayType(StringType()) ) data = spark.createDataFrame( pd.DataFrame( columns=["a", "b", "c", "d"], data={"a": [1], "b": [2], "c": [3], "d": [4]} ) ) with pytest.raises(pyspark.sql.utils.PythonException): res = data.withColumn("res1", udf("a", "b")).select("res1").toPandas() res = data.withColumn("res2", udf("a", "b", "c")).select("res2").toPandas() assert res["res2"][0] == ["a", "b", "c"] res = data.withColumn("res4", udf("a", "b", "c", "d")).select("res4").toPandas() assert res["res4"][0] == ["a", "b", "c"]
def test_spark_udf_autofills_no_arguments(spark): class TestModel(PythonModel): def predict(self, context, model_input): return [model_input.columns] * len(model_input) signature = ModelSignature( inputs=Schema([ColSpec("long", "a"), ColSpec("long", "b"), ColSpec("long", "c")]), outputs=Schema([ColSpec("integer")]), ) good_data = spark.createDataFrame( pd.DataFrame(columns=["a", "b", "c", "d"], data={"a": [1], "b": [2], "c": [3], "d": [4]}) ) with mlflow.start_run() as run: mlflow.pyfunc.log_model("model", python_model=TestModel(), signature=signature) udf = mlflow.pyfunc.spark_udf( spark, "runs:/{}/model".format(run.info.run_id), result_type=ArrayType(StringType()) ) res = good_data.withColumn("res", udf()).select("res").toPandas() assert res["res"][0] == ["a", "b", "c"] with pytest.raises( pyspark.sql.utils.PythonException, match=r"Model input is missing columns. Expected 3 input columns", ): res = good_data.withColumn("res", udf("b", "c")).select("res").toPandas() # this dataframe won't work because it's missing column a bad_data = spark.createDataFrame( pd.DataFrame( columns=["x", "b", "c", "d"], data={"x": [1], "b": [2], "c": [3], "d": [4]} ) ) with pytest.raises(AnalysisException, match=r"cannot resolve 'a' given input columns"): bad_data.withColumn("res", udf()) nameless_signature = ModelSignature( inputs=Schema([ColSpec("long"), ColSpec("long"), ColSpec("long")]), outputs=Schema([ColSpec("integer")]), ) with mlflow.start_run() as run: mlflow.pyfunc.log_model("model", python_model=TestModel(), signature=nameless_signature) udf = mlflow.pyfunc.spark_udf( spark, "runs:/{}/model".format(run.info.run_id), result_type=ArrayType(StringType()) ) with pytest.raises( MlflowException, match=r"Cannot apply udf because no column names specified", ): good_data.withColumn("res", udf()) with mlflow.start_run() as run: # model without signature mlflow.pyfunc.log_model("model", python_model=TestModel()) udf = mlflow.pyfunc.spark_udf( spark, "runs:/{}/model".format(run.info.run_id), result_type=ArrayType(StringType()) ) with pytest.raises(MlflowException, match="Attempting to apply udf on zero columns"): res = good_data.withColumn("res", udf()).select("res").toPandas()
def test_parse_with_schema(pandas_df_with_all_types): schema = Schema([ColSpec(c, c) for c in pandas_df_with_all_types.columns]) df = _shuffle_pdf(pandas_df_with_all_types) json_str = json.dumps(df.to_dict(orient="split"), cls=NumpyEncoder) df = pyfunc_scoring_server.parse_json_input(json_str, orient="split", schema=schema) json_str = json.dumps(df.to_dict(orient="records"), cls=NumpyEncoder) df = pyfunc_scoring_server.parse_json_input(json_str, orient="records", schema=schema) assert schema == infer_signature(df[schema.input_names()]).inputs # The current behavior with pandas json parse with type hints is weird. In some cases, the # types are forced ignoting overflow and loss of precision: bad_df = """{ "columns":["bad_integer", "bad_float", "bad_string", "bad_boolean"], "data":[ [9007199254740991.0, 1.1, 1, 1.5], [9007199254740992.0, 9007199254740992.0, 2, 0], [9007199254740994.0, 3.3, 3, "some arbitrary string"] ] }""" schema = Schema([ ColSpec("integer", "bad_integer"), ColSpec("float", "bad_float"), ColSpec("float", "good_float"), ColSpec("string", "bad_string"), ColSpec("boolean", "bad_boolean"), ]) df = pyfunc_scoring_server.parse_json_input(bad_df, orient="split", schema=schema) # Unfortunately, the current behavior of pandas parse is to force numbers to int32 even if # they don't fit: assert df["bad_integer"].dtype == np.int32 assert all(df["bad_integer"] == [-2147483648, -2147483648, -2147483648]) # The same goes for floats: assert df["bad_float"].dtype == np.float32 assert all(df["bad_float"] == np.array([1.1, 9007199254740992, 3.3], dtype=np.float32)) # However bad string is recognized as int64: assert all(df["bad_string"] == np.array([1, 2, 3], dtype=np.object)) # Boolean is forced - zero and empty string is false, everything else is true: assert df["bad_boolean"].dtype == np.bool assert all(df["bad_boolean"] == [True, False, True])
def _enforce_schema(pdf: pandas.DataFrame, input_schema: Schema): """ Enforce column names and types match the input schema. For column names, we check there are no missing columns and reorder the columns to match the ordering declared in schema if necessary. Any extra columns are ignored. For column types, we make sure the types match schema or can be safely converted to match the input schema. """ if isinstance(pdf, list): pdf = pandas.DataFrame(pdf) if not isinstance(pdf, pandas.DataFrame): message = "Expected input to be DataFrame or list. Found: %s" % type( pdf).__name__ raise MlflowException(message) if input_schema.has_column_names(): # make sure there are no missing columns col_names = input_schema.column_names() expected_names = set(col_names) actual_names = set(pdf.columns) missing_cols = expected_names - actual_names extra_cols = actual_names - expected_names # Preserve order from the original columns, since missing/extra columns are likely to # be in same order. missing_cols = [c for c in col_names if c in missing_cols] extra_cols = [c for c in pdf.columns if c in extra_cols] if missing_cols: message = ("Model input is missing columns {0}." " Note that there were extra columns: {1}".format( missing_cols, extra_cols)) raise MlflowException(message) else: # The model signature does not specify column names => we can only verify column count. if len(pdf.columns) < len(input_schema.columns): message = ( "Model input is missing input columns. The model signature declares " "{0} input columns but the provided input only has " "{1} columns. Note: the columns were not named in the signature so we can " "only verify their count.").format(len(input_schema.columns), len(pdf.columns)) raise MlflowException(message) col_names = pdf.columns[:len(input_schema.columns)] col_types = input_schema.column_types() new_pdf = pandas.DataFrame() for i, x in enumerate(col_names): new_pdf[x] = _enforce_type(x, pdf[x], col_types[i]) return new_pdf
def test_missing_value_hint_is_displayed_when_it_should(): m = Model() input_schema = Schema([ColSpec("integer", "a")]) m.signature = ModelSignature(inputs=input_schema) pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) pdf = pd.DataFrame( data=[[1], [None]], columns=["a"], ) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) hint = "Hint: the type mismatch is likely caused by missing values." assert "Incompatible input types" in str(ex.value.message) assert hint in str(ex.value.message) pdf = pd.DataFrame( data=[[1.5], [None]], columns=["a"], ) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) assert hint not in str(ex.value.message) pdf = pd.DataFrame(data=[[1], [2]], columns=["a"], dtype=np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex.value.message) assert hint not in str(ex.value.message)
def test_serving_model_with_schema(pandas_df_with_all_types): class TestModel(PythonModel): def predict(self, context, model_input): return [[k, str(v)] for k, v in model_input.dtypes.items()] schema = Schema([ColSpec(c, c) for c in pandas_df_with_all_types.columns]) df = _shuffle_pdf(pandas_df_with_all_types) with TempDir(chdr=True): with mlflow.start_run() as run: mlflow.pyfunc.log_model("model", python_model=TestModel(), signature=ModelSignature(schema)) response = pyfunc_serve_and_score_model( model_uri="runs:/{}/model".format(run.info.run_id), data=json.dumps(df.to_dict(orient="split"), cls=NumpyEncoder), content_type=pyfunc_scoring_server. CONTENT_TYPE_JSON_SPLIT_ORIENTED, extra_args=["--no-conda"], ) response_json = json.loads(response.content) assert response_json == [ [k, str(v)] for k, v in pandas_df_with_all_types.dtypes.items() ] response = pyfunc_serve_and_score_model( model_uri="runs:/{}/model".format(run.info.run_id), data=json.dumps(pandas_df_with_all_types.to_dict(orient="records"), cls=NumpyEncoder), content_type=pyfunc_scoring_server. CONTENT_TYPE_JSON_RECORDS_ORIENTED, extra_args=["--no-conda"], ) response_json = json.loads(response.content) assert response_json == [ [k, str(v)] for k, v in pandas_df_with_all_types.dtypes.items() ]
def test_tensor_schema_enforcement_no_col_names(): m = Model() input_schema = Schema([TensorSpec(np.dtype(np.float32), (-1, 3))]) m.signature = ModelSignature(inputs=input_schema) pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) test_data = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=np.float32) # Can call with numpy array of correct shape assert np.array_equal(pyfunc_model.predict(test_data), test_data) # Or can call with a dataframe assert np.array_equal(pyfunc_model.predict(pd.DataFrame(test_data)), test_data) # Can not call with a list with pytest.raises( MlflowException, match= "This model contains a tensor-based model signature with no input names", ): pyfunc_model.predict([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) # Can not call with a dict with pytest.raises( MlflowException, match= "This model contains a tensor-based model signature with no input names", ): pyfunc_model.predict({"blah": test_data}) # Can not call with a np.ndarray of a wrong shape with pytest.raises( MlflowException, match=re.escape( "Shape of input (2, 2) does not match expected shape (-1, 3)"), ): pyfunc_model.predict(np.array([[1.0, 2.0], [4.0, 5.0]])) # Can not call with a np.ndarray of a wrong type with pytest.raises( MlflowException, match="dtype of input uint32 does not match expected dtype float32" ): pyfunc_model.predict(test_data.astype(np.uint32)) # Can call with a np.ndarray with more elements along variable axis test_data2 = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]], dtype=np.float32) assert np.array_equal(pyfunc_model.predict(test_data2), test_data2) # Can not call with an empty ndarray with pytest.raises( MlflowException, match=re.escape( "Shape of input () does not match expected shape (-1, 3)")): pyfunc_model.predict(np.ndarray([]))
def test_parse_tf_serving_dictionary(): def assert_result(result, expected_result): assert result.keys() == expected_result.keys() for key in result: assert (result[key] == expected_result[key]).all() # instances are correctly aggregated to dict of input name -> tensor tfserving_input = { "instances": [ {"a": "s1", "b": 1.1, "c": [1, 2, 3]}, {"a": "s2", "b": 2.2, "c": [4, 5, 6]}, {"a": "s3", "b": 3.3, "c": [7, 8, 9]}, ] } # Without Schema result = parse_tf_serving_input(tfserving_input) expected_result_no_schema = { "a": np.array(["s1", "s2", "s3"]), "b": np.array([1.1, 2.2, 3.3], dtype="float64"), "c": np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype="int64"), } assert_result(result, expected_result_no_schema) # With schema schema = Schema( [ TensorSpec(np.dtype("object"), [-1], "a"), TensorSpec(np.dtype("float32"), [-1], "b"), TensorSpec(np.dtype("int32"), [-1], "c"), ] ) result = parse_tf_serving_input(tfserving_input, schema) expected_result_schema = { "a": np.array(["s1", "s2", "s3"]), "b": np.array([1.1, 2.2, 3.3], dtype="float32"), "c": np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype="int32"), } assert_result(result, expected_result_schema) # input provided as a dict tfserving_input = { "inputs": { "a": ["s1", "s2", "s3"], "b": [1.1, 2.2, 3.3], "c": [[1, 2, 3], [4, 5, 6], [7, 8, 9]], } } # Without Schema result = parse_tf_serving_input(tfserving_input) assert_result(result, expected_result_no_schema) # With Schema result = parse_tf_serving_input(tfserving_input, schema) assert_result(result, expected_result_schema)
def parse_csv_input(csv_input, schema: Schema = None): """ :param csv_input: A CSV-formatted string representation of a Pandas DataFrame, or a stream containing such a string representation. :param schema: Optional schema specification to be used during parsing. """ try: if schema is None: return pd.read_csv(csv_input) else: dtypes = dict(zip(schema.input_names(), schema.pandas_types())) return pd.read_csv(csv_input, dtype=dtypes) except Exception: _handle_serving_error( error_message= ("Failed to parse input as a Pandas DataFrame. Ensure that the input is" " a valid CSV-formatted Pandas DataFrame produced using the" " `pandas.DataFrame.to_csv()` method."), error_code=BAD_REQUEST, )
def test_spark_udf_with_datetime_columns(spark): class TestModel(PythonModel): def predict(self, context, model_input): return [model_input.columns] * len(model_input) signature = ModelSignature( inputs=Schema([ColSpec("datetime", "timestamp"), ColSpec("datetime", "date")]), outputs=Schema([ColSpec("integer")]), ) with mlflow.start_run() as run: mlflow.pyfunc.log_model("model", python_model=TestModel(), signature=signature) udf = mlflow.pyfunc.spark_udf( spark, "runs:/{}/model".format(run.info.run_id), result_type=ArrayType(StringType()) ) data = spark.range(10).selectExpr( "current_timestamp() as timestamp", "current_date() as date" ) res = data.withColumn("res", udf("timestamp", "date")).select("res") res = res.toPandas() assert res["res"][0] == ["timestamp", "date"]
def test_schema_enforcement_single_named_tensor_schema(): m = Model() input_schema = Schema([TensorSpec(np.dtype(np.uint64), (-1, 2), "a")]) m.signature = ModelSignature(inputs=input_schema) pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) inp = { "a": np.array([[0, 0], [1, 1]], dtype=np.uint64), } # sanity test that dictionary with correct input works res = pyfunc_model.predict(inp) assert res == inp expected_types = dict( zip(input_schema.input_names(), input_schema.input_types())) actual_types = {k: v.dtype for k, v in res.items()} assert expected_types == actual_types # test single np.ndarray input works and is converted to dictionary res = pyfunc_model.predict(inp["a"]) assert res == inp expected_types = dict( zip(input_schema.input_names(), input_schema.input_types())) actual_types = {k: v.dtype for k, v in res.items()} assert expected_types == actual_types # test list does not work with pytest.raises(MlflowException) as ex: pyfunc_model.predict([[0, 0], [1, 1]]) assert "Model is missing inputs ['a']" in str(ex)
def test_parse_tf_serving_single_array(): def assert_result(result, expected_result): assert (result == expected_result).all() # values for each column are properly converted to a tensor arr = [ [[1, 2, 3], [4, 5, 6], [7, 8, 9]], [[3, 2, 1], [6, 5, 4], [9, 8, 7]], ] tfserving_instances = {"instances": arr} tfserving_inputs = {"inputs": arr} # Without schema instance_result = parse_tf_serving_input(tfserving_instances) assert instance_result.shape == (2, 3, 3) assert_result(instance_result, np.array(arr, dtype="int64")) input_result = parse_tf_serving_input(tfserving_inputs) assert input_result.shape == (2, 3, 3) assert_result(input_result, np.array(arr, dtype="int64")) # Unnamed schema schema = Schema([TensorSpec(np.dtype("float32"), [-1])]) instance_result = parse_tf_serving_input(tfserving_instances, schema) assert_result(instance_result, np.array(arr, dtype="float32")) input_result = parse_tf_serving_input(tfserving_inputs, schema) assert_result(input_result, np.array(arr, dtype="float32")) # named schema schema = Schema([TensorSpec(np.dtype("float32"), [-1], "a")]) instance_result = parse_tf_serving_input(tfserving_instances, schema) assert isinstance(instance_result, dict) assert len(instance_result.keys()) == 1 and "a" in instance_result assert_result(instance_result["a"], np.array(arr, dtype="float32")) input_result = parse_tf_serving_input(tfserving_inputs, schema) assert isinstance(input_result, dict) assert len(input_result.keys()) == 1 and "a" in input_result assert_result(input_result["a"], np.array(arr, dtype="float32"))
def test_schema_enforcement_named_tensor_schema_1d(): m = Model() input_schema = Schema([ TensorSpec(np.dtype(np.uint64), (-1, ), "a"), TensorSpec(np.dtype(np.float32), (-1, ), "b") ]) m.signature = ModelSignature(inputs=input_schema) pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) pdf = pd.DataFrame(data=[[0, 0], [1, 1]], columns=["a", "b"]) pdf["a"] = pdf["a"].astype(np.uint64) pdf["b"] = pdf["a"].astype(np.float32) d_inp = { "a": np.array(pdf["a"], dtype=np.uint64), "b": np.array(pdf["b"], dtype=np.float32), } # test dataframe input works for 1d tensor specs and input is converted to dict res = pyfunc_model.predict(pdf) assert _compare_exact_tensor_dict_input(res, d_inp) expected_types = dict( zip(input_schema.input_names(), input_schema.input_types())) actual_types = {k: v.dtype for k, v in res.items()} assert expected_types == actual_types # test that dictionary works too res = pyfunc_model.predict(d_inp) assert res == d_inp expected_types = dict( zip(input_schema.input_names(), input_schema.input_types())) actual_types = {k: v.dtype for k, v in res.items()} assert expected_types == actual_types
def test_schema_enforcement_no_col_names(): class TestModel(object): @staticmethod def predict(pdf): return pdf m = Model() input_schema = Schema( [ColSpec("double"), ColSpec("double"), ColSpec("double")]) m.signature = ModelSignature(inputs=input_schema) pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) test_data = [[1.0, 2.0, 3.0]] # Can call with just a list assert pyfunc_model.predict(test_data).equals(pd.DataFrame(test_data)) # Or can call with a DataFrame without column names assert pyfunc_model.predict(pd.DataFrame(test_data)).equals( pd.DataFrame(test_data)) # # Or can call with a np.ndarray assert pyfunc_model.predict(pd.DataFrame(test_data).values).equals( pd.DataFrame(test_data)) # Or with column names! pdf = pd.DataFrame(data=test_data, columns=["a", "b", "c"]) assert pyfunc_model.predict(pdf).equals(pdf) # Must provide the right number of arguments with pytest.raises(MlflowException) as ex: pyfunc_model.predict([[1.0, 2.0]]) assert "the provided input only has 2 columns." in str(ex) # Must provide the right types with pytest.raises(MlflowException) as ex: pyfunc_model.predict([[1, 2, 3]]) assert "Can not safely convert int64 to float64" in str(ex) # Can only provide data type that can be converted to dataframe... with pytest.raises(MlflowException) as ex: pyfunc_model.predict(set([1, 2, 3])) assert "Expected input to be DataFrame or list. Found: set" in str(ex) # 9. dictionaries of str -> list/nparray work d = {"a": [1.0], "b": [2.0], "c": [3.0]} assert pyfunc_model.predict(d).equals(pd.DataFrame(d))
def test_dataframe_from_json(): source = pd.DataFrame( { "boolean": [True, False, True], "string": ["a", "b", "c"], "float": np.array([1.2, 2.3, 3.4], dtype=np.float32), "double": np.array([1.2, 2.3, 3.4], dtype=np.float64), "integer": np.array([3, 4, 5], dtype=np.int32), "long": np.array([3, 4, 5], dtype=np.int64), "binary": [bytes([1, 2, 3]), bytes([4, 5]), bytes([6])], "date_string": ["2018-02-03", "1996-03-02", "2021-03-05"], }, columns=[ "boolean", "string", "float", "double", "integer", "long", "binary", "date_string", ], ) jsonable_df = pd.DataFrame(source, copy=True) jsonable_df["binary"] = jsonable_df["binary"].map(base64.b64encode) schema = Schema([ ColSpec("boolean", "boolean"), ColSpec("string", "string"), ColSpec("float", "float"), ColSpec("double", "double"), ColSpec("integer", "integer"), ColSpec("long", "long"), ColSpec("binary", "binary"), ColSpec("string", "date_string"), ]) parsed = _dataframe_from_json(jsonable_df.to_json(orient="split"), pandas_orient="split", schema=schema) assert parsed.equals(source) parsed = _dataframe_from_json(jsonable_df.to_json(orient="records"), pandas_orient="records", schema=schema) assert parsed.equals(source) # try parsing with tensor schema tensor_schema = Schema([ TensorSpec(np.dtype("bool"), [-1], "boolean"), TensorSpec(np.dtype("str"), [-1], "string"), TensorSpec(np.dtype("float32"), [-1], "float"), TensorSpec(np.dtype("float64"), [-1], "double"), TensorSpec(np.dtype("int32"), [-1], "integer"), TensorSpec(np.dtype("int64"), [-1], "long"), TensorSpec(np.dtype(bytes), [-1], "binary"), ]) parsed = _dataframe_from_json(jsonable_df.to_json(orient="split"), pandas_orient="split", schema=tensor_schema) # NB: tensor schema does not automatically decode base64 encoded bytes. assert parsed.equals(jsonable_df) parsed = _dataframe_from_json(jsonable_df.to_json(orient="records"), pandas_orient="records", schema=tensor_schema) # NB: tensor schema does not automatically decode base64 encoded bytes. assert parsed.equals(jsonable_df) # Test parse with TesnorSchema with a single tensor tensor_schema = Schema([TensorSpec(np.dtype("float32"), [-1, 3])]) source = pd.DataFrame( { "a": np.array([1, 2, 3], dtype=np.float32), "b": np.array([4.1, 5.2, 6.3], dtype=np.float32), "c": np.array([7, 8, 9], dtype=np.float32), }, columns=["a", "b", "c"], ) assert source.equals( _dataframe_from_json(source.to_json(orient="split"), pandas_orient="split", schema=tensor_schema)) assert source.equals( _dataframe_from_json(source.to_json(orient="records"), pandas_orient="records", schema=tensor_schema))
# Prepare dataset try: repo_url = "https://raw.githubusercontent.com/prinz-nussknacker" csv_url = f"{repo_url}/banksim1/master/bs140513_032310.csv" data = pd.read_csv(csv_url, sep=",", quotechar="'", header=0) except Exception as e: logger.exception("Could not read CSV file: {}".format(e)) exit(1) data.dropna() data = data.drop(["step", "customer", "zipcodeOri", "merchant", "zipMerchant"], axis="columns") input_schema = Schema([ ColSpec("string", "age"), ColSpec("string", "gender"), ColSpec("string", "category"), ColSpec("double", "amount") ]) output_schema = Schema([ColSpec("integer")]) signature = ModelSignature(inputs=input_schema, outputs=output_schema) # Prepare train and test sets data_x = data.drop(["fraud"], axis="columns") data_y = data[["fraud"]] train_x, test_x, train_y, test_y = train_test_split(data_x, data_y) with mlflow.start_run(): # Define pipeline numeric_features = ['amount'] numeric_transformer = Pipeline( steps=[('imputer',
def test_schema_enforcement(): class TestModel(object): @staticmethod def predict(pdf): return pdf m = Model() input_schema = Schema([ ColSpec("integer", "a"), ColSpec("long", "b"), ColSpec("float", "c"), ColSpec("double", "d"), ColSpec("boolean", "e"), ColSpec("string", "g"), ColSpec("binary", "f"), ]) m.signature = ModelSignature(inputs=input_schema) pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) pdf = pd.DataFrame( data=[[1, 2, 3, 4, True, "x", bytes([1])]], columns=["b", "d", "a", "c", "e", "g", "f"], dtype=np.object, ) pdf["a"] = pdf["a"].astype(np.int32) pdf["b"] = pdf["b"].astype(np.int64) pdf["c"] = pdf["c"].astype(np.float32) pdf["d"] = pdf["d"].astype(np.float64) # test that missing column raises with pytest.raises(MlflowException) as ex: res = pyfunc_model.predict(pdf[["b", "d", "a", "e", "g", "f"]]) assert "Model input is missing columns" in str(ex) # test that extra column is ignored pdf["x"] = 1 # test that columns are reordered, extra column is ignored res = pyfunc_model.predict(pdf) assert all((res == pdf[input_schema.column_names()]).all()) expected_types = dict( zip(input_schema.column_names(), input_schema.pandas_types())) actual_types = res.dtypes.to_dict() assert expected_types == actual_types # Test conversions # 1. long -> integer raises pdf["a"] = pdf["a"].astype(np.int64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["a"] = pdf["a"].astype(np.int32) # 2. integer -> long works pdf["b"] = pdf["b"].astype(np.int32) res = pyfunc_model.predict(pdf) assert all((res == pdf[input_schema.column_names()]).all()) assert res.dtypes.to_dict() == expected_types pdf["b"] = pdf["b"].astype(np.int64) # 3. double -> float raises pdf["c"] = pdf["c"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["c"] = pdf["c"].astype(np.float32) # 4. float -> double works pdf["d"] = pdf["d"].astype(np.float32) res = pyfunc_model.predict(pdf) assert res.dtypes.to_dict() == expected_types assert "Incompatible input types" in str(ex) pdf["d"] = pdf["d"].astype(np.int64) # 5. floats -> ints raises pdf["c"] = pdf["c"].astype(np.int32) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["c"] = pdf["c"].astype(np.float32) pdf["d"] = pdf["d"].astype(np.int64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["d"] = pdf["d"].astype(np.float64) # 6. ints -> floats raises pdf["a"] = pdf["a"].astype(np.float32) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["a"] = pdf["a"].astype(np.int32) pdf["b"] = pdf["b"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) pdf["b"] = pdf["b"].astype(np.int64) assert "Incompatible input types" in str(ex) # 7. objects work pdf["b"] = pdf["b"].astype(np.object) pdf["d"] = pdf["d"].astype(np.object) pdf["e"] = pdf["e"].astype(np.object) pdf["f"] = pdf["f"].astype(np.object) pdf["g"] = pdf["g"].astype(np.object) res = pyfunc_model.predict(pdf) assert res.dtypes.to_dict() == expected_types
def test_column_schema_enforcement(): m = Model() input_schema = Schema([ ColSpec("integer", "a"), ColSpec("long", "b"), ColSpec("float", "c"), ColSpec("double", "d"), ColSpec("boolean", "e"), ColSpec("string", "g"), ColSpec("binary", "f"), ColSpec("datetime", "h"), ]) m.signature = ModelSignature(inputs=input_schema) pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) pdf = pd.DataFrame( data=[[ 1, 2, 3, 4, True, "x", bytes([1]), "2021-01-01 00:00:00.1234567" ]], columns=["b", "d", "a", "c", "e", "g", "f", "h"], dtype=np.object, ) pdf["a"] = pdf["a"].astype(np.int32) pdf["b"] = pdf["b"].astype(np.int64) pdf["c"] = pdf["c"].astype(np.float32) pdf["d"] = pdf["d"].astype(np.float64) pdf["h"] = pdf["h"].astype(np.datetime64) # test that missing column raises with pytest.raises(MlflowException) as ex: res = pyfunc_model.predict(pdf[["b", "d", "a", "e", "g", "f", "h"]]) assert "Model is missing inputs" in str(ex) # test that extra column is ignored pdf["x"] = 1 # test that columns are reordered, extra column is ignored res = pyfunc_model.predict(pdf) assert all((res == pdf[input_schema.input_names()]).all()) expected_types = dict( zip(input_schema.input_names(), input_schema.pandas_types())) # MLflow datetime type in input_schema does not encode precision, so add it for assertions expected_types["h"] = np.dtype("datetime64[ns]") actual_types = res.dtypes.to_dict() assert expected_types == actual_types # Test conversions # 1. long -> integer raises pdf["a"] = pdf["a"].astype(np.int64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["a"] = pdf["a"].astype(np.int32) # 2. integer -> long works pdf["b"] = pdf["b"].astype(np.int32) res = pyfunc_model.predict(pdf) assert all((res == pdf[input_schema.input_names()]).all()) assert res.dtypes.to_dict() == expected_types pdf["b"] = pdf["b"].astype(np.int64) # 3. unsigned int -> long works pdf["b"] = pdf["b"].astype(np.uint32) res = pyfunc_model.predict(pdf) assert all((res == pdf[input_schema.input_names()]).all()) assert res.dtypes.to_dict() == expected_types pdf["b"] = pdf["b"].astype(np.int64) # 4. unsigned int -> int raises pdf["a"] = pdf["a"].astype(np.uint32) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["a"] = pdf["a"].astype(np.int32) # 5. double -> float raises pdf["c"] = pdf["c"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["c"] = pdf["c"].astype(np.float32) # 6. float -> double works, double -> float does not pdf["d"] = pdf["d"].astype(np.float32) res = pyfunc_model.predict(pdf) assert res.dtypes.to_dict() == expected_types assert "Incompatible input types" in str(ex) pdf["d"] = pdf["d"].astype(np.float64) pdf["c"] = pdf["c"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["c"] = pdf["c"].astype(np.float32) # 7. int -> float raises pdf["c"] = pdf["c"].astype(np.int32) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["c"] = pdf["c"].astype(np.float32) # 8. int -> double works pdf["d"] = pdf["d"].astype(np.int32) pyfunc_model.predict(pdf) assert all((res == pdf[input_schema.input_names()]).all()) assert res.dtypes.to_dict() == expected_types # 9. long -> double raises pdf["d"] = pdf["d"].astype(np.int64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["d"] = pdf["d"].astype(np.float64) # 10. any float -> any int raises pdf["a"] = pdf["a"].astype(np.float32) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) # 10. any float -> any int raises pdf["a"] = pdf["a"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["a"] = pdf["a"].astype(np.int32) pdf["b"] = pdf["b"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["b"] = pdf["b"].astype(np.int64) pdf["b"] = pdf["b"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) pdf["b"] = pdf["b"].astype(np.int64) assert "Incompatible input types" in str(ex) # 11. objects work pdf["b"] = pdf["b"].astype(np.object) pdf["d"] = pdf["d"].astype(np.object) pdf["e"] = pdf["e"].astype(np.object) pdf["f"] = pdf["f"].astype(np.object) pdf["g"] = pdf["g"].astype(np.object) res = pyfunc_model.predict(pdf) assert res.dtypes.to_dict() == expected_types # 12. datetime64[D] (date only) -> datetime64[x] works pdf["h"] = pdf["h"].astype("datetime64[D]") res = pyfunc_model.predict(pdf) assert res.dtypes.to_dict() == expected_types pdf["h"] = pdf["h"].astype("datetime64[s]") # 13. np.ndarrays can be converted to dataframe but have no columns with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf.values) assert "Model is missing inputs" in str(ex) # 14. dictionaries of str -> list/nparray work arr = np.array([1, 2, 3]) d = { "a": arr.astype("int32"), "b": arr.astype("int64"), "c": arr.astype("float32"), "d": arr.astype("float64"), "e": [True, False, True], "g": ["a", "b", "c"], "f": [bytes(0), bytes(1), bytes(1)], "h": np.array(["2020-01-01", "2020-02-02", "2020-03-03"], dtype=np.datetime64), } res = pyfunc_model.predict(d) assert res.dtypes.to_dict() == expected_types # 15. dictionaries of str -> list[list] fail d = { "a": [arr.astype("int32")], "b": [arr.astype("int64")], "c": [arr.astype("float32")], "d": [arr.astype("float64")], "e": [[True, False, True]], "g": [["a", "b", "c"]], "f": [[bytes(0), bytes(1), bytes(1)]], "h": [ np.array(["2020-01-01", "2020-02-02", "2020-03-03"], dtype=np.datetime64) ], } with pytest.raises(MlflowException) as ex: pyfunc_model.predict(d) assert "Incompatible input types" in str(ex) # 16. conversion to dataframe fails d = { "a": [1], "b": [1, 2], "c": [1, 2, 3], } with pytest.raises(MlflowException) as ex: pyfunc_model.predict(d) assert "This model contains a column-based signature, which suggests a DataFrame input." in str( ex)
def test_tensor_multi_named_schema_enforcement(): m = Model() input_schema = Schema([ TensorSpec(np.dtype(np.uint64), (-1, 5), "a"), TensorSpec(np.dtype(np.short), (-1, 2), "b"), TensorSpec(np.dtype(np.float32), (2, -1, 2), "c"), ]) m.signature = ModelSignature(inputs=input_schema) pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) inp = { "a": np.array([[0, 0, 0, 0, 0], [1, 1, 1, 1, 1]], dtype=np.uint64), "b": np.array([[0, 0], [1, 1], [2, 2]], dtype=np.short), "c": np.array([[[0, 0], [1, 1]], [[2, 2], [3, 3]]], dtype=np.float32), } # test that missing column raises inp1 = {k: v for k, v in inp.items()} with pytest.raises(MlflowException) as ex: pyfunc_model.predict(inp1.pop("b")) assert "Model is missing inputs" in str(ex) # test that extra column is ignored inp2 = {k: v for k, v in inp.items()} inp2["x"] = 1 # test that extra column is removed res = pyfunc_model.predict(inp2) assert res == {k: v for k, v in inp.items() if k in {"a", "b", "c"}} expected_types = dict( zip(input_schema.input_names(), input_schema.input_types())) actual_types = {k: v.dtype for k, v in res.items()} assert expected_types == actual_types # test that variable axes are supported inp3 = { "a": np.array([[0, 0, 0, 0, 0], [1, 1, 1, 1, 1], [2, 2, 2, 2, 2]], dtype=np.uint64), "b": np.array([[0, 0], [1, 1]], dtype=np.short), "c": np.array([[[0, 0]], [[2, 2]]], dtype=np.float32), } res = pyfunc_model.predict(inp3) assert _compare_exact_tensor_dict_input(res, inp3) expected_types = dict( zip(input_schema.input_names(), input_schema.input_types())) actual_types = {k: v.dtype for k, v in res.items()} assert expected_types == actual_types # test that type casting is not supported inp4 = {k: v for k, v in inp.items()} inp4["a"] = inp4["a"].astype(np.int32) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(inp4) assert "dtype of input int32 does not match expected dtype uint64" in str( ex) # test wrong shape inp5 = { "a": np.array([[0, 0, 0, 0]], dtype=np.uint), "b": np.array([[0, 0], [1, 1]], dtype=np.short), "c": np.array([[[0, 0]]], dtype=np.float32), } with pytest.raises(MlflowException) as ex: pyfunc_model.predict(inp5) assert "Shape of input (1, 4) does not match expected shape (-1, 5)" in str( ex) # test non-dictionary input inp6 = [ np.array([[0, 0, 0, 0, 0]], dtype=np.uint64), np.array([[0, 0], [1, 1]], dtype=np.short), np.array([[[0, 0]]], dtype=np.float32), ] with pytest.raises(MlflowException) as ex: pyfunc_model.predict(inp6) assert "Model is missing inputs ['a', 'b', 'c']." in str(ex) # test empty ndarray does not work inp7 = {k: v for k, v in inp.items()} inp7["a"] = np.array([]) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(inp7) assert "Shape of input (0,) does not match expected shape" in str(ex) # test dictionary of str -> list does not work inp8 = {k: list(v) for k, v in inp.items()} with pytest.raises(MlflowException) as ex: pyfunc_model.predict(inp8) assert "This model contains a tensor-based model signature with input names" in str( ex) assert ( "suggests a dictionary input mapping input name to a numpy array, but a dict" " with value type <class 'list'> was found") in str(ex) # test dataframe input fails at shape enforcement pdf = pd.DataFrame( data=[[1, 2, 3]], columns=["a", "b", "c"], ) pdf["a"] = pdf["a"].astype(np.uint64) pdf["b"] = pdf["b"].astype(np.short) pdf["c"] = pdf["c"].astype(np.float32) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Shape of input (1,) does not match expected shape (-1, 5)" in str( ex)