def test_spark_udf_autofills_column_names_with_schema(spark): class TestModel(PythonModel): def predict(self, context, model_input): return [model_input.columns] * len(model_input) signature = ModelSignature( inputs=Schema([ColSpec("long", "a"), ColSpec("long", "b"), ColSpec("long", "c")]), outputs=Schema([ColSpec("integer")]), ) with mlflow.start_run() as run: mlflow.pyfunc.log_model("model", python_model=TestModel(), signature=signature) udf = mlflow.pyfunc.spark_udf( spark, "runs:/{}/model".format(run.info.run_id), result_type=ArrayType(StringType()) ) data = spark.createDataFrame( pd.DataFrame( columns=["a", "b", "c", "d"], data={"a": [1], "b": [2], "c": [3], "d": [4]} ) ) with pytest.raises(pyspark.sql.utils.PythonException): res = data.withColumn("res1", udf("a", "b")).select("res1").toPandas() res = data.withColumn("res2", udf("a", "b", "c")).select("res2").toPandas() assert res["res2"][0] == ["a", "b", "c"] res = data.withColumn("res4", udf("a", "b", "c", "d")).select("res4").toPandas() assert res["res4"][0] == ["a", "b", "c"]
def test_spark_udf_autofills_no_arguments(spark): class TestModel(PythonModel): def predict(self, context, model_input): return [model_input.columns] * len(model_input) signature = ModelSignature( inputs=Schema([ColSpec("long", "a"), ColSpec("long", "b"), ColSpec("long", "c")]), outputs=Schema([ColSpec("integer")]), ) good_data = spark.createDataFrame( pd.DataFrame(columns=["a", "b", "c", "d"], data={"a": [1], "b": [2], "c": [3], "d": [4]}) ) with mlflow.start_run() as run: mlflow.pyfunc.log_model("model", python_model=TestModel(), signature=signature) udf = mlflow.pyfunc.spark_udf( spark, "runs:/{}/model".format(run.info.run_id), result_type=ArrayType(StringType()) ) res = good_data.withColumn("res", udf()).select("res").toPandas() assert res["res"][0] == ["a", "b", "c"] with pytest.raises( pyspark.sql.utils.PythonException, match=r"Model input is missing columns. Expected 3 input columns", ): res = good_data.withColumn("res", udf("b", "c")).select("res").toPandas() # this dataframe won't work because it's missing column a bad_data = spark.createDataFrame( pd.DataFrame( columns=["x", "b", "c", "d"], data={"x": [1], "b": [2], "c": [3], "d": [4]} ) ) with pytest.raises(AnalysisException, match=r"cannot resolve 'a' given input columns"): bad_data.withColumn("res", udf()) nameless_signature = ModelSignature( inputs=Schema([ColSpec("long"), ColSpec("long"), ColSpec("long")]), outputs=Schema([ColSpec("integer")]), ) with mlflow.start_run() as run: mlflow.pyfunc.log_model("model", python_model=TestModel(), signature=nameless_signature) udf = mlflow.pyfunc.spark_udf( spark, "runs:/{}/model".format(run.info.run_id), result_type=ArrayType(StringType()) ) with pytest.raises( MlflowException, match=r"Cannot apply udf because no column names specified", ): good_data.withColumn("res", udf()) with mlflow.start_run() as run: # model without signature mlflow.pyfunc.log_model("model", python_model=TestModel()) udf = mlflow.pyfunc.spark_udf( spark, "runs:/{}/model".format(run.info.run_id), result_type=ArrayType(StringType()) ) with pytest.raises(MlflowException, match="Attempting to apply udf on zero columns"): res = good_data.withColumn("res", udf()).select("res").toPandas()
def test_schema_enforcement_no_col_names(): class TestModel(object): @staticmethod def predict(pdf): return pdf m = Model() input_schema = Schema( [ColSpec("double"), ColSpec("double"), ColSpec("double")]) m.signature = ModelSignature(inputs=input_schema) pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) test_data = [[1.0, 2.0, 3.0]] # Can call with just a list assert pyfunc_model.predict(test_data).equals(pd.DataFrame(test_data)) # Or can call with a DataFrame without column names assert pyfunc_model.predict(pd.DataFrame(test_data)).equals( pd.DataFrame(test_data)) # # Or can call with a np.ndarray assert pyfunc_model.predict(pd.DataFrame(test_data).values).equals( pd.DataFrame(test_data)) # Or with column names! pdf = pd.DataFrame(data=test_data, columns=["a", "b", "c"]) assert pyfunc_model.predict(pdf).equals(pdf) # Must provide the right number of arguments with pytest.raises(MlflowException) as ex: pyfunc_model.predict([[1.0, 2.0]]) assert "the provided input only has 2 columns." in str(ex) # Must provide the right types with pytest.raises(MlflowException) as ex: pyfunc_model.predict([[1, 2, 3]]) assert "Can not safely convert int64 to float64" in str(ex) # Can only provide data type that can be converted to dataframe... with pytest.raises(MlflowException) as ex: pyfunc_model.predict(set([1, 2, 3])) assert "Expected input to be DataFrame or list. Found: set" in str(ex) # 9. dictionaries of str -> list/nparray work d = {"a": [1.0], "b": [2.0], "c": [3.0]} assert pyfunc_model.predict(d).equals(pd.DataFrame(d))
def test_serving_model_with_schema(pandas_df_with_all_types): class TestModel(PythonModel): def predict(self, context, model_input): return [[k, str(v)] for k, v in model_input.dtypes.items()] schema = Schema([ColSpec(c, c) for c in pandas_df_with_all_types.columns]) df = _shuffle_pdf(pandas_df_with_all_types) with TempDir(chdr=True): with mlflow.start_run() as run: mlflow.pyfunc.log_model("model", python_model=TestModel(), signature=ModelSignature(schema)) response = pyfunc_serve_and_score_model( model_uri="runs:/{}/model".format(run.info.run_id), data=json.dumps(df.to_dict(orient="split"), cls=NumpyEncoder), content_type=pyfunc_scoring_server. CONTENT_TYPE_JSON_SPLIT_ORIENTED, extra_args=["--no-conda"], ) response_json = json.loads(response.content) assert response_json == [ [k, str(v)] for k, v in pandas_df_with_all_types.dtypes.items() ] response = pyfunc_serve_and_score_model( model_uri="runs:/{}/model".format(run.info.run_id), data=json.dumps(pandas_df_with_all_types.to_dict(orient="records"), cls=NumpyEncoder), content_type=pyfunc_scoring_server. CONTENT_TYPE_JSON_RECORDS_ORIENTED, extra_args=["--no-conda"], ) response_json = json.loads(response.content) assert response_json == [ [k, str(v)] for k, v in pandas_df_with_all_types.dtypes.items() ]
def test_missing_value_hint_is_displayed_when_it_should(): m = Model() input_schema = Schema([ColSpec("integer", "a")]) m.signature = ModelSignature(inputs=input_schema) pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) pdf = pd.DataFrame( data=[[1], [None]], columns=["a"], ) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) hint = "Hint: the type mismatch is likely caused by missing values." assert "Incompatible input types" in str(ex.value.message) assert hint in str(ex.value.message) pdf = pd.DataFrame( data=[[1.5], [None]], columns=["a"], ) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) assert hint not in str(ex.value.message) pdf = pd.DataFrame(data=[[1], [2]], columns=["a"], dtype=np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex.value.message) assert hint not in str(ex.value.message)
def test_spark_udf_with_datetime_columns(spark): class TestModel(PythonModel): def predict(self, context, model_input): return [model_input.columns] * len(model_input) signature = ModelSignature( inputs=Schema([ColSpec("datetime", "timestamp"), ColSpec("datetime", "date")]), outputs=Schema([ColSpec("integer")]), ) with mlflow.start_run() as run: mlflow.pyfunc.log_model("model", python_model=TestModel(), signature=signature) udf = mlflow.pyfunc.spark_udf( spark, "runs:/{}/model".format(run.info.run_id), result_type=ArrayType(StringType()) ) data = spark.range(10).selectExpr( "current_timestamp() as timestamp", "current_date() as date" ) res = data.withColumn("res", udf("timestamp", "date")).select("res") res = res.toPandas() assert res["res"][0] == ["timestamp", "date"]
def test_parse_with_schema(pandas_df_with_all_types): schema = Schema([ColSpec(c, c) for c in pandas_df_with_all_types.columns]) df = _shuffle_pdf(pandas_df_with_all_types) json_str = json.dumps(df.to_dict(orient="split"), cls=NumpyEncoder) df = pyfunc_scoring_server.parse_json_input(json_str, orient="split", schema=schema) json_str = json.dumps(df.to_dict(orient="records"), cls=NumpyEncoder) df = pyfunc_scoring_server.parse_json_input(json_str, orient="records", schema=schema) assert schema == infer_signature(df[schema.input_names()]).inputs # The current behavior with pandas json parse with type hints is weird. In some cases, the # types are forced ignoting overflow and loss of precision: bad_df = """{ "columns":["bad_integer", "bad_float", "bad_string", "bad_boolean"], "data":[ [9007199254740991.0, 1.1, 1, 1.5], [9007199254740992.0, 9007199254740992.0, 2, 0], [9007199254740994.0, 3.3, 3, "some arbitrary string"] ] }""" schema = Schema([ ColSpec("integer", "bad_integer"), ColSpec("float", "bad_float"), ColSpec("float", "good_float"), ColSpec("string", "bad_string"), ColSpec("boolean", "bad_boolean"), ]) df = pyfunc_scoring_server.parse_json_input(bad_df, orient="split", schema=schema) # Unfortunately, the current behavior of pandas parse is to force numbers to int32 even if # they don't fit: assert df["bad_integer"].dtype == np.int32 assert all(df["bad_integer"] == [-2147483648, -2147483648, -2147483648]) # The same goes for floats: assert df["bad_float"].dtype == np.float32 assert all(df["bad_float"] == np.array([1.1, 9007199254740992, 3.3], dtype=np.float32)) # However bad string is recognized as int64: assert all(df["bad_string"] == np.array([1, 2, 3], dtype=np.object)) # Boolean is forced - zero and empty string is false, everything else is true: assert df["bad_boolean"].dtype == np.bool assert all(df["bad_boolean"] == [True, False, True])
def test_schema_enforcement(): class TestModel(object): @staticmethod def predict(pdf): return pdf m = Model() input_schema = Schema([ ColSpec("integer", "a"), ColSpec("long", "b"), ColSpec("float", "c"), ColSpec("double", "d"), ColSpec("boolean", "e"), ColSpec("string", "g"), ColSpec("binary", "f"), ]) m.signature = ModelSignature(inputs=input_schema) pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) pdf = pd.DataFrame( data=[[1, 2, 3, 4, True, "x", bytes([1])]], columns=["b", "d", "a", "c", "e", "g", "f"], dtype=np.object, ) pdf["a"] = pdf["a"].astype(np.int32) pdf["b"] = pdf["b"].astype(np.int64) pdf["c"] = pdf["c"].astype(np.float32) pdf["d"] = pdf["d"].astype(np.float64) # test that missing column raises with pytest.raises(MlflowException) as ex: res = pyfunc_model.predict(pdf[["b", "d", "a", "e", "g", "f"]]) assert "Model input is missing columns" in str(ex) # test that extra column is ignored pdf["x"] = 1 # test that columns are reordered, extra column is ignored res = pyfunc_model.predict(pdf) assert all((res == pdf[input_schema.column_names()]).all()) expected_types = dict( zip(input_schema.column_names(), input_schema.pandas_types())) actual_types = res.dtypes.to_dict() assert expected_types == actual_types # Test conversions # 1. long -> integer raises pdf["a"] = pdf["a"].astype(np.int64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["a"] = pdf["a"].astype(np.int32) # 2. integer -> long works pdf["b"] = pdf["b"].astype(np.int32) res = pyfunc_model.predict(pdf) assert all((res == pdf[input_schema.column_names()]).all()) assert res.dtypes.to_dict() == expected_types pdf["b"] = pdf["b"].astype(np.int64) # 3. double -> float raises pdf["c"] = pdf["c"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["c"] = pdf["c"].astype(np.float32) # 4. float -> double works pdf["d"] = pdf["d"].astype(np.float32) res = pyfunc_model.predict(pdf) assert res.dtypes.to_dict() == expected_types assert "Incompatible input types" in str(ex) pdf["d"] = pdf["d"].astype(np.int64) # 5. floats -> ints raises pdf["c"] = pdf["c"].astype(np.int32) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["c"] = pdf["c"].astype(np.float32) pdf["d"] = pdf["d"].astype(np.int64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["d"] = pdf["d"].astype(np.float64) # 6. ints -> floats raises pdf["a"] = pdf["a"].astype(np.float32) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["a"] = pdf["a"].astype(np.int32) pdf["b"] = pdf["b"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) pdf["b"] = pdf["b"].astype(np.int64) assert "Incompatible input types" in str(ex) # 7. objects work pdf["b"] = pdf["b"].astype(np.object) pdf["d"] = pdf["d"].astype(np.object) pdf["e"] = pdf["e"].astype(np.object) pdf["f"] = pdf["f"].astype(np.object) pdf["g"] = pdf["g"].astype(np.object) res = pyfunc_model.predict(pdf) assert res.dtypes.to_dict() == expected_types
def test_column_schema_enforcement(): m = Model() input_schema = Schema([ ColSpec("integer", "a"), ColSpec("long", "b"), ColSpec("float", "c"), ColSpec("double", "d"), ColSpec("boolean", "e"), ColSpec("string", "g"), ColSpec("binary", "f"), ColSpec("datetime", "h"), ]) m.signature = ModelSignature(inputs=input_schema) pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) pdf = pd.DataFrame( data=[[ 1, 2, 3, 4, True, "x", bytes([1]), "2021-01-01 00:00:00.1234567" ]], columns=["b", "d", "a", "c", "e", "g", "f", "h"], dtype=np.object, ) pdf["a"] = pdf["a"].astype(np.int32) pdf["b"] = pdf["b"].astype(np.int64) pdf["c"] = pdf["c"].astype(np.float32) pdf["d"] = pdf["d"].astype(np.float64) pdf["h"] = pdf["h"].astype(np.datetime64) # test that missing column raises with pytest.raises(MlflowException) as ex: res = pyfunc_model.predict(pdf[["b", "d", "a", "e", "g", "f", "h"]]) assert "Model is missing inputs" in str(ex) # test that extra column is ignored pdf["x"] = 1 # test that columns are reordered, extra column is ignored res = pyfunc_model.predict(pdf) assert all((res == pdf[input_schema.input_names()]).all()) expected_types = dict( zip(input_schema.input_names(), input_schema.pandas_types())) # MLflow datetime type in input_schema does not encode precision, so add it for assertions expected_types["h"] = np.dtype("datetime64[ns]") actual_types = res.dtypes.to_dict() assert expected_types == actual_types # Test conversions # 1. long -> integer raises pdf["a"] = pdf["a"].astype(np.int64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["a"] = pdf["a"].astype(np.int32) # 2. integer -> long works pdf["b"] = pdf["b"].astype(np.int32) res = pyfunc_model.predict(pdf) assert all((res == pdf[input_schema.input_names()]).all()) assert res.dtypes.to_dict() == expected_types pdf["b"] = pdf["b"].astype(np.int64) # 3. unsigned int -> long works pdf["b"] = pdf["b"].astype(np.uint32) res = pyfunc_model.predict(pdf) assert all((res == pdf[input_schema.input_names()]).all()) assert res.dtypes.to_dict() == expected_types pdf["b"] = pdf["b"].astype(np.int64) # 4. unsigned int -> int raises pdf["a"] = pdf["a"].astype(np.uint32) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["a"] = pdf["a"].astype(np.int32) # 5. double -> float raises pdf["c"] = pdf["c"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["c"] = pdf["c"].astype(np.float32) # 6. float -> double works, double -> float does not pdf["d"] = pdf["d"].astype(np.float32) res = pyfunc_model.predict(pdf) assert res.dtypes.to_dict() == expected_types assert "Incompatible input types" in str(ex) pdf["d"] = pdf["d"].astype(np.float64) pdf["c"] = pdf["c"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["c"] = pdf["c"].astype(np.float32) # 7. int -> float raises pdf["c"] = pdf["c"].astype(np.int32) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["c"] = pdf["c"].astype(np.float32) # 8. int -> double works pdf["d"] = pdf["d"].astype(np.int32) pyfunc_model.predict(pdf) assert all((res == pdf[input_schema.input_names()]).all()) assert res.dtypes.to_dict() == expected_types # 9. long -> double raises pdf["d"] = pdf["d"].astype(np.int64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["d"] = pdf["d"].astype(np.float64) # 10. any float -> any int raises pdf["a"] = pdf["a"].astype(np.float32) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) # 10. any float -> any int raises pdf["a"] = pdf["a"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["a"] = pdf["a"].astype(np.int32) pdf["b"] = pdf["b"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["b"] = pdf["b"].astype(np.int64) pdf["b"] = pdf["b"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) pdf["b"] = pdf["b"].astype(np.int64) assert "Incompatible input types" in str(ex) # 11. objects work pdf["b"] = pdf["b"].astype(np.object) pdf["d"] = pdf["d"].astype(np.object) pdf["e"] = pdf["e"].astype(np.object) pdf["f"] = pdf["f"].astype(np.object) pdf["g"] = pdf["g"].astype(np.object) res = pyfunc_model.predict(pdf) assert res.dtypes.to_dict() == expected_types # 12. datetime64[D] (date only) -> datetime64[x] works pdf["h"] = pdf["h"].astype("datetime64[D]") res = pyfunc_model.predict(pdf) assert res.dtypes.to_dict() == expected_types pdf["h"] = pdf["h"].astype("datetime64[s]") # 13. np.ndarrays can be converted to dataframe but have no columns with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf.values) assert "Model is missing inputs" in str(ex) # 14. dictionaries of str -> list/nparray work arr = np.array([1, 2, 3]) d = { "a": arr.astype("int32"), "b": arr.astype("int64"), "c": arr.astype("float32"), "d": arr.astype("float64"), "e": [True, False, True], "g": ["a", "b", "c"], "f": [bytes(0), bytes(1), bytes(1)], "h": np.array(["2020-01-01", "2020-02-02", "2020-03-03"], dtype=np.datetime64), } res = pyfunc_model.predict(d) assert res.dtypes.to_dict() == expected_types # 15. dictionaries of str -> list[list] fail d = { "a": [arr.astype("int32")], "b": [arr.astype("int64")], "c": [arr.astype("float32")], "d": [arr.astype("float64")], "e": [[True, False, True]], "g": [["a", "b", "c"]], "f": [[bytes(0), bytes(1), bytes(1)]], "h": [ np.array(["2020-01-01", "2020-02-02", "2020-03-03"], dtype=np.datetime64) ], } with pytest.raises(MlflowException) as ex: pyfunc_model.predict(d) assert "Incompatible input types" in str(ex) # 16. conversion to dataframe fails d = { "a": [1], "b": [1, 2], "c": [1, 2, 3], } with pytest.raises(MlflowException) as ex: pyfunc_model.predict(d) assert "This model contains a column-based signature, which suggests a DataFrame input." in str( ex)
# Prepare dataset try: repo_url = "https://raw.githubusercontent.com/prinz-nussknacker" csv_url = f"{repo_url}/banksim1/master/bs140513_032310.csv" data = pd.read_csv(csv_url, sep=",", quotechar="'", header=0) except Exception as e: logger.exception("Could not read CSV file: {}".format(e)) exit(1) data.dropna() data = data.drop(["step", "customer", "zipcodeOri", "merchant", "zipMerchant"], axis="columns") input_schema = Schema([ ColSpec("string", "age"), ColSpec("string", "gender"), ColSpec("string", "category"), ColSpec("double", "amount") ]) output_schema = Schema([ColSpec("integer")]) signature = ModelSignature(inputs=input_schema, outputs=output_schema) # Prepare train and test sets data_x = data.drop(["fraud"], axis="columns") data_y = data[["fraud"]] train_x, test_x, train_y, test_y = train_test_split(data_x, data_y) with mlflow.start_run(): # Define pipeline numeric_features = ['amount']
def test_dataframe_from_json(): source = pd.DataFrame( { "boolean": [True, False, True], "string": ["a", "b", "c"], "float": np.array([1.2, 2.3, 3.4], dtype=np.float32), "double": np.array([1.2, 2.3, 3.4], dtype=np.float64), "integer": np.array([3, 4, 5], dtype=np.int32), "long": np.array([3, 4, 5], dtype=np.int64), "binary": [bytes([1, 2, 3]), bytes([4, 5]), bytes([6])], "date_string": ["2018-02-03", "1996-03-02", "2021-03-05"], }, columns=[ "boolean", "string", "float", "double", "integer", "long", "binary", "date_string", ], ) jsonable_df = pd.DataFrame(source, copy=True) jsonable_df["binary"] = jsonable_df["binary"].map(base64.b64encode) schema = Schema([ ColSpec("boolean", "boolean"), ColSpec("string", "string"), ColSpec("float", "float"), ColSpec("double", "double"), ColSpec("integer", "integer"), ColSpec("long", "long"), ColSpec("binary", "binary"), ColSpec("string", "date_string"), ]) parsed = _dataframe_from_json(jsonable_df.to_json(orient="split"), pandas_orient="split", schema=schema) assert parsed.equals(source) parsed = _dataframe_from_json(jsonable_df.to_json(orient="records"), pandas_orient="records", schema=schema) assert parsed.equals(source) # try parsing with tensor schema tensor_schema = Schema([ TensorSpec(np.dtype("bool"), [-1], "boolean"), TensorSpec(np.dtype("str"), [-1], "string"), TensorSpec(np.dtype("float32"), [-1], "float"), TensorSpec(np.dtype("float64"), [-1], "double"), TensorSpec(np.dtype("int32"), [-1], "integer"), TensorSpec(np.dtype("int64"), [-1], "long"), TensorSpec(np.dtype(bytes), [-1], "binary"), ]) parsed = _dataframe_from_json(jsonable_df.to_json(orient="split"), pandas_orient="split", schema=tensor_schema) # NB: tensor schema does not automatically decode base64 encoded bytes. assert parsed.equals(jsonable_df) parsed = _dataframe_from_json(jsonable_df.to_json(orient="records"), pandas_orient="records", schema=tensor_schema) # NB: tensor schema does not automatically decode base64 encoded bytes. assert parsed.equals(jsonable_df) # Test parse with TesnorSchema with a single tensor tensor_schema = Schema([TensorSpec(np.dtype("float32"), [-1, 3])]) source = pd.DataFrame( { "a": np.array([1, 2, 3], dtype=np.float32), "b": np.array([4.1, 5.2, 6.3], dtype=np.float32), "c": np.array([7, 8, 9], dtype=np.float32), }, columns=["a", "b", "c"], ) assert source.equals( _dataframe_from_json(source.to_json(orient="split"), pandas_orient="split", schema=tensor_schema)) assert source.equals( _dataframe_from_json(source.to_json(orient="records"), pandas_orient="records", schema=tensor_schema))
def test_parse_tf_serving_dictionary(): def assert_result(result, expected_result): assert result.keys() == expected_result.keys() for key in result: assert (result[key] == expected_result[key]).all() assert result[key].dtype == expected_result[key].dtype # instances are correctly aggregated to dict of input name -> tensor tfserving_input = { "instances": [ { "a": "s1", "b": 1.1, "c": [1, 2, 3] }, { "a": "s2", "b": 2.2, "c": [4, 5, 6] }, { "a": "s3", "b": 3.3, "c": [7, 8, 9] }, ] } # Without Schema result = parse_tf_serving_input(tfserving_input) expected_result_no_schema = { "a": np.array(["s1", "s2", "s3"]), "b": np.array([1.1, 2.2, 3.3]), "c": np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), } assert_result(result, expected_result_no_schema) # With schema schema = Schema([ TensorSpec(np.dtype("str"), [-1], "a"), TensorSpec(np.dtype("float32"), [-1], "b"), TensorSpec(np.dtype("int32"), [-1], "c"), ]) dfSchema = Schema([ ColSpec("string", "a"), ColSpec("float", "b"), ColSpec("integer", "c") ]) result = parse_tf_serving_input(tfserving_input, schema) expected_result_schema = { "a": np.array(["s1", "s2", "s3"], dtype=np.dtype("str")), "b": np.array([1.1, 2.2, 3.3], dtype="float32"), "c": np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype="int32"), } assert_result(result, expected_result_schema) # With df Schema result = parse_tf_serving_input(tfserving_input, dfSchema) assert_result(result, expected_result_schema) # input provided as a dict tfserving_input = { "inputs": { "a": ["s1", "s2", "s3"], "b": [1.1, 2.2, 3.3], "c": [[1, 2, 3], [4, 5, 6], [7, 8, 9]], } } # Without Schema result = parse_tf_serving_input(tfserving_input) assert_result(result, expected_result_no_schema) # With Schema result = parse_tf_serving_input(tfserving_input, schema) assert_result(result, expected_result_schema) # With df Schema result = parse_tf_serving_input(tfserving_input, dfSchema) assert_result(result, expected_result_schema)
warnings.filterwarnings("ignore") model_id = int(sys.argv[2]) # Prepare dataset try: csv_url = ("https://raw.githubusercontent.com/prinz-nussknacker/banksim1/master/bs140513_032310.csv") data = pd.read_csv(csv_url, sep=",", quotechar="'", header=0) except Exception as e: logger.exception("Could not read CSV file: {}".format(e)) exit(1) data.dropna() data = data.drop(["step", "customer", "zipcodeOri", "merchant", "zipMerchant"], axis="columns") input_schema = Schema([ ColSpec("string", "age"), ColSpec("string", "gender"), ColSpec("string", "category"), ColSpec("double", "amount") ]) output_schema = Schema([ ColSpec("integer") ]) signature = ModelSignature(inputs=input_schema, outputs=output_schema) # Prepare train and test sets data_x = data.drop(["fraud"], axis="columns") data_y = data[["fraud"]] train_x, test_x, train_y, test_y = train_test_split(data_x, data_y) with mlflow.start_run():