Пример #1
0
def test_spark_udf_autofills_column_names_with_schema(spark):
    class TestModel(PythonModel):
        def predict(self, context, model_input):
            return [model_input.columns] * len(model_input)

    signature = ModelSignature(
        inputs=Schema([ColSpec("long", "a"), ColSpec("long", "b"), ColSpec("long", "c")]),
        outputs=Schema([ColSpec("integer")]),
    )
    with mlflow.start_run() as run:
        mlflow.pyfunc.log_model("model", python_model=TestModel(), signature=signature)
        udf = mlflow.pyfunc.spark_udf(
            spark, "runs:/{}/model".format(run.info.run_id), result_type=ArrayType(StringType())
        )
        data = spark.createDataFrame(
            pd.DataFrame(
                columns=["a", "b", "c", "d"], data={"a": [1], "b": [2], "c": [3], "d": [4]}
            )
        )
        with pytest.raises(pyspark.sql.utils.PythonException):
            res = data.withColumn("res1", udf("a", "b")).select("res1").toPandas()

        res = data.withColumn("res2", udf("a", "b", "c")).select("res2").toPandas()
        assert res["res2"][0] == ["a", "b", "c"]
        res = data.withColumn("res4", udf("a", "b", "c", "d")).select("res4").toPandas()
        assert res["res4"][0] == ["a", "b", "c"]
Пример #2
0
def test_spark_udf_autofills_no_arguments(spark):
    class TestModel(PythonModel):
        def predict(self, context, model_input):
            return [model_input.columns] * len(model_input)

    signature = ModelSignature(
        inputs=Schema([ColSpec("long", "a"), ColSpec("long", "b"), ColSpec("long", "c")]),
        outputs=Schema([ColSpec("integer")]),
    )

    good_data = spark.createDataFrame(
        pd.DataFrame(columns=["a", "b", "c", "d"], data={"a": [1], "b": [2], "c": [3], "d": [4]})
    )
    with mlflow.start_run() as run:
        mlflow.pyfunc.log_model("model", python_model=TestModel(), signature=signature)
        udf = mlflow.pyfunc.spark_udf(
            spark, "runs:/{}/model".format(run.info.run_id), result_type=ArrayType(StringType())
        )
        res = good_data.withColumn("res", udf()).select("res").toPandas()
        assert res["res"][0] == ["a", "b", "c"]

        with pytest.raises(
            pyspark.sql.utils.PythonException,
            match=r"Model input is missing columns. Expected 3 input columns",
        ):
            res = good_data.withColumn("res", udf("b", "c")).select("res").toPandas()

        # this dataframe won't work because it's missing column a
        bad_data = spark.createDataFrame(
            pd.DataFrame(
                columns=["x", "b", "c", "d"], data={"x": [1], "b": [2], "c": [3], "d": [4]}
            )
        )
        with pytest.raises(AnalysisException, match=r"cannot resolve 'a' given input columns"):
            bad_data.withColumn("res", udf())

    nameless_signature = ModelSignature(
        inputs=Schema([ColSpec("long"), ColSpec("long"), ColSpec("long")]),
        outputs=Schema([ColSpec("integer")]),
    )
    with mlflow.start_run() as run:
        mlflow.pyfunc.log_model("model", python_model=TestModel(), signature=nameless_signature)
        udf = mlflow.pyfunc.spark_udf(
            spark, "runs:/{}/model".format(run.info.run_id), result_type=ArrayType(StringType())
        )
        with pytest.raises(
            MlflowException,
            match=r"Cannot apply udf because no column names specified",
        ):
            good_data.withColumn("res", udf())

    with mlflow.start_run() as run:
        # model without signature
        mlflow.pyfunc.log_model("model", python_model=TestModel())
        udf = mlflow.pyfunc.spark_udf(
            spark, "runs:/{}/model".format(run.info.run_id), result_type=ArrayType(StringType())
        )
        with pytest.raises(MlflowException, match="Attempting to apply udf on zero columns"):
            res = good_data.withColumn("res", udf()).select("res").toPandas()
Пример #3
0
def test_schema_enforcement_no_col_names():
    class TestModel(object):
        @staticmethod
        def predict(pdf):
            return pdf

    m = Model()
    input_schema = Schema(
        [ColSpec("double"),
         ColSpec("double"),
         ColSpec("double")])
    m.signature = ModelSignature(inputs=input_schema)
    pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel())
    test_data = [[1.0, 2.0, 3.0]]

    # Can call with just a list
    assert pyfunc_model.predict(test_data).equals(pd.DataFrame(test_data))

    # Or can call with a DataFrame without column names
    assert pyfunc_model.predict(pd.DataFrame(test_data)).equals(
        pd.DataFrame(test_data))

    # # Or can call with a np.ndarray
    assert pyfunc_model.predict(pd.DataFrame(test_data).values).equals(
        pd.DataFrame(test_data))

    # Or with column names!
    pdf = pd.DataFrame(data=test_data, columns=["a", "b", "c"])
    assert pyfunc_model.predict(pdf).equals(pdf)

    # Must provide the right number of arguments
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict([[1.0, 2.0]])
    assert "the provided input only has 2 columns." in str(ex)

    # Must provide the right types
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict([[1, 2, 3]])
    assert "Can not safely convert int64 to float64" in str(ex)

    # Can only provide data type that can be converted to dataframe...
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(set([1, 2, 3]))
    assert "Expected input to be DataFrame or list. Found: set" in str(ex)

    # 9. dictionaries of str -> list/nparray work
    d = {"a": [1.0], "b": [2.0], "c": [3.0]}
    assert pyfunc_model.predict(d).equals(pd.DataFrame(d))
Пример #4
0
def test_serving_model_with_schema(pandas_df_with_all_types):
    class TestModel(PythonModel):
        def predict(self, context, model_input):
            return [[k, str(v)] for k, v in model_input.dtypes.items()]

    schema = Schema([ColSpec(c, c) for c in pandas_df_with_all_types.columns])
    df = _shuffle_pdf(pandas_df_with_all_types)
    with TempDir(chdr=True):
        with mlflow.start_run() as run:
            mlflow.pyfunc.log_model("model",
                                    python_model=TestModel(),
                                    signature=ModelSignature(schema))
        response = pyfunc_serve_and_score_model(
            model_uri="runs:/{}/model".format(run.info.run_id),
            data=json.dumps(df.to_dict(orient="split"), cls=NumpyEncoder),
            content_type=pyfunc_scoring_server.
            CONTENT_TYPE_JSON_SPLIT_ORIENTED,
            extra_args=["--no-conda"],
        )
        response_json = json.loads(response.content)
        assert response_json == [
            [k, str(v)] for k, v in pandas_df_with_all_types.dtypes.items()
        ]
        response = pyfunc_serve_and_score_model(
            model_uri="runs:/{}/model".format(run.info.run_id),
            data=json.dumps(pandas_df_with_all_types.to_dict(orient="records"),
                            cls=NumpyEncoder),
            content_type=pyfunc_scoring_server.
            CONTENT_TYPE_JSON_RECORDS_ORIENTED,
            extra_args=["--no-conda"],
        )
        response_json = json.loads(response.content)
        assert response_json == [
            [k, str(v)] for k, v in pandas_df_with_all_types.dtypes.items()
        ]
Пример #5
0
def test_missing_value_hint_is_displayed_when_it_should():
    m = Model()
    input_schema = Schema([ColSpec("integer", "a")])
    m.signature = ModelSignature(inputs=input_schema)
    pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel())
    pdf = pd.DataFrame(
        data=[[1], [None]],
        columns=["a"],
    )
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    hint = "Hint: the type mismatch is likely caused by missing values."
    assert "Incompatible input types" in str(ex.value.message)
    assert hint in str(ex.value.message)
    pdf = pd.DataFrame(
        data=[[1.5], [None]],
        columns=["a"],
    )
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Incompatible input types" in str(ex)
    assert hint not in str(ex.value.message)
    pdf = pd.DataFrame(data=[[1], [2]], columns=["a"], dtype=np.float64)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Incompatible input types" in str(ex.value.message)
    assert hint not in str(ex.value.message)
Пример #6
0
def test_spark_udf_with_datetime_columns(spark):
    class TestModel(PythonModel):
        def predict(self, context, model_input):
            return [model_input.columns] * len(model_input)

    signature = ModelSignature(
        inputs=Schema([ColSpec("datetime", "timestamp"), ColSpec("datetime", "date")]),
        outputs=Schema([ColSpec("integer")]),
    )
    with mlflow.start_run() as run:
        mlflow.pyfunc.log_model("model", python_model=TestModel(), signature=signature)
        udf = mlflow.pyfunc.spark_udf(
            spark, "runs:/{}/model".format(run.info.run_id), result_type=ArrayType(StringType())
        )
        data = spark.range(10).selectExpr(
            "current_timestamp() as timestamp", "current_date() as date"
        )

        res = data.withColumn("res", udf("timestamp", "date")).select("res")
        res = res.toPandas()
        assert res["res"][0] == ["timestamp", "date"]
Пример #7
0
def test_parse_with_schema(pandas_df_with_all_types):
    schema = Schema([ColSpec(c, c) for c in pandas_df_with_all_types.columns])
    df = _shuffle_pdf(pandas_df_with_all_types)
    json_str = json.dumps(df.to_dict(orient="split"), cls=NumpyEncoder)
    df = pyfunc_scoring_server.parse_json_input(json_str,
                                                orient="split",
                                                schema=schema)
    json_str = json.dumps(df.to_dict(orient="records"), cls=NumpyEncoder)
    df = pyfunc_scoring_server.parse_json_input(json_str,
                                                orient="records",
                                                schema=schema)
    assert schema == infer_signature(df[schema.input_names()]).inputs

    # The current behavior with pandas json parse with type hints is weird. In some cases, the
    # types are forced ignoting overflow and loss of precision:

    bad_df = """{
      "columns":["bad_integer", "bad_float", "bad_string", "bad_boolean"],
      "data":[
        [9007199254740991.0, 1.1,                1, 1.5],
        [9007199254740992.0, 9007199254740992.0, 2, 0],
        [9007199254740994.0, 3.3,                3, "some arbitrary string"]
      ]
    }"""
    schema = Schema([
        ColSpec("integer", "bad_integer"),
        ColSpec("float", "bad_float"),
        ColSpec("float", "good_float"),
        ColSpec("string", "bad_string"),
        ColSpec("boolean", "bad_boolean"),
    ])
    df = pyfunc_scoring_server.parse_json_input(bad_df,
                                                orient="split",
                                                schema=schema)
    # Unfortunately, the current behavior of pandas parse is to force numbers to int32 even if
    # they don't fit:
    assert df["bad_integer"].dtype == np.int32
    assert all(df["bad_integer"] == [-2147483648, -2147483648, -2147483648])

    # The same goes for floats:
    assert df["bad_float"].dtype == np.float32
    assert all(df["bad_float"] == np.array([1.1, 9007199254740992, 3.3],
                                           dtype=np.float32))
    # However bad string is recognized as int64:
    assert all(df["bad_string"] == np.array([1, 2, 3], dtype=np.object))

    # Boolean is forced - zero and empty string is false, everything else is true:
    assert df["bad_boolean"].dtype == np.bool
    assert all(df["bad_boolean"] == [True, False, True])
Пример #8
0
def test_schema_enforcement():
    class TestModel(object):
        @staticmethod
        def predict(pdf):
            return pdf

    m = Model()
    input_schema = Schema([
        ColSpec("integer", "a"),
        ColSpec("long", "b"),
        ColSpec("float", "c"),
        ColSpec("double", "d"),
        ColSpec("boolean", "e"),
        ColSpec("string", "g"),
        ColSpec("binary", "f"),
    ])
    m.signature = ModelSignature(inputs=input_schema)
    pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel())
    pdf = pd.DataFrame(
        data=[[1, 2, 3, 4, True, "x", bytes([1])]],
        columns=["b", "d", "a", "c", "e", "g", "f"],
        dtype=np.object,
    )
    pdf["a"] = pdf["a"].astype(np.int32)
    pdf["b"] = pdf["b"].astype(np.int64)
    pdf["c"] = pdf["c"].astype(np.float32)
    pdf["d"] = pdf["d"].astype(np.float64)
    # test that missing column raises
    with pytest.raises(MlflowException) as ex:
        res = pyfunc_model.predict(pdf[["b", "d", "a", "e", "g", "f"]])
    assert "Model input is missing columns" in str(ex)

    # test that extra column is ignored
    pdf["x"] = 1

    # test that columns are reordered, extra column is ignored
    res = pyfunc_model.predict(pdf)
    assert all((res == pdf[input_schema.column_names()]).all())

    expected_types = dict(
        zip(input_schema.column_names(), input_schema.pandas_types()))
    actual_types = res.dtypes.to_dict()
    assert expected_types == actual_types

    # Test conversions
    # 1. long -> integer raises
    pdf["a"] = pdf["a"].astype(np.int64)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Incompatible input types" in str(ex)
    pdf["a"] = pdf["a"].astype(np.int32)
    # 2. integer -> long works
    pdf["b"] = pdf["b"].astype(np.int32)
    res = pyfunc_model.predict(pdf)
    assert all((res == pdf[input_schema.column_names()]).all())
    assert res.dtypes.to_dict() == expected_types
    pdf["b"] = pdf["b"].astype(np.int64)

    # 3. double -> float raises
    pdf["c"] = pdf["c"].astype(np.float64)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Incompatible input types" in str(ex)
    pdf["c"] = pdf["c"].astype(np.float32)

    # 4. float -> double works
    pdf["d"] = pdf["d"].astype(np.float32)
    res = pyfunc_model.predict(pdf)
    assert res.dtypes.to_dict() == expected_types
    assert "Incompatible input types" in str(ex)
    pdf["d"] = pdf["d"].astype(np.int64)

    # 5. floats -> ints raises
    pdf["c"] = pdf["c"].astype(np.int32)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Incompatible input types" in str(ex)
    pdf["c"] = pdf["c"].astype(np.float32)

    pdf["d"] = pdf["d"].astype(np.int64)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Incompatible input types" in str(ex)
    pdf["d"] = pdf["d"].astype(np.float64)

    # 6. ints -> floats raises
    pdf["a"] = pdf["a"].astype(np.float32)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Incompatible input types" in str(ex)
    pdf["a"] = pdf["a"].astype(np.int32)

    pdf["b"] = pdf["b"].astype(np.float64)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    pdf["b"] = pdf["b"].astype(np.int64)
    assert "Incompatible input types" in str(ex)

    # 7. objects work
    pdf["b"] = pdf["b"].astype(np.object)
    pdf["d"] = pdf["d"].astype(np.object)
    pdf["e"] = pdf["e"].astype(np.object)
    pdf["f"] = pdf["f"].astype(np.object)
    pdf["g"] = pdf["g"].astype(np.object)
    res = pyfunc_model.predict(pdf)
    assert res.dtypes.to_dict() == expected_types
Пример #9
0
def test_column_schema_enforcement():
    m = Model()
    input_schema = Schema([
        ColSpec("integer", "a"),
        ColSpec("long", "b"),
        ColSpec("float", "c"),
        ColSpec("double", "d"),
        ColSpec("boolean", "e"),
        ColSpec("string", "g"),
        ColSpec("binary", "f"),
        ColSpec("datetime", "h"),
    ])
    m.signature = ModelSignature(inputs=input_schema)
    pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel())
    pdf = pd.DataFrame(
        data=[[
            1, 2, 3, 4, True, "x",
            bytes([1]), "2021-01-01 00:00:00.1234567"
        ]],
        columns=["b", "d", "a", "c", "e", "g", "f", "h"],
        dtype=np.object,
    )
    pdf["a"] = pdf["a"].astype(np.int32)
    pdf["b"] = pdf["b"].astype(np.int64)
    pdf["c"] = pdf["c"].astype(np.float32)
    pdf["d"] = pdf["d"].astype(np.float64)
    pdf["h"] = pdf["h"].astype(np.datetime64)
    # test that missing column raises
    with pytest.raises(MlflowException) as ex:
        res = pyfunc_model.predict(pdf[["b", "d", "a", "e", "g", "f", "h"]])
    assert "Model is missing inputs" in str(ex)

    # test that extra column is ignored
    pdf["x"] = 1

    # test that columns are reordered, extra column is ignored
    res = pyfunc_model.predict(pdf)
    assert all((res == pdf[input_schema.input_names()]).all())

    expected_types = dict(
        zip(input_schema.input_names(), input_schema.pandas_types()))
    # MLflow datetime type in input_schema does not encode precision, so add it for assertions
    expected_types["h"] = np.dtype("datetime64[ns]")
    actual_types = res.dtypes.to_dict()
    assert expected_types == actual_types

    # Test conversions
    # 1. long -> integer raises
    pdf["a"] = pdf["a"].astype(np.int64)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Incompatible input types" in str(ex)
    pdf["a"] = pdf["a"].astype(np.int32)
    # 2. integer -> long works
    pdf["b"] = pdf["b"].astype(np.int32)
    res = pyfunc_model.predict(pdf)
    assert all((res == pdf[input_schema.input_names()]).all())
    assert res.dtypes.to_dict() == expected_types
    pdf["b"] = pdf["b"].astype(np.int64)

    # 3. unsigned int -> long works
    pdf["b"] = pdf["b"].astype(np.uint32)
    res = pyfunc_model.predict(pdf)
    assert all((res == pdf[input_schema.input_names()]).all())
    assert res.dtypes.to_dict() == expected_types
    pdf["b"] = pdf["b"].astype(np.int64)

    # 4. unsigned int -> int raises
    pdf["a"] = pdf["a"].astype(np.uint32)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Incompatible input types" in str(ex)
    pdf["a"] = pdf["a"].astype(np.int32)

    # 5. double -> float raises
    pdf["c"] = pdf["c"].astype(np.float64)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Incompatible input types" in str(ex)
    pdf["c"] = pdf["c"].astype(np.float32)

    # 6. float -> double works, double -> float does not
    pdf["d"] = pdf["d"].astype(np.float32)
    res = pyfunc_model.predict(pdf)
    assert res.dtypes.to_dict() == expected_types
    assert "Incompatible input types" in str(ex)
    pdf["d"] = pdf["d"].astype(np.float64)
    pdf["c"] = pdf["c"].astype(np.float64)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Incompatible input types" in str(ex)
    pdf["c"] = pdf["c"].astype(np.float32)

    # 7. int -> float raises
    pdf["c"] = pdf["c"].astype(np.int32)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Incompatible input types" in str(ex)
    pdf["c"] = pdf["c"].astype(np.float32)

    # 8. int -> double works
    pdf["d"] = pdf["d"].astype(np.int32)
    pyfunc_model.predict(pdf)
    assert all((res == pdf[input_schema.input_names()]).all())
    assert res.dtypes.to_dict() == expected_types

    # 9. long -> double raises
    pdf["d"] = pdf["d"].astype(np.int64)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Incompatible input types" in str(ex)
    pdf["d"] = pdf["d"].astype(np.float64)

    # 10. any float -> any int raises
    pdf["a"] = pdf["a"].astype(np.float32)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Incompatible input types" in str(ex)
    # 10. any float -> any int raises
    pdf["a"] = pdf["a"].astype(np.float64)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Incompatible input types" in str(ex)
    pdf["a"] = pdf["a"].astype(np.int32)
    pdf["b"] = pdf["b"].astype(np.float64)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Incompatible input types" in str(ex)
    pdf["b"] = pdf["b"].astype(np.int64)

    pdf["b"] = pdf["b"].astype(np.float64)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    pdf["b"] = pdf["b"].astype(np.int64)
    assert "Incompatible input types" in str(ex)

    # 11. objects work
    pdf["b"] = pdf["b"].astype(np.object)
    pdf["d"] = pdf["d"].astype(np.object)
    pdf["e"] = pdf["e"].astype(np.object)
    pdf["f"] = pdf["f"].astype(np.object)
    pdf["g"] = pdf["g"].astype(np.object)
    res = pyfunc_model.predict(pdf)
    assert res.dtypes.to_dict() == expected_types

    # 12. datetime64[D] (date only) -> datetime64[x] works
    pdf["h"] = pdf["h"].astype("datetime64[D]")
    res = pyfunc_model.predict(pdf)
    assert res.dtypes.to_dict() == expected_types
    pdf["h"] = pdf["h"].astype("datetime64[s]")

    # 13. np.ndarrays can be converted to dataframe but have no columns
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf.values)
    assert "Model is missing inputs" in str(ex)

    # 14. dictionaries of str -> list/nparray work
    arr = np.array([1, 2, 3])
    d = {
        "a":
        arr.astype("int32"),
        "b":
        arr.astype("int64"),
        "c":
        arr.astype("float32"),
        "d":
        arr.astype("float64"),
        "e": [True, False, True],
        "g": ["a", "b", "c"],
        "f": [bytes(0), bytes(1), bytes(1)],
        "h":
        np.array(["2020-01-01", "2020-02-02", "2020-03-03"],
                 dtype=np.datetime64),
    }
    res = pyfunc_model.predict(d)
    assert res.dtypes.to_dict() == expected_types

    # 15. dictionaries of str -> list[list] fail
    d = {
        "a": [arr.astype("int32")],
        "b": [arr.astype("int64")],
        "c": [arr.astype("float32")],
        "d": [arr.astype("float64")],
        "e": [[True, False, True]],
        "g": [["a", "b", "c"]],
        "f": [[bytes(0), bytes(1), bytes(1)]],
        "h": [
            np.array(["2020-01-01", "2020-02-02", "2020-03-03"],
                     dtype=np.datetime64)
        ],
    }
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(d)
    assert "Incompatible input types" in str(ex)

    # 16. conversion to dataframe fails
    d = {
        "a": [1],
        "b": [1, 2],
        "c": [1, 2, 3],
    }
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(d)
    assert "This model contains a column-based signature, which suggests a DataFrame input." in str(
        ex)
Пример #10
0
# Prepare dataset
try:
    repo_url = "https://raw.githubusercontent.com/prinz-nussknacker"
    csv_url = f"{repo_url}/banksim1/master/bs140513_032310.csv"
    data = pd.read_csv(csv_url, sep=",", quotechar="'", header=0)
except Exception as e:
    logger.exception("Could not read CSV file: {}".format(e))
    exit(1)

data.dropna()
data = data.drop(["step", "customer", "zipcodeOri", "merchant", "zipMerchant"],
                 axis="columns")

input_schema = Schema([
    ColSpec("string", "age"),
    ColSpec("string", "gender"),
    ColSpec("string", "category"),
    ColSpec("double", "amount")
])
output_schema = Schema([ColSpec("integer")])
signature = ModelSignature(inputs=input_schema, outputs=output_schema)

# Prepare train and test sets
data_x = data.drop(["fraud"], axis="columns")
data_y = data[["fraud"]]
train_x, test_x, train_y, test_y = train_test_split(data_x, data_y)

with mlflow.start_run():
    # Define pipeline
    numeric_features = ['amount']
Пример #11
0
def test_dataframe_from_json():
    source = pd.DataFrame(
        {
            "boolean": [True, False, True],
            "string": ["a", "b", "c"],
            "float": np.array([1.2, 2.3, 3.4], dtype=np.float32),
            "double": np.array([1.2, 2.3, 3.4], dtype=np.float64),
            "integer": np.array([3, 4, 5], dtype=np.int32),
            "long": np.array([3, 4, 5], dtype=np.int64),
            "binary": [bytes([1, 2, 3]),
                       bytes([4, 5]),
                       bytes([6])],
            "date_string": ["2018-02-03", "1996-03-02", "2021-03-05"],
        },
        columns=[
            "boolean",
            "string",
            "float",
            "double",
            "integer",
            "long",
            "binary",
            "date_string",
        ],
    )

    jsonable_df = pd.DataFrame(source, copy=True)
    jsonable_df["binary"] = jsonable_df["binary"].map(base64.b64encode)
    schema = Schema([
        ColSpec("boolean", "boolean"),
        ColSpec("string", "string"),
        ColSpec("float", "float"),
        ColSpec("double", "double"),
        ColSpec("integer", "integer"),
        ColSpec("long", "long"),
        ColSpec("binary", "binary"),
        ColSpec("string", "date_string"),
    ])
    parsed = _dataframe_from_json(jsonable_df.to_json(orient="split"),
                                  pandas_orient="split",
                                  schema=schema)
    assert parsed.equals(source)
    parsed = _dataframe_from_json(jsonable_df.to_json(orient="records"),
                                  pandas_orient="records",
                                  schema=schema)
    assert parsed.equals(source)
    # try parsing with tensor schema
    tensor_schema = Schema([
        TensorSpec(np.dtype("bool"), [-1], "boolean"),
        TensorSpec(np.dtype("str"), [-1], "string"),
        TensorSpec(np.dtype("float32"), [-1], "float"),
        TensorSpec(np.dtype("float64"), [-1], "double"),
        TensorSpec(np.dtype("int32"), [-1], "integer"),
        TensorSpec(np.dtype("int64"), [-1], "long"),
        TensorSpec(np.dtype(bytes), [-1], "binary"),
    ])
    parsed = _dataframe_from_json(jsonable_df.to_json(orient="split"),
                                  pandas_orient="split",
                                  schema=tensor_schema)

    # NB: tensor schema does not automatically decode base64 encoded bytes.
    assert parsed.equals(jsonable_df)
    parsed = _dataframe_from_json(jsonable_df.to_json(orient="records"),
                                  pandas_orient="records",
                                  schema=tensor_schema)

    # NB: tensor schema does not automatically decode base64 encoded bytes.
    assert parsed.equals(jsonable_df)

    # Test parse with TesnorSchema with a single tensor
    tensor_schema = Schema([TensorSpec(np.dtype("float32"), [-1, 3])])
    source = pd.DataFrame(
        {
            "a": np.array([1, 2, 3], dtype=np.float32),
            "b": np.array([4.1, 5.2, 6.3], dtype=np.float32),
            "c": np.array([7, 8, 9], dtype=np.float32),
        },
        columns=["a", "b", "c"],
    )
    assert source.equals(
        _dataframe_from_json(source.to_json(orient="split"),
                             pandas_orient="split",
                             schema=tensor_schema))
    assert source.equals(
        _dataframe_from_json(source.to_json(orient="records"),
                             pandas_orient="records",
                             schema=tensor_schema))
Пример #12
0
def test_parse_tf_serving_dictionary():
    def assert_result(result, expected_result):
        assert result.keys() == expected_result.keys()
        for key in result:
            assert (result[key] == expected_result[key]).all()
            assert result[key].dtype == expected_result[key].dtype

    # instances are correctly aggregated to dict of input name -> tensor
    tfserving_input = {
        "instances": [
            {
                "a": "s1",
                "b": 1.1,
                "c": [1, 2, 3]
            },
            {
                "a": "s2",
                "b": 2.2,
                "c": [4, 5, 6]
            },
            {
                "a": "s3",
                "b": 3.3,
                "c": [7, 8, 9]
            },
        ]
    }
    # Without Schema
    result = parse_tf_serving_input(tfserving_input)
    expected_result_no_schema = {
        "a": np.array(["s1", "s2", "s3"]),
        "b": np.array([1.1, 2.2, 3.3]),
        "c": np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
    }
    assert_result(result, expected_result_no_schema)

    # With schema
    schema = Schema([
        TensorSpec(np.dtype("str"), [-1], "a"),
        TensorSpec(np.dtype("float32"), [-1], "b"),
        TensorSpec(np.dtype("int32"), [-1], "c"),
    ])
    dfSchema = Schema([
        ColSpec("string", "a"),
        ColSpec("float", "b"),
        ColSpec("integer", "c")
    ])
    result = parse_tf_serving_input(tfserving_input, schema)
    expected_result_schema = {
        "a": np.array(["s1", "s2", "s3"], dtype=np.dtype("str")),
        "b": np.array([1.1, 2.2, 3.3], dtype="float32"),
        "c": np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype="int32"),
    }
    assert_result(result, expected_result_schema)
    # With df Schema
    result = parse_tf_serving_input(tfserving_input, dfSchema)
    assert_result(result, expected_result_schema)

    # input provided as a dict
    tfserving_input = {
        "inputs": {
            "a": ["s1", "s2", "s3"],
            "b": [1.1, 2.2, 3.3],
            "c": [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
        }
    }
    # Without Schema
    result = parse_tf_serving_input(tfserving_input)
    assert_result(result, expected_result_no_schema)

    # With Schema
    result = parse_tf_serving_input(tfserving_input, schema)
    assert_result(result, expected_result_schema)

    # With df Schema
    result = parse_tf_serving_input(tfserving_input, dfSchema)
    assert_result(result, expected_result_schema)
Пример #13
0
warnings.filterwarnings("ignore")
model_id = int(sys.argv[2])

# Prepare dataset
try:
    csv_url = ("https://raw.githubusercontent.com/prinz-nussknacker/banksim1/master/bs140513_032310.csv")
    data = pd.read_csv(csv_url, sep=",", quotechar="'", header=0)
except Exception as e:
    logger.exception("Could not read CSV file: {}".format(e))
    exit(1)

data.dropna()
data = data.drop(["step", "customer", "zipcodeOri", "merchant", "zipMerchant"], axis="columns")

input_schema = Schema([
    ColSpec("string", "age"),
    ColSpec("string", "gender"),
    ColSpec("string", "category"),
    ColSpec("double", "amount")
])
output_schema = Schema([
    ColSpec("integer")
])
signature = ModelSignature(inputs=input_schema, outputs=output_schema)

# Prepare train and test sets
data_x = data.drop(["fraud"], axis="columns")
data_y = data[["fraud"]]
train_x, test_x, train_y, test_y = train_test_split(data_x, data_y)

with mlflow.start_run():