示例#1
0
def infer_signature(model_input: Any, model_output: MlflowInferableDataset=None) -> ModelSignature:
    """
    Infer an MLflow model signature from the training data (input) and model predictions (output).

    The signature represents model input and output as data frames with (optionally) named columns
    and data type specified as one of types defined in :py:class:`mlflow.types.DataType`.
    This method will raise an exception if the user data contains incompatible types or is not
    passed in one of the supported formats listed below.

    The input should be one of these:
      - pandas.DataFrame
      - dictionary of { name -> numpy.ndarray}
      - numpy.ndarray
      - pyspark.sql.DataFrame

    The element types should be mappable to one of :py:class:`mlflow.types.DataType`.

    NOTE: Multidimensional (>2d) arrays (aka tensors) are not supported at this time.


    :param model_input: Valid input to the model. E.g. (a subset of) the training dataset.
    :param model_output: Valid model output. E.g. Model predictions for the (subset of) training
                         dataset.
    :return: ModelSignature
    """
    inputs = _infer_schema(model_input)
    outputs = _infer_schema(model_output) if model_output is not None else None
    return ModelSignature(inputs, outputs)
示例#2
0
def test_spark_schema_inference(pandas_df_with_all_types):
    import pyspark
    from pyspark.sql.types import _parse_datatype_string, StructField, StructType

    pandas_df_with_all_types = pandas_df_with_all_types.drop(
        columns=["boolean_ext", "integer_ext", "string_ext"])
    schema = _infer_schema(pandas_df_with_all_types)
    assert schema == Schema(
        [ColSpec(x, x) for x in pandas_df_with_all_types.columns])
    spark_session = pyspark.sql.SparkSession(
        pyspark.SparkContext.getOrCreate())

    struct_fields = []
    for t in schema.input_types():
        # pyspark _parse_datatype_string() expects "timestamp" instead of "datetime"
        if t == DataType.datetime:
            struct_fields.append(
                StructField("datetime", _parse_datatype_string("timestamp"),
                            True))
        else:
            struct_fields.append(
                StructField(t.name, _parse_datatype_string(t.name), True))
    spark_schema = StructType(struct_fields)
    sparkdf = spark_session.createDataFrame(pandas_df_with_all_types,
                                            schema=spark_schema)
    schema = _infer_schema(sparkdf)
    assert schema == Schema(
        [ColSpec(x, x) for x in pandas_df_with_all_types.columns])
示例#3
0
def test_get_tensor_shape(dict_of_ndarrays):
    assert all(-1 == _get_tensor_shape(tensor)[0]
               for tensor in dict_of_ndarrays.values())

    data = dict_of_ndarrays["4D"]
    # Specify variable dimension
    for i in range(-4, 4):
        assert _get_tensor_shape(data, i)[i] == -1

    # Specify None
    assert all([_get_tensor_shape(data, None) != -1])

    # Out of bounds
    with pytest.raises(
            MlflowException,
            match="The specified variable_dimension 10 is out of bounds"):
        _get_tensor_shape(data, 10)
    with pytest.raises(
            MlflowException,
            match="The specified variable_dimension -10 is out of bounds"):
        _get_tensor_shape(data, -10)

    with pytest.raises(
            TypeError,
            match="Data in the dictionary must be of type numpy.ndarray"):
        _infer_schema({"x": 1})
示例#4
0
def test_schema_inference_on_dictionary(dict_of_ndarrays):
    # test dictionary
    schema = _infer_schema(dict_of_ndarrays)
    assert schema == Schema([
        TensorSpec(tensor.dtype, _get_tensor_shape(tensor), name)
        for name, tensor in dict_of_ndarrays.items()
    ])
    # test exception is raised if non-numpy data in dictionary
    with pytest.raises(TypeError):
        _infer_schema({"x": 1})
    with pytest.raises(TypeError):
        _infer_schema({"x": [1]})
示例#5
0
def test_spark_type_mapping(pandas_df_with_all_types):
    import pyspark
    from pyspark.sql.types import (
        BooleanType,
        IntegerType,
        LongType,
        FloatType,
        DoubleType,
        StringType,
        BinaryType,
        TimestampType,
    )
    from pyspark.sql.types import StructField, StructType

    assert isinstance(DataType.boolean.to_spark(), BooleanType)
    assert isinstance(DataType.integer.to_spark(), IntegerType)
    assert isinstance(DataType.long.to_spark(), LongType)
    assert isinstance(DataType.float.to_spark(), FloatType)
    assert isinstance(DataType.double.to_spark(), DoubleType)
    assert isinstance(DataType.string.to_spark(), StringType)
    assert isinstance(DataType.binary.to_spark(), BinaryType)
    assert isinstance(DataType.datetime.to_spark(), TimestampType)
    pandas_df_with_all_types = pandas_df_with_all_types.drop(
        columns=["boolean_ext", "integer_ext", "string_ext"])
    schema = _infer_schema(pandas_df_with_all_types)
    expected_spark_schema = StructType([
        StructField(t.name, t.to_spark(), True) for t in schema.input_types()
    ])
    actual_spark_schema = schema.as_spark_schema()
    assert expected_spark_schema.jsonValue() == actual_spark_schema.jsonValue()
    spark_session = pyspark.sql.SparkSession(
        pyspark.SparkContext.getOrCreate())
    sparkdf = spark_session.createDataFrame(pandas_df_with_all_types,
                                            schema=actual_spark_schema)
    schema2 = _infer_schema(sparkdf)
    assert schema == schema2

    # test unnamed columns
    schema = Schema([ColSpec(col.type) for col in schema.inputs])
    expected_spark_schema = StructType([
        StructField(str(i), t.to_spark(), True)
        for i, t in enumerate(schema.input_types())
    ])
    actual_spark_schema = schema.as_spark_schema()
    assert expected_spark_schema.jsonValue() == actual_spark_schema.jsonValue()

    # test single unnamed column is mapped to just a single spark type
    schema = Schema([ColSpec(DataType.integer)])
    spark_type = schema.as_spark_schema()
    assert isinstance(spark_type, IntegerType)
示例#6
0
def test_schema_inference_on_dataframe(pandas_df_with_all_types):
    basic_types = pandas_df_with_all_types.drop(
        columns=["boolean_ext", "integer_ext", "string_ext"])
    schema = _infer_schema(basic_types)
    assert schema == Schema([ColSpec(x, x) for x in basic_types.columns])

    ext_types = pandas_df_with_all_types[[
        "boolean_ext", "integer_ext", "string_ext"
    ]].copy()
    expected_schema = Schema([
        ColSpec(DataType.boolean, "boolean_ext"),
        ColSpec(DataType.long, "integer_ext"),
        ColSpec(DataType.string, "string_ext"),
    ])
    schema = _infer_schema(ext_types)
    assert schema == expected_schema
示例#7
0
def test_spark_schema_inference(pandas_df_with_all_types):
    import pyspark
    from pyspark.sql.types import _parse_datatype_string, StructField, StructType

    pandas_df_with_all_types = pandas_df_with_all_types.drop(
        columns=["boolean_ext", "integer_ext", "string_ext"]
    )
    schema = _infer_schema(pandas_df_with_all_types)
    assert schema == Schema([ColSpec(x, x) for x in pandas_df_with_all_types.columns])
    spark_session = pyspark.sql.SparkSession(pyspark.SparkContext.getOrCreate())
    spark_schema = StructType(
        [StructField(t.name, _parse_datatype_string(t.name), True) for t in schema.column_types()]
    )
    sparkdf = spark_session.createDataFrame(pandas_df_with_all_types, schema=spark_schema)
    schema = _infer_schema(sparkdf)
    assert schema == Schema([ColSpec(x, x) for x in pandas_df_with_all_types.columns])
示例#8
0
def test_schema_inference_on_dictionary(pandas_df_with_all_types):
    # test dictionary
    d = {
        c: pandas_df_with_all_types[c].values
        for c in pandas_df_with_all_types.columns
    }
    schema = _infer_schema(d)
    assert dict(zip(schema.column_names(), schema.column_types())) == {
        c: DataType[c]
        for c in pandas_df_with_all_types.columns
    }
    # test exception is raised if non-numpy data in dictionary
    with pytest.raises(TypeError):
        _infer_schema({"x": 1})
    with pytest.raises(TypeError):
        _infer_schema({"x": [1]})
示例#9
0
 def test_dtype(nparray, dtype):
     schema = _infer_schema(nparray)
     assert schema == Schema([TensorSpec(np.dtype(dtype), (-1, ))])
     spec = schema.inputs[0]
     recreated_spec = TensorSpec.from_json_dict(**spec.to_dict())
     assert spec == recreated_spec
     enforced_array = _enforce_tensor_spec(nparray, spec)
     assert isinstance(enforced_array, np.ndarray)
示例#10
0
def test_get_schema_type(dict_of_ndarrays):
    schema = _infer_schema(dict_of_ndarrays)
    assert ["float64"] * 4 == schema.numpy_types()
    with pytest.raises(MlflowException, match="TensorSpec only supports numpy types"):
        schema.column_types()
    with pytest.raises(MlflowException, match="TensorSpec only supports numpy types"):
        schema.pandas_types()
    with pytest.raises(MlflowException, match="TensorSpec cannot be converted to spark dataframe"):
        schema.as_spark_schema()
示例#11
0
def test_get_tensor_shape(dict_of_ndarrays):
    assert all([-1 == _get_tensor_shape(tensor)[0] for tensor in dict_of_ndarrays.values()])

    data = dict_of_ndarrays["4D"]
    # Specify variable dimension
    for i in range(-4, 4):
        assert _get_tensor_shape(data, i)[i] == -1

    # Specify None
    assert all([_get_tensor_shape(data, None) != -1])

    # Out of bounds
    with pytest.raises(MlflowException):
        _get_tensor_shape(data, 10)
    with pytest.raises(MlflowException):
        _get_tensor_shape(data, -10)

    with pytest.raises(TypeError):
        _infer_schema({"x": 1})
示例#12
0
def test_schema_inference_on_numpy_array(pandas_df_with_all_types):
    for col in pandas_df_with_all_types:
        data = pandas_df_with_all_types[col].to_numpy()
        schema = _infer_schema(data)
        assert schema == Schema([TensorSpec(type=data.dtype, shape=(-1, ))])

    # test boolean
    schema = _infer_schema(np.array([True, False, True], dtype=np.bool_))
    assert schema == Schema([TensorSpec(np.dtype(np.bool_), (-1, ))])

    # test bytes
    schema = _infer_schema(np.array([bytes([1])], dtype=np.bytes_))
    assert schema == Schema([TensorSpec(np.dtype("S1"), (-1, ))])

    # test (u)ints
    for t in [
            np.uint8, np.int8, np.uint16, np.int16, np.uint32, np.int32,
            np.uint64, np.int64
    ]:
        schema = _infer_schema(np.array([1, 2, 3], dtype=t))
        assert schema == Schema([TensorSpec(np.dtype(t), (-1, ))])

    # test floats
    for t in [np.float16, np.float32, np.float64]:
        schema = _infer_schema(np.array([1.1, 2.2, 3.3], dtype=t))
        assert schema == Schema([TensorSpec(np.dtype(t), (-1, ))])

    if hasattr(np, "float128"):
        schema = _infer_schema(np.array([1.1, 2.2, 3.3], dtype=np.float128))
        assert schema == Schema([TensorSpec(np.dtype(np.float128), (-1, ))])
示例#13
0
def test_that_schema_inference_with_tensors_raises_exception():
    with pytest.raises(MlflowException):
        _infer_schema(np.array([[[1, 2, 3]]], dtype=np.int64))
    with pytest.raises(MlflowException):
        _infer_schema(
            pd.DataFrame({"x": [np.array([[1, 2, 3]], dtype=np.int64)]}))
    with pytest.raises(MlflowException):
        _infer_schema({"x": np.array([[1, 2, 3]], dtype=np.int64)})
示例#14
0
def test_schema_inference_on_dataframe(pandas_df_with_all_types):
    schema = _infer_schema(pandas_df_with_all_types)
    assert schema == Schema(
        [ColSpec(x, x) for x in pandas_df_with_all_types.columns])
示例#15
0
def test_schema_inference_on_basic_numpy(pandas_df_with_all_types):
    for col in pandas_df_with_all_types:
        data = pandas_df_with_all_types[col].to_numpy()
        schema = _infer_schema(data)
        assert schema == Schema([TensorSpec(type=data.dtype, shape=(-1, ))])
示例#16
0
def test_schema_inference_on_pandas_series():
    # test objects
    schema = _infer_schema(pd.Series(np.array(["a"], dtype=object)))
    assert schema == Schema([ColSpec(DataType.string)])
    schema = _infer_schema(pd.Series(np.array([bytes([1])], dtype=object)))
    assert schema == Schema([ColSpec(DataType.binary)])
    schema = _infer_schema(
        pd.Series(np.array([bytearray([1]), None], dtype=object)))
    assert schema == Schema([ColSpec(DataType.binary)])
    schema = _infer_schema(pd.Series(np.array([True, None], dtype=object)))
    assert schema == Schema([ColSpec(DataType.string)])
    schema = _infer_schema(pd.Series(np.array([1.1, None], dtype=object)))
    assert schema == Schema([ColSpec(DataType.double)])

    # test bytes
    schema = _infer_schema(pd.Series(np.array([bytes([1])], dtype=np.bytes_)))
    assert schema == Schema([ColSpec(DataType.binary)])

    # test string
    schema = _infer_schema(pd.Series(np.array(["a"], dtype=str)))
    assert schema == Schema([ColSpec(DataType.string)])

    # test boolean
    schema = _infer_schema(pd.Series(np.array([True], dtype=bool)))
    assert schema == Schema([ColSpec(DataType.boolean)])

    # test ints
    for t in [np.uint8, np.uint16, np.int8, np.int16, np.int32]:
        schema = _infer_schema(pd.Series(np.array([1, 2, 3], dtype=t)))
        assert schema == Schema([ColSpec("integer")])

    # test longs
    for t in [np.uint32, np.int64]:
        schema = _infer_schema(pd.Series(np.array([1, 2, 3], dtype=t)))
        assert schema == Schema([ColSpec("long")])

    # unsigned long is unsupported
    with pytest.raises(MlflowException, match="Unsupported numpy data type"):
        _infer_schema(pd.Series(np.array([1, 2, 3], dtype=np.uint64)))

    # test floats
    for t in [np.float16, np.float32]:
        schema = _infer_schema(pd.Series(np.array([1.1, 2.2, 3.3], dtype=t)))
        assert schema == Schema([ColSpec("float")])

    # test doubles
    schema = _infer_schema(
        pd.Series(np.array([1.1, 2.2, 3.3], dtype=np.float64)))
    assert schema == Schema([ColSpec("double")])

    # test datetime
    schema = _infer_schema(
        pd.Series(
            np.array(
                [
                    "2021-01-01 00:00:00", "2021-02-02 00:00:00",
                    "2021-03-03 12:00:00"
                ],
                dtype="datetime64",
            )))
    assert schema == Schema([ColSpec("datetime")])

    # unsupported
    if hasattr(np, "float128"):
        with pytest.raises(MlflowException,
                           match="Unsupported numpy data type"):
            _infer_schema(pd.Series(np.array([1, 2, 3], dtype=np.float128)))
示例#17
0
def test_schema_inference_on_numpy_array(pandas_df_with_all_types):
    # drop int and float as we lose type size information when storing as objects and defaults are
    # 64b.
    pandas_df_with_all_types = pandas_df_with_all_types.drop(
        columns=["integer", "float"])
    schema = _infer_schema(pandas_df_with_all_types.values)
    assert schema == Schema(
        [ColSpec(x) for x in pandas_df_with_all_types.columns])

    # test objects
    schema = _infer_schema(np.array(["a"], dtype=np.object))
    assert schema == Schema([ColSpec(DataType.string)])
    schema = _infer_schema(np.array([bytes([1])], dtype=np.object))
    assert schema == Schema([ColSpec(DataType.binary)])
    schema = _infer_schema(np.array([bytearray([1]), None], dtype=np.object))
    assert schema == Schema([ColSpec(DataType.binary)])
    schema = _infer_schema(np.array([True, None], dtype=np.object))
    assert schema == Schema([ColSpec(DataType.boolean)])
    schema = _infer_schema(np.array([1.1, None], dtype=np.object))
    assert schema == Schema([ColSpec(DataType.double)])

    # test bytes
    schema = _infer_schema(np.array([bytes([1])], dtype=np.bytes_))

    assert schema == Schema([ColSpec(DataType.binary)])
    schema = _infer_schema(np.array([bytearray([1])], dtype=np.bytes_))
    assert schema == Schema([ColSpec(DataType.binary)])

    # test string
    schema = _infer_schema(np.array(["a"], dtype=np.str))
    assert schema == Schema([ColSpec(DataType.string)])

    # test boolean
    schema = _infer_schema(np.array([True], dtype=np.bool))
    assert schema == Schema([ColSpec(DataType.boolean)])

    # test ints
    for t in [np.uint8, np.uint16, np.int8, np.int16, np.int32]:
        schema = _infer_schema(np.array([1, 2, 3], dtype=t))
        assert schema == Schema([ColSpec("integer")])

    # test longs
    for t in [np.uint32, np.int64]:
        schema = _infer_schema(np.array([1, 2, 3], dtype=t))
        assert schema == Schema([ColSpec("long")])

    # unsigned long is unsupported
    with pytest.raises(MlflowException):
        _infer_schema(np.array([1, 2, 3], dtype=np.uint64))

    # test floats
    for t in [np.float16, np.float32]:
        schema = _infer_schema(np.array([1.1, 2.2, 3.3], dtype=t))
        assert schema == Schema([ColSpec("float")])

    # test doubles
    schema = _infer_schema(np.array([1.1, 2.2, 3.3], dtype=np.float64))
    assert schema == Schema([ColSpec("double")])

    # unsupported
    if hasattr(np, "float128"):
        with pytest.raises(MlflowException):
            _infer_schema(np.array([1, 2, 3], dtype=np.float128))
示例#18
0
def test_get_sparse_matrix_data_type_and_shape(dict_of_sparse_matrix):
    for sparse_matrix in dict_of_sparse_matrix.values():
        schema = _infer_schema(sparse_matrix)
        assert schema.numpy_types() == ["float64"]
        assert _get_tensor_shape(sparse_matrix) == (-1, 8)