예제 #1
0
def test_model_signature_with_colspec():
    signature1 = ModelSignature(
        inputs=Schema([ColSpec(DataType.boolean),
                       ColSpec(DataType.binary)]),
        outputs=Schema([
            ColSpec(name=None, type=DataType.double),
            ColSpec(name=None, type=DataType.double)
        ]),
    )
    signature2 = ModelSignature(
        inputs=Schema([ColSpec(DataType.boolean),
                       ColSpec(DataType.binary)]),
        outputs=Schema([
            ColSpec(name=None, type=DataType.double),
            ColSpec(name=None, type=DataType.double)
        ]),
    )
    assert signature1 == signature2
    signature3 = ModelSignature(
        inputs=Schema([ColSpec(DataType.boolean),
                       ColSpec(DataType.binary)]),
        outputs=Schema([
            ColSpec(name=None, type=DataType.float),
            ColSpec(name=None, type=DataType.double)
        ]),
    )
    assert signature3 != signature1
    as_json = json.dumps(signature1.to_dict())
    signature4 = ModelSignature.from_dict(json.loads(as_json))
    assert signature1 == signature4
    signature5 = ModelSignature(inputs=Schema(
        [ColSpec(DataType.boolean),
         ColSpec(DataType.binary)]),
                                outputs=None)
    as_json = json.dumps(signature5.to_dict())
    signature6 = ModelSignature.from_dict(json.loads(as_json))
    assert signature5 == signature6
예제 #2
0
def test_schema_inference_on_pandas_series():
    # test objects
    schema = _infer_schema(pd.Series(np.array(["a"], dtype=np.object)))
    assert schema == Schema([ColSpec(DataType.string)])
    schema = _infer_schema(pd.Series(np.array([bytes([1])], dtype=np.object)))
    assert schema == Schema([ColSpec(DataType.binary)])
    schema = _infer_schema(
        pd.Series(np.array([bytearray([1]), None], dtype=np.object)))
    assert schema == Schema([ColSpec(DataType.binary)])
    schema = _infer_schema(pd.Series(np.array([True, None], dtype=np.object)))
    assert schema == Schema([ColSpec(DataType.string)])
    schema = _infer_schema(pd.Series(np.array([1.1, None], dtype=np.object)))
    assert schema == Schema([ColSpec(DataType.double)])

    # test bytes
    schema = _infer_schema(pd.Series(np.array([bytes([1])], dtype=np.bytes_)))
    assert schema == Schema([ColSpec(DataType.binary)])

    # test string
    schema = _infer_schema(pd.Series(np.array(["a"], dtype=np.str)))
    assert schema == Schema([ColSpec(DataType.string)])

    # test boolean
    schema = _infer_schema(pd.Series(np.array([True], dtype=np.bool)))
    assert schema == Schema([ColSpec(DataType.boolean)])

    # test ints
    for t in [np.uint8, np.uint16, np.int8, np.int16, np.int32]:
        schema = _infer_schema(pd.Series(np.array([1, 2, 3], dtype=t)))
        assert schema == Schema([ColSpec("integer")])

    # test longs
    for t in [np.uint32, np.int64]:
        schema = _infer_schema(pd.Series(np.array([1, 2, 3], dtype=t)))
        assert schema == Schema([ColSpec("long")])

    # unsigned long is unsupported
    with pytest.raises(MlflowException):
        _infer_schema(pd.Series(np.array([1, 2, 3], dtype=np.uint64)))

    # test floats
    for t in [np.float16, np.float32]:
        schema = _infer_schema(pd.Series(np.array([1.1, 2.2, 3.3], dtype=t)))
        assert schema == Schema([ColSpec("float")])

    # test doubles
    schema = _infer_schema(
        pd.Series(np.array([1.1, 2.2, 3.3], dtype=np.float64)))
    assert schema == Schema([ColSpec("double")])

    # unsupported
    if hasattr(np, "float128"):
        with pytest.raises(MlflowException):
            _infer_schema(pd.Series(np.array([1, 2, 3], dtype=np.float128)))
예제 #3
0
def _infer_schema(data: Any) -> Schema:
    """
    Infer an MLflow schema from a dataset.

    Data inputted as a numpy array or a dictionary is represented by :py:class:`TensorSpec`.
    All other inputted data types are specified by :py:class:`ColSpec`.

    A `TensorSpec` captures the data shape (default variable axis is 0), the data type (numpy.dtype)
    and an optional name for each individual tensor of the dataset.
    A `ColSpec` captures the data type (defined in :py:class:`DataType`) and an optional name for
    each individual column of the dataset.

    This method will raise an exception if the user data contains incompatible types or is not
    passed in one of the supported formats (containers).

    The input should be one of these:
      - pandas.DataFrame or pandas.Series
      - dictionary of { name -> numpy.ndarray}
      - numpy.ndarray
      - pyspark.sql.DataFrame
      - csc/csr matrix

    The element types should be mappable to one of :py:class:`mlflow.models.signature.DataType` for
    dataframes and to one of numpy types for tensors.

    :param data: Dataset to infer from.

    :return: Schema
    """
    from scipy.sparse import csr_matrix, csc_matrix

    if isinstance(data, dict):
        res = []
        for name in data.keys():
            ndarray = data[name]
            if not isinstance(ndarray, np.ndarray):
                raise TypeError(
                    "Data in the dictionary must be of type numpy.ndarray")
            res.append(
                TensorSpec(
                    type=clean_tensor_type(ndarray.dtype),
                    shape=_get_tensor_shape(ndarray),
                    name=name,
                ))
        schema = Schema(res)
    elif isinstance(data, pd.Series):
        schema = Schema([ColSpec(type=_infer_pandas_column(data))])
    elif isinstance(data, pd.DataFrame):
        schema = Schema([
            ColSpec(type=_infer_pandas_column(data[col]), name=col)
            for col in data.columns
        ])
    elif isinstance(data, np.ndarray):
        schema = Schema([
            TensorSpec(type=clean_tensor_type(data.dtype),
                       shape=_get_tensor_shape(data))
        ])
    elif isinstance(data, (csc_matrix, csr_matrix)):
        schema = Schema([
            TensorSpec(type=clean_tensor_type(data.data.dtype),
                       shape=_get_tensor_shape(data))
        ])
    elif _is_spark_df(data):
        schema = Schema([
            ColSpec(type=_infer_spark_type(field.dataType), name=field.name)
            for field in data.schema.fields
        ])
    else:
        raise TypeError(
            "Expected one of (pandas.DataFrame, numpy array, "
            "dictionary of (name -> numpy.ndarray), pyspark.sql.DataFrame) "
            "but got '{}'".format(type(data)))
    if not schema.is_tensor_spec() and any(
        [t in (DataType.integer, DataType.long)
         for t in schema.input_types()]):
        warnings.warn(
            "Hint: Inferred schema contains integer column(s). Integer columns in "
            "Python cannot represent missing values. If your input data contains "
            "missing values at inference time, it will be encoded as floats and will "
            "cause a schema enforcement error. The best way to avoid this problem is "
            "to infer the model schema based on a realistic data sample (training "
            "dataset) that includes missing values. Alternatively, you can declare "
            "integer columns as doubles (float64) whenever these columns may have "
            "missing values. See `Handling Integers With Missing Values "
            "<https://www.mlflow.org/docs/latest/models.html#"
            "handling-integers-with-missing-values>`_ for more details.",
            stacklevel=2,
        )
    return schema
예제 #4
0
def test_col_spec():
    a1 = ColSpec("string", "a")
    a2 = ColSpec(DataType.string, "a")
    a3 = ColSpec(DataType.integer, "a")
    assert a1 != a3
    b1 = ColSpec(DataType.string, "b")
    assert b1 != a1
    assert a1 == a2
    with pytest.raises(MlflowException) as ex:
        ColSpec("unsupported")
    assert "Unsupported type 'unsupported'" in ex.value.message
    a4 = ColSpec(**a1.to_dict())
    assert a4 == a1
    assert ColSpec(**json.loads(json.dumps(a1.to_dict()))) == a1
    a5 = ColSpec("string")
    a6 = ColSpec("string", None)
    assert a5 == a6
    assert ColSpec(**json.loads(json.dumps(a5.to_dict()))) == a5
예제 #5
0
파일: utils.py 프로젝트: nash-lian/mlflow
def _infer_schema(data: Any) -> Schema:
    """
    Infer an MLflow schema from a dataset.

    This method captures the column names and data types from the user data. The signature
    represents model input and output as data frames with (optionally) named columns and data
    type specified as one of types defined in :py:class:`DataType`. This method will raise
    an exception if the user data contains incompatible types or is not passed in one of the
    supported formats (containers).

    The input should be one of these:
      - pandas.DataFrame or pandas.Series
      - dictionary of { name -> numpy.ndarray}
      - numpy.ndarray
      - pyspark.sql.DataFrame

    The element types should be mappable to one of :py:class:`mlflow.models.signature.DataType`.

    NOTE: Multidimensional (>2d) arrays (aka tensors) are not supported at this time.

    :param data: Dataset to infer from.

    :return: Schema
    """

    if isinstance(data, dict):
        res = []
        for col in data.keys():
            ary = data[col]
            if not isinstance(ary, np.ndarray):
                raise TypeError(
                    "Data in the dictionary must be of type numpy.ndarray")
            dims = len(ary.shape)
            if dims == 1:
                res.append(ColSpec(type=_infer_numpy_array(ary), name=col))
            else:
                raise TensorsNotSupportedException(
                    "Data in the dictionary must be 1-dimensional, "
                    "got shape {}".format(ary.shape))
        schema = Schema(res)
    elif isinstance(data, pd.Series):
        schema = Schema([ColSpec(type=_infer_pandas_column(data))])
    elif isinstance(data, pd.DataFrame):
        schema = Schema([
            ColSpec(type=_infer_pandas_column(data[col]), name=col)
            for col in data.columns
        ])
    elif isinstance(data, np.ndarray):
        if len(data.shape) > 2:
            raise TensorsNotSupportedException(
                "Attempting to infer schema from numpy array with "
                "shape {}".format(data.shape))
        if data.dtype == np.object:
            data = pd.DataFrame(data).infer_objects()
            schema = Schema([
                ColSpec(type=_infer_numpy_array(data[col].values))
                for col in data.columns
            ])
        elif len(data.shape) == 1:
            schema = Schema([ColSpec(type=_infer_numpy_dtype(data.dtype))])
        elif len(data.shape) == 2:
            schema = Schema([ColSpec(type=_infer_numpy_dtype(data.dtype))] *
                            data.shape[1])
    elif _is_spark_df(data):
        schema = Schema([
            ColSpec(type=_infer_spark_type(field.dataType), name=field.name)
            for field in data.schema.fields
        ])
    else:
        raise TypeError(
            "Expected one of (pandas.DataFrame, numpy array, "
            "dictionary of (name -> numpy.ndarray), pyspark.sql.DataFrame) "
            "but got '{}'".format(type(data)))
    if any([
            t in (DataType.integer, DataType.long)
            for t in schema.column_types()
    ]):
        warnings.warn(
            "Hint: Inferred schema contains integer column(s). Integer columns in "
            "Python cannot represent missing values. If your input data contains "
            "missing values at inference time, it will be encoded as floats and will "
            "cause a schema enforcement error. The best way to avoid this problem is "
            "to infer the model schema based on a realistic data sample (training "
            "dataset) that includes missing values. Alternatively, you can declare "
            "integer columns as doubles (float64) whenever these columns may have "
            "missing values. See `Handling Integers With Missing Values "
            "<https://www.mlflow.org/docs/latest/models.html#"
            "handling-integers-with-missing-values>`_ for more details.",
            stacklevel=2,
        )
    return schema
예제 #6
0
import pandas as pd
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
import mlflow
import mlflow.sklearn
from mlflow.models.signature import ModelSignature
from mlflow.types.schema import Schema, ColSpec


iris = datasets.load_iris()
iris_train = pd.DataFrame(iris.data, columns=iris.feature_names)
clf = RandomForestClassifier(max_depth=7, random_state=0)
clf.fit(iris_train, iris.target)

input_schema = Schema([
  ColSpec("double", "sepal length (cm)"),
  ColSpec("double", "sepal width (cm)"),
  ColSpec("double", "petal length (cm)"),
  ColSpec("double", "petal width (cm)"),
])
output_schema = Schema([ColSpec("long")])
signature = ModelSignature(inputs=input_schema, outputs=output_schema)

mlflow.sklearn.log_model(clf, "iris_rf", signature=signature)
예제 #7
0
def test_schema_inference_on_numpy_array(pandas_df_with_all_types):
    # drop int and float as we lose type size information when storing as objects and defaults are
    # 64b.
    pandas_df_with_all_types = pandas_df_with_all_types.drop(
        columns=["integer", "float"])
    schema = _infer_schema(pandas_df_with_all_types.values)
    assert schema == Schema(
        [ColSpec(x) for x in pandas_df_with_all_types.columns])

    # test objects
    schema = _infer_schema(np.array(["a"], dtype=np.object))
    assert schema == Schema([ColSpec(DataType.string)])
    schema = _infer_schema(np.array([bytes([1])], dtype=np.object))
    assert schema == Schema([ColSpec(DataType.binary)])
    schema = _infer_schema(np.array([bytearray([1]), None], dtype=np.object))
    assert schema == Schema([ColSpec(DataType.binary)])
    schema = _infer_schema(np.array([True, None], dtype=np.object))
    assert schema == Schema([ColSpec(DataType.boolean)])
    schema = _infer_schema(np.array([1.1, None], dtype=np.object))
    assert schema == Schema([ColSpec(DataType.double)])

    # test bytes
    schema = _infer_schema(np.array([bytes([1])], dtype=np.bytes_))

    assert schema == Schema([ColSpec(DataType.binary)])
    schema = _infer_schema(np.array([bytearray([1])], dtype=np.bytes_))
    assert schema == Schema([ColSpec(DataType.binary)])

    # test string
    schema = _infer_schema(np.array(["a"], dtype=np.str))
    assert schema == Schema([ColSpec(DataType.string)])

    # test boolean
    schema = _infer_schema(np.array([True], dtype=np.bool))
    assert schema == Schema([ColSpec(DataType.boolean)])

    # test ints
    for t in [np.uint8, np.uint16, np.int8, np.int16, np.int32]:
        schema = _infer_schema(np.array([1, 2, 3], dtype=t))
        assert schema == Schema([ColSpec("integer")])

    # test longs
    for t in [np.uint32, np.int64]:
        schema = _infer_schema(np.array([1, 2, 3], dtype=t))
        assert schema == Schema([ColSpec("long")])

    # unsigned long is unsupported
    with pytest.raises(MlflowException):
        _infer_schema(np.array([1, 2, 3], dtype=np.uint64))

    # test floats
    for t in [np.float16, np.float32]:
        schema = _infer_schema(np.array([1.1, 2.2, 3.3], dtype=t))
        assert schema == Schema([ColSpec("float")])

    # test doubles
    schema = _infer_schema(np.array([1.1, 2.2, 3.3], dtype=np.float64))
    assert schema == Schema([ColSpec("double")])

    # unsupported
    if hasattr(np, "float128"):
        with pytest.raises(MlflowException):
            _infer_schema(np.array([1, 2, 3], dtype=np.float128))
예제 #8
0
def test_schema_inference_on_dataframe(pandas_df_with_all_types):
    schema = _infer_schema(pandas_df_with_all_types)
    assert schema == Schema(
        [ColSpec(x, x) for x in pandas_df_with_all_types.columns])