def infer_signature(model_input: Any, model_output: MlflowInferableDataset=None) -> ModelSignature: """ Infer an MLflow model signature from the training data (input) and model predictions (output). The signature represents model input and output as data frames with (optionally) named columns and data type specified as one of types defined in :py:class:`mlflow.types.DataType`. This method will raise an exception if the user data contains incompatible types or is not passed in one of the supported formats listed below. The input should be one of these: - pandas.DataFrame - dictionary of { name -> numpy.ndarray} - numpy.ndarray - pyspark.sql.DataFrame The element types should be mappable to one of :py:class:`mlflow.types.DataType`. NOTE: Multidimensional (>2d) arrays (aka tensors) are not supported at this time. :param model_input: Valid input to the model. E.g. (a subset of) the training dataset. :param model_output: Valid model output. E.g. Model predictions for the (subset of) training dataset. :return: ModelSignature """ inputs = _infer_schema(model_input) outputs = _infer_schema(model_output) if model_output is not None else None return ModelSignature(inputs, outputs)
def test_spark_schema_inference(pandas_df_with_all_types): import pyspark from pyspark.sql.types import _parse_datatype_string, StructField, StructType pandas_df_with_all_types = pandas_df_with_all_types.drop( columns=["boolean_ext", "integer_ext", "string_ext"]) schema = _infer_schema(pandas_df_with_all_types) assert schema == Schema( [ColSpec(x, x) for x in pandas_df_with_all_types.columns]) spark_session = pyspark.sql.SparkSession( pyspark.SparkContext.getOrCreate()) struct_fields = [] for t in schema.input_types(): # pyspark _parse_datatype_string() expects "timestamp" instead of "datetime" if t == DataType.datetime: struct_fields.append( StructField("datetime", _parse_datatype_string("timestamp"), True)) else: struct_fields.append( StructField(t.name, _parse_datatype_string(t.name), True)) spark_schema = StructType(struct_fields) sparkdf = spark_session.createDataFrame(pandas_df_with_all_types, schema=spark_schema) schema = _infer_schema(sparkdf) assert schema == Schema( [ColSpec(x, x) for x in pandas_df_with_all_types.columns])
def test_get_tensor_shape(dict_of_ndarrays): assert all(-1 == _get_tensor_shape(tensor)[0] for tensor in dict_of_ndarrays.values()) data = dict_of_ndarrays["4D"] # Specify variable dimension for i in range(-4, 4): assert _get_tensor_shape(data, i)[i] == -1 # Specify None assert all([_get_tensor_shape(data, None) != -1]) # Out of bounds with pytest.raises( MlflowException, match="The specified variable_dimension 10 is out of bounds"): _get_tensor_shape(data, 10) with pytest.raises( MlflowException, match="The specified variable_dimension -10 is out of bounds"): _get_tensor_shape(data, -10) with pytest.raises( TypeError, match="Data in the dictionary must be of type numpy.ndarray"): _infer_schema({"x": 1})
def test_schema_inference_on_dictionary(dict_of_ndarrays): # test dictionary schema = _infer_schema(dict_of_ndarrays) assert schema == Schema([ TensorSpec(tensor.dtype, _get_tensor_shape(tensor), name) for name, tensor in dict_of_ndarrays.items() ]) # test exception is raised if non-numpy data in dictionary with pytest.raises(TypeError): _infer_schema({"x": 1}) with pytest.raises(TypeError): _infer_schema({"x": [1]})
def test_spark_type_mapping(pandas_df_with_all_types): import pyspark from pyspark.sql.types import ( BooleanType, IntegerType, LongType, FloatType, DoubleType, StringType, BinaryType, TimestampType, ) from pyspark.sql.types import StructField, StructType assert isinstance(DataType.boolean.to_spark(), BooleanType) assert isinstance(DataType.integer.to_spark(), IntegerType) assert isinstance(DataType.long.to_spark(), LongType) assert isinstance(DataType.float.to_spark(), FloatType) assert isinstance(DataType.double.to_spark(), DoubleType) assert isinstance(DataType.string.to_spark(), StringType) assert isinstance(DataType.binary.to_spark(), BinaryType) assert isinstance(DataType.datetime.to_spark(), TimestampType) pandas_df_with_all_types = pandas_df_with_all_types.drop( columns=["boolean_ext", "integer_ext", "string_ext"]) schema = _infer_schema(pandas_df_with_all_types) expected_spark_schema = StructType([ StructField(t.name, t.to_spark(), True) for t in schema.input_types() ]) actual_spark_schema = schema.as_spark_schema() assert expected_spark_schema.jsonValue() == actual_spark_schema.jsonValue() spark_session = pyspark.sql.SparkSession( pyspark.SparkContext.getOrCreate()) sparkdf = spark_session.createDataFrame(pandas_df_with_all_types, schema=actual_spark_schema) schema2 = _infer_schema(sparkdf) assert schema == schema2 # test unnamed columns schema = Schema([ColSpec(col.type) for col in schema.inputs]) expected_spark_schema = StructType([ StructField(str(i), t.to_spark(), True) for i, t in enumerate(schema.input_types()) ]) actual_spark_schema = schema.as_spark_schema() assert expected_spark_schema.jsonValue() == actual_spark_schema.jsonValue() # test single unnamed column is mapped to just a single spark type schema = Schema([ColSpec(DataType.integer)]) spark_type = schema.as_spark_schema() assert isinstance(spark_type, IntegerType)
def test_schema_inference_on_dataframe(pandas_df_with_all_types): basic_types = pandas_df_with_all_types.drop( columns=["boolean_ext", "integer_ext", "string_ext"]) schema = _infer_schema(basic_types) assert schema == Schema([ColSpec(x, x) for x in basic_types.columns]) ext_types = pandas_df_with_all_types[[ "boolean_ext", "integer_ext", "string_ext" ]].copy() expected_schema = Schema([ ColSpec(DataType.boolean, "boolean_ext"), ColSpec(DataType.long, "integer_ext"), ColSpec(DataType.string, "string_ext"), ]) schema = _infer_schema(ext_types) assert schema == expected_schema
def test_spark_schema_inference(pandas_df_with_all_types): import pyspark from pyspark.sql.types import _parse_datatype_string, StructField, StructType pandas_df_with_all_types = pandas_df_with_all_types.drop( columns=["boolean_ext", "integer_ext", "string_ext"] ) schema = _infer_schema(pandas_df_with_all_types) assert schema == Schema([ColSpec(x, x) for x in pandas_df_with_all_types.columns]) spark_session = pyspark.sql.SparkSession(pyspark.SparkContext.getOrCreate()) spark_schema = StructType( [StructField(t.name, _parse_datatype_string(t.name), True) for t in schema.column_types()] ) sparkdf = spark_session.createDataFrame(pandas_df_with_all_types, schema=spark_schema) schema = _infer_schema(sparkdf) assert schema == Schema([ColSpec(x, x) for x in pandas_df_with_all_types.columns])
def test_schema_inference_on_dictionary(pandas_df_with_all_types): # test dictionary d = { c: pandas_df_with_all_types[c].values for c in pandas_df_with_all_types.columns } schema = _infer_schema(d) assert dict(zip(schema.column_names(), schema.column_types())) == { c: DataType[c] for c in pandas_df_with_all_types.columns } # test exception is raised if non-numpy data in dictionary with pytest.raises(TypeError): _infer_schema({"x": 1}) with pytest.raises(TypeError): _infer_schema({"x": [1]})
def test_dtype(nparray, dtype): schema = _infer_schema(nparray) assert schema == Schema([TensorSpec(np.dtype(dtype), (-1, ))]) spec = schema.inputs[0] recreated_spec = TensorSpec.from_json_dict(**spec.to_dict()) assert spec == recreated_spec enforced_array = _enforce_tensor_spec(nparray, spec) assert isinstance(enforced_array, np.ndarray)
def test_get_schema_type(dict_of_ndarrays): schema = _infer_schema(dict_of_ndarrays) assert ["float64"] * 4 == schema.numpy_types() with pytest.raises(MlflowException, match="TensorSpec only supports numpy types"): schema.column_types() with pytest.raises(MlflowException, match="TensorSpec only supports numpy types"): schema.pandas_types() with pytest.raises(MlflowException, match="TensorSpec cannot be converted to spark dataframe"): schema.as_spark_schema()
def test_get_tensor_shape(dict_of_ndarrays): assert all([-1 == _get_tensor_shape(tensor)[0] for tensor in dict_of_ndarrays.values()]) data = dict_of_ndarrays["4D"] # Specify variable dimension for i in range(-4, 4): assert _get_tensor_shape(data, i)[i] == -1 # Specify None assert all([_get_tensor_shape(data, None) != -1]) # Out of bounds with pytest.raises(MlflowException): _get_tensor_shape(data, 10) with pytest.raises(MlflowException): _get_tensor_shape(data, -10) with pytest.raises(TypeError): _infer_schema({"x": 1})
def test_schema_inference_on_numpy_array(pandas_df_with_all_types): for col in pandas_df_with_all_types: data = pandas_df_with_all_types[col].to_numpy() schema = _infer_schema(data) assert schema == Schema([TensorSpec(type=data.dtype, shape=(-1, ))]) # test boolean schema = _infer_schema(np.array([True, False, True], dtype=np.bool_)) assert schema == Schema([TensorSpec(np.dtype(np.bool_), (-1, ))]) # test bytes schema = _infer_schema(np.array([bytes([1])], dtype=np.bytes_)) assert schema == Schema([TensorSpec(np.dtype("S1"), (-1, ))]) # test (u)ints for t in [ np.uint8, np.int8, np.uint16, np.int16, np.uint32, np.int32, np.uint64, np.int64 ]: schema = _infer_schema(np.array([1, 2, 3], dtype=t)) assert schema == Schema([TensorSpec(np.dtype(t), (-1, ))]) # test floats for t in [np.float16, np.float32, np.float64]: schema = _infer_schema(np.array([1.1, 2.2, 3.3], dtype=t)) assert schema == Schema([TensorSpec(np.dtype(t), (-1, ))]) if hasattr(np, "float128"): schema = _infer_schema(np.array([1.1, 2.2, 3.3], dtype=np.float128)) assert schema == Schema([TensorSpec(np.dtype(np.float128), (-1, ))])
def test_that_schema_inference_with_tensors_raises_exception(): with pytest.raises(MlflowException): _infer_schema(np.array([[[1, 2, 3]]], dtype=np.int64)) with pytest.raises(MlflowException): _infer_schema( pd.DataFrame({"x": [np.array([[1, 2, 3]], dtype=np.int64)]})) with pytest.raises(MlflowException): _infer_schema({"x": np.array([[1, 2, 3]], dtype=np.int64)})
def test_schema_inference_on_dataframe(pandas_df_with_all_types): schema = _infer_schema(pandas_df_with_all_types) assert schema == Schema( [ColSpec(x, x) for x in pandas_df_with_all_types.columns])
def test_schema_inference_on_basic_numpy(pandas_df_with_all_types): for col in pandas_df_with_all_types: data = pandas_df_with_all_types[col].to_numpy() schema = _infer_schema(data) assert schema == Schema([TensorSpec(type=data.dtype, shape=(-1, ))])
def test_schema_inference_on_pandas_series(): # test objects schema = _infer_schema(pd.Series(np.array(["a"], dtype=object))) assert schema == Schema([ColSpec(DataType.string)]) schema = _infer_schema(pd.Series(np.array([bytes([1])], dtype=object))) assert schema == Schema([ColSpec(DataType.binary)]) schema = _infer_schema( pd.Series(np.array([bytearray([1]), None], dtype=object))) assert schema == Schema([ColSpec(DataType.binary)]) schema = _infer_schema(pd.Series(np.array([True, None], dtype=object))) assert schema == Schema([ColSpec(DataType.string)]) schema = _infer_schema(pd.Series(np.array([1.1, None], dtype=object))) assert schema == Schema([ColSpec(DataType.double)]) # test bytes schema = _infer_schema(pd.Series(np.array([bytes([1])], dtype=np.bytes_))) assert schema == Schema([ColSpec(DataType.binary)]) # test string schema = _infer_schema(pd.Series(np.array(["a"], dtype=str))) assert schema == Schema([ColSpec(DataType.string)]) # test boolean schema = _infer_schema(pd.Series(np.array([True], dtype=bool))) assert schema == Schema([ColSpec(DataType.boolean)]) # test ints for t in [np.uint8, np.uint16, np.int8, np.int16, np.int32]: schema = _infer_schema(pd.Series(np.array([1, 2, 3], dtype=t))) assert schema == Schema([ColSpec("integer")]) # test longs for t in [np.uint32, np.int64]: schema = _infer_schema(pd.Series(np.array([1, 2, 3], dtype=t))) assert schema == Schema([ColSpec("long")]) # unsigned long is unsupported with pytest.raises(MlflowException, match="Unsupported numpy data type"): _infer_schema(pd.Series(np.array([1, 2, 3], dtype=np.uint64))) # test floats for t in [np.float16, np.float32]: schema = _infer_schema(pd.Series(np.array([1.1, 2.2, 3.3], dtype=t))) assert schema == Schema([ColSpec("float")]) # test doubles schema = _infer_schema( pd.Series(np.array([1.1, 2.2, 3.3], dtype=np.float64))) assert schema == Schema([ColSpec("double")]) # test datetime schema = _infer_schema( pd.Series( np.array( [ "2021-01-01 00:00:00", "2021-02-02 00:00:00", "2021-03-03 12:00:00" ], dtype="datetime64", ))) assert schema == Schema([ColSpec("datetime")]) # unsupported if hasattr(np, "float128"): with pytest.raises(MlflowException, match="Unsupported numpy data type"): _infer_schema(pd.Series(np.array([1, 2, 3], dtype=np.float128)))
def test_schema_inference_on_numpy_array(pandas_df_with_all_types): # drop int and float as we lose type size information when storing as objects and defaults are # 64b. pandas_df_with_all_types = pandas_df_with_all_types.drop( columns=["integer", "float"]) schema = _infer_schema(pandas_df_with_all_types.values) assert schema == Schema( [ColSpec(x) for x in pandas_df_with_all_types.columns]) # test objects schema = _infer_schema(np.array(["a"], dtype=np.object)) assert schema == Schema([ColSpec(DataType.string)]) schema = _infer_schema(np.array([bytes([1])], dtype=np.object)) assert schema == Schema([ColSpec(DataType.binary)]) schema = _infer_schema(np.array([bytearray([1]), None], dtype=np.object)) assert schema == Schema([ColSpec(DataType.binary)]) schema = _infer_schema(np.array([True, None], dtype=np.object)) assert schema == Schema([ColSpec(DataType.boolean)]) schema = _infer_schema(np.array([1.1, None], dtype=np.object)) assert schema == Schema([ColSpec(DataType.double)]) # test bytes schema = _infer_schema(np.array([bytes([1])], dtype=np.bytes_)) assert schema == Schema([ColSpec(DataType.binary)]) schema = _infer_schema(np.array([bytearray([1])], dtype=np.bytes_)) assert schema == Schema([ColSpec(DataType.binary)]) # test string schema = _infer_schema(np.array(["a"], dtype=np.str)) assert schema == Schema([ColSpec(DataType.string)]) # test boolean schema = _infer_schema(np.array([True], dtype=np.bool)) assert schema == Schema([ColSpec(DataType.boolean)]) # test ints for t in [np.uint8, np.uint16, np.int8, np.int16, np.int32]: schema = _infer_schema(np.array([1, 2, 3], dtype=t)) assert schema == Schema([ColSpec("integer")]) # test longs for t in [np.uint32, np.int64]: schema = _infer_schema(np.array([1, 2, 3], dtype=t)) assert schema == Schema([ColSpec("long")]) # unsigned long is unsupported with pytest.raises(MlflowException): _infer_schema(np.array([1, 2, 3], dtype=np.uint64)) # test floats for t in [np.float16, np.float32]: schema = _infer_schema(np.array([1.1, 2.2, 3.3], dtype=t)) assert schema == Schema([ColSpec("float")]) # test doubles schema = _infer_schema(np.array([1.1, 2.2, 3.3], dtype=np.float64)) assert schema == Schema([ColSpec("double")]) # unsupported if hasattr(np, "float128"): with pytest.raises(MlflowException): _infer_schema(np.array([1, 2, 3], dtype=np.float128))
def test_get_sparse_matrix_data_type_and_shape(dict_of_sparse_matrix): for sparse_matrix in dict_of_sparse_matrix.values(): schema = _infer_schema(sparse_matrix) assert schema.numpy_types() == ["float64"] assert _get_tensor_shape(sparse_matrix) == (-1, 8)