def test_input_examples(pandas_df_with_all_types): sig = infer_signature(pandas_df_with_all_types) # test setting example with data frame with all supported data types with TempDir() as tmp: example = _Example(pandas_df_with_all_types) example.save(tmp.path()) filename = example.info["artifact_path"] with open(tmp.path(filename), "r") as f: data = json.load(f) assert set(data.keys()) == set(("columns", "data")) parsed_df = _dataframe_from_json(tmp.path(filename), schema=sig.inputs) assert (pandas_df_with_all_types == parsed_df).all().all() # the frame read without schema should match except for the binary values assert (parsed_df.drop(columns=["binary"]) == _dataframe_from_json(tmp.path(filename)) .drop(columns=["binary"])).all().all() # pass the input as dictionary instead with TempDir() as tmp: d = {name: pandas_df_with_all_types[name].values for name in pandas_df_with_all_types.columns} example = _Example(d) example.save(tmp.path()) filename = example.info["artifact_path"] parsed_df = _dataframe_from_json(tmp.path(filename), sig.inputs) assert (pandas_df_with_all_types == parsed_df).all().all() # input passed as numpy array sig = infer_signature(pandas_df_with_all_types.values) with TempDir() as tmp: example = _Example(pandas_df_with_all_types.values) example.save(tmp.path()) filename = example.info["artifact_path"] with open(tmp.path(filename), "r") as f: data = json.load(f) assert set(data.keys()) == set(("data",)) parsed_ary = _dataframe_from_json(tmp.path(filename), schema=sig.inputs).values assert (pandas_df_with_all_types.values == parsed_ary).all().all() # pass multidimensional array with TempDir() as tmp: example = np.array([[[1, 2, 3]]]) with pytest.raises(TensorsNotSupportedException): _Example(example) # pass multidimensional array with TempDir() as tmp: example = np.array([[1, 2, 3]]) with pytest.raises(TensorsNotSupportedException): _Example({"x": example, "y": example}) # pass dict with scalars with TempDir() as tmp: example = {"a": 1, "b": "abc"} x = _Example(example) x.save(tmp.path()) filename = x.info["artifact_path"] parsed_df = _dataframe_from_json(tmp.path(filename)) assert example == parsed_df.to_dict(orient="records")[0]
def test_model_log(): with TempDir(chdr=True) as tmp: experiment_id = mlflow.create_experiment("test") sig = ModelSignature( inputs=Schema([ColSpec("integer", "x"), ColSpec("integer", "y")]), outputs=Schema([ColSpec(name=None, type="double")])) input_example = {"x": 1, "y": 2} with mlflow.start_run(experiment_id=experiment_id) as r: Model.log("some/path", TestFlavor, signature=sig, input_example=input_example) local_path = _download_artifact_from_uri("runs:/{}/some/path".format( r.info.run_id), output_path=tmp.path("")) loaded_model = Model.load(os.path.join(local_path, "MLmodel")) assert loaded_model.run_id == r.info.run_id assert loaded_model.artifact_path == "some/path" assert loaded_model.flavors == { "flavor1": { "a": 1, "b": 2 }, "flavor2": { "x": 1, "y": 2 }, } assert loaded_model.signature == sig path = os.path.join( local_path, loaded_model.saved_input_example_info["artifact_path"]) x = _dataframe_from_json(path) assert x.to_dict(orient="records")[0] == input_example
def test_model_log_with_input_example_succeeds(): with TempDir(chdr=True) as tmp: sig = ModelSignature( inputs=Schema([ ColSpec("integer", "a"), ColSpec("string", "b"), ColSpec("boolean", "c"), ColSpec("string", "d"), ColSpec("datetime", "e"), ]), outputs=Schema([ColSpec(name=None, type="double")]), ) input_example = pd.DataFrame( { "a": np.int32(1), "b": "test string", "c": True, "d": date.today(), "e": np.datetime64("2020-01-01T00:00:00"), }, index=[0], ) local_path, _ = _log_model_with_signature_and_example( tmp, sig, input_example) loaded_model = Model.load(os.path.join(local_path, "MLmodel")) path = os.path.join( local_path, loaded_model.saved_input_example_info["artifact_path"]) x = _dataframe_from_json(path, schema=sig.inputs) # date column will get deserialized into string input_example["d"] = input_example["d"].apply(lambda x: x.isoformat()) assert x.equals(input_example)
def _read_example(mlflow_model: Model, path: str): """ Read example from a model directory. Returns None if there is no example metadata (i.e. the model was saved without example). Raises FileNotFoundError if there is model metadata but the example file is missing. :param mlflow_model: Model metadata. :param path: Path to the model directory. :return: Input example or None if the model has no example. """ if mlflow_model.saved_input_example_info is None: return None example_type = mlflow_model.saved_input_example_info["type"] if example_type not in ["dataframe", "ndarray", "sparse_matrix_csc", "sparse_matrix_csr"]: raise MlflowException( "This version of mlflow can not load example of type {}".format(example_type) ) input_schema = mlflow_model.signature.inputs if mlflow_model.signature is not None else None path = os.path.join(path, mlflow_model.saved_input_example_info["artifact_path"]) if example_type == "ndarray": return _read_tensor_input_from_json(path, schema=input_schema) elif example_type in ["sparse_matrix_csc", "sparse_matrix_csr"]: return _read_sparse_matrix_from_json(path, example_type) else: return _dataframe_from_json(path, schema=input_schema, precise_float=True)
def test_model_log(): with TempDir(chdr=True) as tmp: sig = ModelSignature( inputs=Schema([ColSpec("integer", "x"), ColSpec("integer", "y")]), outputs=Schema([ColSpec(name=None, type="double")]), ) input_example = {"x": 1, "y": 2} local_path, r = _log_model_with_signature_and_example( tmp, sig, input_example) loaded_model = Model.load(os.path.join(local_path, "MLmodel")) assert loaded_model.run_id == r.info.run_id assert loaded_model.artifact_path == "some/path" assert loaded_model.flavors == { "flavor1": { "a": 1, "b": 2 }, "flavor2": { "x": 1, "y": 2 }, } assert loaded_model.signature == sig path = os.path.join( local_path, loaded_model.saved_input_example_info["artifact_path"]) x = _dataframe_from_json(path) assert x.to_dict(orient="records")[0] == input_example assert not hasattr(loaded_model, "databricks_runtime")
def test_model_log_with_databricks_runtime(): dbr = "8.3.x-snapshot-gpu-ml-scala2.12" with TempDir(chdr=True) as tmp, mock.patch( "mlflow.models.model.get_databricks_runtime", return_value=dbr): sig = ModelSignature( inputs=Schema([ColSpec("integer", "x"), ColSpec("integer", "y")]), outputs=Schema([ColSpec(name=None, type="double")]), ) input_example = {"x": 1, "y": 2} local_path, r = _log_model_with_signature_and_example( tmp, sig, input_example) loaded_model = Model.load(os.path.join(local_path, "MLmodel")) assert loaded_model.run_id == r.info.run_id assert loaded_model.artifact_path == "some/path" assert loaded_model.flavors == { "flavor1": { "a": 1, "b": 2 }, "flavor2": { "x": 1, "y": 2 }, } assert loaded_model.signature == sig path = os.path.join( local_path, loaded_model.saved_input_example_info["artifact_path"]) x = _dataframe_from_json(path) assert x.to_dict(orient="records")[0] == input_example assert loaded_model.databricks_runtime == dbr
def test_input_examples_with_nan(df_with_nan, dict_of_ndarrays_with_nans): # test setting example with data frame with NaN values in it sig = infer_signature(df_with_nan) with TempDir() as tmp: example = _Example(df_with_nan) example.save(tmp.path()) filename = example.info["artifact_path"] with open(tmp.path(filename), "r") as f: data = json.load(f) assert set(data.keys()) == set(("columns", "data")) parsed_df = _dataframe_from_json(tmp.path(filename), schema=sig.inputs) # by definition of NaN, NaN == NaN is False but NaN != NaN is True assert ( ((df_with_nan == parsed_df) | ((df_with_nan != df_with_nan) & (parsed_df != parsed_df))) .all() .all() ) # the frame read without schema should match except for the binary values no_schema_df = _dataframe_from_json(tmp.path(filename)) a = parsed_df.drop(columns=["binary"]) b = no_schema_df.drop(columns=["binary"]) assert ((a == b) | ((a != a) & (b != b))).all().all() # pass multidimensional array for col in dict_of_ndarrays_with_nans: input_example = dict_of_ndarrays_with_nans[col] sig = infer_signature(input_example) with TempDir() as tmp: example = _Example(input_example) example.save(tmp.path()) filename = example.info["artifact_path"] parsed_ary = _read_tensor_input_from_json(tmp.path(filename), schema=sig.inputs) assert np.array_equal(parsed_ary, input_example, equal_nan=True) # without a schema/dtype specified, the resulting tensor will keep the None type no_schema_df = _read_tensor_input_from_json(tmp.path(filename)) assert np.array_equal( no_schema_df, np.where(np.isnan(input_example), None, input_example) )
def test_model_info(): with TempDir(chdr=True) as tmp: sig = ModelSignature( inputs=Schema([ColSpec("integer", "x"), ColSpec("integer", "y")]), outputs=Schema([ColSpec(name=None, type="double")]), ) input_example = {"x": 1, "y": 2} experiment_id = mlflow.create_experiment("test") with mlflow.start_run(experiment_id=experiment_id) as run: model_info = Model.log("some/path", TestFlavor, signature=sig, input_example=input_example) local_path = _download_artifact_from_uri("runs:/{}/some/path".format( run.info.run_id), output_path=tmp.path("")) assert model_info.run_id == run.info.run_id assert model_info.artifact_path == "some/path" assert model_info.model_uri == "runs:/{}/some/path".format( run.info.run_id) loaded_model = Model.load(os.path.join(local_path, "MLmodel")) assert model_info.utc_time_created == loaded_model.utc_time_created assert model_info.model_uuid == loaded_model.model_uuid assert model_info.flavors == { "flavor1": { "a": 1, "b": 2 }, "flavor2": { "x": 1, "y": 2 }, } path = os.path.join( local_path, model_info.saved_input_example_info["artifact_path"]) x = _dataframe_from_json(path) assert x.to_dict(orient="records")[0] == input_example assert model_info.signature_dict == sig.to_dict() assert Version(model_info.mlflow_version) == Version( loaded_model.mlflow_version)
def _read_example(mlflow_model: Model, path: str): """ Read example from a model directory. Returns None if there is no example metadata (i.e. the model was saved without example). Raises IO Exception if there is model metadata but the example file is missing. :param mlflow_model: Model metadata. :param path: Path to the model directory. :return: Input example or None if the model has no example. """ if mlflow_model.saved_input_example_info is None: return None example_type = mlflow_model.saved_input_example_info["type"] if example_type != "dataframe": raise MlflowException("This version of mlflow can not load example of type {}".format( example_type)) input_schema = mlflow_model.signature.inputs if mlflow_model.signature is not None else None path = os.path.join(path, mlflow_model.saved_input_example_info["artifact_path"]) return _dataframe_from_json(path, schema=input_schema, precise_float=True)
def parse_json_input(json_input, orient="split", schema: Schema=None): """ :param json_input: A JSON-formatted string representation of a Pandas DataFrame, or a stream containing such a string representation. :param orient: The Pandas DataFrame orientation of the JSON input. This is either 'split' or 'records'. :param schema: Optional schema specification to be used during parsing. """ # pylint: disable=broad-except try: return _dataframe_from_json(json_input, pandas_orient=orient, schema=schema) except Exception: _handle_serving_error( error_message=( "Failed to parse input as a Pandas DataFrame. Ensure that the input is" " a valid JSON-formatted Pandas DataFrame with the `{orient}` orient" " produced using the `pandas.DataFrame.to_json(..., orient='{orient}')`" " method.".format(orient=orient)), error_code=MALFORMED_REQUEST)
def test_model_log_with_input_example_succeeds(): with TempDir(chdr=True) as tmp: experiment_id = mlflow.create_experiment("test") sig = ModelSignature( inputs=Schema([ ColSpec("integer", "a"), ColSpec("string", "b"), ColSpec("boolean", "c"), ColSpec("string", "d"), ColSpec("datetime", "e"), ]), outputs=Schema([ColSpec(name=None, type="double")]), ) input_example = pd.DataFrame( { "a": np.int32(1), "b": "test string", "c": True, "d": date.today(), "e": np.datetime64("2020-01-01T00:00:00"), }, index=[0], ) with mlflow.start_run(experiment_id=experiment_id) as r: Model.log("some/path", TestFlavor, signature=sig, input_example=input_example) local_path = _download_artifact_from_uri("runs:/{}/some/path".format( r.info.run_id), output_path=tmp.path("")) loaded_model = Model.load(os.path.join(local_path, "MLmodel")) path = os.path.join( local_path, loaded_model.saved_input_example_info["artifact_path"]) x = _dataframe_from_json(path, schema=sig.inputs) # date column will get deserialized into string input_example["d"] = input_example["d"].apply(lambda x: x.isoformat()) assert x.equals(input_example)
def test_dataframe_from_json(): source = pd.DataFrame( { "boolean": [True, False, True], "string": ["a", "b", "c"], "float": np.array([1.2, 2.3, 3.4], dtype=np.float32), "double": np.array([1.2, 2.3, 3.4], dtype=np.float64), "integer": np.array([3, 4, 5], dtype=np.int32), "long": np.array([3, 4, 5], dtype=np.int64), "binary": [bytes([1, 2, 3]), bytes([4, 5]), bytes([6])], "date_string": ["2018-02-03", "1996-03-02", "2021-03-05"], }, columns=[ "boolean", "string", "float", "double", "integer", "long", "binary", "date_string", ], ) jsonable_df = pd.DataFrame(source, copy=True) jsonable_df["binary"] = jsonable_df["binary"].map(base64.b64encode) schema = Schema([ ColSpec("boolean", "boolean"), ColSpec("string", "string"), ColSpec("float", "float"), ColSpec("double", "double"), ColSpec("integer", "integer"), ColSpec("long", "long"), ColSpec("binary", "binary"), ColSpec("string", "date_string"), ]) parsed = _dataframe_from_json(jsonable_df.to_json(orient="split"), pandas_orient="split", schema=schema) assert parsed.equals(source) parsed = _dataframe_from_json(jsonable_df.to_json(orient="records"), pandas_orient="records", schema=schema) assert parsed.equals(source) # try parsing with tensor schema tensor_schema = Schema([ TensorSpec(np.dtype("bool"), [-1], "boolean"), TensorSpec(np.dtype("str"), [-1], "string"), TensorSpec(np.dtype("float32"), [-1], "float"), TensorSpec(np.dtype("float64"), [-1], "double"), TensorSpec(np.dtype("int32"), [-1], "integer"), TensorSpec(np.dtype("int64"), [-1], "long"), TensorSpec(np.dtype(bytes), [-1], "binary"), ]) parsed = _dataframe_from_json(jsonable_df.to_json(orient="split"), pandas_orient="split", schema=tensor_schema) # NB: tensor schema does not automatically decode base64 encoded bytes. assert parsed.equals(jsonable_df) parsed = _dataframe_from_json(jsonable_df.to_json(orient="records"), pandas_orient="records", schema=tensor_schema) # NB: tensor schema does not automatically decode base64 encoded bytes. assert parsed.equals(jsonable_df) # Test parse with TesnorSchema with a single tensor tensor_schema = Schema([TensorSpec(np.dtype("float32"), [-1, 3])]) source = pd.DataFrame( { "a": np.array([1, 2, 3], dtype=np.float32), "b": np.array([4.1, 5.2, 6.3], dtype=np.float32), "c": np.array([7, 8, 9], dtype=np.float32), }, columns=["a", "b", "c"], ) assert source.equals( _dataframe_from_json(source.to_json(orient="split"), pandas_orient="split", schema=tensor_schema)) assert source.equals( _dataframe_from_json(source.to_json(orient="records"), pandas_orient="records", schema=tensor_schema))
def test_input_examples(pandas_df_with_all_types, dict_of_ndarrays): sig = infer_signature(pandas_df_with_all_types) # test setting example with data frame with all supported data types with TempDir() as tmp: example = _Example(pandas_df_with_all_types) example.save(tmp.path()) filename = example.info["artifact_path"] with open(tmp.path(filename), "r") as f: data = json.load(f) assert set(data.keys()) == set(("columns", "data")) parsed_df = _dataframe_from_json(tmp.path(filename), schema=sig.inputs) assert (pandas_df_with_all_types == parsed_df).all().all() # the frame read without schema should match except for the binary values assert ((parsed_df.drop(columns=["binary"]) == _dataframe_from_json( tmp.path(filename)).drop(columns=["binary"])).all().all()) # NB: Drop columns that cannot be encoded by proto_json_utils.pyNumpyEncoder new_df = pandas_df_with_all_types.drop( columns=["boolean_ext", "integer_ext", "string_ext"]) # pass the input as dictionary instead with TempDir() as tmp: d = {name: new_df[name].values for name in new_df.columns} example = _Example(d) example.save(tmp.path()) filename = example.info["artifact_path"] parsed_dict = _read_tensor_input_from_json(tmp.path(filename)) assert d.keys() == parsed_dict.keys() # Asserting binary will fail since it is converted to base64 encoded strings. # The check above suffices that the binary input is stored. del d["binary"] for key in d: assert np.array_equal(d[key], parsed_dict[key]) # input passed as numpy array new_df = pandas_df_with_all_types.drop(columns=["binary"]) for col in new_df: input_example = new_df[col].to_numpy() with TempDir() as tmp: example = _Example(input_example) example.save(tmp.path()) filename = example.info["artifact_path"] parsed_ary = _read_tensor_input_from_json(tmp.path(filename)) assert np.array_equal(parsed_ary, input_example) # pass multidimensional array for col in dict_of_ndarrays: input_example = dict_of_ndarrays[col] with TempDir() as tmp: example = _Example(input_example) example.save(tmp.path()) filename = example.info["artifact_path"] parsed_ary = _read_tensor_input_from_json(tmp.path(filename)) assert np.array_equal(parsed_ary, input_example) # pass multidimensional array as a list example = np.array([[1, 2, 3]]) with pytest.raises(TensorsNotSupportedException): _Example([example, example]) # pass dict with scalars with TempDir() as tmp: example = {"a": 1, "b": "abc"} x = _Example(example) x.save(tmp.path()) filename = x.info["artifact_path"] parsed_df = _dataframe_from_json(tmp.path(filename)) assert example == parsed_df.to_dict(orient="records")[0]