def test_categorical_model_can_be_loaded_and_evaluated_as_pyfunc( saved_tf_categorical_model, model_path): mlflow.tensorflow.save_model( tf_saved_model_dir=saved_tf_categorical_model.path, tf_meta_graph_tags=saved_tf_categorical_model.meta_graph_tags, tf_signature_def_key=saved_tf_categorical_model.signature_def_key, path=model_path) pyfunc_wrapper = pyfunc.load_pyfunc(model_path) results_df = pyfunc_wrapper.predict( saved_tf_categorical_model.inference_df) pandas.testing.assert_frame_equal( results_df, saved_tf_categorical_model.expected_results_df, check_less_precise=6)
def test_model_export(spark_model_iris, model_path, spark_custom_env): sparkm.save_model(spark_model_iris.model, path=model_path, conda_env=spark_custom_env) # 1. score and compare reloaded sparkml model reloaded_model = sparkm.load_model(model_uri=model_path) preds_df = reloaded_model.transform(spark_model_iris.spark_df) preds1 = [x.prediction for x in preds_df.select("prediction").collect()] assert spark_model_iris.predictions == preds1 m = pyfunc.load_pyfunc(model_path) # 2. score and compare reloaded pyfunc preds2 = m.predict(spark_model_iris.pandas_df) assert spark_model_iris.predictions == preds2 # 3. score and compare reloaded pyfunc Spark udf preds3 = score_model_as_udf(model_uri=model_path, pandas_df=spark_model_iris.pandas_df) assert spark_model_iris.predictions == preds3 assert os.path.exists(sparkm.DFS_TMP)
def test_sklearn_model_save_load(xgb_sklearn_model, model_path): model = xgb_sklearn_model.model mlflow.xgboost.save_model(xgb_model=model, path=model_path) reloaded_model = mlflow.xgboost.load_model(model_uri=model_path) reloaded_pyfunc = pyfunc.load_pyfunc(model_uri=model_path) np.testing.assert_array_almost_equal( model.predict(xgb_sklearn_model.inference_dataframe), reloaded_model.predict(xgb_sklearn_model.inference_dataframe), ) np.testing.assert_array_almost_equal( reloaded_model.predict(xgb_sklearn_model.inference_dataframe), reloaded_pyfunc.predict(xgb_sklearn_model.inference_dataframe), )
def test_model_save_load(self): with TempDir(chdr=True, remove_on_exit=True) as tmp: model_path = tmp.path("knn.pkl") with open(model_path, "wb") as f: pickle.dump(self._knn, f) path = tmp.path("knn") sklearn.save_model(self._knn, path=path) x = sklearn.load_model(path) xpred = x.predict(self._X) np.testing.assert_array_equal(self._knn_predict, xpred) # sklearn should also be stored as a valid pyfunc model # test pyfunc compatibility y = pyfunc.load_pyfunc(path) ypred = y.predict(self._X) np.testing.assert_array_equal(self._knn_predict, ypred)
def test_model_save_load(cb_model, model_path): model, inference_dataframe = cb_model mlflow.catboost.save_model(cb_model=model, path=model_path) loaded_model = mlflow.catboost.load_model(model_uri=model_path) np.testing.assert_array_almost_equal( model.predict(inference_dataframe), loaded_model.predict(inference_dataframe), ) loaded_pyfunc = pyfunc.load_pyfunc(model_uri=model_path) np.testing.assert_array_almost_equal( loaded_model.predict(inference_dataframe), loaded_pyfunc.predict(inference_dataframe), )
def test_load_pyfunc_loads_torch_model_using_pickle_module_specified_at_save_time( module_scoped_subclassed_model, model_path): custom_pickle_module = pickle mlflow.pytorch.save_model(path=model_path, pytorch_model=module_scoped_subclassed_model, conda_env=None, pickle_module=custom_pickle_module) import_module_fn = importlib.import_module imported_modules = [] def track_module_imports(module_name): imported_modules.append(module_name) return import_module_fn(module_name) with mock.patch("importlib.import_module") as import_mock,\ mock.patch("torch.load") as torch_load_mock: import_mock.side_effect = track_module_imports pyfunc.load_pyfunc(model_path) torch_load_mock.assert_called_with(mock.ANY, pickle_module=custom_pickle_module) assert custom_pickle_module.__name__ in imported_modules
def test_model_log(tmpdir): conda_env = os.path.join(str(tmpdir), "conda_env.yml") _mlflow_conda_env( conda_env, additional_pip_deps=["pyspark=={}".format(pyspark_version)]) iris = datasets.load_iris() X = iris.data # we only take the first two features. y = iris.target pandas_df = pd.DataFrame(X, columns=iris.feature_names) pandas_df['label'] = pd.Series(y) spark_session = pyspark.sql.SparkSession.builder \ .config(key="spark_session.python.worker.reuse", value=True) \ .master("local-cluster[2, 1, 1024]") \ .getOrCreate() spark_df = spark_session.createDataFrame(pandas_df) model_path = tmpdir.mkdir("model") assembler = VectorAssembler(inputCols=iris.feature_names, outputCol="features") lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8) pipeline = Pipeline(stages=[assembler, lr]) # Fit the model model = pipeline.fit(spark_df) # Print the coefficients and intercept for multinomial logistic regression preds_df = model.transform(spark_df) preds1 = [x.prediction for x in preds_df.select("prediction").collect()] old_tracking_uri = tracking.get_tracking_uri() # should_start_run tests whether or not calling log_model() automatically starts a run. for should_start_run in [False, True]: try: tracking_dir = os.path.abspath(str(tmpdir.mkdir("mlruns"))) tracking.set_tracking_uri("file://%s" % tracking_dir) if should_start_run: tracking.start_run() sparkm.log_model(artifact_path="model", spark_model=model) run_id = tracking.active_run().info.run_uuid x = pyfunc.load_pyfunc("model", run_id=run_id) preds2 = x.predict(pandas_df) assert preds1 == preds2 reloaded_model = sparkm.load_model("model", run_id=run_id) preds_df_1 = reloaded_model.transform(spark_df) preds3 = [ x.prediction for x in preds_df_1.select("prediction").collect() ] assert preds1 == preds3 finally: tracking.end_run() tracking.set_tracking_uri(old_tracking_uri) shutil.rmtree(tracking_dir)
def test_model_save_load(sklearn_knn_model, model_path): knn_model = sklearn_knn_model.model mlflow.sklearn.save_model(sk_model=knn_model, path=model_path) reloaded_knn_model = mlflow.sklearn.load_model(model_uri=model_path) reloaded_knn_pyfunc = pyfunc.load_pyfunc(model_uri=model_path) np.testing.assert_array_equal( knn_model.predict(sklearn_knn_model.inference_data), reloaded_knn_model.predict(sklearn_knn_model.inference_data), ) np.testing.assert_array_equal( reloaded_knn_model.predict(sklearn_knn_model.inference_data), reloaded_knn_pyfunc.predict(sklearn_knn_model.inference_data), )
def test_sagemaker_docker_model_scoring_with_default_conda_env(lgb_model, model_path): mlflow.lightgbm.save_model(lgb_model=lgb_model.model, path=model_path, conda_env=None) reloaded_pyfunc = pyfunc.load_pyfunc(model_uri=model_path) scoring_response = score_model_in_sagemaker_docker_container( model_uri=model_path, data=lgb_model.inference_dataframe, content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON_SPLIT_ORIENTED, flavor=mlflow.pyfunc.FLAVOR_NAME) deployed_model_preds = pd.DataFrame(json.loads(scoring_response.content)) pandas.testing.assert_frame_equal( deployed_model_preds, pd.DataFrame(reloaded_pyfunc.predict(lgb_model.inference_dataframe)), check_dtype=False, check_less_precise=6)
def serve(model_uri, port, host, no_conda): """ Serve a pyfunc model saved with MLflow by launching a webserver on the specified host and port. For information about the input data formats accepted by the webserver, see the following documentation: https://www.mlflow.org/docs/latest/models.html#pyfunc-deployment. """ local_model_path = _download_artifact_from_uri(artifact_uri=model_uri) model_env_file = _load_model_env(path=local_model_path) if not no_conda and model_env_file is not None: conda_env_path = os.path.join(local_model_path, model_env_file) return _rerun_in_conda(conda_env_path) app = scoring_server.init(load_pyfunc(local_model_path)) app.run(port=port, host=host)
def test_model_save_load(lgb_model, model_path): model = lgb_model.model mlflow.lightgbm.save_model(lgb_model=model, path=model_path) reloaded_model = mlflow.lightgbm.load_model(model_uri=model_path) reloaded_pyfunc = pyfunc.load_pyfunc(model_uri=model_path) np.testing.assert_array_almost_equal( model.predict(lgb_model.inference_dataframe), reloaded_model.predict(lgb_model.inference_dataframe), ) np.testing.assert_array_almost_equal( reloaded_model.predict(lgb_model.inference_dataframe), reloaded_pyfunc.predict(lgb_model.inference_dataframe), )
def test_model_save_load(pd_model, model_path): mlflow.paddle.save_model(pd_model=pd_model.model, path=model_path) reloaded_pd_model = mlflow.paddle.load_model(model_uri=model_path) reloaded_pyfunc = pyfunc.load_pyfunc(model_uri=model_path) np.testing.assert_array_almost_equal( pd_model.model(pd_model.inference_dataframe), reloaded_pyfunc.predict(pd_model.inference_dataframe), decimal=5, ) np.testing.assert_array_almost_equal( reloaded_pd_model(pd_model.inference_dataframe), reloaded_pyfunc.predict(pd_model.inference_dataframe), decimal=5, )
def serve(model_path, run_id, port, host, no_conda): """ Serve a PythonFunction model saved with MLflow. If a ``run_id`` is specified, ``model-path`` is treated as an artifact path within that run; otherwise it is treated as a local path. """ if run_id: model_path = _get_model_log_dir(model_path, run_id) model_env_file = _load_model_env(model_path) if not no_conda and model_env_file is not None: conda_env_path = os.path.join(model_path, model_env_file) return _rerun_in_conda(conda_env_path) app = scoring_server.init(load_pyfunc(model_path)) app.run(port=port, host=host)
def test_diviner_pyfunc_group_predict_prophet(grouped_prophet, model_path, diviner_data): groups = [] for i in [0, -1]: key_entries = [] for value in diviner_data.df[diviner_data.key_columns].iloc[[i]].to_dict().values(): key_entries.append(list(value.values())[0]) groups.append(tuple(key_entries)) mlflow.diviner.save_model(diviner_model=grouped_prophet, path=model_path) loaded_pyfunc_model = pyfunc.load_pyfunc(model_uri=model_path) local_group_pred = grouped_prophet.predict_groups(groups=groups, horizon=10, frequency="D") pyfunc_conf = pd.DataFrame({"groups": [groups], "horizon": 10, "frequency": "D"}, index=[0]) pyfunc_group_predict = loaded_pyfunc_model.predict(pyfunc_conf) pd.testing.assert_frame_equal(local_group_pred, pyfunc_group_predict)
def predict(model_path, run_id, input_path, output_path): """ Loads a Pandas DataFrame and runs a PythonFunction model saved with MLflow against it. This method will return the prediction results as a CSV-formatted Pandas DataFrame. If a run_id is specified, MODEL_PATH is treated as an artifact path within that run; otherwise it is treated as a local path. """ if run_id: model_path = _get_model_log_dir(model_path, run_id) model = load_pyfunc(model_path) df = pandas.read_csv(input_path) result = model.predict(df) out_stream = sys.stdout if output_path: out_stream = open(output_path, 'w') pandas.DataFrame(data=result).to_csv(out_stream, header=False, index=False)
def test_log_saved_model(self): with TempDir(chdr=False, remove_on_exit=True) as tmp: # Setting the logging such that it is in the temp folder and deleted after the test. old_tracking_dir = tracking.get_tracking_uri() tracking_dir = os.path.abspath(tmp.path("mlruns")) tracking.set_tracking_uri("file://%s" % tracking_dir) tracking.start_run() try: # Creating dict of features names (str) to placeholders (tensors) feature_spec = {} for name in self._feature_names: feature_spec[name] = tf.placeholder("float", name=name, shape=[150]) # Creating receiver function for model saving. receiver_fn = tf.estimator.export.build_raw_serving_input_receiver_fn( feature_spec) saved_model_path = tmp.path("model") os.makedirs(saved_model_path) os.makedirs(tmp.path("hello")) # Saving Tensorflow model. saved_model_path = self._dnn.export_savedmodel( saved_model_path, receiver_fn).decode("utf-8") # Logging the Tensorflow model just saved. tensorflow.log_saved_model(saved_model_dir=saved_model_path, signature_def_key="predict", artifact_path=tmp.path("hello")) # Loading the saved Tensorflow model as a pyfunc. x = pyfunc.load_pyfunc(saved_model_path) # Predicting on the iris dataset using the pyfunc. xpred = x.predict( pandas.DataFrame(data=self._X, columns=self._feature_names)) saved = [] for s in self._dnn_predict: saved.append(s['predictions']) loaded = [] for index, rows in xpred.iterrows(): loaded.append(rows) # Asserting that the loaded model predictions are as expected. np.testing.assert_array_equal(saved, loaded) finally: # Restoring the old logging location. tracking.end_run() tracking.set_tracking_uri(old_tracking_dir)
def test_model_retrain_built_in_high_level_api( pd_model_built_in_high_level_api, model_path, model_retrain_path ): model = pd_model_built_in_high_level_api.model mlflow.paddle.save_model(pd_model=model, path=model_path, training=True) training_dataset, test_dataset = get_dataset_built_in_high_level_api() model_retrain = paddle.Model(UCIHousing()) model_retrain = mlflow.paddle.load_model(model_uri=model_path, model=model_retrain) optim = paddle.optimizer.Adam(learning_rate=0.015, parameters=model.parameters()) model_retrain.prepare(optim, paddle.nn.MSELoss()) model_retrain.fit(training_dataset, epochs=6, batch_size=8, verbose=1) mlflow.paddle.save_model(pd_model=model_retrain, path=model_retrain_path, training=False) with pytest.raises(TypeError, match="This model can't be loaded"): mlflow.paddle.load_model(model_uri=model_retrain_path, model=model_retrain) error_model = 0 error_model_type = type(error_model) with pytest.raises( TypeError, match="Invalid object type `{}` for `model`, must be `paddle.Model`".format( error_model_type ), ): mlflow.paddle.load_model(model_uri=model_retrain_path, model=error_model) reloaded_pd_model = mlflow.paddle.load_model(model_uri=model_retrain_path) reloaded_pyfunc = pyfunc.load_pyfunc(model_uri=model_retrain_path) low_level_test_dataset = [x[0] for x in test_dataset] np.testing.assert_array_almost_equal( np.array(model_retrain.predict(test_dataset)).squeeze(), np.array(reloaded_pyfunc.predict(np.array(low_level_test_dataset))).squeeze(), decimal=5, ) np.testing.assert_array_almost_equal( np.array(reloaded_pd_model(np.array(low_level_test_dataset))).squeeze(), np.array(reloaded_pyfunc.predict(np.array(low_level_test_dataset))).squeeze(), decimal=5, )
def test_spark_udf(self): pandas_df = self._pandas_df spark_df = self.spark.createDataFrame(pandas_df) pyfunc_udf = spark_udf(self.spark, self._model_path, result_type="integer") new_df = spark_df.withColumn("prediction", pyfunc_udf(*self._pandas_df.columns)) spark_results = new_df.collect() # Compare against directly running the model. direct_model = load_pyfunc(self._model_path) pandas_results = direct_model.predict(pandas_df) self.assertEqual(178, len(pandas_results)) self.assertEqual(178, len(spark_results)) for i in range(0, len(pandas_results)): # noqa self.assertEqual(self._predict[i], pandas_results[i]) self.assertEqual(pandas_results[i], spark_results[i]['prediction'])
def test_gbt(): old_uri = tracking.get_tracking_uri() with TempDir(chdr=False, remove_on_exit=True) as tmp: try: diamonds = tmp.path("diamonds") artifacts = tmp.path("artifacts") os.mkdir(diamonds) os.mkdir(artifacts) tracking.set_tracking_uri(artifacts) mlflow.set_experiment("test-experiment") # Download the diamonds dataset via mlflow run run(".", entry_point="main", version=None, parameters={"dest-dir": diamonds}, mode="local", cluster_spec=None, git_username=None, git_password=None, use_conda=True, storage_dir=None) # Run the main gbt app via mlflow submitted_run = run( "apps/gbt-regression", entry_point="main", version=None, parameters={"train": os.path.join(diamonds, "train_diamonds.parquet"), "test": os.path.join(diamonds, "test_diamonds.parquet"), "n-trees": 10, "m-depth": 3, "learning-rate": .1, "loss": "rmse", "label-col": "price"}, mode="local", cluster_spec=None, git_username=None, git_password=None, use_conda=True, storage_dir=None) pyfunc = load_pyfunc("model", run_id=submitted_run.run_id) df = pandas.read_parquet(os.path.join(diamonds, "test_diamonds.parquet")) # Removing the price column from the DataFrame so we can use the features to predict df = df.drop(columns="price") # Predicting from the saved pyfunc predict = pyfunc.predict(df) # Make sure the data is of the right type assert isinstance(predict[0], numpy.float32) finally: tracking.set_tracking_uri(old_uri)
def get_or_load(archive_path): """Given a path returned by add_local_model(), this method will return a tuple of (loaded_model, local_model_path). If this Python process ever loaded the model before, we will reuse that copy. """ if archive_path in SparkModelCache._models: SparkModelCache._cache_hits += 1 return SparkModelCache._models[archive_path] local_model_dir = _SparkDirectoryDistributor.get_or_extract( archive_path) # We must rely on a supposed cyclic import here because we want this behavior # on the Spark Executors (i.e., don't try to pickle the load_model function). from mlflow.pyfunc import load_pyfunc # pylint: disable=cyclic-import SparkModelCache._models[archive_path] = (load_pyfunc(local_model_dir), local_model_dir) return SparkModelCache._models[archive_path]
def helper(self, feature_spec, tmp, estimator, df): """ This functions handles exporting, logging, loading back, and predicting on an estimator for testing purposes. """ receiver_fn = tf.estimator.export.build_raw_serving_input_receiver_fn(feature_spec) saved_estimator_path = tmp.path("model") os.makedirs(saved_estimator_path) # Saving TensorFlow model. saved_estimator_path = estimator.export_savedmodel(saved_estimator_path, receiver_fn).decode("utf-8") # Logging the TensorFlow model just saved. tensorflow.log_saved_model(saved_model_dir=saved_estimator_path, signature_def_key="predict", artifact_path="hello") # Loading the saved TensorFlow model as a pyfunc. x = pyfunc.load_pyfunc(saved_estimator_path) # Predicting on the dataset using the pyfunc. return x.predict(df)
def test_model_save_load(self): with TempDir() as tmp: model_path = tmp.path("knn.pkl") with open(model_path, "wb") as f: pickle.dump(self._knn, f) path = tmp.path("knn") m = Model(run_id="test", artifact_path="testtest") pyfunc.save_model(dst_path=path, data_path=model_path, loader_module=os.path.basename(__file__)[:-3], code_path=[__file__], model=m) m2 = Model.load(os.path.join(path, "MLmodel")) print("m1", m.__dict__) print("m2", m2.__dict__) assert m.__dict__ == m2.__dict__ x = pyfunc.load_pyfunc(path) xpred = x.predict(self._X) np.testing.assert_array_equal(self._knn_predict, xpred)
def get_or_load(archive_path): """Given a path returned by add_local_model(), this method will return the loaded model. If this Python process ever loaded the model before, we will reuse that copy. """ if archive_path in SparkModelCache._models: SparkModelCache._cache_hits += 1 return SparkModelCache._models[archive_path] local_path = SparkFiles.get(archive_path) temp_dir = tempfile.mkdtemp() zip_ref = zipfile.ZipFile(local_path, 'r') zip_ref.extractall(temp_dir) zip_ref.close() # We must rely on a supposed cyclic import here because we want this behavior # on the Spark Executors (i.e., don't try to pickle the load_model function). from mlflow.pyfunc import load_pyfunc # pylint: disable=cyclic-import SparkModelCache._models[archive_path] = load_pyfunc(temp_dir) return SparkModelCache._models[archive_path]
def predict(model_uri, input_path, output_path, no_conda): """ Load a pandas DataFrame and runs a python_function model saved with MLflow against it. Return the prediction results as a CSV-formatted pandas DataFrame. """ local_model_path = _download_artifact_from_uri(artifact_uri=model_uri) model_env_file = _load_model_env(path=local_model_path) if not no_conda and model_env_file is not None: conda_env_path = os.path.join(local_model_path, model_env_file) return _rerun_in_conda(conda_env_path) model = load_pyfunc(local_model_path) df = pandas.read_csv(input_path) result = model.predict(df) out_stream = sys.stdout if output_path: out_stream = open(output_path, 'w') pandas.DataFrame(data=result).to_csv(out_stream, header=False, index=False)
def main(argv): # Builds, trains and evaluates a tf.estimator. Then, exports it for inference, logs the exported model # with MLflow, and loads the fitted model back as a PyFunc to make predictions. (x_train, y_train), (x_test, y_test) = tf.keras.datasets.boston_housing.load_data() # There are 13 features we are using for inference. feat_cols = [tf.feature_column.numeric_column(key="features", shape=(x_train.shape[1],))] feat_spec = { "features": tf.placeholder("float", name="features", shape=[None, x_train.shape[1]])} hidden_units = [50, 20] steps = 1000 regressor = tf.estimator.DNNRegressor(hidden_units=hidden_units, feature_columns=feat_cols) train_input_fn = tf.estimator.inputs.numpy_input_fn({"features": x_train}, y_train, num_epochs=None, shuffle=True) with mlflow.start_run() as run: print(run) mlflow.log_param("Hidden Units", hidden_units) mlflow.log_param("Steps", steps) regressor.train(train_input_fn, steps=steps) test_input_fn = tf.estimator.inputs.numpy_input_fn({"features": x_test}, y_test, num_epochs=None, shuffle=True) # Compute mean squared error mse = regressor.evaluate(test_input_fn, steps=steps) mlflow.log_metric("Mean Square Error", mse['average_loss']) # Building a receiver function for exporting receiver_fn = tf.estimator.export.build_raw_serving_input_receiver_fn(feat_spec) temp = tempfile.mkdtemp() try: saved_estimator_path = regressor.export_savedmodel(temp, receiver_fn).decode("utf-8") # Logging the saved model mlflow.tensorflow.log_model(tf_saved_model_dir=saved_estimator_path, tf_meta_graph_tags=[tag_constants.SERVING], tf_signature_def_key="predict", artifact_path="model") # Reloading the model pyfunc_model = pyfunc.load_pyfunc(mlflow.get_artifact_uri('model')) df = pd.DataFrame(data=x_test, columns=["features"] * x_train.shape[1]) # Predicting on the loaded Python Function predict_df = pyfunc_model.predict(df) predict_df['original_labels'] = y_test print(predict_df) finally: shutil.rmtree(temp)
def serve(model_path, run_id, port, host, no_conda): """ Serve a pyfunc model saved with MLflow by launching a webserver on the specified host and port. For information about the input data formats accepted by the webserver, see the following documentation: https://www.mlflow.org/docs/latest/models.html#pyfunc-deployment. If a ``run_id`` is specified, ``model-path`` is treated as an artifact path within that run; otherwise it is treated as a local path. """ if run_id: model_path = _get_model_log_dir(model_path, run_id) model_env_file = _load_model_env(model_path) if not no_conda and model_env_file is not None: conda_env_path = os.path.join(model_path, model_env_file) return _rerun_in_conda(conda_env_path) app = scoring_server.init(load_pyfunc(model_path)) app.run(port=port, host=host)
def test_load_model_succeeds_when_data_is_model_file_instead_of_directory( module_scoped_subclassed_model, model_path, data ): """ This test verifies that PyTorch models saved in older versions of MLflow are loaded successfully by ``mlflow.pytorch.load_model``. The ``data`` path associated with these older models is serialized PyTorch model file, as opposed to the current format: a directory containing a serialized model file and pickle module information. """ artifact_path = "pytorch_model" with mlflow.start_run(): mlflow.pytorch.log_model( artifact_path=artifact_path, pytorch_model=module_scoped_subclassed_model, conda_env=None, ) model_path = _download_artifact_from_uri( "runs:/{run_id}/{artifact_path}".format( run_id=mlflow.active_run().info.run_id, artifact_path=artifact_path ) ) model_conf_path = os.path.join(model_path, "MLmodel") model_conf = Model.load(model_conf_path) pyfunc_conf = model_conf.flavors.get(pyfunc.FLAVOR_NAME) assert pyfunc_conf is not None model_data_path = os.path.join(model_path, pyfunc_conf[pyfunc.DATA]) assert os.path.exists(model_data_path) assert mlflow.pytorch._SERIALIZED_TORCH_MODEL_FILE_NAME in os.listdir(model_data_path) pyfunc_conf[pyfunc.DATA] = os.path.join( model_data_path, mlflow.pytorch._SERIALIZED_TORCH_MODEL_FILE_NAME ) model_conf.save(model_conf_path) loaded_pyfunc = pyfunc.load_pyfunc(model_path) np.testing.assert_array_almost_equal( loaded_pyfunc.predict(data[0]), pd.DataFrame(_predict(model=module_scoped_subclassed_model, data=data)), decimal=4, )
def test_model_save_load_built_in_high_level_api(pd_model_built_in_high_level_api, model_path): model = pd_model_built_in_high_level_api.model test_dataset = pd_model_built_in_high_level_api.inference_dataframe mlflow.paddle.save_model(pd_model=model, path=model_path) reloaded_pd_model = mlflow.paddle.load_model(model_uri=model_path) reloaded_pyfunc = pyfunc.load_pyfunc(model_uri=model_path) low_level_test_dataset = [x[0] for x in test_dataset] np.testing.assert_array_almost_equal( np.array(model.predict(test_dataset)).squeeze(), np.array(reloaded_pyfunc.predict(np.array(low_level_test_dataset))).squeeze(), decimal=5, ) np.testing.assert_array_almost_equal( np.array(reloaded_pd_model(np.array(low_level_test_dataset))).squeeze(), np.array(reloaded_pyfunc.predict(np.array(low_level_test_dataset))).squeeze(), decimal=5, )
def test_model_export(tmpdir): conda_env = os.path.join(str(tmpdir), "conda_env.yml") _mlflow_conda_env( conda_env, additional_pip_deps=["pyspark=={}".format(pyspark_version)]) iris = datasets.load_iris() X = iris.data # we only take the first two features. y = iris.target pandas_df = pd.DataFrame(X, columns=iris.feature_names) pandas_df['label'] = pd.Series(y) spark_session = pyspark.sql.SparkSession.builder \ .config(key="spark_session.python.worker.reuse", value=True) \ .master("local-cluster[2, 1, 1024]") \ .getOrCreate() spark_df = spark_session.createDataFrame(pandas_df) model_path = tmpdir.mkdir("model") assembler = VectorAssembler(inputCols=iris.feature_names, outputCol="features") lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8) pipeline = Pipeline(stages=[assembler, lr]) # Fit the model model = pipeline.fit(spark_df) # Print the coefficients and intercept for multinomial logistic regression preds_df = model.transform(spark_df) preds1 = [x.prediction for x in preds_df.select("prediction").collect()] sparkm.save_model(model, path=str(model_path), conda_env=conda_env) reloaded_model = sparkm.load_model(path=str(model_path)) preds_df_1 = reloaded_model.transform(spark_df) preds1_1 = [ x.prediction for x in preds_df_1.select("prediction").collect() ] assert preds1 == preds1_1 m = pyfunc.load_pyfunc(str(model_path)) preds2 = m.predict(pandas_df) assert preds1 == preds2 preds3 = score_model_in_sagemaker_docker_container( model_path=str(model_path), data=pandas_df) assert preds1 == preds3 assert os.path.exists(sparkm.DFS_TMP) print(os.listdir(sparkm.DFS_TMP)) assert not os.listdir(sparkm.DFS_TMP)
def test_model_export(spark_model_iris, model_path, spark_conda_env): preds_df = spark_model_iris.model.transform(spark_model_iris.training_df) preds1 = [x.prediction for x in preds_df.select("prediction").collect()] sparkm.save_model(spark_model_iris.model, path=model_path, conda_env=spark_conda_env) reloaded_model = sparkm.load_model(path=model_path) preds_df_1 = reloaded_model.transform(spark_model_iris.training_df) preds1_1 = [ x.prediction for x in preds_df_1.select("prediction").collect() ] assert preds1 == preds1_1 m = pyfunc.load_pyfunc(model_path) preds2 = m.predict(spark_model_iris.inference_df) assert preds1 == preds2 preds3 = score_model_in_sagemaker_docker_container( model_path=model_path, data=spark_model_iris.inference_df) assert preds1 == preds3 assert os.path.exists(sparkm.DFS_TMP) print(os.listdir(sparkm.DFS_TMP)) # We expect not to delete the DFS tempdir. assert os.listdir(sparkm.DFS_TMP)