class TestSparkMLVectorAssembler(unittest.TestCase): # Test VectorAssembler @unittest.skipIf((not sparkml_installed()) or (not pandas_installed()), reason="Spark-ML test requires pyspark and pandas") @unittest.skipIf(LooseVersion(torch.__version__) < LooseVersion("1.6.0"), reason="Spark-ML test requires torch >= 1.6.0") def test_vectorassembler_converter(self): iris = load_iris() features = ["sepal_length", "sepal_width", "petal_length", "petal_width"] pd_df = pd.DataFrame(data=np.c_[iris["data"], iris["target"]], columns=features + ["target"])[ ["sepal_length", "sepal_width", "petal_length", "petal_width"] ] df = sql.createDataFrame(pd_df) model = VectorAssembler(inputCols=features, outputCol="features") test_df = df torch_model = convert(model, "torch", test_df) self.assertTrue(torch_model is not None) spark_output = model.transform(test_df).toPandas() spark_output["features"] = spark_output["features"].map(lambda x: np.array(x.toArray())) spark_output_np = spark_output["features"].to_numpy() torch_output_np = torch_model.transform(pd_df) np.testing.assert_allclose(np.vstack(spark_output_np), torch_output_np, rtol=1e-06, atol=1e-06)
class TestSparkMLDiscretizers(unittest.TestCase): # Test QuantileDiscretizer @unittest.skipIf((not sparkml_installed()) or (not pandas_installed()), reason="Spark-ML test requires pyspark and pandas") @unittest.skipIf(LooseVersion(torch.__version__) < LooseVersion("1.6.0"), reason="Spark-ML test requires torch >= 1.6.0") def test_quantilediscretizer_converter(self): iris = load_iris() features = [ "sepal_length", "sepal_width", "petal_length", "petal_width" ] pd_df = pd.DataFrame(data=np.c_[iris["data"], iris["target"]], columns=features + ["target"]) df = sql.createDataFrame(pd_df).select("sepal_length") quantile = QuantileDiscretizer(inputCol="sepal_length", outputCol="sepal_length_bucket", numBuckets=2) model = quantile.fit(df) test_df = df torch_model = convert(model, "torch", test_df) self.assertTrue(torch_model is not None) spark_output = model.transform(test_df).select( "sepal_length_bucket").toPandas() torch_output_np = torch_model.transform(pd_df[["sepal_length"]]) np.testing.assert_allclose(spark_output.to_numpy(), torch_output_np, rtol=1e-06, atol=1e-06)
class TestSparkMLLinear(unittest.TestCase): def _test_linear(self, classes, model_class): n_features = 10 n_total = 100 np.random.seed(0) warnings.filterwarnings("ignore") X = np.random.rand(n_total, n_features) X = np.array(X, dtype=np.float32) y = np.random.randint(classes, size=(n_total, 1)) arr = np.concatenate([y, X], axis=1).reshape(n_total, -1) df = map(lambda x: (int(x[0]), Vectors.dense(x[1:])), arr) df = sql.createDataFrame(df, schema=["label", "features"]) model = model_class() model = model.fit(df) test_df = df.select("features").limit(10) torch_model = convert(model, "torch", test_df) self.assertTrue(torch_model is not None) np.testing.assert_allclose( np.array( model.transform(df).select("probability").collect()).reshape( -1, classes), torch_model.predict_proba(X), rtol=1e-06, atol=1e-06, ) # pyspark.ml.LogisticRegression with two classes @unittest.skipIf(LooseVersion(torch.__version__) < LooseVersion("1.6.0"), reason="Spark-ML test requires torch >= 1.6.0") @unittest.skipIf((not sparkml_installed()) or (not pandas_installed()), reason="Spark-ML test requires pyspark and pandas") def test_logistic_regression_binary(self): self._test_linear(2, model_class=LogisticRegression) # pyspark.ml.LogisticRegression with multi_class @unittest.skipIf(LooseVersion(torch.__version__) < LooseVersion("1.6.0"), reason="Spark-ML test requires torch >= 1.6.0") @unittest.skipIf((not sparkml_installed()) or (not pandas_installed()), reason="Spark-ML test requires pyspark and pandas") def test_logistic_regression_multi_class(self): self._test_linear(5, model_class=LogisticRegression)
class TestSparkMLPipeline(unittest.TestCase): @unittest.skipIf(not sparkml_installed(), reason="Spark-ML test requires pyspark") @unittest.skipIf(LooseVersion(torch.__version__) < LooseVersion("1.6.0"), reason="Spark-ML test requires torch >= 1.6.0") def test_pipeline_1(self): n_features = 10 n_total = 100 classes = 2 np.random.seed(0) warnings.filterwarnings("ignore") X = np.random.rand(n_total, n_features) X = np.array(X, dtype=np.float32) y = np.random.randint(classes, size=(n_total, 1)) arr = np.concatenate([y, X], axis=1).reshape(n_total, -1) df = map(lambda x: (int(x[0]), Vectors.dense(x[1:])), arr) df = sql.createDataFrame(df, schema=["label", "features"]) pipeline = Pipeline(stages=[LogisticRegression()]) model = pipeline.fit(df) test_df = df.select("features").limit(1) torch_model = convert(model, "torch", test_df) self.assertTrue(torch_model is not None) np.testing.assert_allclose( np.array(model.transform(df).select( "prediction").collect()).reshape(-1), torch_model.predict(X), rtol=1e-06, atol=1e-06, ) np.testing.assert_allclose( np.array( model.transform(df).select("probability").collect()).reshape( -1, classes), torch_model.predict_proba(X), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(LooseVersion(torch.__version__) < LooseVersion("1.6.0"), reason="Spark-ML test requires torch >= 1.6.0") @unittest.skipIf((not sparkml_installed()) or (not pandas_installed()), reason="Spark-ML test requires pyspark and pandas") def test_pipeline2(self): iris = load_iris() features = [ "sepal_length", "sepal_width", "petal_length", "petal_width" ] pd_df = pd.DataFrame(data=np.c_[iris["data"], iris["target"]], columns=features + ["label"]) df = sql.createDataFrame(pd_df) quantile = QuantileDiscretizer(inputCol="sepal_length", outputCol="sepal_length_bucket", numBuckets=2) features = ["sepal_length_bucket"] + features assembler = VectorAssembler(inputCols=features, outputCol="features") pipeline = Pipeline(stages=[quantile, assembler, LogisticRegression()]) model = pipeline.fit(df) df = df.select( ["sepal_length", "sepal_width", "petal_length", "petal_width"]) pd_df = pd_df[[ "sepal_length", "sepal_width", "petal_length", "petal_width" ]] torch_model = convert(model, "torch", df) self.assertTrue(torch_model is not None) np.testing.assert_allclose( np.array(model.transform(df).select( "prediction").collect()).reshape(-1), torch_model.predict(pd_df), rtol=1e-06, atol=1e-06, ) np.testing.assert_allclose( np.array( model.transform(df).select("probability").collect()).reshape( -1, 3), torch_model.predict_proba(pd_df), rtol=1e-06, atol=1e-05, ) @unittest.skipIf((not sparkml_installed()) or (not pandas_installed()), reason="Spark-ML test requires pyspark and pandas") @unittest.skipIf(LooseVersion(torch.__version__) < LooseVersion("1.6.0"), reason="Spark-ML test requires torch >= 1.6.0") def test_pipeline3(self): iris = load_iris() features = [ "sepal_length", "sepal_width", "petal_length", "petal_width" ] pd_df = pd.DataFrame(data=np.c_[iris["data"], iris["target"]], columns=features + ["label"]) df = sql.createDataFrame(pd_df) quantile1 = QuantileDiscretizer(inputCol="sepal_length", outputCol="sepal_length_bucket", numBuckets=2) quantile2 = QuantileDiscretizer(inputCol="sepal_width", outputCol="sepal_width_bucket", numBuckets=2) features = ["sepal_length_bucket", "sepal_width_bucket"] + features assembler = VectorAssembler(inputCols=features, outputCol="features") pipeline = Pipeline( stages=[quantile1, quantile2, assembler, LogisticRegression()]) model = pipeline.fit(df) df = df.select( ["sepal_length", "sepal_width", "petal_length", "petal_width"]) pd_df = pd_df[[ "sepal_length", "sepal_width", "petal_length", "petal_width" ]] torch_model = convert(model, "torch", df) self.assertTrue(torch_model is not None) np.testing.assert_allclose( np.array(model.transform(df).select( "prediction").collect()).reshape(-1), torch_model.predict(pd_df), rtol=1e-06, atol=1e-06, ) np.testing.assert_allclose( np.array( model.transform(df).select("probability").collect()).reshape( -1, 3), torch_model.predict_proba(pd_df), rtol=1e-06, atol=1e-05, )
""" Tests Spark-ML Pipeline converters """ import unittest import warnings import numpy as np import torch from sklearn.datasets import load_iris from hummingbird.ml._utils import sparkml_installed, pandas_installed from hummingbird.ml import convert from distutils.version import LooseVersion if sparkml_installed(): from pyspark.sql import SparkSession, SQLContext from pyspark.ml import Pipeline from pyspark.ml.linalg import Vectors from pyspark.ml.classification import LogisticRegression from pyspark.ml.feature import QuantileDiscretizer, VectorAssembler spark = SparkSession.builder.master("local[*]").config( "spark.driver.bindAddress", "127.0.0.1").getOrCreate() sql = SQLContext(spark) if pandas_installed(): import pandas as pd class TestSparkMLPipeline(unittest.TestCase): @unittest.skipIf(not sparkml_installed(),
class TestBackends(unittest.TestCase): # Test backends are browsable def test_backends(self): warnings.filterwarnings("ignore") self.assertTrue(len(hummingbird.ml.backends) > 0) # Test backends are not case sensitive def test_backends_case_sensitive(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) hb_model = hummingbird.ml.convert(model, "tOrCh") self.assertIsNotNone(hb_model) np.testing.assert_allclose(model.predict_proba(X), hb_model.predict_proba(X), rtol=1e-06, atol=1e-06) # Test pytorch is still a valid backend name def test_backends_pytorch(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) hb_model = hummingbird.ml.convert(model, "pytOrCh") self.assertIsNotNone(hb_model) np.testing.assert_allclose(model.predict_proba(X), hb_model.predict_proba(X), rtol=1e-06, atol=1e-06) # Test pytorch save and load def test_pytorch_save_load(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) hb_model = hummingbird.ml.convert(model, "torch") self.assertIsNotNone(hb_model) hb_model.save("pt-tmp") hb_model_loaded = hummingbird.ml.TorchContainer.load("pt-tmp") np.testing.assert_allclose(hb_model_loaded.predict_proba(X), hb_model.predict_proba(X), rtol=1e-06, atol=1e-06) os.remove("pt-tmp.zip") # Test pytorch save and generic load def test_pytorch_save_generic_load(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) hb_model = hummingbird.ml.convert(model, "torch") self.assertIsNotNone(hb_model) hb_model.save("pt-tmp") hb_model_loaded = hummingbird.ml.load("pt-tmp") np.testing.assert_allclose(hb_model_loaded.predict_proba(X), hb_model.predict_proba(X), rtol=1e-06, atol=1e-06) os.remove("pt-tmp.zip") def test_pytorch_save_load_load(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) hb_model = hummingbird.ml.convert(model, "torch") self.assertIsNotNone(hb_model) hb_model.save("pt-tmp") hummingbird.ml.load("pt-tmp") hummingbird.ml.load("pt-tmp") os.remove("pt-tmp.zip") def test_pytorch_save_load_more_versions(self): from hummingbird.ml.operator_converters import constants warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) hb_model = hummingbird.ml.convert(model, "torch") self.assertIsNotNone(hb_model) hb_model.save("pt-tmp") shutil.unpack_archive("pt-tmp.zip", "pt-tmp", format="zip") # Adding a new library does not create problems. with open( os.path.join("pt-tmp", constants.SAVE_LOAD_MODEL_CONFIGURATION_PATH), "r") as file: configuration = file.readlines() configuration.append("\nlibx=1.3") os.remove( os.path.join("pt-tmp", constants.SAVE_LOAD_MODEL_CONFIGURATION_PATH)) with open( os.path.join("pt-tmp", constants.SAVE_LOAD_MODEL_CONFIGURATION_PATH), "w") as file: file.writelines(configuration) shutil.make_archive("pt-tmp", "zip", "pt-tmp") hummingbird.ml.load("pt-tmp") os.remove("pt-tmp.zip") def test_pytorch_save_load_less_versions(self): from hummingbird.ml.operator_converters import constants warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) hb_model = hummingbird.ml.convert(model, "torch") self.assertIsNotNone(hb_model) hb_model.save("pt-tmp") shutil.unpack_archive("pt-tmp.zip", "pt-tmp", format="zip") # Removing a library does not create problems. with open( os.path.join("pt-tmp", constants.SAVE_LOAD_MODEL_CONFIGURATION_PATH), "r") as file: configuration = file.readlines() configuration = configuration[-1] os.remove( os.path.join("pt-tmp", constants.SAVE_LOAD_MODEL_CONFIGURATION_PATH)) with open( os.path.join("pt-tmp", constants.SAVE_LOAD_MODEL_CONFIGURATION_PATH), "w") as file: file.writelines(configuration) shutil.make_archive("pt-tmp", "zip", "pt-tmp") hummingbird.ml.load("pt-tmp") os.remove("pt-tmp.zip") def test_pytorch_save_load_different_versions(self): from hummingbird.ml.operator_converters import constants warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) hb_model = hummingbird.ml.convert(model, "torch") self.assertIsNotNone(hb_model) hb_model.save("pt-tmp") shutil.unpack_archive("pt-tmp.zip", "pt-tmp", format="zip") # Changing the version of a library does not create problems. with open( os.path.join("pt-tmp", constants.SAVE_LOAD_MODEL_CONFIGURATION_PATH), "r") as file: configuration = file.readlines() configuration[0] = "hummingbird=0.0.0.1\n" os.remove( os.path.join("pt-tmp", constants.SAVE_LOAD_MODEL_CONFIGURATION_PATH)) with open( os.path.join("pt-tmp", constants.SAVE_LOAD_MODEL_CONFIGURATION_PATH), "w") as file: file.writelines(configuration) shutil.make_archive("pt-tmp", "zip", "pt-tmp") hummingbird.ml.load("pt-tmp") os.remove("pt-tmp.zip") # Test torchscript save and load def test_torchscript_save_load(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) hb_model = hummingbird.ml.convert(model, "torch.jit", X) self.assertIsNotNone(hb_model) hb_model.save("ts-tmp") hb_model_loaded = hummingbird.ml.TorchContainer.load("ts-tmp") np.testing.assert_allclose(hb_model_loaded.predict_proba(X), hb_model.predict_proba(X), rtol=1e-06, atol=1e-06) os.remove("ts-tmp.zip") # Test torchscript save and generic load def test_torchscript_save_generic_load(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) hb_model = hummingbird.ml.convert(model, "torch.jit", X) self.assertIsNotNone(hb_model) hb_model.save("ts-tmp") hb_model_loaded = hummingbird.ml.load("ts-tmp") np.testing.assert_allclose(hb_model_loaded.predict_proba(X), hb_model.predict_proba(X), rtol=1e-06, atol=1e-06) os.remove("ts-tmp.zip") def test_load_fails_bad_path(self): # Asserts for bad path with extension self.assertRaises(AssertionError, hummingbird.ml.load, "nonsense.zip") self.assertRaises(AssertionError, hummingbird.ml.TorchContainer.load, "nonsense.zip") # Asserts for bad path with no extension self.assertRaises(AssertionError, hummingbird.ml.load, "nonsense") self.assertRaises(AssertionError, hummingbird.ml.TorchContainer.load, "nonsense") @unittest.skipIf( not (onnx_ml_tools_installed() and onnx_runtime_installed()), reason="ONNXML test require ONNX, ORT and ONNXMLTOOLS") def test_load_fails_bad_path_onnx(self): self.assertRaises(AssertionError, hummingbird.ml.ONNXContainer.load, "nonsense.zip") self.assertRaises(AssertionError, hummingbird.ml.ONNXContainer.load, "nonsense") @unittest.skipIf(not tvm_installed(), reason="TVM test requires TVM installed") def test_load_fails_bad_path_tvm(self): self.assertRaises(AssertionError, hummingbird.ml.TVMContainer.load, "nonsense.zip") self.assertRaises(AssertionError, hummingbird.ml.TVMContainer.load, "nonsense") # Test not supported backends def test_unsupported_backend(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) # Test scala backend rises an exception self.assertRaises(MissingBackend, hummingbird.ml.convert, model, "scala") # Test torchscript requires test_data def test_torchscript_test_data(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) # Test torcscript requires test_input self.assertRaises(RuntimeError, hummingbird.ml.convert, model, "torch.jit") # Test TVM requires test_data @unittest.skipIf(not tvm_installed(), reason="TVM test requires TVM installed") def test_tvm_test_data(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) # Test tvm requires test_input self.assertRaises(RuntimeError, hummingbird.ml.convert, model, "tvm") # Test tvm save and load @unittest.skipIf(not tvm_installed(), reason="TVM test requires TVM installed") def test_tvm_save_load(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) hb_model = hummingbird.ml.convert(model, "tvm", X) self.assertIsNotNone(hb_model) hb_model.save("tvm-tmp") hb_model_loaded = hummingbird.ml.TVMContainer.load("tvm-tmp") np.testing.assert_allclose(hb_model_loaded.predict_proba(X), hb_model.predict_proba(X), rtol=1e-06, atol=1e-06) os.remove("tvm-tmp.zip") # Test tvm save and generic load @unittest.skipIf(not tvm_installed(), reason="TVM test requires TVM installed") def test_tvm_save_generic_load(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) hb_model = hummingbird.ml.convert(model, "tvm", X) self.assertIsNotNone(hb_model) hb_model.save("tvm-tmp") hb_model_loaded = hummingbird.ml.load("tvm-tmp") np.testing.assert_allclose(hb_model_loaded.predict_proba(X), hb_model.predict_proba(X), rtol=1e-06, atol=1e-06) os.remove("tvm-tmp.zip") # Test tvm save and load zip file @unittest.skipIf(not tvm_installed(), reason="TVM test requires TVM installed") def test_tvm_save_load_zip(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) hb_model = hummingbird.ml.convert(model, "tvm", X) self.assertIsNotNone(hb_model) hb_model.save("tvm-tmp.zip") hb_model_loaded = hummingbird.ml.TVMContainer.load("tvm-tmp.zip") np.testing.assert_allclose(hb_model_loaded.predict_proba(X), hb_model.predict_proba(X), rtol=1e-06, atol=1e-06) os.remove("tvm-tmp.zip") @unittest.skipIf(not tvm_installed(), reason="TVM test requires TVM installed") def test_tvm_save_load_load(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) hb_model = hummingbird.ml.convert(model, "tvm", X) self.assertIsNotNone(hb_model) hb_model.save("tvm-tmp.zip") hummingbird.ml.TVMContainer.load("tvm-tmp.zip") hummingbird.ml.TVMContainer.load("tvm-tmp.zip") os.remove("tvm-tmp.zip") @unittest.skipIf(not tvm_installed(), reason="TVM test requires TVM installed") def test_tvm_save_load_no_versions(self): from hummingbird.ml.operator_converters import constants warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) hb_model = hummingbird.ml.convert(model, "tvm", X) self.assertIsNotNone(hb_model) hb_model.save("tvm-tmp") shutil.unpack_archive("tvm-tmp.zip", "tvm-tmp", format="zip") # Removing the configuration file with the versions does not create problems. os.remove( os.path.join("tvm-tmp", constants.SAVE_LOAD_MODEL_CONFIGURATION_PATH)) hummingbird.ml.load("tvm-tmp") os.remove("tvm-tmp.zip") # Test onnx requires test_data or initial_types @unittest.skipIf( not (onnx_ml_tools_installed() and onnx_runtime_installed()), reason="ONNXML test require ONNX, ORT and ONNXMLTOOLS") def test_onnx_no_test_data_float(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) # Create ONNX-ML model onnx_ml_model = convert_sklearn( model, initial_types=[("input", FloatTensorType([X.shape[0], X.shape[1]]))], target_opset=11) # Test onnx requires no test_data hb_model = hummingbird.ml.convert(onnx_ml_model, "onnx") assert hb_model # Test onnx 0 shape input @unittest.skipIf( not (onnx_ml_tools_installed() and onnx_runtime_installed()), reason="ONNXML test require ONNX, ORT and ONNXMLTOOLS") def test_onnx_zero_shape_input(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) y = np.random.randint(num_classes, size=100) model.fit(X, y) # Create ONNX-ML model onnx_ml_model = convert_sklearn(model, initial_types=[ ("input", DoubleTensorType([0, X.shape[1]])) ], target_opset=11) # Test onnx requires no test_data hb_model = hummingbird.ml.convert(onnx_ml_model, "onnx") assert hb_model # Test onnx no test_data, double input @unittest.skipIf( not (onnx_ml_tools_installed() and onnx_runtime_installed()), reason="ONNXML test require ONNX, ORT and ONNXMLTOOLS") def test_onnx_no_test_data_double(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) y = np.random.randint(num_classes, size=100) model.fit(X, y) # Create ONNX-ML model onnx_ml_model = convert_sklearn( model, initial_types=[("input", DoubleTensorType([X.shape[0], X.shape[1]]))], target_opset=11) # Test onnx requires no test_data hb_model = hummingbird.ml.convert(onnx_ml_model, "onnx") assert hb_model # Test onnx no test_data, long input @unittest.skipIf( not (onnx_ml_tools_installed() and onnx_runtime_installed()), reason="ONNXML test require ONNX, ORT and ONNXMLTOOLS") def test_onnx_no_test_data_long(self): warnings.filterwarnings("ignore") model = model = StandardScaler(with_mean=True, with_std=True) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.int64) model.fit(X) # Create ONNX-ML model onnx_ml_model = convert_sklearn( model, initial_types=[("input", Int64TensorType([X.shape[0], X.shape[1]]))], target_opset=11) # Test onnx requires no test_data hb_model = hummingbird.ml.convert(onnx_ml_model, "onnx") assert hb_model # Test onnx no test_data, int input @unittest.skipIf( not (onnx_ml_tools_installed() and onnx_runtime_installed()), reason="ONNXML test require ONNX, ORT and ONNXMLTOOLS") def test_onnx_no_test_data_int(self): warnings.filterwarnings("ignore") model = OneHotEncoder() X = np.array([[1, 2, 3]], dtype=np.int32) model.fit(X) # Create ONNX-ML model onnx_ml_model = convert_sklearn( model, initial_types=[("input", Int32TensorType([X.shape[0], X.shape[1]]))], target_opset=11) # Test onnx requires no test_data hb_model = hummingbird.ml.convert(onnx_ml_model, "onnx") assert hb_model # Test onnx no test_data, string input @unittest.skipIf( not (onnx_ml_tools_installed() and onnx_runtime_installed()), reason="ONNXML test require ONNX, ORT and ONNXMLTOOLS") def test_onnx_no_test_data_string(self): warnings.filterwarnings("ignore") model = OneHotEncoder() X = np.array([["a", "b", "c"]]) model.fit(X) # Create ONNX-ML model onnx_ml_model = convert_sklearn( model, initial_types=[("input", StringTensorType([X.shape[0], X.shape[1]]))], target_opset=11) # Test backends are not case sensitive self.assertRaises(RuntimeError, hummingbird.ml.convert, onnx_ml_model, "onnx") # Test ONNX save and load @unittest.skipIf(not onnx_runtime_installed(), reason="ONNX test requires ORT") def test_onnx_save_load(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) hb_model = hummingbird.ml.convert(model, "onnx", X) self.assertIsNotNone(hb_model) hb_model.save("onnx-tmp") hb_model_loaded = hummingbird.ml.ONNXContainer.load("onnx-tmp") np.testing.assert_allclose(hb_model_loaded.predict_proba(X), hb_model.predict_proba(X), rtol=1e-06, atol=1e-06) os.remove("onnx-tmp.zip") # Test ONNX save and generic load @unittest.skipIf(not onnx_runtime_installed(), reason="ONNX test requires ORT") def test_onnx_save_generic_load(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) hb_model = hummingbird.ml.convert(model, "onnx", X) self.assertIsNotNone(hb_model) hb_model.save("onnx-tmp") hb_model_loaded = hummingbird.ml.load("onnx-tmp") np.testing.assert_allclose(hb_model_loaded.predict_proba(X), hb_model.predict_proba(X), rtol=1e-06, atol=1e-06) os.remove("onnx-tmp.zip") # Test ONNX save and generic load @unittest.skipIf(not onnx_runtime_installed(), reason="ONNX test requires ORT") def test_onnx_save_load_load(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) hb_model = hummingbird.ml.convert(model, "onnx", X) self.assertIsNotNone(hb_model) hb_model.save("onnx-tmp") hummingbird.ml.load("onnx-tmp") hummingbird.ml.load("onnx-tmp") os.remove("onnx-tmp.zip") @unittest.skipIf(not onnx_runtime_installed(), reason="ONNX test requires ORT") def test_onnx_save_load_no_versions(self): from hummingbird.ml.operator_converters import constants warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) hb_model = hummingbird.ml.convert(model, "onnx", X) self.assertIsNotNone(hb_model) hb_model.save("onnx-tmp") shutil.unpack_archive("onnx-tmp.zip", "onnx-tmp", format="zip") # Removing the configuration file with the versions does not create problems. os.remove( os.path.join("onnx-tmp", constants.SAVE_LOAD_MODEL_CONFIGURATION_PATH)) hummingbird.ml.load("onnx-tmp") os.remove("onnx-tmp.zip") # Test for when the user forgets to add a target (ex: convert(model, output) rather than convert(model, 'torch')) due to API change def test_forgotten_backend_string(self): from sklearn.preprocessing import LabelEncoder model = LabelEncoder() data = np.array([1, 4, 5, 2, 0, 2], dtype=np.int32) model.fit(data) self.assertRaises(ValueError, hummingbird.ml.convert, model, [("input", Int32TensorType([6, 1]))]) # Test ONNX @unittest.skipIf(not onnx_runtime_installed(), reason="ONNX test requires ORT") def test_onnx(self): import numpy as np import lightgbm as lgb from hummingbird.ml import convert # Create some random data for binary classification. num_classes = 2 X = np.array(np.random.rand(10000, 28), dtype=np.float32) y = np.random.randint(num_classes, size=10000) model = lgb.LGBMClassifier() model.fit(X, y) self.assertRaises(RuntimeError, hummingbird.ml.convert, model, "onnx") # Test Spark UDF @unittest.skipIf( os.name == "nt" or not sparkml_installed() or LooseVersion(pyspark.__version__) < LooseVersion("3"), reason="UDF Test requires spark >= 3", ) def test_udf_torch(self): X, y = load_iris(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=77, test_size=0.2, ) spark_df = sql_context.createDataFrame(pd.DataFrame(data=X_train)) sql_context.registerDataFrameAsTable(spark_df, "IRIS") model = GradientBoostingClassifier(n_estimators=10) model.fit(X_train, y_train) hb_model = hummingbird.ml.convert(model, "torch") # Broadcast the model. broadcasted_model = spark.sparkContext.broadcast(hb_model) # UDF definition. @pandas_udf("long") def udf_hb_predict( iterator: Iterator[pd.Series]) -> Iterator[pd.Series]: model = broadcasted_model.value for args in iterator: data_unmangled = pd.concat([feature for feature in args], axis=1) predictions = model.predict(data_unmangled) yield pd.Series(np.array(predictions)) # Register the UDF. sql_context.udf.register("PREDICT", udf_hb_predict) # Run the query. sql_context.sql( "SELECT SUM(prediction) FROM (SELECT PREDICT(*) as prediction FROM IRIS)" ).show() @unittest.skipIf( os.name == "nt" or not sparkml_installed() or LooseVersion(pyspark.__version__) < LooseVersion("3"), reason="UDF Test requires spark >= 3", ) def test_udf_torch_jit_broadcast(self): import pickle X, y = load_iris(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=77, test_size=0.2, ) spark_df = sql_context.createDataFrame(pd.DataFrame(data=X_train)) sql_context.registerDataFrameAsTable(spark_df, "IRIS") model = GradientBoostingClassifier(n_estimators=10) model.fit(X_train, y_train) hb_model = hummingbird.ml.convert(model, "torch.jit", X_test) # Broadcast the model returns an error. self.assertRaises(pickle.PickleError, spark.sparkContext.broadcast, hb_model) @unittest.skipIf( os.name == "nt" or not sparkml_installed() or LooseVersion(pyspark.__version__) < LooseVersion("3"), reason="UDF Test requires spark >= 3", ) def test_udf_torch_jit_spark_file(self): import dill import torch.jit X, y = load_iris(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=77, test_size=0.2, ) spark_df = sql_context.createDataFrame(pd.DataFrame(data=X_train)) sql_context.registerDataFrameAsTable(spark_df, "IRIS") model = GradientBoostingClassifier(n_estimators=10) model.fit(X_train, y_train) hb_model = hummingbird.ml.convert(model, "torch.jit", X_test) # Save the file locally. if os.path.exists("deployed_model.zip"): os.remove("deployed_model.zip") torch.jit.save(hb_model.model, "deployed_model.zip") hb_model._model = None # Share the model using spark file and broadcast the container. spark.sparkContext.addFile("deployed_model.zip") broadcasted_container = spark.sparkContext.broadcast(hb_model) # UDF definition. @pandas_udf("long") def udf_hb_predict( iterator: Iterator[pd.Series]) -> Iterator[pd.Series]: location = SparkFiles.get("deployed_model.zip") torch_model = torch.jit.load(location) container = broadcasted_container.value container._model = torch_model model = container for args in iterator: data_unmangled = pd.concat([feature for feature in args], axis=1) predictions = model.predict(data_unmangled.values) yield pd.Series(np.array(predictions)) # Register the UDF. sql_context.udf.register("PREDICT", udf_hb_predict) # Run the query. sql_context.sql( "SELECT SUM(prediction) FROM (SELECT PREDICT(*) as prediction FROM IRIS)" ).show() os.remove("deployed_model.zip")