class TestSparkMLVectorAssembler(unittest.TestCase): # Test VectorAssembler @unittest.skipIf((not sparkml_installed()) or (not pandas_installed()), reason="Spark-ML test requires pyspark and pandas") @unittest.skipIf(LooseVersion(torch.__version__) < LooseVersion("1.6.0"), reason="Spark-ML test requires torch >= 1.6.0") def test_vectorassembler_converter(self): iris = load_iris() features = ["sepal_length", "sepal_width", "petal_length", "petal_width"] pd_df = pd.DataFrame(data=np.c_[iris["data"], iris["target"]], columns=features + ["target"])[ ["sepal_length", "sepal_width", "petal_length", "petal_width"] ] df = sql.createDataFrame(pd_df) model = VectorAssembler(inputCols=features, outputCol="features") test_df = df torch_model = convert(model, "torch", test_df) self.assertTrue(torch_model is not None) spark_output = model.transform(test_df).toPandas() spark_output["features"] = spark_output["features"].map(lambda x: np.array(x.toArray())) spark_output_np = spark_output["features"].to_numpy() torch_output_np = torch_model.transform(pd_df) np.testing.assert_allclose(np.vstack(spark_output_np), torch_output_np, rtol=1e-06, atol=1e-06)
class TestSparkMLDiscretizers(unittest.TestCase): # Test QuantileDiscretizer @unittest.skipIf((not sparkml_installed()) or (not pandas_installed()), reason="Spark-ML test requires pyspark and pandas") @unittest.skipIf(LooseVersion(torch.__version__) < LooseVersion("1.6.0"), reason="Spark-ML test requires torch >= 1.6.0") def test_quantilediscretizer_converter(self): iris = load_iris() features = [ "sepal_length", "sepal_width", "petal_length", "petal_width" ] pd_df = pd.DataFrame(data=np.c_[iris["data"], iris["target"]], columns=features + ["target"]) df = sql.createDataFrame(pd_df).select("sepal_length") quantile = QuantileDiscretizer(inputCol="sepal_length", outputCol="sepal_length_bucket", numBuckets=2) model = quantile.fit(df) test_df = df torch_model = convert(model, "torch", test_df) self.assertTrue(torch_model is not None) spark_output = model.transform(test_df).select( "sepal_length_bucket").toPandas() torch_output_np = torch_model.transform(pd_df[["sepal_length"]]) np.testing.assert_allclose(spark_output.to_numpy(), torch_output_np, rtol=1e-06, atol=1e-06)
class TestSparkMLLinear(unittest.TestCase): def _test_linear(self, classes, model_class): n_features = 10 n_total = 100 np.random.seed(0) warnings.filterwarnings("ignore") X = np.random.rand(n_total, n_features) X = np.array(X, dtype=np.float32) y = np.random.randint(classes, size=(n_total, 1)) arr = np.concatenate([y, X], axis=1).reshape(n_total, -1) df = map(lambda x: (int(x[0]), Vectors.dense(x[1:])), arr) df = sql.createDataFrame(df, schema=["label", "features"]) model = model_class() model = model.fit(df) test_df = df.select("features").limit(10) torch_model = convert(model, "torch", test_df) self.assertTrue(torch_model is not None) np.testing.assert_allclose( np.array( model.transform(df).select("probability").collect()).reshape( -1, classes), torch_model.predict_proba(X), rtol=1e-06, atol=1e-06, ) # pyspark.ml.LogisticRegression with two classes @unittest.skipIf(LooseVersion(torch.__version__) < LooseVersion("1.6.0"), reason="Spark-ML test requires torch >= 1.6.0") @unittest.skipIf((not sparkml_installed()) or (not pandas_installed()), reason="Spark-ML test requires pyspark and pandas") def test_logistic_regression_binary(self): self._test_linear(2, model_class=LogisticRegression) # pyspark.ml.LogisticRegression with multi_class @unittest.skipIf(LooseVersion(torch.__version__) < LooseVersion("1.6.0"), reason="Spark-ML test requires torch >= 1.6.0") @unittest.skipIf((not sparkml_installed()) or (not pandas_installed()), reason="Spark-ML test requires pyspark and pandas") def test_logistic_regression_multi_class(self): self._test_linear(5, model_class=LogisticRegression)
class TestExtraConf(unittest.TestCase): # Test default number of threads. It will only work on mac after 1.6 https://github.com/pytorch/pytorch/issues/43036 @unittest.skipIf( sys.platform == "darwin" and LooseVersion(torch.__version__) <= LooseVersion("1.6.0"), reason="PyTorch has a bug on mac related to multi-threading", ) def test_torch_deafault_n_threads(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) hb_model = hummingbird.ml.convert(model, "torch") self.assertIsNotNone(hb_model) self.assertTrue(torch.get_num_threads() == psutil.cpu_count(logical=False)) self.assertTrue(torch.get_num_interop_threads() == 1) # Test one thread in pytorch. @unittest.skipIf( sys.platform == "darwin" and LooseVersion(torch.__version__) > LooseVersion("1.6.0"), reason="Setting threading multi times will break on mac", ) def test_torch_one_thread(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) hb_model = hummingbird.ml.convert(model, "torch", extra_config={constants.N_THREADS: 1}) self.assertIsNotNone(hb_model) self.assertTrue(torch.get_num_threads() == 1) self.assertTrue(torch.get_num_interop_threads() == 1) # Test default number of threads onnx. @unittest.skipIf( not (onnx_ml_tools_installed() and onnx_runtime_installed()), reason="ONNXML test require ONNX, ORT and ONNXMLTOOLS" ) def test_onnx_deafault_n_threads(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) # Create ONNX-ML model onnx_ml_model = convert_sklearn( model, initial_types=[("input", FloatTensorType([X.shape[0], X.shape[1]]))], target_opset=9 ) hb_model = hummingbird.ml.convert(onnx_ml_model, "onnx", X) self.assertIsNotNone(hb_model) self.assertTrue(hb_model._session.get_session_options().intra_op_num_threads == psutil.cpu_count(logical=False)) self.assertTrue(hb_model._session.get_session_options().inter_op_num_threads == 1) # Test one thread onnx. @unittest.skipIf( not (onnx_ml_tools_installed() and onnx_runtime_installed()), reason="ONNXML test require ONNX, ORT and ONNXMLTOOLS" ) def test_onnx_one_thread(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) hb_model = hummingbird.ml.convert(model, "onnx", X, extra_config={constants.N_THREADS: 1}) self.assertIsNotNone(hb_model) self.assertTrue(hb_model._session.get_session_options().intra_op_num_threads == 1) self.assertTrue(hb_model._session.get_session_options().inter_op_num_threads == 1) # Test pytorch regressor with batching. def test_torch_regression_batch(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingRegressor(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) batch_size = 10 remainder_size = X.shape[0] % batch_size hb_model = hummingbird.ml.convert_batch(model, "torch", X[:batch_size, :], remainder_size=remainder_size) self.assertIsNotNone(hb_model) np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06) # Test pytorch classifier with batching. def test_torch_classification_batch(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) batch_size = 10 remainder_size = X.shape[0] % batch_size hb_model = hummingbird.ml.convert_batch(model, "torch", X[:batch_size, :], remainder_size=remainder_size) self.assertIsNotNone(hb_model) np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06) np.testing.assert_allclose(model.predict_proba(X), hb_model.predict_proba(X), rtol=1e-06, atol=1e-06) # Test pytorch classifier with batching. def test_torch_iforest_batch(self): warnings.filterwarnings("ignore") num_classes = 2 model = IsolationForest(n_estimators=10, max_samples=2) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) batch_size = 10 remainder_size = X.shape[0] % batch_size hb_model = hummingbird.ml.convert_batch(model, "torch", X[:batch_size, :], remainder_size=remainder_size) self.assertIsNotNone(hb_model) np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06) np.testing.assert_allclose(model.decision_function(X), hb_model.decision_function(X), rtol=1e-06, atol=1e-06) np.testing.assert_allclose(model.score_samples(X), hb_model.score_samples(X), rtol=1e-06, atol=1e-06) # Test pytorch regressor with batching and uneven rows. def test_torch_batch_regression_uneven(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingRegressor(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(105, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=105) model.fit(X, y) batch_size = 10 remainder_size = X.shape[0] % batch_size hb_model = hummingbird.ml.convert_batch(model, "torch", X[:batch_size, :], remainder_size=remainder_size) self.assertIsNotNone(hb_model) np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06) # Test pytorch classification with batching and uneven rows. def test_torch_batch_classification_uneven(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(105, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=105) model.fit(X, y) batch_size = 10 remainder_size = X.shape[0] % batch_size hb_model = hummingbird.ml.convert_batch(model, "torch", X[:batch_size, :], remainder_size=remainder_size) self.assertIsNotNone(hb_model) np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06) # Test pytorch transform with batching and uneven rows. def test_torch_batch_transform(self): warnings.filterwarnings("ignore") model = StandardScaler(with_mean=True, with_std=True) np.random.seed(0) X = np.random.rand(105, 200) X = np.array(X, dtype=np.float32) model.fit(X) batch_size = 10 remainder_size = X.shape[0] % batch_size hb_model = hummingbird.ml.convert_batch(model, "torch", X[:batch_size, :], remainder_size=remainder_size) self.assertIsNotNone(hb_model) np.testing.assert_allclose(model.transform(X), hb_model.transform(X), rtol=1e-06, atol=1e-06) # Test torchscript regression with batching. def test_torchscript_regression_batch(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingRegressor(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(103, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=103) model.fit(X, y) batch_size = 10 remainder_size = X.shape[0] % batch_size hb_model = hummingbird.ml.convert_batch(model, "torch.jit", X[:batch_size, :], remainder_size=remainder_size) self.assertIsNotNone(hb_model) np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06) # Test torchscript classification with batching. def test_torchscript_classification_batch(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(103, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=103) model.fit(X, y) batch_size = 10 remainder_size = X.shape[0] % batch_size hb_model = hummingbird.ml.convert_batch(model, "torch.jit", X[:batch_size, :], remainder_size=remainder_size) self.assertIsNotNone(hb_model) np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06) np.testing.assert_allclose(model.predict_proba(X), hb_model.predict_proba(X), rtol=1e-06, atol=1e-06) # Test torchscript iforest with batching. def test_torchscript_iforest_batch(self): warnings.filterwarnings("ignore") num_classes = 2 model = IsolationForest(n_estimators=10, max_samples=2) np.random.seed(0) X = np.random.rand(103, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=103) model.fit(X, y) batch_size = 10 remainder_size = X.shape[0] % batch_size hb_model = hummingbird.ml.convert_batch(model, "torch.jit", X[:batch_size, :], remainder_size=remainder_size) self.assertIsNotNone(hb_model) np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06) np.testing.assert_allclose(model.decision_function(X), hb_model.decision_function(X), rtol=1e-06, atol=1e-06) np.testing.assert_allclose(model.score_samples(X), hb_model.score_samples(X), rtol=1e-06, atol=1e-06) # Test torchscript transform with batching and uneven rows. def test_torchscript_batch_transform(self): warnings.filterwarnings("ignore") model = StandardScaler(with_mean=True, with_std=True) np.random.seed(0) X = np.random.rand(101, 200) X = np.array(X, dtype=np.float32) model.fit(X) batch_size = 10 remainder_size = X.shape[0] % batch_size hb_model = hummingbird.ml.convert_batch(model, "torch.jit", X[:batch_size, :], remainder_size=remainder_size) self.assertIsNotNone(hb_model) np.testing.assert_allclose(model.transform(X), hb_model.transform(X), rtol=1e-06, atol=1e-06) # Test onnx transform with batching and uneven rows. @unittest.skipIf( not (onnx_ml_tools_installed() and onnx_runtime_installed()), reason="ONNXML test require ONNX, ORT and ONNXMLTOOLS" ) def test_onnx_batch_transform(self): warnings.filterwarnings("ignore") model = StandardScaler(with_mean=True, with_std=True) np.random.seed(0) X = np.random.rand(101, 200) X = np.array(X, dtype=np.float32) model.fit(X) batch_size = 10 remainder_size = X.shape[0] % batch_size hb_model = hummingbird.ml.convert_batch(model, "onnx", X[:batch_size, :], remainder_size=remainder_size) self.assertIsNotNone(hb_model) np.testing.assert_allclose(model.transform(X), hb_model.transform(X), rtol=1e-06, atol=1e-06) # Test onnx regression with batching. @unittest.skipIf( not (onnx_ml_tools_installed() and onnx_runtime_installed()), reason="ONNXML test require ONNX, ORT and ONNXMLTOOLS" ) def test_onnx_regression_batch(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingRegressor(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(103, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=103) model.fit(X, y) batch_size = 10 remainder_size = X.shape[0] % batch_size hb_model = hummingbird.ml.convert_batch(model, "onnx", X[:batch_size, :], remainder_size=remainder_size) self.assertIsNotNone(hb_model) np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06) # Test onnx classification with batching. @unittest.skipIf( not (onnx_ml_tools_installed() and onnx_runtime_installed()), reason="ONNXML test require ONNX, ORT and ONNXMLTOOLS" ) def test_onnx_classification_batch(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(103, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=103) model.fit(X, y) batch_size = 10 remainder_size = X.shape[0] % batch_size hb_model = hummingbird.ml.convert_batch(model, "onnx", X[:batch_size, :], remainder_size=remainder_size) self.assertIsNotNone(hb_model) np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06) np.testing.assert_allclose(model.predict_proba(X), hb_model.predict_proba(X), rtol=1e-06, atol=1e-06) # Test onnx iforest with batching. @unittest.skipIf( not (onnx_ml_tools_installed() and onnx_runtime_installed()), reason="ONNXML test require ONNX, ORT and ONNXMLTOOLS" ) def test_onnx_iforest_batch(self): warnings.filterwarnings("ignore") num_classes = 2 model = IsolationForest(n_estimators=10, max_samples=2) np.random.seed(0) X = np.random.rand(103, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=103) model.fit(X, y) batch_size = 10 remainder_size = X.shape[0] % batch_size hb_model = hummingbird.ml.convert_batch(model, "onnx", X[:batch_size, :], remainder_size=remainder_size) self.assertIsNotNone(hb_model) np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06) np.testing.assert_allclose(model.decision_function(X), hb_model.decision_function(X), rtol=1e-06, atol=1e-06) np.testing.assert_allclose(model.score_samples(X), hb_model.score_samples(X), rtol=1e-06, atol=1e-06) # Test tvm transform with batching. @unittest.skipIf(not tvm_installed(), reason="TVM test require TVM") def test_tvm_batch_transform(self): warnings.filterwarnings("ignore") model = StandardScaler(with_mean=True, with_std=True) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) model.fit(X) batch_size = 10 hb_model = hummingbird.ml.convert_batch(model, "tvm", X[:batch_size, :]) self.assertIsNotNone(hb_model) np.testing.assert_allclose(model.transform(X), hb_model.transform(X), rtol=1e-06, atol=1e-06) # Test tvm regression with batching. @unittest.skipIf(not tvm_installed(), reason="TVM test require TVM") def test_tvm_regression_batch(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingRegressor(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(103, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=103) model.fit(X, y) batch_size = 10 remainder_size = X.shape[0] % batch_size hb_model = hummingbird.ml.convert_batch(model, "tvm", X[:batch_size, :], remainder_size=remainder_size) self.assertIsNotNone(hb_model) np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06) # Test tvm classification with batching. @unittest.skipIf(not tvm_installed(), reason="TVM test require TVM") def test_tvm_classification_batch(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) batch_size = 10 hb_model = hummingbird.ml.convert_batch(model, "tvm", X[:batch_size, :]) self.assertIsNotNone(hb_model) np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06) np.testing.assert_allclose(model.predict_proba(X), hb_model.predict_proba(X), rtol=1e-06, atol=1e-06) # Test tvm iforest with batching. @unittest.skipIf(not tvm_installed(), reason="TVM test require TVM") def test_tvm_iforest_batch(self): warnings.filterwarnings("ignore") num_classes = 2 model = IsolationForest(n_estimators=10, max_samples=2) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) batch_size = 10 hb_model = hummingbird.ml.convert_batch(model, "tvm", X[:batch_size, :]) self.assertIsNotNone(hb_model) np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06) np.testing.assert_allclose(model.decision_function(X), hb_model.decision_function(X), rtol=1e-06, atol=1e-06) np.testing.assert_allclose(model.score_samples(X), hb_model.score_samples(X), rtol=1e-06, atol=1e-06) # Test tvm transform with batching and uneven numer of records. @unittest.skipIf(not tvm_installed(), reason="TVM test require TVM") def test_tvm_batch_remainder_transform(self): warnings.filterwarnings("ignore") model = StandardScaler(with_mean=True, with_std=True) np.random.seed(0) X = np.random.rand(105, 200) X = np.array(X, dtype=np.float32) model.fit(X) batch_size = 10 remainder_size = X.shape[0] % batch_size hb_model = hummingbird.ml.convert_batch(model, "tvm", X[:batch_size, :], remainder_size) self.assertIsNotNone(hb_model) np.testing.assert_allclose(model.transform(X), hb_model.transform(X), rtol=1e-06, atol=1e-06) # Test tvm regression with batching and uneven numer of records. @unittest.skipIf(not tvm_installed(), reason="TVM test require TVM") def test_tvm_regression_remainder_batch(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingRegressor(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(105, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=105) model.fit(X, y) batch_size = 10 remainder_size = X.shape[0] % batch_size hb_model = hummingbird.ml.convert_batch(model, "tvm", X[:batch_size, :], remainder_size) self.assertIsNotNone(hb_model) np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06) # Test tvm classification with batching and uneven numer of records. @unittest.skipIf(not tvm_installed(), reason="TVM test require TVM") def test_tvm_classification_remainder_batch(self): warnings.filterwarnings("ignore") max_depth = 10 num_classes = 2 model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(105, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=105) model.fit(X, y) batch_size = 10 remainder_size = X.shape[0] % batch_size hb_model = hummingbird.ml.convert_batch(model, "tvm", X[:batch_size, :], remainder_size) self.assertIsNotNone(hb_model) np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06) np.testing.assert_allclose(model.predict_proba(X), hb_model.predict_proba(X), rtol=1e-06, atol=1e-06) # Test tvm iforest with batching and uneven numer of records. @unittest.skipIf(not tvm_installed(), reason="TVM test require TVM") def test_tvm_iforest_remainder_batch(self): warnings.filterwarnings("ignore") num_classes = 2 model = IsolationForest(n_estimators=10, max_samples=2) np.random.seed(0) X = np.random.rand(105, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=105) model.fit(X, y) batch_size = 10 remainder_size = X.shape[0] % batch_size hb_model = hummingbird.ml.convert_batch(model, "tvm", X[:batch_size, :], remainder_size) self.assertIsNotNone(hb_model) np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06) np.testing.assert_allclose(model.decision_function(X), hb_model.decision_function(X), rtol=1e-06, atol=1e-06) np.testing.assert_allclose(model.score_samples(X), hb_model.score_samples(X), rtol=1e-06, atol=1e-06) # Test batch with pandas. @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") def test_pandas_batch(self): import pandas max_depth = 10 iris = datasets.load_iris() X = iris.data[:149, :3] y = iris.target[:149] columns = ["vA", "vB", "vC"] X_train = pandas.DataFrame(X, columns=columns) pipeline = Pipeline( steps=[ ("preprocessor", ColumnTransformer(transformers=[], remainder="passthrough",)), ("classifier", GradientBoostingClassifier(n_estimators=10, max_depth=max_depth)), ] ) pipeline.fit(X_train, y) batch_size = 10 remainder_size = X.shape[0] % batch_size torch_model = hummingbird.ml.convert_batch( pipeline, "torch", pandas.DataFrame(X[:batch_size], columns=columns), remainder_size ) self.assertTrue(torch_model is not None) np.testing.assert_allclose( pipeline.predict_proba(X_train), torch_model.predict_proba(X_train), rtol=1e-06, atol=1e-06, ) # Test batch with pandas ts. @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") def test_pandas_batch_ts(self): import pandas max_depth = 10 iris = datasets.load_iris() X = iris.data[:149, :3] y = iris.target[:149] columns = ["vA", "vB", "vC"] X_train = pandas.DataFrame(X, columns=columns) pipeline = Pipeline( steps=[ ("preprocessor", ColumnTransformer(transformers=[], remainder="passthrough",)), ("classifier", GradientBoostingClassifier(n_estimators=10, max_depth=max_depth)), ] ) pipeline.fit(X_train, y) batch_size = 10 remainder_size = X.shape[0] % batch_size torch_model = hummingbird.ml.convert_batch( pipeline, "torch.jit", pandas.DataFrame(X[:batch_size], columns=columns), remainder_size ) self.assertTrue(torch_model is not None) np.testing.assert_allclose( pipeline.predict_proba(X_train), torch_model.predict_proba(X_train), rtol=1e-06, atol=1e-06, ) # Test batch with pandas onnx. @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") @unittest.skipIf(not onnx_runtime_installed(), reason="ONNXML test require ONNX and ORT") def test_pandas_batch_onnx(self): import pandas max_depth = 10 iris = datasets.load_iris() X = iris.data[:149, :3] y = iris.target[:149] columns = ["vA", "vB", "vC"] X_train = pandas.DataFrame(X, columns=columns) pipeline = Pipeline( steps=[ ("preprocessor", ColumnTransformer(transformers=[], remainder="passthrough",)), ("classifier", GradientBoostingClassifier(n_estimators=10, max_depth=max_depth)), ] ) pipeline.fit(X_train, y) batch_size = 10 remainder_size = X.shape[0] % batch_size hb_model = hummingbird.ml.convert_batch( pipeline, "onnx", pandas.DataFrame(X[:batch_size], columns=columns), remainder_size ) self.assertTrue(hb_model is not None) np.testing.assert_allclose( pipeline.predict_proba(X_train), hb_model.predict_proba(X_train), rtol=1e-06, atol=1e-06, ) # Test batch with pandas from onnxml. @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") @unittest.skipIf( not (onnx_ml_tools_installed() and onnx_runtime_installed()), reason="ONNXML test require ONNX, ORT and ONNXMLTOOLS" ) def test_pandas_batch_onnxml(self): import pandas max_depth = 10 iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target columns = ["vA", "vB", "vC"] X_train = pandas.DataFrame(X, columns=columns) pipeline = Pipeline( steps=[ ("preprocessor", ColumnTransformer(transformers=[], remainder="passthrough",)), ("classifier", GradientBoostingClassifier(n_estimators=10, max_depth=max_depth)), ] ) pipeline.fit(X_train, y) # Create ONNX-ML model onnx_ml_model = convert_sklearn( pipeline, initial_types=[ ("vA", DoubleTensorType([X.shape[0], 1])), ("vB", DoubleTensorType([X.shape[0], 1])), ("vC", DoubleTensorType([X.shape[0], 1])), ], target_opset=9, ) batch_size = 10 remainder_size = X.shape[0] % batch_size hb_model = hummingbird.ml.convert_batch( onnx_ml_model, "onnx", pandas.DataFrame(X[:batch_size], columns=columns), remainder_size ) self.assertTrue(hb_model is not None) np.testing.assert_allclose( pipeline.predict_proba(X_train), hb_model.predict_proba(X_train), rtol=1e-06, atol=1e-06, ) # Test batch with pandas tvm. @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") @unittest.skipIf(not tvm_installed(), reason="TVM test requires TVM") def test_pandas_batch_tvm(self): import pandas max_depth = 10 iris = datasets.load_iris() X = iris.data[:149, :3] y = iris.target[:149] columns = ["vA", "vB", "vC"] X_train = pandas.DataFrame(X, columns=columns) pipeline = Pipeline( steps=[ ("preprocessor", ColumnTransformer(transformers=[], remainder="passthrough",)), ("classifier", GradientBoostingClassifier(n_estimators=10, max_depth=max_depth)), ] ) pipeline.fit(X_train, y) batch_size = 10 remainder_size = X.shape[0] % batch_size hb_model = hummingbird.ml.convert_batch( pipeline, "tvm", pandas.DataFrame(X[:batch_size], columns=columns), remainder_size ) self.assertTrue(hb_model is not None) np.testing.assert_allclose( pipeline.predict_proba(X_train), hb_model.predict_proba(X_train), rtol=1e-06, atol=1e-06, ) # Check converter with model name set as extra_config. @unittest.skipIf( not (onnx_ml_tools_installed() and onnx_runtime_installed()), reason="ONNXML test require ONNX, ORT and ONNXMLTOOLS" ) @unittest.skipIf(not lightgbm_installed(), reason="LightGBM test requires LightGBM installed") def test_lightgbm_pytorch_extra_config(self): warnings.filterwarnings("ignore") X = [[0, 1], [1, 1], [2, 0]] X = np.array(X, dtype=np.float32) y = np.array([100, -10, 50], dtype=np.float32) model = lgb.LGBMRegressor(n_estimators=3, min_child_samples=1) model.fit(X, y) # Create ONNX-ML model onnx_ml_model = convert_lightgbm( model, initial_types=[("input", FloatTensorType([X.shape[0], X.shape[1]]))], target_opset=9 ) # Create ONNX model model_name = "hummingbird.ml.test.lightgbm" onnx_model = hummingbird.ml.convert(onnx_ml_model, "onnx", extra_config={constants.ONNX_OUTPUT_MODEL_NAME: model_name}) assert onnx_model.model.graph.name == model_name # Test max fuse depth configuration in TVM. @unittest.skipIf(not tvm_installed(), reason="TVM test requires TVM installed") def test_tvm_max_fuse(self): warnings.filterwarnings("ignore") X = [[0, 1], [1, 1], [2, 0]] X = np.array(X, dtype=np.float32) y = np.array([100, -10, 50], dtype=np.float32) model = lgb.LGBMRegressor(n_estimators=3, min_child_samples=1) model.fit(X, y) hb_model = hummingbird.ml.convert(model, "tvm", X, extra_config={constants.TVM_MAX_FUSE_DEPTH: 30}) self.assertIsNotNone(hb_model) np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06) # Test TVM without padding returns an errror is sizes don't match. @unittest.skipIf(not tvm_installed(), reason="TVM test requires TVM installed") def test_tvm_no_padding(self): warnings.filterwarnings("ignore") np.random.seed(0) X = np.random.rand(100, 20) X = np.array(X, dtype=np.float32) y = np.random.randint(2, size=100) model = lgb.LGBMRegressor(n_estimators=10) model.fit(X, y) hb_model = hummingbird.ml.convert(model, "tvm", X) self.assertIsNotNone(hb_model) self.assertRaises(AssertionError, hb_model.predict, X[:98]) # Test padding in TVM. @unittest.skipIf(not tvm_installed(), reason="TVM test requires TVM installed") def test_tvm_padding(self): warnings.filterwarnings("ignore") np.random.seed(0) X = np.random.rand(100, 20) X = np.array(X, dtype=np.float32) y = np.random.randint(2, size=100) model = lgb.LGBMRegressor(n_estimators=10) model.fit(X, y) hb_model = hummingbird.ml.convert(model, "tvm", X, extra_config={constants.TVM_PAD_INPUT: True}) self.assertIsNotNone(hb_model) np.testing.assert_allclose(model.predict(X[:98]), hb_model.predict(X[:98]), rtol=1e-06, atol=1e-06) # Test padding in TVM does not create problems when not necessary. @unittest.skipIf(not tvm_installed(), reason="TVM test requires TVM installed") def test_tvm_padding_2(self): warnings.filterwarnings("ignore") X = [[0, 1], [1, 1], [2, 0]] X = np.array(X, dtype=np.float32) y = np.array([100, -10, 50], dtype=np.float32) model = lgb.LGBMRegressor(n_estimators=3, min_child_samples=1) model.fit(X, y) hb_model = hummingbird.ml.convert(model, "tvm", X, extra_config={constants.TVM_PAD_INPUT: True}) self.assertIsNotNone(hb_model) np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06) # Test max string lentgh. def test_max_str_length(self): model = LabelEncoder() data = [ "paris", "tokyo", "amsterdam", "tokyo", ] model.fit(data) torch_model = hummingbird.ml.convert(model, "torch", extra_config={constants.MAX_STRING_LENGTH: 20}) np.testing.assert_allclose(model.transform(data), torch_model.transform(data), rtol=1e-06, atol=1e-06)
class TestSparkMLPipeline(unittest.TestCase): @unittest.skipIf(not sparkml_installed(), reason="Spark-ML test requires pyspark") @unittest.skipIf(LooseVersion(torch.__version__) < LooseVersion("1.6.0"), reason="Spark-ML test requires torch >= 1.6.0") def test_pipeline_1(self): n_features = 10 n_total = 100 classes = 2 np.random.seed(0) warnings.filterwarnings("ignore") X = np.random.rand(n_total, n_features) X = np.array(X, dtype=np.float32) y = np.random.randint(classes, size=(n_total, 1)) arr = np.concatenate([y, X], axis=1).reshape(n_total, -1) df = map(lambda x: (int(x[0]), Vectors.dense(x[1:])), arr) df = sql.createDataFrame(df, schema=["label", "features"]) pipeline = Pipeline(stages=[LogisticRegression()]) model = pipeline.fit(df) test_df = df.select("features").limit(1) torch_model = convert(model, "torch", test_df) self.assertTrue(torch_model is not None) np.testing.assert_allclose( np.array(model.transform(df).select( "prediction").collect()).reshape(-1), torch_model.predict(X), rtol=1e-06, atol=1e-06, ) np.testing.assert_allclose( np.array( model.transform(df).select("probability").collect()).reshape( -1, classes), torch_model.predict_proba(X), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(LooseVersion(torch.__version__) < LooseVersion("1.6.0"), reason="Spark-ML test requires torch >= 1.6.0") @unittest.skipIf((not sparkml_installed()) or (not pandas_installed()), reason="Spark-ML test requires pyspark and pandas") def test_pipeline2(self): iris = load_iris() features = [ "sepal_length", "sepal_width", "petal_length", "petal_width" ] pd_df = pd.DataFrame(data=np.c_[iris["data"], iris["target"]], columns=features + ["label"]) df = sql.createDataFrame(pd_df) quantile = QuantileDiscretizer(inputCol="sepal_length", outputCol="sepal_length_bucket", numBuckets=2) features = ["sepal_length_bucket"] + features assembler = VectorAssembler(inputCols=features, outputCol="features") pipeline = Pipeline(stages=[quantile, assembler, LogisticRegression()]) model = pipeline.fit(df) df = df.select( ["sepal_length", "sepal_width", "petal_length", "petal_width"]) pd_df = pd_df[[ "sepal_length", "sepal_width", "petal_length", "petal_width" ]] torch_model = convert(model, "torch", df) self.assertTrue(torch_model is not None) np.testing.assert_allclose( np.array(model.transform(df).select( "prediction").collect()).reshape(-1), torch_model.predict(pd_df), rtol=1e-06, atol=1e-06, ) np.testing.assert_allclose( np.array( model.transform(df).select("probability").collect()).reshape( -1, 3), torch_model.predict_proba(pd_df), rtol=1e-06, atol=1e-05, ) @unittest.skipIf((not sparkml_installed()) or (not pandas_installed()), reason="Spark-ML test requires pyspark and pandas") @unittest.skipIf(LooseVersion(torch.__version__) < LooseVersion("1.6.0"), reason="Spark-ML test requires torch >= 1.6.0") def test_pipeline3(self): iris = load_iris() features = [ "sepal_length", "sepal_width", "petal_length", "petal_width" ] pd_df = pd.DataFrame(data=np.c_[iris["data"], iris["target"]], columns=features + ["label"]) df = sql.createDataFrame(pd_df) quantile1 = QuantileDiscretizer(inputCol="sepal_length", outputCol="sepal_length_bucket", numBuckets=2) quantile2 = QuantileDiscretizer(inputCol="sepal_width", outputCol="sepal_width_bucket", numBuckets=2) features = ["sepal_length_bucket", "sepal_width_bucket"] + features assembler = VectorAssembler(inputCols=features, outputCol="features") pipeline = Pipeline( stages=[quantile1, quantile2, assembler, LogisticRegression()]) model = pipeline.fit(df) df = df.select( ["sepal_length", "sepal_width", "petal_length", "petal_width"]) pd_df = pd_df[[ "sepal_length", "sepal_width", "petal_length", "petal_width" ]] torch_model = convert(model, "torch", df) self.assertTrue(torch_model is not None) np.testing.assert_allclose( np.array(model.transform(df).select( "prediction").collect()).reshape(-1), torch_model.predict(pd_df), rtol=1e-06, atol=1e-06, ) np.testing.assert_allclose( np.array( model.transform(df).select("probability").collect()).reshape( -1, 3), torch_model.predict_proba(pd_df), rtol=1e-06, atol=1e-05, )
from hummingbird.ml._utils import sparkml_installed, pandas_installed from hummingbird.ml import convert from distutils.version import LooseVersion if sparkml_installed(): from pyspark.sql import SparkSession, SQLContext from pyspark.ml import Pipeline from pyspark.ml.linalg import Vectors from pyspark.ml.classification import LogisticRegression from pyspark.ml.feature import QuantileDiscretizer, VectorAssembler spark = SparkSession.builder.master("local[*]").config( "spark.driver.bindAddress", "127.0.0.1").getOrCreate() sql = SQLContext(spark) if pandas_installed(): import pandas as pd class TestSparkMLPipeline(unittest.TestCase): @unittest.skipIf(not sparkml_installed(), reason="Spark-ML test requires pyspark") @unittest.skipIf(LooseVersion(torch.__version__) < LooseVersion("1.6.0"), reason="Spark-ML test requires torch >= 1.6.0") def test_pipeline_1(self): n_features = 10 n_total = 100 classes = 2 np.random.seed(0) warnings.filterwarnings("ignore") X = np.random.rand(n_total, n_features)
class TestProphet(unittest.TestCase): def _get_data(self): local_path = "tests/resources" local_data = os.path.join(local_path, "example_wp_log_peyton_manning.csv") url = "https://raw.githubusercontent.com/facebook/prophet/master/examples/example_wp_log_peyton_manning.csv" if not os.path.isfile(local_data): os.makedirs(local_path) urlretrieve(url, local_data) data = pd.read_csv(local_data) return data @unittest.skipIf(not (pandas_installed() and prophet_installed()), reason="Test requires Prophet and Pandas") def test_prophet_trend(self): df = self._get_data() m = Prophet() m.fit(df) # Convert with Hummingbird. hb_model = hummingbird.ml.convert(m, "torch") # Predictions. future = m.make_future_dataframe(periods=365) prophet_trend = m.predict(future)["trend"].values hb_trend = hb_model.predict(future) np.testing.assert_allclose(prophet_trend, hb_trend, rtol=1e-06, atol=1e-06) @unittest.skipIf( not (pandas_installed() and prophet_installed()), reason="Test requires Prophet, Pandas and ONNX runtime.", ) @unittest.skipIf( LooseVersion(torch.__version__) < LooseVersion("1.8.1"), reason="Test requires Torch 1.8.1.", ) @unittest.skipIf( not onnx_runtime_installed() or LooseVersion(onnxruntime.__version__) < LooseVersion("1.7.0"), reason="Prophet test requires onnxruntime => 1.7.0", ) def test_prophet_trend_onnx(self): df = self._get_data() m = Prophet() m.fit(df) future = m.make_future_dataframe(periods=365) future_np = (future.values - np.datetime64("1970-01-01T00:00:00.000000000")).astype( np.int64) / 1000000000 # Convert with Hummingbird. hb_model = hummingbird.ml.convert(m, "onnx", future_np) # Predictions. prophet_trend = m.predict(future)["trend"] hb_trend = hb_model.predict(future_np) import onnx onnx.save(hb_model.model, "prophet.onnx") np.testing.assert_allclose(prophet_trend, hb_trend, rtol=1e-06, atol=1e-06)
class TestSklearnPipeline(unittest.TestCase): def test_pipeline(self): data = np.array([[0, 0], [0, 0], [1, 1], [1, 1]], dtype=np.float32) scaler = StandardScaler() scaler.fit(data) model = Pipeline([("scaler1", scaler), ("scaler2", scaler)]) torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.transform(data), torch_model.transform(data), rtol=1e-06, atol=1e-06, ) def test_pipeline2(self): data = np.array([[0.0, 0.0], [0.0, 0.0], [1.0, 1.0], [1.0, 1.0]], dtype=np.float32) scaler = StandardScaler() scaler.fit(data) model = Pipeline([("scaler1", scaler), ("scaler2", scaler)]) torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.transform(data), torch_model.transform(data), rtol=1e-06, atol=1e-06, ) def test_combine_inputs_union_in_pipeline(self): from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline data = np.array([[0.0, 0.0], [0.0, 0.0], [1.0, 1.0], [1.0, 1.0]], dtype=np.float32) model = Pipeline([ ("scaler1", StandardScaler()), ("union", FeatureUnion([("scaler2", StandardScaler()), ("scaler3", MinMaxScaler())])), ]) model.fit(data) torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.transform(data), torch_model.transform(data), rtol=1e-06, atol=1e-06, ) def test_combine_inputs_floats_ints(self): data = [[0, 0.0], [0, 0.0], [1, 1.0], [1, 1.0]] scaler = StandardScaler() scaler.fit(data) model = Pipeline([("scaler1", scaler), ("scaler2", scaler)]) torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.transform(data), torch_model.transform(data), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") def test_pipeline_column_transformer_1(self): iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2) X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4) y_train = y % 2 numeric_features = [0, 1, 2] # ["vA", "vB", "vC"] classifier = LogisticRegression( C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3, ) numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())]) preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, numeric_features)]) model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) X_test = X_train[:11] torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.predict_proba(X_test), torch_model.predict_proba(X_test.values), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") def test_pipeline_column_transformer(self): iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2) X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4) y_train = y % 2 numeric_features = [0, 1, 2] # ["vA", "vB", "vC"] categorical_features = [3, 4] # ["vcat", "vcat2"] classifier = LogisticRegression( C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3, ) numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())]) categorical_transformer = Pipeline( steps=[("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))]) preprocessor = ColumnTransformer(transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ]) model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) X_test = X_train[:11] torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.predict_proba(X_test), torch_model.predict_proba(X_test.values), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") def test_pipeline_column_transformer_weights(self): iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2) X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4) y_train = y % 2 numeric_features = [0, 1, 2] # ["vA", "vB", "vC"] categorical_features = [3, 4] # ["vcat", "vcat2"] classifier = LogisticRegression( C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3, ) numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())]) categorical_transformer = Pipeline( steps=[("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))]) preprocessor = ColumnTransformer( transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ], transformer_weights={ "num": 2, "cat": 3 }, ) model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) X_test = X_train[:11] torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.predict_proba(X_test), torch_model.predict_proba(X_test.values), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") def test_pipeline_column_transformer_drop(self): iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2) X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4) y_train = y % 2 numeric_features = [0, 1] # ["vA", "vB"] categorical_features = [3, 4] # ["vcat", "vcat2"] classifier = LogisticRegression( C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3, ) numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())]) categorical_transformer = Pipeline( steps=[("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))]) preprocessor = ColumnTransformer( transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ], transformer_weights={ "num": 2, "cat": 3 }, remainder="drop", ) model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) X_test = X_train[:11] torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.predict_proba(X_test), torch_model.predict_proba(X_test.values), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") def test_pipeline_column_transformer_drop_noweights(self): iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2) X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4) y_train = y % 2 numeric_features = [0, 1] # ["vA", "vB"] categorical_features = [3, 4] # ["vcat", "vcat2"] classifier = LogisticRegression( C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3, ) numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())]) categorical_transformer = Pipeline( steps=[("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))]) preprocessor = ColumnTransformer( transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ], remainder="drop", ) model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) X_test = X_train[:11] torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.predict_proba(X_test), torch_model.predict_proba(X_test.values), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(ColumnTransformer is None, reason="ColumnTransformer not available in 0.19") @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") def test_pipeline_column_transformer_passthrough(self): iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2) X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4) y_train = y % 2 numeric_features = [0, 1] # ["vA", "vB"] categorical_features = [3, 4] # ["vcat", "vcat2"] classifier = LogisticRegression( C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3, ) numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())]) categorical_transformer = Pipeline( steps=[("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))]) preprocessor = ColumnTransformer( transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ], transformer_weights={ "num": 2, "cat": 3 }, remainder="passthrough", ) model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) X_test = X_train[:11] torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.predict_proba(X_test), torch_model.predict_proba(X_test.values), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(ColumnTransformer is None, reason="ColumnTransformer not available in 0.19") @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") def test_pipeline_column_transformer_passthrough_noweights(self): iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2) X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4) y_train = y % 2 numeric_features = [0, 1] # ["vA", "vB"] categorical_features = [3, 4] # ["vcat", "vcat2"] classifier = LogisticRegression( C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3, ) numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())]) categorical_transformer = Pipeline( steps=[("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))]) preprocessor = ColumnTransformer( transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ], remainder="passthrough", ) model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) X_test = X_train[:11] torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.predict_proba(X_test), torch_model.predict_proba(X_test.values), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(ColumnTransformer is None, reason="ColumnTransformer not available in 0.19") @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") def test_pipeline_column_transformer_passthrough_slice(self): iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2) X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4) y_train = y % 2 numeric_features = slice(0, 1) # ["vA", "vB"] categorical_features = slice(3, 4) # ["vcat", "vcat2"] classifier = LogisticRegression( C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3, ) numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())]) categorical_transformer = Pipeline( steps=[("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))]) preprocessor = ColumnTransformer( transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ], transformer_weights={ "num": 2, "cat": 3 }, remainder="passthrough", ) model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) X_test = X_train[:11] torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.predict_proba(X_test), torch_model.predict_proba(X_test.values), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(ColumnTransformer is None, reason="ColumnTransformer not available in 0.19") @unittest.skipIf(not onnx_runtime_installed(), reason="Test requires ORT installed") def test_pipeline_many_inputs(self): n_features = 18 X = np.random.rand(100, n_features) y = np.random.randint(1000, size=100) scaler_transformer = Pipeline(steps=[("scaler", StandardScaler())]) preprocessor = ColumnTransformer( transformers=[("scaling", scaler_transformer, list(range(n_features)))]) model = RandomForestRegressor(n_estimators=10, max_depth=9) pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)]) pipeline.fit(X, y) X_test = tuple(np.split(X, n_features, axis=1)) hb_model = hummingbird.ml.convert(pipeline, "onnx", X_test) assert len(hb_model.model.graph.input) == n_features np.testing.assert_allclose( pipeline.predict(X), np.array(hb_model.predict(X_test)).flatten(), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(ColumnTransformer is None, reason="ColumnTransformer not available in 0.19") @unittest.skipIf(not onnx_runtime_installed(), reason="Test requires ORT installed") def test_pipeline_many_inputs_with_schema(self): n_features = 5 X = np.random.rand(100, n_features) y = np.random.randint(1000, size=100) input_column_names = ["A", "B", "C", "D", "E"] output_column_names = ["score"] scaler_transformer = Pipeline(steps=[("scaler", StandardScaler())]) preprocessor = ColumnTransformer( transformers=[("scaling", scaler_transformer, list(range(n_features)))]) model = RandomForestRegressor(n_estimators=10, max_depth=9) pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)]) pipeline.fit(X, y) X_test = tuple(np.split(X, n_features, axis=1)) extra_config = { constants.INPUT_NAMES: input_column_names, constants.OUTPUT_NAMES: output_column_names } hb_model = hummingbird.ml.convert(pipeline, "onnx", X_test, extra_config=extra_config) graph_inputs = [input.name for input in hb_model.model.graph.input] graph_outputs = [output.name for output in hb_model.model.graph.output] assert len(hb_model.model.graph.input) == n_features assert graph_inputs == input_column_names assert graph_outputs == output_column_names
class TestSklearnLinearClassifiers(unittest.TestCase): # LogisticRegression test function to be parameterized def _test_logistic_regression(self, num_classes, solver="liblinear", multi_class="auto", labels_shift=0): if num_classes > 2: model = LogisticRegression(solver=solver, multi_class=multi_class, fit_intercept=True) else: model = LogisticRegression(solver="liblinear", fit_intercept=True) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) + labels_shift model.fit(X, y) torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose(model.predict_proba(X), torch_model.predict_proba(X), rtol=1e-6, atol=1e-6) # LogisticRegression binary def test_logistic_regression_bi(self): self._test_logistic_regression(2) # LogisticRegression multiclass with auto def test_logistic_regression_multi_auto(self): self._test_logistic_regression(3) # LogisticRegression with class labels shifted def test_logistic_regression_shifted_classes(self): self._test_logistic_regression(3, labels_shift=2) # LogisticRegression with multi+ovr def test_logistic_regression_multi_ovr(self): self._test_logistic_regression(3, multi_class="ovr") # LogisticRegression with multi+multinomial+sag def test_logistic_regression_multi_multin_sag(self): warnings.filterwarnings("ignore") # this will not converge due to small test size self._test_logistic_regression(3, multi_class="multinomial", solver="sag") # LogisticRegression binary lbfgs def test_logistic_regression_bi_lbfgs(self): warnings.filterwarnings("ignore") # this will not converge due to small test size self._test_logistic_regression(2, solver="lbfgs") # LogisticRegression with multi+lbfgs def test_logistic_regression_multi_lbfgs(self): warnings.filterwarnings("ignore") # this will not converge due to small test size self._test_logistic_regression(3, solver="lbfgs") # LogisticRegression with multi+multinomial+lbfgs def test_logistic_regression_multi_multin_lbfgs(self): warnings.filterwarnings("ignore") # this will not converge due to small test size self._test_logistic_regression(3, multi_class="multinomial", solver="lbfgs") # LogisticRegression with multi+ovr+lbfgs def test_logistic_regression_multi_ovr_lbfgs(self): warnings.filterwarnings("ignore") # this will not converge due to small test size self._test_logistic_regression(3, multi_class="ovr", solver="lbfgs") # LinearRegression test function to be parameterized def _test_linear_regression(self, y_input): model = LinearRegression() np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = y_input model.fit(X, y) torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose(model.predict(X), torch_model.predict(X), rtol=1e-6, atol=1e-6) # LinearRegression with ints def test_linear_regression_int(self): np.random.seed(0) self._test_linear_regression(np.random.randint(2, size=100)) # LinearRegression with floats def test_linear_regression_float(self): np.random.seed(0) self._test_linear_regression(np.random.rand(100)) # RidgeCV test function to be parameterized def _test_ridge_cv(self, y_input): model = RidgeCV() np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = y_input model.fit(X, y) torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose(model.predict(X), torch_model.predict(X), rtol=1e-6, atol=1e-6) # RidgeCV with ints def test_ridge_cv_int(self): np.random.seed(0) self._test_ridge_cv(np.random.randint(2, size=100)) # RidgeCV with floats def test_ridge_cv_float(self): np.random.seed(0) self._test_ridge_cv(np.random.rand(100)) # LogisticRegressionCV test function to be parameterized def _test_logistic_regression_cv(self, num_classes, solver="liblinear", multi_class="auto", labels_shift=0): if num_classes > 2: model = LogisticRegressionCV(solver=solver, multi_class=multi_class, fit_intercept=True) else: model = LogisticRegressionCV(solver="liblinear", fit_intercept=True) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) + labels_shift model.fit(X, y) torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose(model.predict_proba(X), torch_model.predict_proba(X), rtol=1e-6, atol=1e-6) # LogisticRegressionCV with 2 classes def test_logistic_regression_cv_bi(self): self._test_logistic_regression_cv(2) # LogisticRegressionCV with 3 classes def test_logistic_regression_cv_multi(self): self._test_logistic_regression_cv(3) # LogisticRegressionCV with shifted classes def test_logistic_regression_cv_shifted_classes(self): self._test_logistic_regression_cv(3, labels_shift=2) # LogisticRegressionCV with multi+ovr def test_logistic_regression_cv_multi_ovr(self): self._test_logistic_regression_cv(3, multi_class="ovr") # LogisticRegressionCV with multi+multinomial def test_logistic_regression_cv_multi_multin(self): warnings.filterwarnings("ignore") # this will not converge due to small test size self._test_logistic_regression_cv(3, multi_class="multinomial", solver="sag") # SGDClassifier test function to be parameterized def _test_sgd_classifier(self, num_classes): model = SGDClassifier(loss="log") np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose(model.predict_proba(X), torch_model.predict_proba(X), rtol=1e-6, atol=1e-6) # SGDClassifier with 2 classes def test_sgd_classifier_bi(self): self._test_sgd_classifier(2) # SGDClassifier with 3 classes def test_sgd_classifier_multi(self): self._test_sgd_classifier(3) # SGDClassifier with modified huber loss @unittest.skipIf( LooseVersion(torch.__version__) < LooseVersion("1.6.0"), reason="Modified Huber loss test requires torch >= 1.6.0" ) def test_modified_huber(self): X = np.array([[-0.5, -1], [-1, -1], [-0.1, -0.1], [0.1, -0.2], [0.5, 1], [1, 1], [0.1, 0.1], [-0.1, 0.2]]) Y = np.array([1, 1, 1, 1, 2, 2, 2, 2]) model = SGDClassifier(loss="modified_huber", max_iter=1000, tol=1e-3) model.fit(X, Y) # Use Hummingbird to convert the model to PyTorch hb_model = hummingbird.ml.convert(model, "torch") inputs = [[-1, -1], [1, 1], [-0.2, 0.1], [0.2, -0.1]] np.testing.assert_allclose(model.predict_proba(inputs), hb_model.predict_proba(inputs), rtol=1e-6, atol=1e-6) @unittest.skipIf( LooseVersion(torch.__version__) < LooseVersion("1.6.0"), reason="Modified Huber loss test requires torch >= 1.6.0" ) def test_modified_huber2(self): X = np.array([[-0.5, -1], [-1, -1], [-0.1, -0.1], [0.1, -0.2], [0.5, 1], [1, 1], [0.1, 0.1], [-0.1, 0.2]]) Y = np.array([1, 1, 1, 1, 2, 2, 2, 2]) model = SGDClassifier(loss="modified_huber", max_iter=1000, tol=1e-3) model.fit(X, Y) # Use Hummingbird to convert the model to PyTorch hb_model = hummingbird.ml.convert(model, "torch") np.testing.assert_allclose(model.predict_proba(X), hb_model.predict_proba(X), rtol=1e-6, atol=1e-6) # SGDClassifier with modified huber loss multiclass @unittest.skipIf( LooseVersion(torch.__version__) < LooseVersion("1.6.0"), reason="Modified Huber loss test requires torch >= 1.6.0" ) def test_modified_huber_multi(self): X = np.array([[-0.5, -1], [-1, -1], [-0.1, -0.1], [0.1, -0.2], [0.5, 1], [1, 1], [0.1, 0.1], [-0.1, 0.2]]) Y = np.array([0, 1, 1, 1, 2, 2, 2, 2]) model = SGDClassifier(loss="modified_huber", max_iter=1000, tol=1e-3) model.fit(X, Y) # Use Hummingbird to convert the model to PyTorch hb_model = hummingbird.ml.convert(model, "torch") inputs = [[-1, -1], [1, 1], [-0.2, 0.1], [0.2, -0.1]] np.testing.assert_allclose(model.predict_proba(inputs), hb_model.predict_proba(inputs), rtol=1e-6, atol=1e-6) # Failure cases def test_sklearn_linear_model_raises_wrong_type(self): warnings.filterwarnings("ignore") np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(3, size=100).astype(np.float32) # y must be int, not float, should error model = SGDClassifier().fit(X, y) self.assertRaises(RuntimeError, hummingbird.ml.convert, model, "torch") # Float 64 data tests def test_float64_linear_regression(self): model = LinearRegression() np.random.seed(0) X = np.random.rand(100, 200) y = np.random.randint(2, size=100) model.fit(X, y) torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose(model.predict(X), torch_model.predict(X), rtol=1e-6, atol=1e-6) def test_float64_sgd_classifier(self): model = SGDClassifier(loss="log") np.random.seed(0) num_classes = 3 X = np.random.rand(100, 200) y = np.random.randint(num_classes, size=100) model.fit(X, y) torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose(model.predict(X), torch_model.predict(X), rtol=1e-6, atol=1e-6) # Multioutput regression tests def test_multioutput_linear_regression(self): for n_targets in [1, 2, 7]: model = LinearRegression() X, y = datasets.make_regression( n_samples=100, n_features=10, n_informative=5, n_targets=n_targets, random_state=2021 ) model.fit(X, y) torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose(model.predict(X), torch_model.predict(X), rtol=1e-5, atol=1e-5) # Test Pandas input @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") def test_logistic_regression_pandas(self): model = LogisticRegression(solver="liblinear") data = datasets.load_iris() X, y = data.data[:, :3], data.target X = X.astype(np.float32) X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2) X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4) y_train = y % 2 model.fit(X_train, y_train) hb_model = hummingbird.ml.convert(model, "torch") self.assertTrue(hb_model is not None) np.testing.assert_allclose(model.predict(X_train), hb_model.predict(X_train), rtol=1e-6, atol=1e-6) np.testing.assert_allclose(model.predict_proba(X_train), hb_model.predict_proba(X_train), rtol=1e-6, atol=1e-6) # Test Torschscript backend. def test_logistic_regression_ts(self): model = LogisticRegression(solver="liblinear") data = datasets.load_iris() X, y = data.data, data.target X = X.astype(np.float32) model.fit(X, y) ts_model = hummingbird.ml.convert(model, "torch.jit", X) self.assertTrue(ts_model is not None) np.testing.assert_allclose(model.predict(X), ts_model.predict(X), rtol=1e-6, atol=1e-6) np.testing.assert_allclose(model.predict_proba(X), ts_model.predict_proba(X), rtol=1e-6, atol=1e-6) # Test TVM backends. @unittest.skipIf(not (tvm_installed()), reason="TVM tests require TVM") def test_sgd_classifier_tvm(self): model = SGDClassifier(loss="log") np.random.seed(0) num_classes = 3 X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) tvm_model = hummingbird.ml.convert(model, "tvm", X) self.assertTrue(tvm_model is not None) np.testing.assert_allclose(model.predict(X), tvm_model.predict(X), rtol=1e-6, atol=1e-6) np.testing.assert_allclose(model.predict_proba(X), tvm_model.predict_proba(X), rtol=1e-6, atol=1e-6) @unittest.skipIf(not (tvm_installed()), reason="TVM tests require TVM") def test_lr_tvm(self): model = LinearRegression() np.random.seed(0) num_classes = 1000 X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y) tvm_model = hummingbird.ml.convert(model, "tvm", X, extra_config={constants.TVM_MAX_FUSE_DEPTH: 30}) self.assertTrue(tvm_model is not None) np.testing.assert_allclose(model.predict(X), tvm_model.predict(X), rtol=1e-6, atol=1e-3)
class TestSklearnPipeline(unittest.TestCase): def test_pipeline(self): data = np.array([[0, 0], [0, 0], [1, 1], [1, 1]], dtype=np.float32) scaler = StandardScaler() scaler.fit(data) model = Pipeline([("scaler1", scaler), ("scaler2", scaler)]) torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.transform(data), torch_model.transform(data), rtol=1e-06, atol=1e-06, ) def test_pipeline2(self): data = np.array([[0.0, 0.0], [0.0, 0.0], [1.0, 1.0], [1.0, 1.0]], dtype=np.float32) scaler = StandardScaler() scaler.fit(data) model = Pipeline([("scaler1", scaler), ("scaler2", scaler)]) torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.transform(data), torch_model.transform(data), rtol=1e-06, atol=1e-06, ) def test_combine_inputs_union_in_pipeline(self): from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline data = np.array([[0.0, 0.0], [0.0, 0.0], [1.0, 1.0], [1.0, 1.0]], dtype=np.float32) model = Pipeline([ ("scaler1", StandardScaler()), ("union", FeatureUnion([("scaler2", StandardScaler()), ("scaler3", MinMaxScaler())])), ]) model.fit(data) torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.transform(data), torch_model.transform(data), rtol=1e-06, atol=1e-06, ) def test_combine_inputs_floats_ints(self): data = [[0, 0.0], [0, 0.0], [1, 1.0], [1, 1.0]] scaler = StandardScaler() scaler.fit(data) model = Pipeline([("scaler1", scaler), ("scaler2", scaler)]) torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.transform(data), torch_model.transform(data), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") def test_pipeline_column_transformer_1(self): iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2) X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4) y_train = y % 2 numeric_features = [0, 1, 2] # ["vA", "vB", "vC"] classifier = LogisticRegression( C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3, ) numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())]) preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, numeric_features)]) model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) X_test = X_train[:11] torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.predict_proba(X_test), torch_model.predict_proba(X_test.values), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") def test_pipeline_column_transformer_string(self): """ TODO: Hummingbird does not yet support strings in this context. Should raise error. When this feature is complete, change this test. """ # fit titanic_url = "https://raw.githubusercontent.com/amueller/scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv" data = pandas.read_csv(titanic_url) X = data.drop("survived", axis=1) y = data["survived"] # SimpleImputer on string is not available for string # in ONNX-ML specifications. # So we do it beforehand. X["pclass"].fillna("missing", inplace=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) numeric_features = ["age", "fare"] numeric_transformer = Pipeline( steps=[("imputer", SimpleImputer( strategy="median")), ("scaler", StandardScaler())]) categorical_features = ["pclass"] categorical_transformer = Pipeline( steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))]) preprocessor = ColumnTransformer(transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ]) clf = Pipeline( steps=[("preprocessor", preprocessor ), ("classifier", LogisticRegression(solver="liblinear"))]) to_drop = { "parch", "sibsp", "cabin", "ticket", "name", "body", "home.dest", "boat", "sex", "embarked" } X_train = X_train.copy() X_test = X_test.copy() X_train["pclass"] = X_train["pclass"].astype(np.int64) X_test["pclass"] = X_test["pclass"].astype(np.int64) X_train = X_train.drop(to_drop, axis=1) X_test = X_test.drop(to_drop, axis=1) clf.fit(X_train, y_train) torch_model = hummingbird.ml.convert(clf, "torch", X_test) self.assertTrue(torch_model is not None) np.testing.assert_allclose( clf.predict(X_test), torch_model.predict(X_test), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") def test_pipeline_column_transformer(self): iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2) X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4) y_train = y % 2 numeric_features = [0, 1, 2] # ["vA", "vB", "vC"] categorical_features = [3, 4] # ["vcat", "vcat2"] classifier = LogisticRegression( C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3, ) numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())]) categorical_transformer = Pipeline( steps=[("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))]) preprocessor = ColumnTransformer(transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ]) model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) X_test = X_train[:11] torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.predict_proba(X_test), torch_model.predict_proba(X_test.values), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") def test_pipeline_column_transformer_pandas(self): iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2) X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4) y_train = y % 2 numeric_features = [0, 1, 2] # ["vA", "vB", "vC"] categorical_features = [3, 4] # ["vcat", "vcat2"] classifier = LogisticRegression( C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3, ) numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())]) categorical_transformer = Pipeline( steps=[("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))]) preprocessor = ColumnTransformer(transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ]) model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) X_test = X_train[:11] torch_model = hummingbird.ml.convert(model, "torch", X_test) self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.predict_proba(X_test), torch_model.predict_proba(X_test), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") def test_pipeline_column_transformer_pandas_ts(self): iris = datasets.load_iris() X = np.array( iris.data[:, :3], np.float32 ) # If we don't use float32 here, with python 3.5 and torch 1.5.1 will fail. y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2) X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4) y_train = y % 2 numeric_features = [0, 1, 2] # ["vA", "vB", "vC"] categorical_features = [3, 4] # ["vcat", "vcat2"] classifier = LogisticRegression( C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3, ) numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())]) categorical_transformer = Pipeline( steps=[("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))]) preprocessor = ColumnTransformer(transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ]) model = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) X_test = X_train[:11] torch_model = hummingbird.ml.convert(model, "torch.jit", X_test) self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.predict_proba(X_test), torch_model.predict_proba(X_test), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") def test_pipeline_column_transformer_weights(self): iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2) X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4) y_train = y % 2 numeric_features = [0, 1, 2] # ["vA", "vB", "vC"] categorical_features = [3, 4] # ["vcat", "vcat2"] classifier = LogisticRegression( C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3, ) numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())]) categorical_transformer = Pipeline( steps=[("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))]) preprocessor = ColumnTransformer( transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ], transformer_weights={ "num": 2, "cat": 3 }, ) model = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) X_test = X_train[:11] torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.predict_proba(X_test), torch_model.predict_proba(X_test.values), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") def test_pipeline_column_transformer_weights_pandas(self): iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2) X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4) y_train = y % 2 numeric_features = [0, 1, 2] # ["vA", "vB", "vC"] categorical_features = [3, 4] # ["vcat", "vcat2"] classifier = LogisticRegression( C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3, ) numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())]) categorical_transformer = Pipeline( steps=[("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))]) preprocessor = ColumnTransformer( transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ], transformer_weights={ "num": 2, "cat": 3 }, ) model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) X_test = X_train[:11] torch_model = hummingbird.ml.convert(model, "torch", X_test) self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.predict_proba(X_test), torch_model.predict_proba(X_test), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") def test_pipeline_column_transformer_drop(self): iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2) X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4) y_train = y % 2 numeric_features = [0, 1] # ["vA", "vB"] categorical_features = [3, 4] # ["vcat", "vcat2"] classifier = LogisticRegression( C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3, ) numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())]) categorical_transformer = Pipeline( steps=[("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))]) preprocessor = ColumnTransformer( transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ], transformer_weights={ "num": 2, "cat": 3 }, remainder="drop", ) model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) X_test = X_train[:11] torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.predict_proba(X_test), torch_model.predict_proba(X_test.values), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") def test_pipeline_column_transformer_drop_noweights(self): iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2) X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4) y_train = y % 2 numeric_features = [0, 1] # ["vA", "vB"] categorical_features = [3, 4] # ["vcat", "vcat2"] classifier = LogisticRegression( C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3, ) numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())]) categorical_transformer = Pipeline( steps=[("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))]) preprocessor = ColumnTransformer( transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ], remainder="drop", ) model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) X_test = X_train[:11] torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.predict_proba(X_test), torch_model.predict_proba(X_test.values), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(ColumnTransformer is None, reason="ColumnTransformer not available in 0.19") @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") def test_pipeline_column_transformer_passthrough(self): iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2) X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4) y_train = y % 2 numeric_features = [0, 1] # ["vA", "vB"] categorical_features = [3, 4] # ["vcat", "vcat2"] classifier = LogisticRegression( C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3, ) numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())]) categorical_transformer = Pipeline( steps=[("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))]) preprocessor = ColumnTransformer( transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ], transformer_weights={ "num": 2, "cat": 3 }, remainder="passthrough", ) model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) X_test = X_train[:11] torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.predict_proba(X_test), torch_model.predict_proba(X_test.values), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(ColumnTransformer is None, reason="ColumnTransformer not available in 0.19") @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") def test_pipeline_column_transformer_passthrough_noweights(self): iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2) X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4) y_train = y % 2 numeric_features = [0, 1] # ["vA", "vB"] categorical_features = [3, 4] # ["vcat", "vcat2"] classifier = LogisticRegression( C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3, ) numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())]) categorical_transformer = Pipeline( steps=[("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))]) preprocessor = ColumnTransformer( transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ], remainder="passthrough", ) model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) X_test = X_train[:11] torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.predict_proba(X_test), torch_model.predict_proba(X_test.values), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(ColumnTransformer is None, reason="ColumnTransformer not available in 0.19") @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") def test_pipeline_column_transformer_passthrough_slice(self): iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2) X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4) y_train = y % 2 numeric_features = slice(0, 1) # ["vA", "vB"] categorical_features = slice(3, 4) # ["vcat", "vcat2"] classifier = LogisticRegression( C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3, ) numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())]) categorical_transformer = Pipeline( steps=[("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))]) preprocessor = ColumnTransformer( transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ], transformer_weights={ "num": 2, "cat": 3 }, remainder="passthrough", ) model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) X_test = X_train[:11] torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.predict_proba(X_test), torch_model.predict_proba(X_test.values), rtol=1e-06, atol=1e-06, ) # Taken from https://github.com/microsoft/hummingbird/issues/388https://github.com/microsoft/hummingbird/issues/388 def test_pipeline_pca_rf(self): X, y = make_regression(n_samples=1000, n_features=8, n_informative=5, n_targets=1, random_state=0, shuffle=True) pca = PCA(n_components=8, svd_solver="randomized", whiten=True) clf = make_pipeline( StandardScaler(), pca, RandomForestRegressor(n_estimators=10, max_depth=30, random_state=0)) clf.fit(X, y) model = hummingbird.ml.convert(clf, "pytorch") prediction_sk = clf.predict([[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]) prediction_hb = model.predict( [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]) np.testing.assert_allclose(prediction_sk, prediction_hb, rtol=1e-06, atol=1e-06) @unittest.skipIf(ColumnTransformer is None, reason="ColumnTransformer not available in 0.19") @unittest.skipIf(not onnx_runtime_installed(), reason="Test requires ORT installed") def test_pipeline_many_inputs(self): n_features = 18 X = np.random.rand(100, n_features) y = np.random.randint(1000, size=100) scaler_transformer = Pipeline(steps=[("scaler", StandardScaler())]) preprocessor = ColumnTransformer( transformers=[("scaling", scaler_transformer, list(range(n_features)))]) model = RandomForestRegressor(n_estimators=10, max_depth=9) pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)]) pipeline.fit(X, y) X_test = tuple(np.split(X, n_features, axis=1)) hb_model = hummingbird.ml.convert(pipeline, "onnx", X_test) assert len(hb_model.model.graph.input) == n_features np.testing.assert_allclose( pipeline.predict(X), np.array(hb_model.predict(X_test)).flatten(), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(ColumnTransformer is None, reason="ColumnTransformer not available in 0.19") @unittest.skipIf(not onnx_runtime_installed(), reason="Test requires ORT installed") def test_pipeline_many_inputs_with_schema(self): n_features = 5 X = np.random.rand(100, n_features) y = np.random.randint(1000, size=100) input_column_names = ["A", "B", "C", "D", "E"] output_column_names = ["score"] scaler_transformer = Pipeline(steps=[("scaler", StandardScaler())]) preprocessor = ColumnTransformer( transformers=[("scaling", scaler_transformer, list(range(n_features)))]) model = RandomForestRegressor(n_estimators=10, max_depth=9) pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)]) pipeline.fit(X, y) X_test = tuple(np.split(X, n_features, axis=1)) extra_config = { constants.INPUT_NAMES: input_column_names, constants.OUTPUT_NAMES: output_column_names } hb_model = hummingbird.ml.convert(pipeline, "onnx", X_test, extra_config=extra_config) graph_inputs = [input.name for input in hb_model.model.graph.input] graph_outputs = [output.name for output in hb_model.model.graph.output] assert len(hb_model.model.graph.input) == n_features assert graph_inputs == input_column_names assert graph_outputs == output_column_names
class TestSklearnPipeline(unittest.TestCase): def test_pipeline(self): data = np.array([[0, 0], [0, 0], [1, 1], [1, 1]], dtype=np.float32) scaler = StandardScaler() scaler.fit(data) model = Pipeline([("scaler1", scaler), ("scaler2", scaler)]) torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.transform(data), torch_model.transform(data), rtol=1e-06, atol=1e-06, ) def test_pipeline2(self): data = np.array([[0.0, 0.0], [0.0, 0.0], [1.0, 1.0], [1.0, 1.0]], dtype=np.float32) scaler = StandardScaler() scaler.fit(data) model = Pipeline([("scaler1", scaler), ("scaler2", scaler)]) torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.transform(data), torch_model.transform(data), rtol=1e-06, atol=1e-06, ) def test_combine_inputs_union_in_pipeline(self): from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline data = np.array([[0.0, 0.0], [0.0, 0.0], [1.0, 1.0], [1.0, 1.0]], dtype=np.float32) model = Pipeline([ ("scaler1", StandardScaler()), ("union", FeatureUnion([("scaler2", StandardScaler()), ("scaler3", MinMaxScaler())])), ]) model.fit(data) torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.transform(data), torch_model.transform(data), rtol=1e-06, atol=1e-06, ) def test_combine_inputs_floats_ints(self): data = [[0, 0.0], [0, 0.0], [1, 1.0], [1, 1.0]] scaler = StandardScaler() scaler.fit(data) model = Pipeline([("scaler1", scaler), ("scaler2", scaler)]) torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.transform(data), torch_model.transform(data), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") def test_pipeline_column_transformer_1(self): iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2) X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4) y_train = y % 2 numeric_features = [0, 1, 2] # ["vA", "vB", "vC"] classifier = LogisticRegression( C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3, ) numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())]) preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, numeric_features)]) model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) X_test = X_train[:11] torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.predict_proba(X_test), torch_model.predict_proba(X_test.values), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") def test_pipeline_column_transformer(self): iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2) X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4) y_train = y % 2 numeric_features = [0, 1, 2] # ["vA", "vB", "vC"] categorical_features = [3, 4] # ["vcat", "vcat2"] classifier = LogisticRegression( C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3, ) numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())]) categorical_transformer = Pipeline( steps=[("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))]) preprocessor = ColumnTransformer(transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ]) model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) X_test = X_train[:11] torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.predict_proba(X_test), torch_model.predict_proba(X_test.values), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") def test_pipeline_column_transformer_weights(self): iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2) X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4) y_train = y % 2 numeric_features = [0, 1, 2] # ["vA", "vB", "vC"] categorical_features = [3, 4] # ["vcat", "vcat2"] classifier = LogisticRegression( C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3, ) numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())]) categorical_transformer = Pipeline( steps=[("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))]) preprocessor = ColumnTransformer( transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ], transformer_weights={ "num": 2, "cat": 3 }, ) model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) X_test = X_train[:11] torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.predict_proba(X_test), torch_model.predict_proba(X_test.values), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") def test_pipeline_column_transformer_drop(self): iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2) X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4) y_train = y % 2 numeric_features = [0, 1] # ["vA", "vB"] categorical_features = [3, 4] # ["vcat", "vcat2"] classifier = LogisticRegression( C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3, ) numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())]) categorical_transformer = Pipeline( steps=[("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))]) preprocessor = ColumnTransformer( transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ], transformer_weights={ "num": 2, "cat": 3 }, remainder="drop", ) model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) X_test = X_train[:11] torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.predict_proba(X_test), torch_model.predict_proba(X_test.values), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") def test_pipeline_column_transformer_drop_noweights(self): iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2) X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4) y_train = y % 2 numeric_features = [0, 1] # ["vA", "vB"] categorical_features = [3, 4] # ["vcat", "vcat2"] classifier = LogisticRegression( C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3, ) numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())]) categorical_transformer = Pipeline( steps=[("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))]) preprocessor = ColumnTransformer( transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ], remainder="drop", ) model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) X_test = X_train[:11] torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.predict_proba(X_test), torch_model.predict_proba(X_test.values), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(ColumnTransformer is None, reason="ColumnTransformer not available in 0.19") @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") def test_pipeline_column_transformer_passthrough(self): iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2) X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4) y_train = y % 2 numeric_features = [0, 1] # ["vA", "vB"] categorical_features = [3, 4] # ["vcat", "vcat2"] classifier = LogisticRegression( C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3, ) numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())]) categorical_transformer = Pipeline( steps=[("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))]) preprocessor = ColumnTransformer( transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ], transformer_weights={ "num": 2, "cat": 3 }, remainder="passthrough", ) model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) X_test = X_train[:11] torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.predict_proba(X_test), torch_model.predict_proba(X_test.values), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(ColumnTransformer is None, reason="ColumnTransformer not available in 0.19") @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") def test_pipeline_column_transformer_passthrough_noweights(self): iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2) X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4) y_train = y % 2 numeric_features = [0, 1] # ["vA", "vB"] categorical_features = [3, 4] # ["vcat", "vcat2"] classifier = LogisticRegression( C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3, ) numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())]) categorical_transformer = Pipeline( steps=[("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))]) preprocessor = ColumnTransformer( transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ], remainder="passthrough", ) model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) X_test = X_train[:11] torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.predict_proba(X_test), torch_model.predict_proba(X_test.values), rtol=1e-06, atol=1e-06, ) @unittest.skipIf(ColumnTransformer is None, reason="ColumnTransformer not available in 0.19") @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed") def test_pipeline_column_transformer_passthrough_slice(self): iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2) X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4) y_train = y % 2 numeric_features = slice(0, 1) # ["vA", "vB"] categorical_features = slice(3, 4) # ["vcat", "vcat2"] classifier = LogisticRegression( C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="liblinear", tol=1e-3, ) numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())]) categorical_transformer = Pipeline( steps=[("onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"))]) preprocessor = ColumnTransformer( transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ], transformer_weights={ "num": 2, "cat": 3 }, remainder="passthrough", ) model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) X_test = X_train[:11] torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose( model.predict_proba(X_test), torch_model.predict_proba(X_test.values), rtol=1e-06, atol=1e-06, )