import os sys.path.insert(0, sys.argv[1]) os.environ['PYSPARK_PYTHON'] = sys.executable import unittest from pysparkling.context import H2OContext from pysparkling.conf import H2OConf from pyspark.sql import SparkSession import unit_test_utils import generic_test_utils class H2OConfTest(unittest.TestCase): @classmethod def setUpClass(cls): cls._cloud_name = generic_test_utils.unique_cloud_name("h2o_conf_test") cls._conf = unit_test_utils.get_default_spark_conf(cls._spark_options_from_params). \ set("spark.ext.h2o.cloud.name", cls._cloud_name) cls._spark = SparkSession.builder.config(conf=cls._conf).getOrCreate() cls._hc = H2OContext.getOrCreate(cls._spark, H2OConf(cls._spark).set_num_of_external_h2o_nodes(1)) # test passing h2o_conf to H2OContext def test_h2o_conf(self): self.assertEquals(self._hc.get_conf().cloud_name(), self._cloud_name, "Configuration property cloud_name should match") if __name__ == '__main__': generic_test_utils.run_tests([H2OConfTest], file_name="py_unit_tests_conf_report")
df = self._spark.createDataFrame(data) hf = hc.as_h2o_frame(df) # Modify H2O frame - this should invalidate internal cache hf['c3'] = 3 # Now try to convert modified H2O frame back to Spark data frame dfe = hc.as_spark_frame(hf) self.assertEquals(dfe.count(), len(data), "Number of rows should match") self.assertEquals(len(dfe.columns), 3, "Number of columns should match") self.assertEquals( dfe.collect(), [Row(c1=1, c2='first', c3=3), Row(c1=2, c2='second', c3=3)]) def test_sparse_data_conversion(self): data = [(float(x), SparseVector(50000, {x: float(x)})) for x in range(1, 90)] df = self._spark.sparkContext.parallelize(data).toDF() t0 = time.time() self._hc.as_h2o_frame(df) t1 = time.time() total = t1 - t0 assert total < 10 # The conversion should not take longer then 10 seconds if __name__ == '__main__': generic_test_utils.run_tests([FrameTransformationsTest], file_name="py_unit_tests_conversions_report")
sys.path.insert(0, sys.argv[1]) os.environ['PYSPARK_PYTHON'] = sys.executable os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable from integ_test_utils import * from generic_test_utils import run_tests class YarnIntegTestSuite(unittest.TestCase): @classmethod def setUpClass(cls): conf = get_default_spark_conf(cls._spark_options_from_params) conf["spark.master"] = "local[*]" conf["spark.submit.pyFiles"] = sys.argv[1] # Configure YARN environment conf["spark.yarn.max.executor.failures"] = "1" # In fail of executor, fail the test conf["spark.executor.instances"] = "1" cls._conf = conf def test_xgboost_medium(self): return_code = launch(self._conf, "examples/scripts/tests/xgboost_test_medium.py") self.assertTrue(return_code == 0, "Process ended in a wrong way. It ended with return code " + str(return_code)) def test_chicago_crime(self): return_code = launch(self._conf, "examples/scripts/ChicagoCrimeDemo.py") self.assertTrue(return_code == 0, "Process ended in a wrong way. It ended with return code " + str(return_code)) if __name__ == '__main__': run_tests([YarnIntegTestSuite], file_name="py_integ_yarn_tests_report")
class H2OMojoPredictionsTest(unittest.TestCase): @classmethod def setUpClass(cls): cls._cloud_name = generic_test_utils.unique_cloud_name("h2o_mojo_predictions_test") cls._spark = SparkSession.builder.config(conf = unit_test_utils.get_default_spark_conf()).getOrCreate() # test predictions on H2O Mojo def test_h2o_mojo_predictions(self): # Try loading the Mojo and prediction on it without starting H2O Context mojo = H2OMOJOModel.create_from_mojo("../ml/src/test/resources/binom_model_prostate.mojo") prostate_frame = self._spark.read.csv("file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True) mojo.predict(prostate_frame).repartition(1).collect() def test_h2o_mojo_predictions_unseen_categoricals(self): mojo = H2OMOJOModel.create_from_mojo("../ml/src/test/resources/deep_learning_airlines_categoricals.zip") mojo.setConvertUnknownCategoricalLevelsToNa(True) d =[{'sepal_len':5.1, 'sepal_wid':3.5, 'petal_len':1.4, 'petal_wid':0.2, 'class':'Missing_categorical'}] df = self._spark.createDataFrame(d) data = mojo.transform(df).collect()[0] assert data["class"] == "Missing_categorical" assert data["petal_len"] == 1.4 assert data["petal_wid"] == 0.2 assert data["sepal_len"] == 5.1 assert data["sepal_wid"] == 3.5 assert data["prediction_output"][0] == 5.240174068202646 if __name__ == '__main__': generic_test_utils.run_tests([H2OMojoPredictionsTest], file_name="py_unit_tests_mojo_predictions_report")
@classmethod def setUpClass(cls): conf = get_default_spark_conf(cls._spark_options_from_params) conf["spark.master"] = "local[*]" conf["spark.submit.pyFiles"] = sys.argv[1] # Configure YARN environment conf[ "spark.yarn.max.executor.failures"] = "1" # In fail of executor, fail the test conf["spark.executor.instances"] = "1" cls._conf = conf def test_xgboost_medium(self): return_code = launch(self._conf, "examples/scripts/tests/xgboost_test_medium.py") self.assertTrue( return_code == 0, "Process ended in a wrong way. It ended with return code " + str(return_code)) def test_chicago_crime(self): return_code = launch(self._conf, "examples/scripts/ChicagoCrimeDemo.py") self.assertTrue( return_code == 0, "Process ended in a wrong way. It ended with return code " + str(return_code)) if __name__ == '__main__': run_tests([YarnIntegTestSuite], file_name="py_integ_yarn_tests_report")
def test_s3n_import(self): fr = h2o.import_file("s3n://data.h2o.ai/h2o-open-tour/2016-nyc/weather.csv") assert fr.ncol == 27 assert fr.nrow == 9768 def test_s3a_import(self): fr = h2o.import_file("s3a://data.h2o.ai/h2o-open-tour/2016-nyc/weather.csv") assert fr.ncol == 27 assert fr.nrow == 9768 def s3_import_export(self, scheme): local_frame = h2o.import_file("/home/0xdiag/smalldata/logreg/prostate.csv") timestamp = datetime.today().utcnow().strftime("%Y%m%d-%H%M%S") unique_suffix = str(uuid.uuid4()) s3_path = scheme + "://test.0xdata.com/h2o-hadoop-tests/test-export/" + scheme + "/exported." + \ timestamp + "." + unique_suffix + ".csv.zip" h2o.export_file(local_frame, s3_path) s3_frame = h2o.import_file(s3_path) assert_frame_equal(local_frame.as_data_frame(), s3_frame.as_data_frame()) def test_s3a_import_export(self): self.s3_import_export("s3a") @unittest.skip("skip") def test_s3n_import_export(self): self.s3_import_export("s3n") if __name__ == '__main__': generic_test_utils.run_tests([HadoopSmokeTestSuite], file_name="py_hadoop_smoke_tests_report")
pipeline.write().overwrite().save( "file://" + os.path.abspath("build/test_dai_pipeline_as_spark_pipeline")) loaded_pipeline = Pipeline.load( "file://" + os.path.abspath("build/test_dai_pipeline_as_spark_pipeline")) ## Train the pipeline model model = loaded_pipeline.fit(prostate_frame) model.write().overwrite().save( "file://" + os.path.abspath("build/test_dai_pipeline_as_spark_pipeline_model")) loaded_model = PipelineModel.load( "file://" + os.path.abspath("build/test_dai_pipeline_as_spark_pipeline_model")) preds = loaded_model.transform(prostate_frame).repartition(1).select( mojo.select_prediction_udf("AGE")).take(5) assert preds[0][0] == 65.36320409515132 assert preds[1][0] == 64.96902128114817 assert preds[2][0] == 64.96721023747583 assert preds[3][0] == 65.78772654671035 assert preds[4][0] == 66.11327967814829 if __name__ == '__main__': generic_test_utils.run_tests( [H2OMojoPipelineTest], file_name="py_unit_tests_mojo_pipeline_report")
def test_s3a_import(self): fr = h2o.import_file( "s3a://data.h2o.ai/h2o-open-tour/2016-nyc/weather.csv") assert fr.ncol == 27 assert fr.nrow == 9768 def s3_import_export(self, scheme): local_frame = h2o.import_file( "/home/0xdiag/smalldata/logreg/prostate.csv") timestamp = datetime.today().utcnow().strftime("%Y%m%d-%H%M%S") unique_suffix = str(uuid.uuid4()) s3_path = scheme + "://test.0xdata.com/h2o-hadoop-tests/test-export/" + scheme + "/exported." + \ timestamp + "." + unique_suffix + ".csv.zip" h2o.export_file(local_frame, s3_path) s3_frame = h2o.import_file(s3_path) assert_frame_equal(local_frame.as_data_frame(), s3_frame.as_data_frame()) def test_s3a_import_export(self): self.s3_import_export("s3a") @unittest.skip("skip") def test_s3n_import_export(self): self.s3_import_export("s3n") if __name__ == '__main__': generic_test_utils.run_tests([HadoopSmokeTestSuite], file_name="py_hadoop_smoke_tests_report")
env = IntegTestEnv() env.set_spark_master("local[*]") env.conf("spark.ext.h2o.port.base", 63331) return_code = launch(env, "examples/pipelines/ham_or_spam_multi_algo.py", "automl") self.assertTrue( return_code == 0, "Process ended in a wrong way. It ended with return code " + str(return_code)) def test_import_pysparkling_standalone_app(self): env = IntegTestEnv() env.set_spark_master("local[*]") env.conf("spark.ext.h2o.port.base", 63331) return_code = launch( env, "examples/scripts/tests/pysparkling_ml_import_overrides_spark_test.py" ) self.assertTrue( return_code == 0, "Process ended in a wrong way. It ended with return code " + str(return_code)) if __name__ == '__main__': generic_test_utils.run_tests([LocalIntegTestSuite], file_name="py_integ_local_tests_report")
env.set_spark_master("local[*]") # Configure YARN environment env.conf("spark.yarn.max.executor.failures", 1) # In fail of executor, fail the test env.conf("spark.executor.instances", 1) env.conf("spark.executor.memory", "2g") env.conf("spark.ext.h2o.port.base", 63331) env.conf("spark.driver.memory", "2g") return_code = launch(env, "examples/scripts/tests/xgboost_test_medium.py") self.assertTrue(return_code == 0, "Process ended in a wrong way. It ended with return code "+str(return_code)) def test_chicago_crime(self): env = IntegTestEnv() env.set_spark_master("local[*]") # Configure YARN environment env.conf("spark.yarn.max.executor.failures", 1) # In fail of executor, fail the test env.conf("spark.executor.instances", 1) env.conf("spark.executor.memory", "2g") env.conf("spark.ext.h2o.port.base", 63331) env.conf("spark.driver.memory", "2g") return_code = launch(env, "examples/scripts/ChicagoCrimeDemo.py") self.assertTrue(return_code == 0, "Process ended in a wrong way. It ended with return code "+str(return_code)) if __name__ == '__main__': generic_test_utils.run_tests([YarnIntegTestSuite], file_name="py_integ_yarn_tests_report")
featuresCols=["sepal_len", "sepal_wid", "petal_len", "petal_wid"], withDetailedPredictionCol=True) model = algo.fit(self.dataset) transformed = model.transform(self.dataset) self.assertEquals( transformed.select("detailed_prediction.cluster").head()[0], 0, "Prediction should match") self.assertEquals( len(transformed.select("detailed_prediction.distances").head()[0]), 3, "Size of distances array should match") def testUserPoints(self): algo = H2OKMeans( splitRatio=0.8, seed=1, k=3, featuresCols=["sepal_len", "sepal_wid", "petal_len", "petal_wid"], userPoints=[[4.9, 3.0, 1.4, 0.2], [5.6, 2.5, 3.9, 1.1], [6.5, 3.0, 5.2, 2.0]]) model = algo.fit(self.dataset) self.assertEquals( model.transform(self.dataset).select("prediction").head()[0], 0, "Prediction should match") if __name__ == '__main__': generic_test_utils.run_tests([H2OKMeansTestSuite], file_name="py_unit_tests_kmeans_report")
# # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # """ Integration tests for pySparkling for spark running in Standalone mode """ import generic_test_utils from integ_test_utils import * import unittest class StandaloneIntegTestSuite(unittest.TestCase): pass if __name__ == '__main__': generic_test_utils.run_tests([StandaloneIntegTestSuite], file_name="py_integ_standalone_tests_report")
return_code = launch(self._conf, "examples/pipelines/ham_or_spam_multi_algo.py", param="xgboost") self.assertTrue( return_code == 0, "Process ended in a wrong way. It ended with return code " + str(return_code)) def test_pipeline_automl(self): return_code = launch(self._conf, "examples/pipelines/ham_or_spam_multi_algo.py", param="automl") self.assertTrue( return_code == 0, "Process ended in a wrong way. It ended with return code " + str(return_code)) def test_import_pysparkling_standalone_app(self): return_code = launch( self._conf, "examples/scripts/tests/pysparkling_ml_import_overrides_spark_test.py" ) self.assertTrue( return_code == 0, "Process ended in a wrong way. It ended with return code " + str(return_code)) if __name__ == '__main__': run_tests([LocalIntegTestSuite], file_name="py_integ_local_tests_report")
def testProducedMOJOModelAndLoadedMOJOModelReturnsSameResult(self): targetEncoder = H2OTargetEncoder(labelCol="CAPSULE", inputCols=["RACE", "DPROS", "DCAPS"]) pipeline = Pipeline(stages=[targetEncoder]) producedModel = pipeline.fit(self._trainingDataset) path = "file://" + os.path.abspath("build/testProducedMOJOModelAndLoadedMOJOModelReturnsSameResult") producedModel.write().overwrite().save(path) loadedModel = PipelineModel.load(path) transformedByProducedModel = producedModel.transform(self._testingDataset) transformedByLoadedModel = loadedModel.transform(self._testingDataset) unit_test_utils.assert_data_frames_are_identical(transformedByProducedModel, transformedByLoadedModel) def testTargetEncoderModelWithDisabledNoiseAndTargetEncoderMOJOModelTransformTheTrainingDatasetSameWay(self): targetEncoder = H2OTargetEncoder()\ .setInputCols(["RACE", "DPROS", "DCAPS"])\ .setLabelCol("CAPSULE")\ .setHoldoutStrategy("None")\ .setNoise(0.0) targetEncoderModel = targetEncoder.fit(self._trainingDataset) transformedByModel = targetEncoderModel.transformTrainingDataset(self._trainingDataset) transformedByMOJOModel = targetEncoderModel.transform(self._trainingDataset) unit_test_utils.assert_data_frames_are_identical(transformedByModel, transformedByMOJOModel) if __name__ == '__main__': generic_test_utils.run_tests([H2OTargetEncoderTestSuite], file_name="py_unit_tests_target_encoder_report")
@classmethod def setUpClass(cls): conf = get_default_spark_conf(cls._spark_options_from_params) conf["spark.master"] = "local[*]" conf["spark.submit.pyFiles"] = sys.argv[1] cls._conf = conf def test_pipeline_gbm_mojo(self): return_code = launch(self._conf, "examples/pipelines/ham_or_spam_multi_algo.py", param="gbm") self.assertTrue(return_code == 0, "Process ended in a wrong way. It ended with return code " + str(return_code)) def test_pipeline_deep_learning(self): return_code = launch(self._conf, "examples/pipelines/ham_or_spam_multi_algo.py", param="dl") self.assertTrue(return_code == 0, "Process ended in a wrong way. It ended with return code " + str(return_code)) def test_pipeline_xgboost(self): return_code = launch(self._conf, "examples/pipelines/ham_or_spam_multi_algo.py", param="xgboost") self.assertTrue(return_code == 0, "Process ended in a wrong way. It ended with return code " + str(return_code)) def test_pipeline_automl(self): return_code = launch(self._conf, "examples/pipelines/ham_or_spam_multi_algo.py", param="automl") self.assertTrue(return_code == 0, "Process ended in a wrong way. It ended with return code " + str(return_code)) def test_import_pysparkling_standalone_app(self): return_code = launch(self._conf, "examples/scripts/tests/pysparkling_ml_import_overrides_spark_test.py") self.assertTrue(return_code == 0, "Process ended in a wrong way. It ended with return code " + str(return_code)) if __name__ == '__main__': run_tests([LocalIntegTestSuite], file_name="py_integ_local_tests_report")