def test_mojo_dai_pipeline_serialize(self): mojo = H2OMOJOPipelineModel.createFromMojo("file://" + os.path.abspath( "../ml/src/test/resources/mojo2data/pipeline.mojo")) prostateFrame = self._spark.read.csv( "file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True) # Create Spark pipeline of single step - mojo pipeline pipeline = Pipeline(stages=[mojo]) pipeline.write().overwrite().save( "file://" + os.path.abspath("build/test_dai_pipeline_as_spark_pipeline")) loadedPipeline = Pipeline.load( "file://" + os.path.abspath("build/test_dai_pipeline_as_spark_pipeline")) # Train the pipeline model model = loadedPipeline.fit(prostateFrame) model.write().overwrite().save( "file://" + os.path.abspath("build/test_dai_pipeline_as_spark_pipeline_model")) loadedModel = PipelineModel.load( "file://" + os.path.abspath("build/test_dai_pipeline_as_spark_pipeline_model")) preds = loadedModel.transform(prostateFrame).repartition(1).select( mojo.selectPredictionUDF("AGE")).take(5) assert preds[0][0] == 65.36320409515132 assert preds[1][0] == 64.96902128114817 assert preds[2][0] == 64.96721023747583 assert preds[3][0] == 65.78772654671035 assert preds[4][0] == 66.11327967814829
def test_h2o_mojo_pipeline_predictions(self): # Try loading the Mojo and prediction on it without starting H2O Context mojo = H2OMOJOPipelineModel.create_from_mojo( "file://" + os.path.abspath("../ml/src/test/resources/mojo2data/pipeline.mojo") ) mojo.set_named_mojo_output_columns(False) prostate_frame = self._spark.read.csv( "file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True) preds = mojo.predict(prostate_frame).repartition(1) normalSelection = preds.select("prediction.preds").take(5) assert normalSelection[0][0][0] == 65.36320409515132 assert normalSelection[1][0][0] == 64.96902128114817 assert normalSelection[2][0][0] == 64.96721023747583 assert normalSelection[3][0][0] == 65.78772654671035 assert normalSelection[4][0][0] == 66.11327967814829 udfSelection = preds.select(mojo.select_prediction_udf("AGE")).take(5) assert udfSelection[0][0] == 65.36320409515132 assert udfSelection[1][0] == 64.96902128114817 assert udfSelection[2][0] == 64.96721023747583 assert udfSelection[3][0] == 65.78772654671035 assert udfSelection[4][0] == 66.11327967814829 assert mojo.get_output_names()[0] == "AGE"
def test_h2o_mojo_pipeline_predictions_with_named_cols(self): # Try loading the Mojo and prediction on it without starting H2O Context mojo = H2OMOJOPipelineModel.createFromMojo( "file://" + os.path.abspath("../ml/src/test/resources/mojo2data/pipeline.mojo")) prostateFrame = self._spark.read.csv("file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True) preds = mojo.transform(prostateFrame).repartition(1).select(mojo.selectPredictionUDF("AGE")).take(5) assert preds[0][0] == 65.36320409515132 assert preds[1][0] == 64.96902128114817 assert preds[2][0] == 64.96721023747583 assert preds[3][0] == 65.78772654671035 assert preds[4][0] == 66.11327967814829
def mojoModelScoring(sparkSession, scoreFrame, mojoFile, selectionColumns=None, outColumns=None, clusterResource=None): """ Performs scoring on the dataset provided against the mojo file passed to this scoring function Syntax: status, message, df = mojoModelScoring(sparkSession, scoreFrame, mojoFile) Args: sparkSession (sparkSession context) : sparkSession context mojoFile (model object) : model object to be used for scoring scoreFrame (pyspark.sql.dataframe) : dataframe to be scored against selectionColumns (list) : list of column names that should be considered to model, when set to None, all columns are considered (Default: None) Returns: status (bool) : True/False based on execution of the function message (str) : message from execution of the function df (pyspark.sql.dataframe) : scored data as a pyspark dataframe """ if clusterResource: sparkSession.sparkContext.addPyFile(clusterResource) from pysparkling.ml import H2OMOJOPipelineModel try: # read the mojo file from the provided model object mojo = H2OMOJOPipelineModel.createFromMojo("file://" + mojoFile) if selectionColumns: transformFrame = mojo.transform( scoreFrame.select(*selectionColumns)) else: transformFrame = mojo.transform(scoreFrame) if outColumns: colsToBeRenamed = (lambda y: list( filter(lambda x: x not in y, transformFrame.columns)))( scoreFrame.columns) finalFrame = eval("transformFrame" + reduce( lambda x, y: x + ".withColumnRenamed('" + y[0] + "','" + y[1] + "')", [''] + list(zip(colsToBeRenamed, outColumns)))) return True, "dataset scored against mojo file", finalFrame else: return True, "dataset scored against mojo file", transformFrame except Exception as e: print( "Error occured while scoring the mojo file {} on the provided dataset:\n{}" .format(mojoFile, e)) return False, e, None
def test_feature_types_on_h2o_mojo_pipeline(): mojo = H2OMOJOPipelineModel.createFromMojo( "file://" + os.path.abspath("../ml/src/test/resources/mojo2data/pipeline.mojo")) types = mojo.getFeatureTypes() types["DPROS"] == "Int32" types["GLEASON"] == "Int32" types["VOL"] == "Float64" types["DCAPS"] == "Int32" types["PSA"] == "Float64" types["VOL"] == "Float64" types["CAPSULE"] == "Int32" types["RACE"] == "Int32" types["ID"] == "Int32" len(types) == 9
def testMojoPipelineProtoBackendWithoutError(spark): mojo = H2OMOJOPipelineModel.createFromMojo( "file://" + os.path.abspath("../ml/src/test/resources/proto_based_pipeline.mojo")) data = [ (2.0, 'male', 0.41670000553131104, 111361, 6.449999809265137, 'A19'), (1.0, 'female', 0.33329999446868896, 110413, 6.4375, 'A14'), (1.0, 'female', 0.16670000553131104, 111320, 6.237500190734863, 'A21'), (1.0, 'female', 2.0, 111361, 6.237500190734863, 'A20'), (3.0, 'female', 1.0, 110152, 6.75, 'A14'), (1.0, 'male', 0.666700005531311, 110489, 6.85830020904541, 'A10'), (3.0, 'male', 0.33329999446868896, 111320, 0.0, 'A11'), (3.0, 'male', 2.0, 110413, 6.85830020904541, 'A24'), (1.0, 'female', 1.0, 110489, 3.170799970626831, 'A21'), (1.0, 'female', 0.33329999446868896, 111240, 0.0, 'A14') ] rdd = spark.sparkContext.parallelize(data) df = spark.createDataFrame( rdd, ['pclass', 'sex', 'age', 'ticket', 'fare', 'cabin']) prediction = mojo.transform(df) prediction.collect()
def test_h2o_mojo_pipeline_predictions(prostateDataset): # Try loading the Mojo and prediction on it without starting H2O Context path = "file://" + os.path.abspath( "../ml/src/test/resources/mojo2data/pipeline.mojo") settings = H2OMOJOSettings(namedMojoOutputColumns=False) mojo = H2OMOJOPipelineModel.createFromMojo(path, settings) preds = mojo.transform(prostateDataset).repartition(1) normalSelection = preds.select("prediction.preds").take(5) assert normalSelection[0][0][0] == 65.36320409515132 assert normalSelection[1][0][0] == 64.96902128114817 assert normalSelection[2][0][0] == 64.96721023747583 assert normalSelection[3][0][0] == 65.78772654671035 assert normalSelection[4][0][0] == 66.11327967814829 udfSelection = preds.select(mojo.selectPredictionUDF("AGE")).take(5) assert udfSelection[0][0] == 65.36320409515132 assert udfSelection[1][0] == 64.96902128114817 assert udfSelection[2][0] == 64.96721023747583 assert udfSelection[3][0] == 65.78772654671035 assert udfSelection[4][0] == 66.11327967814829