예제 #1
0
    def test_mojo_dai_pipeline_serialize(self):
        mojo = H2OMOJOPipelineModel.createFromMojo("file://" + os.path.abspath(
            "../ml/src/test/resources/mojo2data/pipeline.mojo"))
        prostateFrame = self._spark.read.csv(
            "file://" +
            unit_test_utils.locate("smalldata/prostate/prostate.csv"),
            header=True)
        # Create Spark pipeline of single step - mojo pipeline
        pipeline = Pipeline(stages=[mojo])
        pipeline.write().overwrite().save(
            "file://" +
            os.path.abspath("build/test_dai_pipeline_as_spark_pipeline"))
        loadedPipeline = Pipeline.load(
            "file://" +
            os.path.abspath("build/test_dai_pipeline_as_spark_pipeline"))

        # Train the pipeline model
        model = loadedPipeline.fit(prostateFrame)

        model.write().overwrite().save(
            "file://" +
            os.path.abspath("build/test_dai_pipeline_as_spark_pipeline_model"))
        loadedModel = PipelineModel.load(
            "file://" +
            os.path.abspath("build/test_dai_pipeline_as_spark_pipeline_model"))

        preds = loadedModel.transform(prostateFrame).repartition(1).select(
            mojo.selectPredictionUDF("AGE")).take(5)

        assert preds[0][0] == 65.36320409515132
        assert preds[1][0] == 64.96902128114817
        assert preds[2][0] == 64.96721023747583
        assert preds[3][0] == 65.78772654671035
        assert preds[4][0] == 66.11327967814829
    def test_h2o_mojo_pipeline_predictions(self):
        # Try loading the Mojo and prediction on it without starting H2O Context
        mojo = H2OMOJOPipelineModel.create_from_mojo(
            "file://" +
            os.path.abspath("../ml/src/test/resources/mojo2data/pipeline.mojo")
        )
        mojo.set_named_mojo_output_columns(False)
        prostate_frame = self._spark.read.csv(
            "file://" +
            unit_test_utils.locate("smalldata/prostate/prostate.csv"),
            header=True)
        preds = mojo.predict(prostate_frame).repartition(1)

        normalSelection = preds.select("prediction.preds").take(5)

        assert normalSelection[0][0][0] == 65.36320409515132
        assert normalSelection[1][0][0] == 64.96902128114817
        assert normalSelection[2][0][0] == 64.96721023747583
        assert normalSelection[3][0][0] == 65.78772654671035
        assert normalSelection[4][0][0] == 66.11327967814829

        udfSelection = preds.select(mojo.select_prediction_udf("AGE")).take(5)

        assert udfSelection[0][0] == 65.36320409515132
        assert udfSelection[1][0] == 64.96902128114817
        assert udfSelection[2][0] == 64.96721023747583
        assert udfSelection[3][0] == 65.78772654671035
        assert udfSelection[4][0] == 66.11327967814829

        assert mojo.get_output_names()[0] == "AGE"
    def test_h2o_mojo_pipeline_predictions_with_named_cols(self):
        # Try loading the Mojo and prediction on it without starting H2O Context
        mojo = H2OMOJOPipelineModel.createFromMojo(
            "file://" + os.path.abspath("../ml/src/test/resources/mojo2data/pipeline.mojo"))
        prostateFrame = self._spark.read.csv("file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"),
                                              header=True)
        preds = mojo.transform(prostateFrame).repartition(1).select(mojo.selectPredictionUDF("AGE")).take(5)

        assert preds[0][0] == 65.36320409515132
        assert preds[1][0] == 64.96902128114817
        assert preds[2][0] == 64.96721023747583
        assert preds[3][0] == 65.78772654671035
        assert preds[4][0] == 66.11327967814829
예제 #4
0
def mojoModelScoring(sparkSession,
                     scoreFrame,
                     mojoFile,
                     selectionColumns=None,
                     outColumns=None,
                     clusterResource=None):
    """
    Performs scoring on the dataset provided against the mojo file passed to this scoring function

    Syntax:
        status, message, df = mojoModelScoring(sparkSession, scoreFrame, mojoFile)
    
    Args:
        sparkSession (sparkSession context)              : sparkSession context
        mojoFile (model object)            : model object to be used for scoring
        scoreFrame (pyspark.sql.dataframe) : dataframe to be scored against
        selectionColumns (list)            : list of column names that should be considered to model, when set to None, all columns are
                                                considered (Default: None)    
    Returns: 
        status (bool)                      : True/False based on execution of the function
        message (str)                      : message from execution of the function
        df (pyspark.sql.dataframe)         : scored data as a pyspark dataframe
    """
    if clusterResource:
        sparkSession.sparkContext.addPyFile(clusterResource)
    from pysparkling.ml import H2OMOJOPipelineModel

    try:
        # read the mojo file from the provided model object
        mojo = H2OMOJOPipelineModel.createFromMojo("file://" + mojoFile)
        if selectionColumns:
            transformFrame = mojo.transform(
                scoreFrame.select(*selectionColumns))
        else:
            transformFrame = mojo.transform(scoreFrame)
        if outColumns:
            colsToBeRenamed = (lambda y: list(
                filter(lambda x: x not in y, transformFrame.columns)))(
                    scoreFrame.columns)
            finalFrame = eval("transformFrame" + reduce(
                lambda x, y: x + ".withColumnRenamed('" + y[0] + "','" + y[1] +
                "')", [''] + list(zip(colsToBeRenamed, outColumns))))
            return True, "dataset scored against mojo file", finalFrame
        else:
            return True, "dataset scored against mojo file", transformFrame
    except Exception as e:
        print(
            "Error occured while scoring the mojo file {} on the provided dataset:\n{}"
            .format(mojoFile, e))
        return False, e, None
예제 #5
0
def test_feature_types_on_h2o_mojo_pipeline():
    mojo = H2OMOJOPipelineModel.createFromMojo(
        "file://" +
        os.path.abspath("../ml/src/test/resources/mojo2data/pipeline.mojo"))
    types = mojo.getFeatureTypes()

    types["DPROS"] == "Int32"
    types["GLEASON"] == "Int32"
    types["VOL"] == "Float64"
    types["DCAPS"] == "Int32"
    types["PSA"] == "Float64"
    types["VOL"] == "Float64"
    types["CAPSULE"] == "Int32"
    types["RACE"] == "Int32"
    types["ID"] == "Int32"
    len(types) == 9
예제 #6
0
def testMojoPipelineProtoBackendWithoutError(spark):
    mojo = H2OMOJOPipelineModel.createFromMojo(
        "file://" +
        os.path.abspath("../ml/src/test/resources/proto_based_pipeline.mojo"))

    data = [
        (2.0, 'male', 0.41670000553131104, 111361, 6.449999809265137, 'A19'),
        (1.0, 'female', 0.33329999446868896, 110413, 6.4375, 'A14'),
        (1.0, 'female', 0.16670000553131104, 111320, 6.237500190734863, 'A21'),
        (1.0, 'female', 2.0, 111361, 6.237500190734863, 'A20'),
        (3.0, 'female', 1.0, 110152, 6.75, 'A14'),
        (1.0, 'male', 0.666700005531311, 110489, 6.85830020904541, 'A10'),
        (3.0, 'male', 0.33329999446868896, 111320, 0.0, 'A11'),
        (3.0, 'male', 2.0, 110413, 6.85830020904541, 'A24'),
        (1.0, 'female', 1.0, 110489, 3.170799970626831, 'A21'),
        (1.0, 'female', 0.33329999446868896, 111240, 0.0, 'A14')
    ]
    rdd = spark.sparkContext.parallelize(data)
    df = spark.createDataFrame(
        rdd, ['pclass', 'sex', 'age', 'ticket', 'fare', 'cabin'])
    prediction = mojo.transform(df)
    prediction.collect()
예제 #7
0
def test_h2o_mojo_pipeline_predictions(prostateDataset):
    # Try loading the Mojo and prediction on it without starting H2O Context
    path = "file://" + os.path.abspath(
        "../ml/src/test/resources/mojo2data/pipeline.mojo")
    settings = H2OMOJOSettings(namedMojoOutputColumns=False)
    mojo = H2OMOJOPipelineModel.createFromMojo(path, settings)

    preds = mojo.transform(prostateDataset).repartition(1)

    normalSelection = preds.select("prediction.preds").take(5)

    assert normalSelection[0][0][0] == 65.36320409515132
    assert normalSelection[1][0][0] == 64.96902128114817
    assert normalSelection[2][0][0] == 64.96721023747583
    assert normalSelection[3][0][0] == 65.78772654671035
    assert normalSelection[4][0][0] == 66.11327967814829

    udfSelection = preds.select(mojo.selectPredictionUDF("AGE")).take(5)

    assert udfSelection[0][0] == 65.36320409515132
    assert udfSelection[1][0] == 64.96902128114817
    assert udfSelection[2][0] == 64.96721023747583
    assert udfSelection[3][0] == 65.78772654671035
    assert udfSelection[4][0] == 66.11327967814829