예제 #1
0
    def test_h2o_mojo_model_serialization_in_pipeline(self):
        mojo = H2OMOJOModel.create_from_mojo("file://" + os.path.abspath(
            "../ml/src/test/resources/binom_model_prostate.mojo"))
        prostate_frame = self._spark.read.csv(
            "file://" +
            unit_test_utils.locate("smalldata/prostate/prostate.csv"),
            header=True)

        pipeline = Pipeline(stages=[mojo])

        pipeline.write().overwrite().save(
            "file://" +
            os.path.abspath("build/test_spark_pipeline_model_mojo"))
        loaded_pipeline = Pipeline.load(
            "file://" +
            os.path.abspath("build/test_spark_pipeline_model_mojo"))

        model = loaded_pipeline.fit(prostate_frame)

        model.write().overwrite().save(
            "file://" +
            os.path.abspath("build/test_spark_pipeline_model_mojo_model"))
        PipelineModel.load(
            "file://" +
            os.path.abspath("build/test_spark_pipeline_model_mojo_model"))
 def test_h2o_mojo_predictions(self):
     # Try loading the Mojo and prediction on it without starting H2O Context
     mojo = H2OMOJOModel.create_from_mojo("file://" + os.path.abspath(
         "../ml/src/test/resources/binom_model_prostate.mojo"))
     prostate_frame = self._spark.read.csv(
         "file://" +
         unit_test_utils.locate("smalldata/prostate/prostate.csv"),
         header=True)
     mojo.predict(prostate_frame).repartition(1).collect()
 def test_h2o_mojo_predictions_unseen_categoricals(self):
     mojo = H2OMOJOModel.create_from_mojo("../ml/src/test/resources/deep_learning_airlines_categoricals.zip")
     mojo.setConvertUnknownCategoricalLevelsToNa(True)
     d =[{'sepal_len':5.1, 'sepal_wid':3.5, 'petal_len':1.4, 'petal_wid':0.2, 'class':'Missing_categorical'}]
     df = self._spark.createDataFrame(d)
     data = mojo.transform(df).collect()[0]
     assert data["class"] == "Missing_categorical"
     assert data["petal_len"] == 1.4
     assert data["petal_wid"] == 0.2
     assert data["sepal_len"] == 5.1
     assert data["sepal_wid"] == 3.5
     assert data["prediction_output"][0] == 5.240174068202646
예제 #4
0
    def test_h2o_mojo_unsupervised(self):
        mojo = H2OMOJOModel.create_from_mojo(
            "file://" +
            os.path.abspath("../ml/src/test/resources/isolation_forest.mojo"))

        row_for_scoring = Row("V1")

        df = self._spark.createDataFrame(
            self._spark.sparkContext.parallelize([
                (5.1, )
            ]).map(lambda r: row_for_scoring(*r)))
        mojo.predict(df).repartition(1).collect()
예제 #5
0
def load_mojo_model(local_dir, filename, extension=""):
    """
    Loads a saved H2OMOJOModel (can be used with Spark without a running H2OSparkling Session)

    :param string local_dir: Local directory where the model is saved
    :param string filename: Filename with which the model is saved
    :param string extension: Extension to the filename with which the model is saved
    :return:
    """

    from pysparkling.ml import H2OMOJOModel
    return H2OMOJOModel.create_from_mojo(local_dir + "/" + filename +
                                         extension)
예제 #6
0
    def test_load_mojo_deeplearning(self):
        from pysparkling.ml import H2OMOJOModel, H2ODeepLearning
        mojo = H2OMOJOModel.create_from_mojo("file://" + os.path.abspath("../ml/src/test/resources/deep_learning_prostate.mojo"))
        prostate_frame = self._hc.as_spark_frame(h2o.upload_file(unit_test_utils.locate("smalldata/prostate/prostate.csv")))

        dl = H2ODeepLearning(seed=42, reproducible=True, predictionCol="CAPSULE")

        model = dl.fit(prostate_frame)

        pred_mojo = mojo.predict(prostate_frame).repartition(1).collect()
        pred_model = model.transform(prostate_frame).repartition(1).collect()

        assert len(pred_mojo)==len(pred_model)
        for i in range(0, len(pred_mojo)):
            assert pred_mojo[i]==pred_model[i]
예제 #7
0
    def test_load_mojo_gbm(self):
        from pysparkling.ml import H2OMOJOModel, H2OGBM
        mojo = H2OMOJOModel.create_from_mojo("file://" + os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo"))
        prostate_frame = self._hc.as_spark_frame(h2o.upload_file(unit_test_utils.locate("smalldata/prostate/prostate.csv")))

        gbm = H2OGBM(ntrees=2, seed=42, distribution="bernoulli", predictionCol="capsule")

        model = gbm.fit(prostate_frame)

        pred_mojo = mojo.predict(prostate_frame).repartition(1).collect()
        pred_model = model.transform(prostate_frame).repartition(1).collect()

        assert len(pred_mojo)==len(pred_model)
        for i in range(0, len(pred_mojo)):
            assert pred_mojo[i]==pred_model[i]
    def test_h2o_mojo_predictions_unseen_categoricals(self):
        mojo = H2OMOJOModel.create_from_mojo("file://" + os.path.abspath(
            "../ml/src/test/resources/deep_learning_airlines_categoricals.zip")
                                             )
        mojo.setConvertUnknownCategoricalLevelsToNa(True)
        row_for_scoring = Row("sepal_len", "sepal_wid", "petal_len",
                              "petal_wid", "class")

        df = self._spark.createDataFrame(
            self._spark.sparkContext.parallelize([
                (5.1, 3.5, 1.4, 0.2, "Missing_categorical")
            ]).map(lambda r: row_for_scoring(*r)))
        data = mojo.transform(df).collect()[0]
        assert data["class"] == "Missing_categorical"
        assert data["petal_len"] == 1.4
        assert data["petal_wid"] == 0.2
        assert data["sepal_len"] == 5.1
        assert data["sepal_wid"] == 3.5
        assert data["prediction_output"][0] == 5.240174068202646