def test_grid_gbm_in_spark_pipeline(self):
        prostate_frame = self._spark.read.csv(
            "file://" +
            unit_test_utils.locate("smalldata/prostate/prostate.csv"),
            header=True,
            inferSchema=True)

        algo = H2OGridSearch(labelCol="AGE",
                             hyperParameters={"_seed": [1, 2, 3]},
                             ratio=0.8,
                             algo=H2OGBM(),
                             strategy="RandomDiscrete",
                             maxModels=3,
                             maxRuntimeSecs=60,
                             selectBestModelBy="RMSE")

        pipeline = Pipeline(stages=[algo])
        pipeline.write().overwrite().save(
            "file://" + os.path.abspath("build/grid_gbm_pipeline"))
        loaded_pipeline = Pipeline.load(
            "file://" + os.path.abspath("build/grid_gbm_pipeline"))
        model = loaded_pipeline.fit(prostate_frame)

        model.write().overwrite().save(
            "file://" + os.path.abspath("build/grid_gbm_pipeline_model"))
        loaded_model = PipelineModel.load(
            "file://" + os.path.abspath("build/grid_gbm_pipeline_model"))

        loaded_model.transform(prostate_frame).count()
예제 #2
0
def testGetAlgoViaSetter():
    # SW-2276, 3rd call of getAlgo failed
    grid = H2OGridSearch(hyperParameters={"seed": [1, 2, 3]},
                         strategy="RandomDiscrete",
                         maxModels=3,
                         maxRuntimeSecs=60,
                         selectBestModelBy="RMSE")
    grid.setAlgo(H2OGBM().setNtrees(100).setLabelCol("AGE").setSplitRatio(0.8))
    grid.getAlgo()
    grid.getAlgo()
    assert grid.getAlgo().getNtrees() == 100
예제 #3
0
def testGetGridModels(prostateDataset):
    grid = H2OGridSearch(hyperParameters={"seed": [1, 2, 3]},
                         algo=H2OGBM(splitRatio=0.8, labelCol="AGE"),
                         strategy="RandomDiscrete",
                         maxModels=3,
                         maxRuntimeSecs=60,
                         selectBestModelBy="RMSE")

    grid.fit(prostateDataset)
    models = grid.getGridModels()
    assert len(models) == 3
예제 #4
0
def testGetGridModelsNoParams(prostateDataset):
    grid = H2OGridSearch(algo=H2OGBM(labelCol="AGE", splitRatio=0.8),
                         strategy="RandomDiscrete",
                         maxModels=3,
                         maxRuntimeSecs=60,
                         selectBestModelBy="RMSE")

    grid.fit(prostateDataset)
    params = grid.getGridModelsParams()
    assert params.count() == 1
    assert params.columns == ['MOJO Model ID']
    params.collect()  # try materializing
예제 #5
0
def testGetGridModelsParams(prostateDataset):
    grid = H2OGridSearch(hyperParameters={"seed": [1, 2, 3]},
                         algo=H2OGBM(splitRatio=0.8, labelCol="AGE"),
                         strategy="RandomDiscrete",
                         maxModels=3,
                         maxRuntimeSecs=60,
                         selectBestModelBy="RMSE")

    grid.fit(prostateDataset)
    params = grid.getGridModelsParams()
    assert params.count() == 3
    assert params.columns == ['MOJO Model ID', 'seed']
    params.collect()  # try materializing
예제 #6
0
def testGetAlgoViaConstructor():
    # SW-2276, 3rd call of getAlgo failed
    grid = H2OGridSearch(hyperParameters={"seed": [1, 2, 3]},
                         algo=H2OGBM(labelCol="AGE",
                                     ntrees=100,
                                     splitRatio=0.8),
                         strategy="RandomDiscrete",
                         maxModels=3,
                         maxRuntimeSecs=60,
                         selectBestModelBy="RMSE")
    grid.getAlgo()
    grid.getAlgo()
    assert grid.getAlgo().getNtrees() == 100
예제 #7
0
def testGetGridModelsMetrics(prostateDataset):
    grid = H2OGridSearch(hyperParameters={"seed": [1, 2, 3]},
                         algo=H2OGBM(labelCol="AGE", splitRatio=0.8),
                         strategy="RandomDiscrete",
                         maxModels=3,
                         maxRuntimeSecs=60,
                         selectBestModelBy="RMSE")

    grid.fit(prostateDataset)
    metrics = grid.getGridModelsMetrics()
    assert metrics.count() == 3
    assert metrics.columns == [
        'MOJO Model ID', 'MSE', 'MeanResidualDeviance', 'R2', 'RMSE'
    ]
    metrics.collect()  # try materializing
    def test_load_mojo_gbm(self):
        from pysparkling.ml import H2OMOJOModel, H2OGBM
        mojo = H2OMOJOModel.create_from_mojo(
            "file://" + os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo"))
        prostate_frame = self._hc.as_spark_frame(
            h2o.upload_file(unit_test_utils.locate("smalldata/prostate/prostate.csv")))

        gbm = H2OGBM(ntrees=2, seed=42, distribution="bernoulli", labelCol="capsule")

        model = gbm.fit(prostate_frame)

        pred_mojo = mojo.predict(prostate_frame).repartition(1).collect()
        pred_model = model.transform(prostate_frame).repartition(1).collect()

        self.assertEquals(len(pred_mojo), len(pred_model))
        for i in range(0, len(pred_mojo)):
            self.assertEquals(pred_mojo[i], pred_model[i])
예제 #9
0
def testPipelineSerializationGBM(prostateDataset):
    gridSearchTester(H2OGBM().setLabelCol("AGE"), prostateDataset)