def testPipelineWithTargetEncoderIsSerializable(): targetEncoder = H2OTargetEncoder( foldCol="ID", labelCol="CAPSULE", inputCols=["RACE", "DPROS", "DCAPS"], outputCols=["RACE_out", "DPROS_out", "DCAPS_out"], holdoutStrategy="KFold", blendedAvgEnabled=True, blendedAvgInflectionPoint=15.0, blendedAvgSmoothing=25.0, noise=0.05, noiseSeed=123) gbm = H2OGBM() \ .setLabelCol("CAPSULE") \ .setFeaturesCols(targetEncoder.getOutputCols()) pipeline = Pipeline(stages=[targetEncoder, gbm]) path = "file://" + os.path.abspath( "build/testPipelineWithTargetEncoderIsSerializable") pipeline.write().overwrite().save(path) loadedPipeline = Pipeline.load(path) [loadedTargetEncoder, loadedGbm] = loadedPipeline.getStages() assertTargetEncoderAndMOJOModelParamsAreEqual(targetEncoder, loadedTargetEncoder) assert gbm.getLabelCol() == loadedGbm.getLabelCol() assert gbm.getFeaturesCols() == loadedGbm.getFeaturesCols()
def testPipelineSerialization(craiglistDataset): [traningDataset, testingDataset] = craiglistDataset.randomSplit([0.9, 0.1], 42) tokenizer = RegexTokenizer(inputCol="jobtitle", minTokenLength=2, outputCol="tokenized") stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="stopWordsRemoved") w2v = H2OWord2Vec(sentSampleRate=0, epochs=10, inputCol=stopWordsRemover.getOutputCol(), outputCol="w2v") gbm = H2OGBM(labelCol="category", featuresCols=[w2v.getOutputCol()]) pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, w2v, gbm]) pipeline.write().overwrite().save("file://" + os.path.abspath("build/w2v_pipeline")) loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/w2v_pipeline")) model = loadedPipeline.fit(traningDataset) expected = model.transform(testingDataset) model.write().overwrite().save("file://" + os.path.abspath("build/w2v_pipeline_model")) loadedModel = PipelineModel.load( "file://" + os.path.abspath("build/w2v_pipeline_model")) result = loadedModel.transform(testingDataset) unit_test_utils.assert_data_frames_are_identical(expected, result)
def testPipelineWithTargetEncoderTransformsTrainingAndTestingDatasetWithoutException(trainingDataset, testingDataset): targetEncoder = H2OTargetEncoder(labelCol="CAPSULE", inputCols=["RACE", "DPROS", "DCAPS"]) gbm = H2OGBM(labelCol="CAPSULE") pipeline = Pipeline(stages=[targetEncoder, gbm]) model = pipeline.fit(trainingDataset) model.transform(testingDataset).collect()
def testMonotoneConstraintsGetProperlyPropagatedToJavaBackend(): gbm = H2OGBM(monotoneConstraints={"District": -1, "Group": 1}) gbm._transfer_params_to_java() constraints = gbm._java_obj.getMonotoneConstraints() assert constraints.apply("District") == -1.0 assert constraints.apply("Group") == 1.0
def testGetGridModelsNoParams(prostateDataset): grid = H2OGridSearch(labelCol="AGE", splitRatio=0.8, algo=H2OGBM(), strategy="RandomDiscrete", maxModels=3, maxRuntimeSecs=60, selectBestModelBy="RMSE") grid.fit(prostateDataset) params = grid.getGridModelsParams() assert params.count() == 1 assert params.columns == ['MOJO Model ID'] params.collect() # try materializing
def testGetGridModels(prostateDataset): grid = H2OGridSearch(hyperParameters={"seed": [1, 2, 3]}, algo=H2OGBM(splitRatio=0.8, labelCol="AGE"), strategy="RandomDiscrete", maxModels=3, maxRuntimeSecs=60, selectBestModelBy="RMSE") grid.fit(prostateDataset) models = grid.getGridModels() assert len(models) == 3
def testGetAlgoViaSetter(): # SW-2276, 3rd call of getAlgo failed grid = H2OGridSearch(hyperParameters={"seed": [1, 2, 3]}, strategy="RandomDiscrete", maxModels=3, maxRuntimeSecs=60, selectBestModelBy="RMSE") grid.setAlgo(H2OGBM().setNtrees(100).setLabelCol("AGE").setSplitRatio(0.8)) grid.getAlgo() grid.getAlgo() assert grid.getAlgo().getNtrees() == 100
def gbmModelWithOffset(dataset): gbm = H2OGBM(distribution="tweedie", ntrees=600, maxDepth=1, minRows=1, learnRate=0.1, minSplitImprovement=0, featuresCols=["District", "Group", "Age"], labelCol="Claims", offsetCol="Offset") return gbm.fit(dataset)
def testGetAlgoViaConstructor(): # SW-2276, 3rd call of getAlgo failed grid = H2OGridSearch(hyperParameters={"seed": [1, 2, 3]}, algo=H2OGBM(labelCol="AGE", ntrees=100, splitRatio=0.8), strategy="RandomDiscrete", maxModels=3, maxRuntimeSecs=60, selectBestModelBy="RMSE") grid.getAlgo() grid.getAlgo() assert grid.getAlgo().getNtrees() == 100
def testGetGridModelsParams(prostateDataset): grid = H2OGridSearch(hyperParameters={"seed": [1, 2, 3]}, algo=H2OGBM(splitRatio=0.8, labelCol="AGE"), strategy="RandomDiscrete", maxModels=3, maxRuntimeSecs=60, selectBestModelBy="RMSE") grid.fit(prostateDataset) params = grid.getGridModelsParams() assert params.count() == 3 assert params.columns == ['MOJO Model ID', 'seed'] params.collect() # try materializing
def testLoadAndTrainMojo(prostateDataset): mojo = H2OMOJOModel.createFromMojo( "file://" + os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo")) gbm = H2OGBM(ntrees=2, seed=42, distribution="bernoulli", labelCol="capsule") model = gbm.fit(prostateDataset) predMojo = mojo.transform(prostateDataset).repartition(1).collect() predModel = model.transform(prostateDataset).repartition(1).collect() assert len(predMojo) == len(predModel) for i in range(0, len(predMojo)): assert predMojo[i] == predModel[i]
def testGetGridModelsMetrics(prostateDataset): grid = H2OGridSearch(hyperParameters={"seed": [1, 2, 3]}, algo=H2OGBM(labelCol="AGE", splitRatio=0.8), strategy="RandomDiscrete", maxModels=3, maxRuntimeSecs=60, selectBestModelBy="RMSE") grid.fit(prostateDataset) metrics = grid.getGridModelsMetrics() assert metrics.count() == 3 assert metrics.columns == [ 'MOJO Model ID', 'MSE', 'MeanResidualDeviance', 'R2', 'RMSE' ] metrics.collect() # try materializing
def test_load_mojo_gbm(self): from pysparkling.ml import H2OMOJOModel, H2OGBM mojo = H2OMOJOModel.create_from_mojo("file://" + os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo")) prostate_frame = self._hc.as_spark_frame(h2o.upload_file(unit_test_utils.locate("smalldata/prostate/prostate.csv"))) gbm = H2OGBM(ntrees=2, seed=42, distribution="bernoulli", predictionCol="capsule") model = gbm.fit(prostate_frame) pred_mojo = mojo.predict(prostate_frame).repartition(1).collect() pred_model = model.transform(prostate_frame).repartition(1).collect() assert len(pred_mojo)==len(pred_model) for i in range(0, len(pred_mojo)): assert pred_mojo[i]==pred_model[i]
def testMonotoneConstraintsGetProperlyPropagatedFromJavaBackend(): gbm = H2OGBM(monotoneConstraints={"District": -1, "Group": 1}) gbm._transfer_params_to_java() gbm.setMonotoneConstraints({"District": 1, "Group": -1}) constraints = gbm.getMonotoneConstraints() assert constraints["District"] == 1.0 assert constraints["Group"] == -1.0 gbm._transfer_params_from_java() constraints = gbm.getMonotoneConstraints() assert constraints["District"] == -1.0 assert constraints["Group"] == 1.0
def h2o_gbm(df, label, columns, **kargs): H2OContext.getOrCreate(Spark.instance.spark) df_sti = string_to_index(df, input_cols=label) df_va = vector_assembler(df_sti, input_cols=columns) h2o_gbm = H2OGBM(ratio=0.8, seed=1, featuresCols=columns, labelCol=label, **kargs) model = h2o_gbm.fit(df_va) df_raw = model.transform(df_va) df_pred = df_raw.withColumn("prediction", when(df_raw.prediction_output["p1"] > 0.5, 1.0).otherwise(0.0)) return df_pred, model
def testDomainColumns(prostateDataset): mojo = H2OMOJOModel.createFromMojo( "file://" + os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo")) gbm = H2OGBM(ntrees=2, seed=42, distribution="bernoulli", labelCol="capsule") model = gbm.fit(prostateDataset) domainValues = model.getDomainValues() assert domainValues["DPROS"] is None assert domainValues["DCAPS"] is None assert domainValues["VOL"] is None assert domainValues["AGE"] is None assert domainValues["PSA"] is None assert domainValues["capsule"] == ["0", "1"] assert domainValues["RACE"] is None assert domainValues["ID"] is None
def testPipelineSerialization(prostateDataset): algo = H2OGridSearch(labelCol="AGE", hyperParameters={"_seed": [1, 2, 3]}, splitRatio=0.8, algo=H2OGBM(), strategy="RandomDiscrete", maxModels=3, maxRuntimeSecs=60, selectBestModelBy="RMSE") pipeline = Pipeline(stages=[algo]) pipeline.write().overwrite().save( "file://" + os.path.abspath("build/grid_gbm_pipeline")) loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/grid_gbm_pipeline")) model = loadedPipeline.fit(prostateDataset) model.write().overwrite().save( "file://" + os.path.abspath("build/grid_gbm_pipeline_model")) loadedModel = PipelineModel.load( "file://" + os.path.abspath("build/grid_gbm_pipeline_model")) loadedModel.transform(prostateDataset).count()
outputCol="filtered", stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"], caseSensitive=False) ## Hash the words hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(), outputCol="wordToIndex", numFeatures=1 << 10) ## Create inverse document frequencies model idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4) if algo == "gbm": ## Create GBM model algoStage = H2OGBM(ratio=0.8, seed=1, featuresCols=[idf.getOutputCol()], predictionCol="label") elif algo == "dl": ## Create H2ODeepLearning model algoStage = H2ODeepLearning(epochs=10, seed=1, l1=0.001, l2=0.0, hidden=[200, 200], featuresCols=[idf.getOutputCol()], predictionCol="label") elif algo == "automl": ## Create H2OAutoML model algoStage = H2OAutoML( convertUnknownCategoricalLevelsToNa=True, maxRuntimeSecs=60, # 1 minutes
def testGBMParameters(prostateDataset): features = ['AGE', 'RACE', 'DPROS', 'DCAPS', 'PSA'] algorithm = H2OGBM(seed=1, labelCol="CAPSULE", featuresCols=features, monotoneConstraints={'AGE': 1, 'RACE': -1}) model = algorithm.fit(prostateDataset) compareParameterValues(algorithm, model)
def testParams(): gbm = H2OGBM(modelId=None, splitRatio=1.0, labelCol="label", weightCol=None, featuresCols=[], allStringColumnsToCategorical=True, columnsToCategorical=[], nfolds=0, keepCrossValidationPredictions=False, keepCrossValidationFoldAssignment=False, parallelizeCrossValidation=True, seed=-1, distribution="Auto", ntrees=50, maxDepth=5, minRows=10.0, nbins=20, nbinsCats=1024, minSplitImprovement=1e-5, histogramType="AUTO", r2Stopping=1, nbinsTopLevel=1 << 10, buildTreeOneNode=False, scoreTreeInterval=0, sampleRate=1.0, sampleRatePerClass=None, colSampleRateChangePerLevel=1.0, colSampleRatePerTree=1.0, learnRate=0.1, learnRateAnnealing=1.0, colSampleRate=1.0, maxAbsLeafnodePred=1, predNoiseBandwidth=0.0, convertUnknownCategoricalLevelsToNa=False, foldCol=None, predictionCol="prediction", detailedPredictionCol="detailed_prediction", withDetailedPredictionCol=False, convertInvalidNumbersToNa=False) assert gbm.getModelId() == None assert gbm.getSplitRatio() == 1.0 assert gbm.getLabelCol() == "label" assert gbm.getWeightCol() == None assert gbm.getFeaturesCols() == [] assert gbm.getAllStringColumnsToCategorical() == True assert gbm.getColumnsToCategorical() == [] assert gbm.getNfolds() == 0 assert gbm.getKeepCrossValidationPredictions() == False assert gbm.getKeepCrossValidationFoldAssignment() == False assert gbm.getParallelizeCrossValidation() == True assert gbm.getSeed() == -1 assert gbm.getDistribution() == "AUTO" assert gbm.getNtrees() == 50 assert gbm.getMaxDepth() == 5 assert gbm.getMinRows() == 10.0 assert gbm.getNbins() == 20 assert gbm.getNbinsCats() == 1024 assert gbm.getMinSplitImprovement() == 1e-5 assert gbm.getHistogramType() == "AUTO" assert gbm.getR2Stopping() == 1 assert gbm.getNbinsTopLevel() == 1 << 10 assert gbm.getBuildTreeOneNode() == False assert gbm.getScoreTreeInterval() == 0 assert gbm.getSampleRate() == 1.0 assert gbm.getSampleRatePerClass() == None assert gbm.getColSampleRateChangePerLevel() == 1.0 assert gbm.getColSampleRatePerTree() == 1.0 assert gbm.getLearnRate() == 0.1 assert gbm.getLearnRateAnnealing() == 1.0 assert gbm.getColSampleRate() == 1.0 assert gbm.getMaxAbsLeafnodePred() == 1 assert gbm.getPredNoiseBandwidth() == 0.0 assert gbm.getConvertUnknownCategoricalLevelsToNa() == False assert gbm.getFoldCol() == None assert gbm.getPredictionCol() == "prediction" assert gbm.getDetailedPredictionCol() == "detailed_prediction" assert gbm.getWithDetailedPredictionCol() == False assert gbm.getConvertInvalidNumbersToNa() == False
def testPipelineSerializationGBM(prostateDataset): gridSearchTester(H2OGBM().setLabelCol("AGE"), prostateDataset)
def gbmModel(prostateDataset): gbm = H2OGBM(ntrees=2, seed=42, distribution="bernoulli", labelCol="capsule") return gbm.fit(prostateDataset)
stopWordsRemover = StopWordsRemover( inputCol=tokenizer.getOutputCol(), outputCol="filtered", stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"], caseSensitive=False) ## Hash the words hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(), outputCol="wordToIndex", numFeatures=1 << 10) ## Create inverse document frequencies model idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4) gbm = H2OGBM(splitRatio=0.8, seed=1, featuresCols=[idf.getOutputCol()], labelCol="label") dl = H2ODeepLearning(epochs=10, seed=1, l1=0.001, l2=0.0, hidden=[200, 200], featuresCols=[idf.getOutputCol()], labelCol="label") automl = H2OAutoML( convertUnknownCategoricalLevelsToNa=True, maxRuntimeSecs=60 * 100, # 100 minutes maxModels=10, seed=1,
inputCol=tokenizer.getOutputCol(), outputCol="filtered", stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"], caseSensitive=False) ## Hash the words hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(), outputCol="wordToIndex", numFeatures=1 << 10) ## Create inverse document frequencies model idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4) ## Create GBM model gbm = H2OGBM(ratio=0.8, featuresCols=[idf.getOutputCol()], predictionCol="label") ## Remove all helper columns colPruner = ColumnPruner(columns=[ idf.getOutputCol(), hashingTF.getOutputCol(), stopWordsRemover.getOutputCol(), tokenizer.getOutputCol() ]) ## Create the pipeline by defining all the stages pipeline = Pipeline( stages=[tokenizer, stopWordsRemover, hashingTF, idf, gbm, colPruner]) ## Train the pipeline model
def testPipelineSerializationGBM(prostateDataset): gridSearchTester(H2OGBM(), prostateDataset)