def testPropagationOfPredictionCol(prostateDataset): predictionCol = "my_prediction_col_name" algo = H2OGAM(featuresCols=["DPROS", "DCAPS", "RACE", "GLEASON"], gamCols=["PSA", "AGE"], labelCol="CAPSULE", seed=1, splitRatio=0.8, predictionCol=predictionCol) model = algo.fit(prostateDataset) columns = model.transform(prostateDataset).columns assert True == (predictionCol in columns)
def testH2OGAMRegressorBehavesTheSameAsGenericH2OGAMOnNumericLabelColumn( prostateDataset): [trainingDateset, testingDataset] = prostateDataset.randomSplit([0.9, 0.1], 42) automl = setParamtersForProblemSpecificTests(H2OGAM()) referenceModel = automl.fit(trainingDateset) referenceDataset = referenceModel.transform(testingDataset) classifier = setParamtersForProblemSpecificTests(H2OGAMRegressor()) model = classifier.fit(trainingDateset) result = model.transform(testingDataset) unit_test_utils.assert_data_frames_are_identical(referenceDataset, result)
def testH2OGAMClassifierBehavesTheSameAsGenericH2OGAMOnStringLabelColumn( prostateDataset): [trainingDateset, testingDataset] = prostateDataset.randomSplit([0.9, 0.1], 42) gam = setParamtersForProblemSpecificTests(H2OGAM()) referenceModel = gam.fit( trainingDateset.withColumn("CAPSULE", col("CAPSULE").cast("string"))) referenceDataset = referenceModel.transform(testingDataset) classifier = setParamtersForProblemSpecificTests(H2OGAMClassifier()) model = classifier.fit(trainingDateset) result = model.transform(testingDataset) unit_test_utils.assert_data_frames_are_identical(referenceDataset, result)
def testPipelineSerialization(prostateDataset): algo = H2OGAM(featuresCols=["DPROS", "DCAPS", "RACE", "GLEASON"], gamCols=["PSA", "AGE"], labelCol="CAPSULE", seed=1, splitRatio=0.8) pipeline = Pipeline(stages=[algo]) pipeline.write().overwrite().save("file://" + os.path.abspath("build/gam_pipeline")) loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/gam_pipeline")) model = loadedPipeline.fit(prostateDataset) model.write().overwrite().save("file://" + os.path.abspath("build/gam_pipeline_model")) loadedModel = PipelineModel.load( "file://" + os.path.abspath("build/gam_pipeline_model")) loadedModel.transform(prostateDataset).count()
def testPipelineSerializationGAM(prostateDataset): gridSearchTester(H2OGAM().setLabelCol("AGE").setGamCols(["PSA"]), prostateDataset)
def createInitialGamDefinition(): return H2OGAM(featuresCols=featuresCols, labelCol="CAPSULE", seed=1, splitRatio=0.8, gamCols=["PSA", "AGE"])