def testTargetEncoderModelProduceSameResultsRegardlessSpecificationOfOutputCols( trainingDataset, testingDataset): def trainAndReturnTranformedTestingDataset(targetEncoder): targetEncoderModel = targetEncoder.fit(trainingDataset) return targetEncoderModel.transformTrainingDataset(testingDataset) targetEncoderDefaultOutputCols = H2OTargetEncoder() \ .setInputCols(["RACE", "DPROS", "DCAPS"]) \ .setLabelCol("CAPSULE") \ .setHoldoutStrategy("None") \ .setNoise(0.0) dataFrameDefaultOutputCols = trainAndReturnTranformedTestingDataset(targetEncoderDefaultOutputCols) \ .withColumnRenamed("RACE_te", "RACE_out") \ .withColumnRenamed("DPROS_te", "DPROS_out") \ .withColumnRenamed("DCAPS_te", "DCAPS_out") targetEncoderCustomOutputCols = H2OTargetEncoder() \ .setInputCols(["RACE", "DPROS", "DCAPS"]) \ .setOutputCols(["RACE_out", "DPROS_out", "DCAPS_out"]) \ .setLabelCol("CAPSULE") \ .setHoldoutStrategy("None") \ .setNoise(0.0) dataFrameCustomOutputCols = trainAndReturnTranformedTestingDataset( targetEncoderCustomOutputCols) unit_test_utils.assert_data_frames_are_identical( dataFrameDefaultOutputCols, dataFrameCustomOutputCols)
def testPipelineWithTargetEncoderIsSerializable(): targetEncoder = H2OTargetEncoder( foldCol="ID", labelCol="CAPSULE", inputCols=["RACE", "DPROS", "DCAPS"], outputCols=["RACE_out", "DPROS_out", "DCAPS_out"], holdoutStrategy="KFold", blendedAvgEnabled=True, blendedAvgInflectionPoint=15.0, blendedAvgSmoothing=25.0, noise=0.05, noiseSeed=123) gbm = H2OGBM() \ .setLabelCol("CAPSULE") \ .setFeaturesCols(targetEncoder.getOutputCols()) pipeline = Pipeline(stages=[targetEncoder, gbm]) path = "file://" + os.path.abspath( "build/testPipelineWithTargetEncoderIsSerializable") pipeline.write().overwrite().save(path) loadedPipeline = Pipeline.load(path) [loadedTargetEncoder, loadedGbm] = loadedPipeline.getStages() assertTargetEncoderAndMOJOModelParamsAreEqual(targetEncoder, loadedTargetEncoder) assert gbm.getLabelCol() == loadedGbm.getLabelCol() assert gbm.getFeaturesCols() == loadedGbm.getFeaturesCols()
def createTargetEncoder(): return H2OTargetEncoder() \ .setInputCols(["RACE", "DPROS", "DCAPS"]) \ .setLabelCol("CAPSULE") \ .setHoldoutStrategy("None") \ .setBlendedAvgEnabled(False) \ .setNoise(5.0) \ .setNoiseSeed(42)
def testPipelineWithTargetEncoderTransformsTrainingAndTestingDatasetWithoutException(trainingDataset, testingDataset): targetEncoder = H2OTargetEncoder(labelCol="CAPSULE", inputCols=["RACE", "DPROS", "DCAPS"]) gbm = H2OGBM(labelCol="CAPSULE") pipeline = Pipeline(stages=[targetEncoder, gbm]) model = pipeline.fit(trainingDataset) model.transform(testingDataset).collect()
def testTargetEncoderConstructorParametersGetPropagatedToLoadedMOJOModel(self): targetEncoder = H2OTargetEncoder(foldCol="ID", labelCol="CAPSULE", inputCols=["RACE", "DPROS", "DCAPS"], holdoutStrategy = "KFold", blendedAvgEnabled=True, blendedAvgInflectionPoint=15.0, blendedAvgSmoothing=25.0, noise=0.05, noiseSeed=123) pipeline = Pipeline(stages=[targetEncoder]) model = pipeline.fit(self._trainingDataset) path = "file://" + os.path.abspath("build/testTargetEncoderConstructorParametersGetPropagatedToLoadedMOJOModel") model.write().overwrite().save(path) loadedModel = PipelineModel.load(path) mojoModel = loadedModel.stages[0] self.assertTargetEncoderAndMOJOModelParamsAreEqual(targetEncoder, mojoModel)
def testTargetEncoderMOJOModelCouldBeSavedAndLoaded(trainingDataset, testingDataset): targetEncoder = H2OTargetEncoder(foldCol="ID", labelCol="CAPSULE", inputCols=["RACE", "DPROS", "DCAPS"], outputCols=["RACE_out", "DPROS_out", "DCAPS_out"]) model = targetEncoder.fit(trainingDataset) path = "file://" + os.path.abspath("build/testTargetEncoderMOJOModelCouldBeSavedAndLoaded") model.write().overwrite().save(path) loadedModel = H2OTargetEncoderMOJOModel.load(path) expected = model.transform(testingDataset) result = loadedModel.transform(testingDataset) unit_test_utils.assert_data_frames_are_identical(expected, result)
def testTargetEncoderModelWithDisabledNoiseAndTargetEncoderMOJOModelTransformTheTrainingDatasetSameWay(trainingDataset): targetEncoder = H2OTargetEncoder() \ .setInputCols([["RACE"], ["DPROS", "DCAPS"]]) \ .setLabelCol("CAPSULE") \ .setHoldoutStrategy("None") \ .setNoise(0.0) targetEncoderModel = targetEncoder.fit(trainingDataset) transformedByModel = targetEncoderModel.transformTrainingDataset(trainingDataset) transformedByMOJOModel = targetEncoderModel.transform(trainingDataset) unit_test_utils.assert_data_frames_are_identical(transformedByModel, transformedByMOJOModel)
def testProducedMOJOModelAndLoadedMOJOModelReturnsSameResult(trainingDataset, testingDataset): targetEncoder = H2OTargetEncoder(labelCol="CAPSULE", inputCols=[["RACE"], ["DPROS", "DCAPS"]]) pipeline = Pipeline(stages=[targetEncoder]) producedModel = pipeline.fit(trainingDataset) path = "file://" + os.path.abspath("build/testProducedMOJOModelAndLoadedMOJOModelReturnsSameResult") producedModel.write().overwrite().save(path) loadedModel = PipelineModel.load(path) transformedByProducedModel = producedModel.transform(testingDataset) transformedByLoadedModel = loadedModel.transform(testingDataset) unit_test_utils.assert_data_frames_are_identical(transformedByProducedModel, transformedByLoadedModel)
def testTargetEncoderSetterParametersGetPropagatedToLoadedMOJOModel(self): targetEncoder = H2OTargetEncoder()\ .setFoldCol("ID")\ .setLabelCol("CAPSULE")\ .setInputCols(["RACE", "DPROS", "DCAPS"])\ .setHoldoutStrategy("KFold")\ .setBlendedAvgEnabled(True)\ .setBlendedAvgInflectionPoint(15.0)\ .setBlendedAvgSmoothing(25.0)\ .setNoise(0.05)\ .setNoiseSeed(123) pipeline = Pipeline(stages=[targetEncoder]) model = pipeline.fit(self._trainingDataset) path = "file://" + os.path.abspath("build/testTargetEncoderSetterParametersGetPropagatedToLoadedMOJOModel") model.write().overwrite().save(path) loadedModel = PipelineModel.load(path) mojoModel = loadedModel.stages[0] self.assertTargetEncoderAndMOJOModelParamsAreEqual(targetEncoder, mojoModel)