예제 #1
0
def testTargetEncoderModelProduceSameResultsRegardlessSpecificationOfOutputCols(
        trainingDataset, testingDataset):
    def trainAndReturnTranformedTestingDataset(targetEncoder):
        targetEncoderModel = targetEncoder.fit(trainingDataset)
        return targetEncoderModel.transformTrainingDataset(testingDataset)

    targetEncoderDefaultOutputCols = H2OTargetEncoder() \
        .setInputCols(["RACE", "DPROS", "DCAPS"]) \
        .setLabelCol("CAPSULE") \
        .setHoldoutStrategy("None") \
        .setNoise(0.0)
    dataFrameDefaultOutputCols = trainAndReturnTranformedTestingDataset(targetEncoderDefaultOutputCols) \
        .withColumnRenamed("RACE_te", "RACE_out") \
        .withColumnRenamed("DPROS_te", "DPROS_out") \
        .withColumnRenamed("DCAPS_te", "DCAPS_out")

    targetEncoderCustomOutputCols = H2OTargetEncoder() \
        .setInputCols(["RACE", "DPROS", "DCAPS"]) \
        .setOutputCols(["RACE_out", "DPROS_out", "DCAPS_out"]) \
        .setLabelCol("CAPSULE") \
        .setHoldoutStrategy("None") \
        .setNoise(0.0)
    dataFrameCustomOutputCols = trainAndReturnTranformedTestingDataset(
        targetEncoderCustomOutputCols)

    unit_test_utils.assert_data_frames_are_identical(
        dataFrameDefaultOutputCols, dataFrameCustomOutputCols)
예제 #2
0
def testPipelineWithTargetEncoderIsSerializable():
    targetEncoder = H2OTargetEncoder(
        foldCol="ID",
        labelCol="CAPSULE",
        inputCols=["RACE", "DPROS", "DCAPS"],
        outputCols=["RACE_out", "DPROS_out", "DCAPS_out"],
        holdoutStrategy="KFold",
        blendedAvgEnabled=True,
        blendedAvgInflectionPoint=15.0,
        blendedAvgSmoothing=25.0,
        noise=0.05,
        noiseSeed=123)
    gbm = H2OGBM() \
        .setLabelCol("CAPSULE") \
        .setFeaturesCols(targetEncoder.getOutputCols())
    pipeline = Pipeline(stages=[targetEncoder, gbm])
    path = "file://" + os.path.abspath(
        "build/testPipelineWithTargetEncoderIsSerializable")
    pipeline.write().overwrite().save(path)
    loadedPipeline = Pipeline.load(path)
    [loadedTargetEncoder, loadedGbm] = loadedPipeline.getStages()

    assertTargetEncoderAndMOJOModelParamsAreEqual(targetEncoder,
                                                  loadedTargetEncoder)
    assert gbm.getLabelCol() == loadedGbm.getLabelCol()
    assert gbm.getFeaturesCols() == loadedGbm.getFeaturesCols()
예제 #3
0
 def createTargetEncoder():
     return H2OTargetEncoder() \
         .setInputCols(["RACE", "DPROS", "DCAPS"]) \
         .setLabelCol("CAPSULE") \
         .setHoldoutStrategy("None") \
         .setBlendedAvgEnabled(False) \
         .setNoise(5.0) \
         .setNoiseSeed(42)
예제 #4
0
def testPipelineWithTargetEncoderTransformsTrainingAndTestingDatasetWithoutException(trainingDataset, testingDataset):
    targetEncoder = H2OTargetEncoder(labelCol="CAPSULE", inputCols=["RACE", "DPROS", "DCAPS"])
    gbm = H2OGBM(labelCol="CAPSULE")

    pipeline = Pipeline(stages=[targetEncoder, gbm])
    model = pipeline.fit(trainingDataset)

    model.transform(testingDataset).collect()
    def testTargetEncoderConstructorParametersGetPropagatedToLoadedMOJOModel(self):
        targetEncoder = H2OTargetEncoder(foldCol="ID", labelCol="CAPSULE", inputCols=["RACE", "DPROS", "DCAPS"], holdoutStrategy = "KFold",
                                         blendedAvgEnabled=True, blendedAvgInflectionPoint=15.0, blendedAvgSmoothing=25.0, noise=0.05, noiseSeed=123)
        pipeline = Pipeline(stages=[targetEncoder])
        model = pipeline.fit(self._trainingDataset)
        path = "file://" + os.path.abspath("build/testTargetEncoderConstructorParametersGetPropagatedToLoadedMOJOModel")
        model.write().overwrite().save(path)
        loadedModel = PipelineModel.load(path)
        mojoModel = loadedModel.stages[0]

        self.assertTargetEncoderAndMOJOModelParamsAreEqual(targetEncoder, mojoModel)
예제 #6
0
def testTargetEncoderMOJOModelCouldBeSavedAndLoaded(trainingDataset, testingDataset):
    targetEncoder = H2OTargetEncoder(foldCol="ID", labelCol="CAPSULE", inputCols=["RACE", "DPROS", "DCAPS"],
                                     outputCols=["RACE_out", "DPROS_out", "DCAPS_out"])
    model = targetEncoder.fit(trainingDataset)
    path = "file://" + os.path.abspath("build/testTargetEncoderMOJOModelCouldBeSavedAndLoaded")
    model.write().overwrite().save(path)
    loadedModel = H2OTargetEncoderMOJOModel.load(path)

    expected = model.transform(testingDataset)
    result = loadedModel.transform(testingDataset)

    unit_test_utils.assert_data_frames_are_identical(expected, result)
예제 #7
0
def testTargetEncoderModelWithDisabledNoiseAndTargetEncoderMOJOModelTransformTheTrainingDatasetSameWay(trainingDataset):
    targetEncoder = H2OTargetEncoder() \
        .setInputCols([["RACE"], ["DPROS", "DCAPS"]]) \
        .setLabelCol("CAPSULE") \
        .setHoldoutStrategy("None") \
        .setNoise(0.0)
    targetEncoderModel = targetEncoder.fit(trainingDataset)

    transformedByModel = targetEncoderModel.transformTrainingDataset(trainingDataset)
    transformedByMOJOModel = targetEncoderModel.transform(trainingDataset)

    unit_test_utils.assert_data_frames_are_identical(transformedByModel, transformedByMOJOModel)
예제 #8
0
def testProducedMOJOModelAndLoadedMOJOModelReturnsSameResult(trainingDataset, testingDataset):
    targetEncoder = H2OTargetEncoder(labelCol="CAPSULE", inputCols=[["RACE"], ["DPROS", "DCAPS"]])
    pipeline = Pipeline(stages=[targetEncoder])
    producedModel = pipeline.fit(trainingDataset)
    path = "file://" + os.path.abspath("build/testProducedMOJOModelAndLoadedMOJOModelReturnsSameResult")
    producedModel.write().overwrite().save(path)
    loadedModel = PipelineModel.load(path)

    transformedByProducedModel = producedModel.transform(testingDataset)
    transformedByLoadedModel = loadedModel.transform(testingDataset)

    unit_test_utils.assert_data_frames_are_identical(transformedByProducedModel, transformedByLoadedModel)
    def testTargetEncoderSetterParametersGetPropagatedToLoadedMOJOModel(self):
        targetEncoder = H2OTargetEncoder()\
            .setFoldCol("ID")\
            .setLabelCol("CAPSULE")\
            .setInputCols(["RACE", "DPROS", "DCAPS"])\
            .setHoldoutStrategy("KFold")\
            .setBlendedAvgEnabled(True)\
            .setBlendedAvgInflectionPoint(15.0)\
            .setBlendedAvgSmoothing(25.0)\
            .setNoise(0.05)\
            .setNoiseSeed(123)
        pipeline = Pipeline(stages=[targetEncoder])
        model = pipeline.fit(self._trainingDataset)
        path = "file://" + os.path.abspath("build/testTargetEncoderSetterParametersGetPropagatedToLoadedMOJOModel")
        model.write().overwrite().save(path)
        loadedModel = PipelineModel.load(path)
        mojoModel = loadedModel.stages[0]

        self.assertTargetEncoderAndMOJOModelParamsAreEqual(targetEncoder, mojoModel)