예제 #1
0
def testMOJOModelReturnsSameResultAsBinaryModelWhenOffsetColumnsIsSet(
        hc, dataset):
    [trainingDataset, testingDataset] = dataset.randomSplit([0.8, 0.2], 1)
    trainingFrame = hc.as_h2o_frame(trainingDataset)
    testingFrame = hc.as_h2o_frame(testingDataset)
    gbm = H2OGradientBoostingEstimator(distribution="tweedie",
                                       ntrees=600,
                                       max_depth=1,
                                       min_rows=1,
                                       learn_rate=0.1,
                                       min_split_improvement=0)
    gbm.train(x=["District", "Group", "Age"],
              y="Claims",
              training_frame=trainingFrame,
              offset_column="Offset")

    mojoFile = gbm.download_mojo(path=os.path.abspath("build/"),
                                 get_genmodel_jar=False)
    print(mojoFile)
    mojoModel = H2OMOJOModel.createFromMojo("file://" + mojoFile)

    binaryModelResult = hc.as_spark_frame(gbm.predict(testingFrame))
    mojoResult = mojoModel.transform(testingDataset).select("prediction")

    unit_test_utils.assert_data_frames_are_identical(binaryModelResult,
                                                     mojoResult)
    assert mojoModel.getOffsetCol(
    ) == "Offset", "Offset column must be propagated to the MOJO model."
예제 #2
0
def testTargetEncoderModelProduceSameResultsRegardlessSpecificationOfOutputCols(
        trainingDataset, testingDataset):
    def trainAndReturnTranformedTestingDataset(targetEncoder):
        targetEncoderModel = targetEncoder.fit(trainingDataset)
        return targetEncoderModel.transformTrainingDataset(testingDataset)

    targetEncoderDefaultOutputCols = H2OTargetEncoder() \
        .setInputCols(["RACE", "DPROS", "DCAPS"]) \
        .setLabelCol("CAPSULE") \
        .setHoldoutStrategy("None") \
        .setNoise(0.0)
    dataFrameDefaultOutputCols = trainAndReturnTranformedTestingDataset(targetEncoderDefaultOutputCols) \
        .withColumnRenamed("RACE_te", "RACE_out") \
        .withColumnRenamed("DPROS_te", "DPROS_out") \
        .withColumnRenamed("DCAPS_te", "DCAPS_out")

    targetEncoderCustomOutputCols = H2OTargetEncoder() \
        .setInputCols(["RACE", "DPROS", "DCAPS"]) \
        .setOutputCols(["RACE_out", "DPROS_out", "DCAPS_out"]) \
        .setLabelCol("CAPSULE") \
        .setHoldoutStrategy("None") \
        .setNoise(0.0)
    dataFrameCustomOutputCols = trainAndReturnTranformedTestingDataset(
        targetEncoderCustomOutputCols)

    unit_test_utils.assert_data_frames_are_identical(
        dataFrameDefaultOutputCols, dataFrameCustomOutputCols)
예제 #3
0
def testPipelineSerialization(craiglistDataset):
    [traningDataset, testingDataset] = craiglistDataset.randomSplit([0.9, 0.1],
                                                                    42)

    tokenizer = RegexTokenizer(inputCol="jobtitle",
                               minTokenLength=2,
                               outputCol="tokenized")
    stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                        outputCol="stopWordsRemoved")
    w2v = H2OWord2Vec(sentSampleRate=0,
                      epochs=10,
                      inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="w2v")
    gbm = H2OGBM(labelCol="category", featuresCols=[w2v.getOutputCol()])

    pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, w2v, gbm])

    pipeline.write().overwrite().save("file://" +
                                      os.path.abspath("build/w2v_pipeline"))
    loadedPipeline = Pipeline.load("file://" +
                                   os.path.abspath("build/w2v_pipeline"))
    model = loadedPipeline.fit(traningDataset)
    expected = model.transform(testingDataset)

    model.write().overwrite().save("file://" +
                                   os.path.abspath("build/w2v_pipeline_model"))
    loadedModel = PipelineModel.load(
        "file://" + os.path.abspath("build/w2v_pipeline_model"))
    result = loadedModel.transform(testingDataset)

    unit_test_utils.assert_data_frames_are_identical(expected, result)
예제 #4
0
def testH2OAutoMLRegressorBehavesTheSameAsGenericH2OAutoMLOnNumericLabelColumn(prostateDataset):
    [trainingDateset, testingDataset] = prostateDataset.randomSplit([0.9, 0.1], 42)

    automl = setParametersForTesting(H2OAutoML())
    referenceModel = automl.fit(trainingDateset)
    referenceDataset = referenceModel.transform(testingDataset)

    classifier = setParametersForTesting(H2OAutoMLRegressor())
    model = classifier.fit(trainingDateset)
    result = model.transform(testingDataset)

    unit_test_utils.assert_data_frames_are_identical(referenceDataset, result)
예제 #5
0
def testH2OAutoMLClassifierBehavesTheSameAsGenericH2OAutoMLOnStringLabelColumn(prostateDataset):
    [trainingDateset, testingDataset] = prostateDataset.randomSplit([0.9, 0.1], 42)

    automl = setParametersForTesting(H2OAutoML())
    referenceModel = automl.fit(trainingDateset.withColumn("CAPSULE", col("CAPSULE").cast("string")))
    referenceDataset = referenceModel.transform(testingDataset)

    classifier = setParametersForTesting(H2OAutoMLClassifier())
    model = classifier.fit(trainingDateset)
    result = model.transform(testingDataset)

    unit_test_utils.assert_data_frames_are_identical(referenceDataset, result)
예제 #6
0
def testTargetEncoderMOJOModelCouldBeSavedAndLoaded(trainingDataset, testingDataset):
    targetEncoder = H2OTargetEncoder(foldCol="ID", labelCol="CAPSULE", inputCols=["RACE", "DPROS", "DCAPS"],
                                     outputCols=["RACE_out", "DPROS_out", "DCAPS_out"])
    model = targetEncoder.fit(trainingDataset)
    path = "file://" + os.path.abspath("build/testTargetEncoderMOJOModelCouldBeSavedAndLoaded")
    model.write().overwrite().save(path)
    loadedModel = H2OTargetEncoderMOJOModel.load(path)

    expected = model.transform(testingDataset)
    result = loadedModel.transform(testingDataset)

    unit_test_utils.assert_data_frames_are_identical(expected, result)
예제 #7
0
def testTargetEncoderModelWithDisabledNoiseAndTargetEncoderMOJOModelTransformTheTrainingDatasetSameWay(trainingDataset):
    targetEncoder = H2OTargetEncoder() \
        .setInputCols([["RACE"], ["DPROS", "DCAPS"]]) \
        .setLabelCol("CAPSULE") \
        .setHoldoutStrategy("None") \
        .setNoise(0.0)
    targetEncoderModel = targetEncoder.fit(trainingDataset)

    transformedByModel = targetEncoderModel.transformTrainingDataset(trainingDataset)
    transformedByMOJOModel = targetEncoderModel.transform(trainingDataset)

    unit_test_utils.assert_data_frames_are_identical(transformedByModel, transformedByMOJOModel)
예제 #8
0
def testProducedMOJOModelAndLoadedMOJOModelReturnsSameResult(trainingDataset, testingDataset):
    targetEncoder = H2OTargetEncoder(labelCol="CAPSULE", inputCols=[["RACE"], ["DPROS", "DCAPS"]])
    pipeline = Pipeline(stages=[targetEncoder])
    producedModel = pipeline.fit(trainingDataset)
    path = "file://" + os.path.abspath("build/testProducedMOJOModelAndLoadedMOJOModelReturnsSameResult")
    producedModel.write().overwrite().save(path)
    loadedModel = PipelineModel.load(path)

    transformedByProducedModel = producedModel.transform(testingDataset)
    transformedByLoadedModel = loadedModel.transform(testingDataset)

    unit_test_utils.assert_data_frames_are_identical(transformedByProducedModel, transformedByLoadedModel)
def testH2OFrameOfSpecificTypeToDataframe(spark, hc, data, sparkType):
    columnName = 'A'
    schema = StructType([
        StructField(columnName, sparkType, False),
    ])

    originalDF = spark.createDataFrame(map(lambda i: (i,), data), schema)
    frame = h2o.H2OFrame(data, column_names=[columnName])

    transformedDF = hc.asSparkFrame(frame)

    unit_test_utils.assert_data_frames_are_identical(originalDF, transformedDF)
    assert originalDF.dtypes == transformedDF.dtypes
예제 #10
0
def testGridSearchWithDRFRegressorBehavesTheSameAsGridSearchWithGenericDRFOnNumericLabelColumn(
        prostateDataset):
    [trainingDateset, testingDataset] = prostateDataset.randomSplit([0.9, 0.1],
                                                                    42)

    referenceGrid = createGridForProblemSpecificTesting(H2ODRF())
    referenceModel = referenceGrid.fit(trainingDateset)
    referenceDataset = referenceModel.transform(testingDataset)

    grid = createGridForProblemSpecificTesting(H2ODRFRegressor())
    model = grid.fit(trainingDateset)
    result = model.transform(testingDataset)

    unit_test_utils.assert_data_frames_are_identical(referenceDataset, result)
예제 #11
0
def testLoadAndTrainMojo(hc, spark):
    referenceMojo = H2OMOJOModel.createFromMojo("file://" + os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo"))
    df = spark.read.csv("file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True, inferSchema=True)
    frame = hc.asH2OFrame(df)
    frame["CAPSULE"] = frame["CAPSULE"].asfactor()
    gbm = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=2, seed=42)
    gbm.train(y="CAPSULE", training_frame=frame)
    mojoFile = gbm.download_mojo(path=os.path.abspath("build/"), get_genmodel_jar=False)
    trainedMojo = H2OMOJOModel.createFromMojo("file://" + mojoFile)

    expect = referenceMojo.transform(df)
    result = trainedMojo.transform(df)

    unit_test_utils.assert_data_frames_are_identical(expect, result)
예제 #12
0
def testPipelineSerialization(prostateDataset):
    algo = H2OIsolationForest(seed=1)

    pipeline = Pipeline(stages=[algo])
    pipeline.write().overwrite().save("file://" + os.path.abspath("build/isolation_forest_pipeline"))
    loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/isolation_forest_pipeline"))
    model = loadedPipeline.fit(prostateDataset)
    expected = model.transform(prostateDataset)

    model.write().overwrite().save("file://" + os.path.abspath("build/isolation_forest_pipeline_model"))
    loadedModel = PipelineModel.load("file://" + os.path.abspath("build/isolation_forest_pipeline_model"))
    result = loadedModel.transform(prostateDataset)

    unit_test_utils.assert_data_frames_are_identical(expected, result)
예제 #13
0
def testGridSearchWithDRFClassifierBehavesTheSameAsGridSearchWithGenericDRFOnStringLabelColumn(
        prostateDataset):
    [trainingDateset, testingDataset] = prostateDataset.randomSplit([0.9, 0.1],
                                                                    42)

    referenceGrid = createGridForProblemSpecificTesting(H2ODRF())
    referenceModel = referenceGrid.fit(
        trainingDateset.withColumn("CAPSULE",
                                   col("CAPSULE").cast("string")))
    referenceDataset = referenceModel.transform(testingDataset)

    grid = createGridForProblemSpecificTesting(H2ODRFClassifier())
    model = grid.fit(trainingDateset)
    result = model.transform(testingDataset)

    unit_test_utils.assert_data_frames_are_identical(referenceDataset, result)
예제 #14
0
def testPipelineSerialization(heartDataset):
    features = ['age', 'year', 'surgery', 'transplant', 'start', 'stop']
    algo = H2OCoxPH(labelCol="event",
                    featuresCols=features,
                    startCol='start',
                    stopCol='stop')

    pipeline = Pipeline(stages=[algo])
    pipeline.write().overwrite().save("file://" +
                                      os.path.abspath("build/cox_ph_pipeline"))
    loadedPipeline = Pipeline.load("file://" +
                                   os.path.abspath("build/cox_ph_pipeline"))
    model = loadedPipeline.fit(heartDataset)
    expected = model.transform(heartDataset)

    model.write().overwrite().save("file://" +
                                   os.path.abspath("build/cox_ph_pipeline"))
    loadedModel = PipelineModel.load("file://" +
                                     os.path.abspath("build/cox_ph_pipeline"))
    result = loadedModel.transform(heartDataset)

    unit_test_utils.assert_data_frames_are_identical(expected, result)
예제 #15
0
 def _fit(self, dataset):
     unit_test_utils.assert_data_frames_are_identical(expected, dataset)
     unit_test_utils.assert_data_frames_have_different_values(unexpected, dataset)
     return DummyTransformer()