def createInitialDeepLearningDefinition():
     return H2ODeepLearning(seed=42,
                            reproducible=True,
                            labelCol="CAPSULE",
                            featuresCols=["AGE", "RACE", "DPROS", "DCAPS"],
                            hidden=[
                                3,
                            ])
Пример #2
0
def testLoadAndTrainMojo(prostateDataset):
    mojo = H2OMOJOModel.createFromMojo("file://" + os.path.abspath(
        "../ml/src/test/resources/deep_learning_prostate.mojo"))

    dl = H2ODeepLearning(seed=42, reproducible=True, labelCol="CAPSULE")

    model = dl.fit(prostateDataset)

    predMojo = mojo.transform(prostateDataset).repartition(1).collect()
    predModel = model.transform(prostateDataset).repartition(1).collect()

    assert len(predMojo) == len(predModel)
    for i in range(0, len(predMojo)):
        assert predMojo[i] == predModel[i]
Пример #3
0
def testParams():
    dl = H2ODeepLearning(modelId=None,
                         splitRatio=1.0,
                         labelCol="label",
                         weightCol=None,
                         featuresCols=[],
                         allStringColumnsToCategorical=True,
                         columnsToCategorical=[],
                         nfolds=0,
                         keepCrossValidationPredictions=False,
                         keepCrossValidationFoldAssignment=False,
                         parallelizeCrossValidation=True,
                         seed=-1,
                         distribution="AUTO",
                         epochs=10.0,
                         l1=0.0,
                         l2=0.0,
                         hidden=[200, 200],
                         reproducible=False,
                         convertUnknownCategoricalLevelsToNa=False,
                         foldCol=None,
                         predictionCol="prediction",
                         detailedPredictionCol="detailed_prediction",
                         withDetailedPredictionCol=False,
                         convertInvalidNumbersToNa=False)

    assert dl.getModelId() == None
    assert dl.getSplitRatio() == 1.0
    assert dl.getLabelCol() == "label"
    assert dl.getWeightCol() == None
    assert dl.getFeaturesCols() == []
    assert dl.getAllStringColumnsToCategorical() == True
    assert dl.getColumnsToCategorical() == []
    assert dl.getNfolds() == 0
    assert dl.getKeepCrossValidationPredictions() == False
    assert dl.getKeepCrossValidationFoldAssignment() == False
    assert dl.getParallelizeCrossValidation() == True
    assert dl.getSeed() == -1
    assert dl.getDistribution() == "AUTO"
    assert dl.getEpochs() == 10.0
    assert dl.getL1() == 0.0
    assert dl.getL2() == 0.0
    assert dl.getHidden() == [200, 200]
    assert dl.getReproducible() == False
    assert dl.getConvertUnknownCategoricalLevelsToNa() == False
    assert dl.getFoldCol() == None
    assert dl.getPredictionCol() == "prediction"
    assert dl.getDetailedPredictionCol() == "detailed_prediction"
    assert dl.getWithDetailedPredictionCol() == False
    assert dl.getConvertInvalidNumbersToNa() == False
Пример #4
0
    def test_load_mojo_deeplearning(self):
        from pysparkling.ml import H2OMOJOModel, H2ODeepLearning
        mojo = H2OMOJOModel.create_from_mojo("file://" + os.path.abspath("../ml/src/test/resources/deep_learning_prostate.mojo"))
        prostate_frame = self._hc.as_spark_frame(h2o.upload_file(unit_test_utils.locate("smalldata/prostate/prostate.csv")))

        dl = H2ODeepLearning(seed=42, reproducible=True, predictionCol="CAPSULE")

        model = dl.fit(prostate_frame)

        pred_mojo = mojo.predict(prostate_frame).repartition(1).collect()
        pred_model = model.transform(prostate_frame).repartition(1).collect()

        assert len(pred_mojo)==len(pred_model)
        for i in range(0, len(pred_mojo)):
            assert pred_mojo[i]==pred_model[i]
Пример #5
0
    def h2o_deeplearning(df, label, columns, **kargs):

        H2OContext.getOrCreate(Spark.instance.spark)

        df_sti = string_to_index(df, input_cols=label)
        df_va = vector_assembler(df_sti, input_cols=columns)
        h2o_deeplearning = H2ODeepLearning(epochs=10,
                                           seed=1,
                                           l1=0.001,
                                           l2=0.0,
                                           hidden=[200, 200],
                                           featuresCols=columns,
                                           labelCol=label,
                                           **kargs)
        model = h2o_deeplearning.fit(df_va)
        df_raw = model.transform(df_va)

        df_pred = df_raw.withColumn("prediction", when(df_raw.prediction_output["p1"] > 0.5, 1.0).otherwise(0.0))

        return df_pred, model
Пример #6
0
    outputCol="filtered",
    stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"],
    caseSensitive=False)

## Hash the words
hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="wordToIndex",
                      numFeatures=1 << 10)

## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4)

## Create H2ODeepLearning model
dl = H2ODeepLearning(epochs=10,
                     l1=0.001,
                     l2=0.0,
                     hidden=[200, 200],
                     featuresCols=[idf.getOutputCol()],
                     predictionCol="label")

## Remove all helper columns
colPruner = ColumnPruner(columns=[
    idf.getOutputCol(),
    hashingTF.getOutputCol(),
    stopWordsRemover.getOutputCol(),
    tokenizer.getOutputCol()
])

## Create the pipeline by defining all the stages
pipeline = Pipeline(
    stages=[tokenizer, stopWordsRemover, hashingTF, idf, dl, colPruner])
Пример #7
0
def testPipelineSerializationDeepLearning(prostateDataset):
    gridSearchTester(H2ODeepLearning().setLabelCol("AGE"), prostateDataset)
def testPipelineSerializationDeepLearning(prostateDataset):
    gridSearchTester(H2ODeepLearning(), prostateDataset)
Пример #9
0
def testDeepLearningParameters(prostateDataset):
    features = ['AGE', 'RACE', 'DPROS', 'DCAPS', 'PSA']
    algorithm = H2ODeepLearning(seed=1, labelCol="CAPSULE", featuresCols=features)
    model = algorithm.fit(prostateDataset)
    compareParameterValues(algorithm, model)
## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4)

if algo == "gbm":
    ## Create GBM model
    algoStage = H2OGBM(splitRatio=0.8,
                       seed=1,
                       featuresCols=[idf.getOutputCol()],
                       labelCol="label")
elif algo == "dl":
    ## Create H2ODeepLearning model
    algoStage = H2ODeepLearning(epochs=10,
                                seed=1,
                                l1=0.001,
                                l2=0.0,
                                hidden=[200, 200],
                                featuresCols=[idf.getOutputCol()],
                                labelCol="label")
elif algo == "automl":
    ## Create H2OAutoML model
    algoStage = H2OAutoML(
        convertUnknownCategoricalLevelsToNa=True,
        maxRuntimeSecs=60 * 100,  # 100 minutes
        maxModels=3,
        seed=1,
        labelCol="label")
elif algo == "xgboost":
    ## Create H2OXGBoost model
    algoStage = H2OXGBoost(convertUnknownCategoricalLevelsToNa=True,
                           featuresCols=[idf.getOutputCol()],