Exemplo n.º 1
0
def testXGBoostParameters(prostateDataset):
    features = ['AGE', 'RACE', 'DPROS', 'DCAPS', 'PSA']
    algorithm = H2OXGBoost(seed=1, labelCol="CAPSULE", featuresCols=features,
                           monotoneConstraints={'AGE': 1, 'RACE': -1},
                           interactionConstraints=[['AGE', 'RACE', 'DPROS'], ['DCAPS', 'PSA']])
    model = algorithm.fit(prostateDataset)
    compareParameterValues(algorithm, model)
Exemplo n.º 2
0
def testFitXgboostWithoutError(prostateDataset):
    xgboost = H2OXGBoost(ntrees=2,
                         seed=42,
                         distribution="bernoulli",
                         labelCol="capsule")

    model = xgboost.fit(prostateDataset)
    model.transform(prostateDataset).repartition(1).collect()
Exemplo n.º 3
0
    def h2o_xgboost(df, label, columns, **kargs):

        H2OContext.getOrCreate(Spark.instance.spark)

        df_sti = string_to_index(df, input_cols=label)
        df_va = vector_assembler(df_sti, input_cols=columns)
        h2o_xgboost = H2OXGBoost(convertUnknownCategoricalLevelsToNa=True,
                                 featuresCols=columns,
                                 labelCol=label,
                                 **kargs)
        model = h2o_xgboost.fit(df_va)
        df_raw = model.transform(df_va)

        df_pred = df_raw.withColumn("prediction", when(df_raw.prediction_output["p1"] > 0.5, 1.0).otherwise(0.0))

        return df_pred, model
Exemplo n.º 4
0
def testPipelineSerializationXGBoost(prostateDataset):
    gridSearchTester(H2OXGBoost().setLabelCol("AGE"), prostateDataset)
def testPipelineSerializationXGBoost(prostateDataset):
    gridSearchTester(H2OXGBoost(), prostateDataset)
                     seed=1,
                     l1=0.001,
                     l2=0.0,
                     hidden=[200, 200],
                     featuresCols=[idf.getOutputCol()],
                     labelCol="label")

automl = H2OAutoML(
    convertUnknownCategoricalLevelsToNa=True,
    maxRuntimeSecs=60 * 100,  # 100 minutes
    maxModels=10,
    seed=1,
    labelCol="label")

xgboost = H2OXGBoost(convertUnknownCategoricalLevelsToNa=True,
                     featuresCols=[idf.getOutputCol()],
                     labelCol="label")

data = load()


def trainPipelineModel(idf, hashingTF, stopWordsRemover, tokenizer, algoStage,
                       data):
    ## Remove all helper columns
    colPruner = ColumnPruner(columns=[
        idf.getOutputCol(),
        hashingTF.getOutputCol(),
        stopWordsRemover.getOutputCol(),
        tokenizer.getOutputCol()
    ])
Exemplo n.º 7
0
#

from pysparkling import *
from pyspark.sql import SparkSession
import h2o

# Start Cluster
spark = SparkSession.builder.appName("App name").getOrCreate()
hc = H2OContext.getOrCreate()
assert h2o.cluster().cloud_size == int(
    spark.sparkContext.getConf().get("spark.executor.instances"))

# Prepare Data
frame = h2o.import_file(
    "https://raw.githubusercontent.com/h2oai/sparkling-water/master/examples/smalldata/prostate/prostate.csv"
)
sparkDF = hc.asSparkFrame(frame)
sparkDF = sparkDF.withColumn("CAPSULE", sparkDF.CAPSULE.cast("string"))
[trainingDF, testingDF] = sparkDF.randomSplit([0.8, 0.2])

# Train Model
from pysparkling.ml import H2OXGBoost

estimator = H2OXGBoost(labelCol="CAPSULE")
model = estimator.fit(trainingDF)

# Run Predictions
model.transform(testingDF).collect()
hc.stop()
spark.stop()
Exemplo n.º 8
0
                                l2=0.0,
                                hidden=[200, 200],
                                featuresCols=[idf.getOutputCol()],
                                predictionCol="label")
elif algo == "automl":
    ## Create H2OAutoML model
    algoStage = H2OAutoML(
        convertUnknownCategoricalLevelsToNa=True,
        maxRuntimeSecs=60 * 100,  # 100 minutes
        maxModels=3,
        seed=1,
        predictionCol="label")
elif algo == "xgboost":
    ## Create H2OXGBoost model
    algoStage = H2OXGBoost(convertUnknownCategoricalLevelsToNa=True,
                           featuresCols=[idf.getOutputCol()],
                           predictionCol="label")
## Remove all helper columns
colPruner = ColumnPruner(columns=[
    idf.getOutputCol(),
    hashingTF.getOutputCol(),
    stopWordsRemover.getOutputCol(),
    tokenizer.getOutputCol()
])

## Create the pipeline by defining all the stages
pipeline = Pipeline(
    stages=[tokenizer, stopWordsRemover, hashingTF, idf, algoStage, colPruner])

## Test exporting and importing the pipeline. On Systems where HDFS & Hadoop is not available, this call store the pipeline
## to local file in the current directory. In case HDFS & Hadoop is available, this call stores the pipeline to HDFS home
Exemplo n.º 9
0
def testParams():
    xgboost = H2OXGBoost(modelId=None,
                         splitRatio=1.0,
                         labelCol="label",
                         weightCol=None,
                         featuresCols=[],
                         allStringColumnsToCategorical=True,
                         columnsToCategorical=[],
                         nfolds=0,
                         keepCrossValidationPredictions=False,
                         keepCrossValidationFoldAssignment=False,
                         parallelizeCrossValidation=True,
                         seed=-1,
                         distribution="AUTO",
                         convertUnknownCategoricalLevelsToNa=False,
                         quietMode=True,
                         ntrees=50,
                         nEstimators=0,
                         maxDepth=6,
                         minRows=1.0,
                         minChildWeight=1.0,
                         learnRate=0.3,
                         eta=0.3,
                         learnRateAnnealing=1.0,
                         sampleRate=1.0,
                         subsample=1.0,
                         colSampleRate=1.0,
                         colSampleByLevel=1.0,
                         colSampleRatePerTree=1.0,
                         colSampleByTree=1.0,
                         maxAbsLeafnodePred=0.0,
                         maxDeltaStep=0.0,
                         scoreTreeInterval=0,
                         initialScoreInterval=4000,
                         scoreInterval=4000,
                         minSplitImprovement=0.0,
                         gamma=0.0,
                         nthread=-1,
                         maxBins=256,
                         maxLeaves=0,
                         minSumHessianInLeaf=100.0,
                         minDataInLeaf=0.0,
                         treeMethod="auto",
                         growPolicy="depthwise",
                         booster="gbtree",
                         dmatrixType="auto",
                         regLambda=0.0,
                         regAlpha=0.0,
                         sampleType="uniform",
                         normalizeType="tree",
                         rateDrop=0.0,
                         oneDrop=False,
                         skipDrop=0.0,
                         gpuId=0,
                         backend="auto",
                         foldCol=None,
                         predictionCol="prediction",
                         detailedPredictionCol="detailed_prediction",
                         withDetailedPredictionCol=False,
                         convertInvalidNumbersToNa=False)

    assert xgboost.getModelId() == None
    assert xgboost.getSplitRatio() == 1.0
    assert xgboost.getLabelCol() == "label"
    assert xgboost.getWeightCol() == None
    assert xgboost.getFeaturesCols() == []
    assert xgboost.getAllStringColumnsToCategorical() == True
    assert xgboost.getColumnsToCategorical() == []
    assert xgboost.getNfolds() == 0
    assert xgboost.getKeepCrossValidationPredictions() == False
    assert xgboost.getKeepCrossValidationFoldAssignment() == False
    assert xgboost.getParallelizeCrossValidation() == True
    assert xgboost.getSeed() == -1
    assert xgboost.getDistribution() == "AUTO"
    assert xgboost.getConvertUnknownCategoricalLevelsToNa() == False
    assert xgboost.getQuietMode() == True
    assert xgboost.getNtrees() == 50
    assert xgboost.getNEstimators() == 0
    assert xgboost.getMaxDepth() == 6
    assert xgboost.getMinRows() == 1.0
    assert xgboost.getMinChildWeight() == 1.0
    assert xgboost.getLearnRate() == 0.3
    assert xgboost.getEta() == 0.3
    assert xgboost.getLearnRateAnnealing() == 1.0
    assert xgboost.getSampleRate() == 1.0
    assert xgboost.getSubsample() == 1.0
    assert xgboost.getColSampleRate() == 1.0
    assert xgboost.getColSampleByLevel() == 1.0
    assert xgboost.getColSampleRatePerTree() == 1.0
    assert xgboost.getColSampleByTree() == 1.0
    assert xgboost.getMaxAbsLeafnodePred() == 0.0
    assert xgboost.getMaxDeltaStep() == 0.0
    assert xgboost.getScoreTreeInterval() == 0
    assert xgboost.getInitialScoreInterval() == 4000
    assert xgboost.getScoreInterval() == 4000
    assert xgboost.getMinSplitImprovement() == 0.0
    assert xgboost.getGamma() == 0.0
    assert xgboost.getNthread() == -1
    assert xgboost.getMaxBins() == 256
    assert xgboost.getMaxLeaves() == 0
    assert xgboost.getMinSumHessianInLeaf() == 100.0
    assert xgboost.getMinDataInLeaf() == 0.0
    assert xgboost.getTreeMethod() == "auto"
    assert xgboost.getGrowPolicy() == "depthwise"
    assert xgboost.getBooster() == "gbtree"
    assert xgboost.getDmatrixType() == "auto"
    assert xgboost.getRegLambda() == 0.0
    assert xgboost.getRegAlpha() == 0.0
    assert xgboost.getSampleType() == "uniform"
    assert xgboost.getNormalizeType() == "tree"
    assert xgboost.getRateDrop() == 0.0
    assert xgboost.getOneDrop() == False
    assert xgboost.getSkipDrop() == 0.0
    assert xgboost.getGpuId() == 0
    assert xgboost.getBackend() == "auto"
    assert xgboost.getFoldCol() == None
    assert xgboost.getPredictionCol() == "prediction"
    assert xgboost.getDetailedPredictionCol() == "detailed_prediction"
    assert xgboost.getWithDetailedPredictionCol() == False
    assert xgboost.getConvertInvalidNumbersToNa() == False
Exemplo n.º 10
0
spark_frame.show(4)


# In[8]:


spark_frame.describe()


# # 3. Train the model

# In[10]:


from pysparkling.ml import H2OXGBoost
estimator = H2OXGBoost(predictionCol="AGE")
model = estimator.fit(spark_frame)


# # 4. Run Predictions

# In[12]:


predictions = model.transform(spark_frame)


# In[13]:


predictions.show(4)
Exemplo n.º 11
0
 def createInitialXGBoostDefinition():
     return H2OXGBoost(featuresCols=featureCols,
                       labelCol="CAPSULE",
                       seed=1,
                       splitRatio=0.8)