def testParams(): automl = H2OAutoML(featuresCols=[], labelCol="label", allStringColumnsToCategorical=True, columnsToCategorical=[], splitRatio=1.0, foldCol=None, weightCol=None, ignoredCols=[], includeAlgos=["XGbooST"], excludeAlgos=["DRF", "DeePLeArNING"], projectName="test", maxRuntimeSecs=3600.0, stoppingRounds=3, stoppingTolerance=0.001, stoppingMetric="AUTO", nfolds=5, convertUnknownCategoricalLevelsToNa=True, seed=-1, sortMetric="AUTO", balanceClasses=False, classSamplingFactors=None, maxAfterBalanceSize=5.0, keepCrossValidationPredictions=True, keepCrossValidationModels=True, maxModels=0, predictionCol="prediction", detailedPredictionCol="detailed_prediction", withDetailedPredictionCol=False, convertInvalidNumbersToNa=False) assert automl.getFeaturesCols() == [] assert automl.getLabelCol() == "label" assert automl.getAllStringColumnsToCategorical() == True assert automl.getColumnsToCategorical() == [] assert automl.getSplitRatio() == 1.0 assert automl.getFoldCol() == None assert automl.getWeightCol() == None assert automl.getIgnoredCols() == [] assert automl.getIncludeAlgos() == ["XGBoost"] assert automl.getExcludeAlgos() == ["DRF", "DeepLearning"] assert automl.getProjectName() == "test" assert automl.getMaxRuntimeSecs() == 3600.0 assert automl.getStoppingRounds() == 3 assert automl.getStoppingTolerance() == 0.001 assert automl.getStoppingMetric() == "AUTO" assert automl.getNfolds() == 5 assert automl.getConvertUnknownCategoricalLevelsToNa() == True assert automl.getSeed() == -1 assert automl.getSortMetric() == "AUTO" assert automl.getBalanceClasses() == False assert automl.getClassSamplingFactors() == None assert automl.getMaxAfterBalanceSize() == 5.0 assert automl.getKeepCrossValidationPredictions() == True assert automl.getKeepCrossValidationModels() == True assert automl.getMaxModels() == 0 assert automl.getPredictionCol() == "prediction" assert automl.getDetailedPredictionCol() == "detailed_prediction" assert automl.getWithDetailedPredictionCol() == False assert automl.getConvertInvalidNumbersToNa() == False
def h2o_automl(df, label, columns, **kargs): H2OContext.getOrCreate(Spark.instance.spark) df_sti = string_to_index(df, input_cols=label) df_va = vector_assembler(df_sti, input_cols=columns) automl = H2OAutoML(convertUnknownCategoricalLevelsToNa=True, maxRuntimeSecs=60, # 1 minutes seed=1, maxModels=3, labelCol=label + "_index", **kargs) model = automl.fit(df_va) df_raw = model.transform(df_va) df_pred = df_raw.withColumn("prediction", when(df_raw.prediction_output["value"] > 0.5, 1.0).otherwise(0.0)) return df_pred, model
stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"], caseSensitive=False) ## Hash the words hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(), outputCol="wordToIndex", numFeatures=1 << 10) ## Create inverse document frequencies model idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4) ## Create H2OAutoML model automl = H2OAutoML(convertUnknownCategoricalLevelsToNa=False, seed=1, maxRuntimeSecs=300, # 5 minutes predictionCol="label") ## Remove all helper columns colPruner = ColumnPruner(columns=[idf.getOutputCol(), hashingTF.getOutputCol(), stopWordsRemover.getOutputCol(), tokenizer.getOutputCol()]) ## Create the pipeline by defining all the stages pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, hashingTF, idf, automl, colPruner]) ## Train the pipeline model data = load() model = pipeline.fit(data) ## ## Make predictions on unlabeled data ## Spam detector
featuresCols=[idf.getOutputCol()], predictionCol="label") elif algo == "dl": ## Create H2ODeepLearning model algoStage = H2ODeepLearning(epochs=10, seed=1, l1=0.001, l2=0.0, hidden=[200, 200], featuresCols=[idf.getOutputCol()], predictionCol="label") elif algo == "automl": ## Create H2OAutoML model algoStage = H2OAutoML( convertUnknownCategoricalLevelsToNa=True, maxRuntimeSecs=60, # 1 minutes seed=1, predictionCol="label") ## Remove all helper columns colPruner = ColumnPruner(columns=[ idf.getOutputCol(), hashingTF.getOutputCol(), stopWordsRemover.getOutputCol(), tokenizer.getOutputCol() ]) ## Create the pipeline by defining all the stages pipeline = Pipeline( stages=[tokenizer, stopWordsRemover, hashingTF, idf, algoStage, colPruner])
def getAlgorithmForGetLeaderboardTesting(): automl = H2OAutoML(labelCol="CAPSULE", ignoredCols=["ID"]) automl.setExcludeAlgos(["GLM"]) automl.setMaxModels(5) automl.setSortMetric("AUC") return automl
gbm = H2OGBM(splitRatio=0.8, seed=1, featuresCols=[idf.getOutputCol()], labelCol="label") dl = H2ODeepLearning(epochs=10, seed=1, l1=0.001, l2=0.0, hidden=[200, 200], featuresCols=[idf.getOutputCol()], labelCol="label") automl = H2OAutoML( convertUnknownCategoricalLevelsToNa=True, maxRuntimeSecs=60 * 100, # 100 minutes maxModels=10, seed=1, labelCol="label") xgboost = H2OXGBoost(convertUnknownCategoricalLevelsToNa=True, featuresCols=[idf.getOutputCol()], labelCol="label") data = load() def trainPipelineModel(idf, hashingTF, stopWordsRemover, tokenizer, algoStage, data): ## Remove all helper columns colPruner = ColumnPruner(columns=[ idf.getOutputCol(),