Пример #1
0
class ColumnPruner(H2OStageBase, JavaTransformer):
    keep = Param(Params._dummy(), "keep",
                 "keep the specified columns in the frame",
                 H2OTypeConverters.toBoolean())

    columns = Param(Params._dummy(), "columns", "specified columns",
                    H2OTypeConverters.toListString())

    @keyword_only
    def __init__(self, keep=False, columns=[]):
        Initializer.load_sparkling_jar()
        super(ColumnPruner, self).__init__()
        self._java_obj = self._new_java_obj(
            "ai.h2o.sparkling.ml.features.ColumnPruner", self.uid)
        self._setDefaultValuesFromJava()
        kwargs = Utils.getInputKwargs(self)
        self._set(**kwargs)

    def setKeep(self, value):
        return self._set(keep=value)

    def setColumns(self, value):
        return self._set(columns=value)

    def getKeep(self):
        return self.getOrDefault(self.keep)

    def getColumns(self):
        return self.getOrDefault(self.columns)
Пример #2
0
class H2OAutoMLParams(H2OCommonSupervisedParams, HasMonotoneConstraints):
    ##
    # Param definitions
    ##
    ignoredCols = Param(
        Params._dummy(),
        "ignoredCols",
        "Ignored column names",
        H2OTypeConverters.toListString())

    includeAlgos = Param(
        Params._dummy(),
        "includeAlgos",
        "Algorithms to include when using automl",
        H2OTypeConverters.toEnumListString("ai.h2o.automl.Algo"))

    excludeAlgos = Param(
        Params._dummy(),
        "excludeAlgos",
        "Algorithms to exclude when using automl",
        H2OTypeConverters.toEnumListString("ai.h2o.automl.Algo"))

    projectName = Param(
        Params._dummy(),
        "projectName",
        "identifier for models that should be grouped together in the leaderboard "
        "(e.g., airlines and iris)",
        H2OTypeConverters.toNullableString())

    maxRuntimeSecs = Param(
        Params._dummy(),
        "maxRuntimeSecs",
        "Maximum time in seconds for automl to be running",
        H2OTypeConverters.toFloat())

    stoppingRounds = Param(
        Params._dummy(),
        "stoppingRounds",
        "Stopping rounds",
        H2OTypeConverters.toInt())

    stoppingTolerance = Param(
        Params._dummy(),
        "stoppingTolerance",
        "Stopping tolerance",
        H2OTypeConverters.toFloat())

    stoppingMetric = Param(
        Params._dummy(),
        "stoppingMetric",
        "Stopping metric",
        H2OTypeConverters.toEnumString("hex.ScoreKeeper$StoppingMetric"))

    sortMetric = Param(
        Params._dummy(),
        "sortMetric",
        "Sort metric for the AutoML leaderboard",
        H2OTypeConverters.toEnumString("ai.h2o.sparkling.ml.algos.H2OAutoMLSortMetric"))

    balanceClasses = Param(
        Params._dummy(),
        "balanceClasses",
        "Balance classes",
        H2OTypeConverters.toBoolean())

    classSamplingFactors = Param(
        Params._dummy(),
        "classSamplingFactors",
        "Class sampling factors",
        H2OTypeConverters.toNullableListFloat())

    maxAfterBalanceSize = Param(
        Params._dummy(),
        "maxAfterBalanceSize",
        "Max after balance size",
        H2OTypeConverters.toFloat())

    keepCrossValidationPredictions = Param(
        Params._dummy(),
        "keepCrossValidationPredictions",
        "Keep cross validation predictions",
        H2OTypeConverters.toBoolean())

    keepCrossValidationModels = Param(
        Params._dummy(),
        "keepCrossValidationModels",
        "Keep cross validation models",
        H2OTypeConverters.toBoolean())

    maxModels = Param(
        Params._dummy(),
        "maxModels",
        "Max models to train in AutoML",
        H2OTypeConverters.toInt())

    ##
    # Getters
    ##
    def getIgnoredCols(self):
        return self.getOrDefault(self.ignoredCols)

    def getTryMutations(self):
        return self.getOrDefault(self.tryMutations)

    def getExcludeAlgos(self):
        return self.getOrDefault(self.excludeAlgos)

    def getIncludeAlgos(self):
        return self.getOrDefault(self.includeAlgos)

    def getProjectName(self):
        return self.getOrDefault(self.projectName)

    def getMaxRuntimeSecs(self):
        return self.getOrDefault(self.maxRuntimeSecs)

    def getStoppingRounds(self):
        return self.getOrDefault(self.stoppingRounds)

    def getStoppingTolerance(self):
        return self.getOrDefault(self.stoppingTolerance)

    def getStoppingMetric(self):
        return self.getOrDefault(self.stoppingMetric)

    def getSortMetric(self):
        return self.getOrDefault(self.sortMetric)

    def getBalanceClasses(self):
        return self.getOrDefault(self.balanceClasses)

    def getClassSamplingFactors(self):
        return self.getOrDefault(self.classSamplingFactors)

    def getMaxAfterBalanceSize(self):
        return self.getOrDefault(self.maxAfterBalanceSize)

    def getKeepCrossValidationPredictions(self):
        return self.getOrDefault(self.keepCrossValidationPredictions)

    def getKeepCrossValidationModels(self):
        return self.getOrDefault(self.keepCrossValidationModels)

    def getMaxModels(self):
        return self.getOrDefault(self.maxModels)

    ##
    # Setters
    ##
    def setIgnoredCols(self, value):
        return self._set(ignoredCols=value)

    def setTryMutations(self, value):
        return self._set(tryMutations=value)

    def setIncludeAlgos(self, value):
        return self._set(includeAlgos=value)

    def setExcludeAlgos(self, value):
        return self._set(excludeAlgos=value)

    def setProjectName(self, value):
        return self._set(projectName=value)

    def setMaxRuntimeSecs(self, value):
        return self._set(maxRuntimeSecs=value)

    def setStoppingRounds(self, value):
        return self._set(stoppingRounds=value)

    def setStoppingTolerance(self, value):
        return self._set(stoppingTolerance=value)

    def setStoppingMetric(self, value):
        return self._set(stoppingMetric=value)

    def setSortMetric(self, value):
        return self._set(sortMetric=value)

    def setBalanceClasses(self, value):
        return self._set(balanceClasses=value)

    def setClassSamplingFactors(self, value):
        return self._set(classSamplingFactors=value)

    def setMaxAfterBalanceSize(self, value):
        return self._set(maxAfterBalanceSize=value)

    def setKeepCrossValidationPredictions(self, value):
        return self._set(keepCrossValidationPredictions=value)

    def setKeepCrossValidationModels(self, value):
        return self._set(keepCrossValidationModels=value)

    def setMaxModels(self, value):
        return self._set(maxModels=value)
Пример #3
0
class H2OCommonParams(H2OBaseMOJOParams):
    ##
    # Param definitions
    ##
    validationDataFrame = Param(
        Params._dummy(), "validationDataFrame",
        "A data frame dedicated for a validation of the trained model. If the parameters is not set,"
        + "a validation frame created via the 'splitRatio' parameter.",
        H2OTypeConverters.toNullableDataFrame())

    splitRatio = Param(
        Params._dummy(), "splitRatio",
        "Accepts values in range [0, 1.0] which determine how large part of dataset is used for training"
        " and for validation. For example, 0.8 -> 80% training 20% validation.",
        H2OTypeConverters.toFloat())

    columnsToCategorical = Param(
        Params._dummy(), "columnsToCategorical",
        "List of columns to convert to categorical before modelling",
        H2OTypeConverters.toListString())

    ##
    # Getters
    ##
    def getValidationDataFrame(self):
        return self.getOrDefault(self.validationDataFrame)

    def getSplitRatio(self):
        return self.getOrDefault(self.splitRatio)

    def getColumnsToCategorical(self):
        return self.getOrDefault(self.columnsToCategorical)

    ##
    # Setters
    ##
    def setValidationDataFrame(self, value):
        return self._set(validationDataFrame=value)

    def setSplitRatio(self, value):
        return self._set(splitRatio=value)

    def setColumnsToCategorical(self, value, *args):
        assert_is_type(value, [str], str)

        if isinstance(value, str):
            prepared_array = [value]
        else:
            prepared_array = value

        for arg in args:
            prepared_array.append(arg)

        return self._set(columnsToCategorical=value)

    # Setters for parameters which are defined on MOJO as well
    def setPredictionCol(self, value):
        return self._set(predictionCol=value)

    def setDetailedPredictionCol(self, value):
        return self._set(detailedPredictionCol=value)

    def setWithDetailedPredictionCol(self, value):
        warnings.warn(
            "The method will be removed without a replacement in the version 3.34."
            "Detailed prediction columns is enabled by default.",
            DeprecationWarning)
        return self

    def setFeaturesCols(self, value):
        return self._set(featuresCols=value)

    def setConvertUnknownCategoricalLevelsToNa(self, value):
        return self._set(convertUnknownCategoricalLevelsToNa=value)

    def setConvertInvalidNumbersToNa(self, value):
        return self._set(convertInvalidNumbersToNa=value)

    def setNamedMojoOutputColumns(self, value):
        return self._set(namedMojoOutputColumns=value)

    def setWithContributions(self, value):
        return self._set(withContributions=value)

    def setWithLeafNodeAssignments(self, value):
        return self._set(withLeafNodeAssignments=value)

    def setWithStageResults(self, value):
        return self._set(withStageResults=value)
Пример #4
0
class H2OCommonParams(H2OMOJOAlgoSharedParams):
    foldCol = Param(Params._dummy(), "foldCol", "Fold column name",
                    H2OTypeConverters.toNullableString())

    weightCol = Param(Params._dummy(), "weightCol", "Weight column name",
                      H2OTypeConverters.toNullableString())

    splitRatio = Param(
        Params._dummy(), "splitRatio",
        "Accepts values in range [0, 1.0] which determine how large part of dataset is used for training"
        " and for validation. For example, 0.8 -> 80% training 20% validation.",
        H2OTypeConverters.toFloat())

    seed = Param(Params._dummy(), "seed",
                 "Used to specify seed to reproduce the model run",
                 H2OTypeConverters.toInt())

    nfolds = Param(Params._dummy(), "nfolds", "Number of fold columns",
                   H2OTypeConverters.toInt())

    allStringColumnsToCategorical = Param(
        Params._dummy(), "allStringColumnsToCategorical",
        "Transform all strings columns to categorical",
        H2OTypeConverters.toBoolean())

    columnsToCategorical = Param(
        Params._dummy(), "columnsToCategorical",
        "List of columns to convert to categorical before modelling",
        H2OTypeConverters.toListString())

    ##
    # Getters
    ##
    def getFoldCol(self):
        return self.getOrDefault(self.foldCol)

    def getWeightCol(self):
        return self.getOrDefault(self.weightCol)

    def getSplitRatio(self):
        return self.getOrDefault(self.splitRatio)

    def getSeed(self):
        return self.getOrDefault(self.seed)

    def getNfolds(self):
        return self.getOrDefault(self.nfolds)

    def getAllStringColumnsToCategorical(self):
        return self.getOrDefault(self.allStringColumnsToCategorical)

    def getColumnsToCategorical(self):
        return self.getOrDefault(self.columnsToCategorical)

    ##
    # Setters
    ##
    def setFoldCol(self, value):
        return self._set(foldCol=value)

    def setWeightCol(self, value):
        return self._set(weightCol=value)

    def setSplitRatio(self, value):
        return self._set(splitRatio=value)

    def setSeed(self, value):
        return self._set(seed=value)

    def setNfolds(self, value):
        return self._set(nfolds=value)

    def setAllStringColumnsToCategorical(self, value):
        return self._set(allStringColumnsToCategorical=value)

    def setColumnsToCategorical(self, value, *args):
        assert_is_type(value, [str], str)

        if isinstance(value, str):
            prepared_array = [value]
        else:
            prepared_array = value

        for arg in args:
            prepared_array.append(arg)

        return self._set(columnsToCategorical=value)

    # Setters for parameters which are defined on MOJO as well
    def setPredictionCol(self, value):
        return self._set(predictionCol=value)

    def setDetailedPredictionCol(self, value):
        return self._set(detailedPredictionCol=value)

    def setWithDetailedPredictionCol(self, value):
        return self._set(withDetailedPredictionCol=value)

    def setFeaturesCols(self, value):
        return self._set(featuresCols=value)

    def setConvertUnknownCategoricalLevelsToNa(self, value):
        return self._set(convertUnknownCategoricalLevelsToNa=value)

    def setConvertInvalidNumbersToNa(self, value):
        return self._set(convertInvalidNumbersToNa=value)

    def setNamedMojoOutputColumns(self, value):
        return self._set(namedMojoOutputColumns=value)
class H2OMOJOAlgoSharedParams(Params):
    predictionCol = Param(
        Params._dummy(),
        "predictionCol",
        "Prediction column name",
        H2OTypeConverters.toString())

    detailedPredictionCol = Param(
        Params._dummy(),
        "detailedPredictionCol",
        "Column containing additional prediction details, its content depends on the model type.",
        H2OTypeConverters.toString())

    withDetailedPredictionCol = Param(
        Params._dummy(),
        "withDetailedPredictionCol",
        "Enables or disables generating additional prediction column, but with more details",
        H2OTypeConverters.toBoolean())

    featuresCols = Param(
        Params._dummy(),
        "featuresCols",
        "Name of feature columns",
        H2OTypeConverters.toListString())

    convertUnknownCategoricalLevelsToNa = Param(
        Params._dummy(),
        "convertUnknownCategoricalLevelsToNa",
        "If set to 'true', the model converts unknown categorical levels to NA during making predictions.",
        H2OTypeConverters.toBoolean())

    convertInvalidNumbersToNa = Param(
        Params._dummy(),
        "convertInvalidNumbersToNa",
        "If set to 'true', the model converts invalid numbers to NA during making predictions.",
        H2OTypeConverters.toBoolean())

    namedMojoOutputColumns = Param(
        Params._dummy(),
        "namedMojoOutputColumns",
        "Mojo Output is not stored in the array but in the properly named columns",
        H2OTypeConverters.toBoolean())

    ##
    # Getters
    ##
    def getPredictionCol(self):
        return self.getOrDefault(self.predictionCol)

    def getDetailedPredictionCol(self):
        return self.getOrDefault(self.detailedPredictionCol)

    def getWithDetailedPredictionCol(self):
        return self.getOrDefault(self.withDetailedPredictionCol)

    def getFeaturesCols(self):
        return self.getOrDefault(self.featuresCols)

    def getConvertUnknownCategoricalLevelsToNa(self):
        return self.getOrDefault(self.convertUnknownCategoricalLevelsToNa)

    def getConvertInvalidNumbersToNa(self):
        return self.getOrDefault(self.convertInvalidNumbersToNa)

    def getNamedMojoOutputColumns(self):
        return self.getOrDefault(self.namedMojoOutputColumns)
class H2OTargetEncoderParams(Params):
    ##
    # Param definitions
    ##
    foldCol = Param(Params._dummy(), "foldCol", "Fold column name",
                    H2OTypeConverters.toNullableString())

    labelCol = Param(Params._dummy(), "labelCol", "Label column name",
                     H2OTypeConverters.toString())

    inputCols = Param(Params._dummy(), "inputCols",
                      "Names of columns that will be transformed",
                      H2OTypeConverters.toListString())

    outputCols = Param(
        Params._dummy(), "outputCols",
        "Names of columns representing the result of target encoding",
        H2OTypeConverters.toListString())

    holdoutStrategy = Param(
        Params._dummy(), "holdoutStrategy",
        """A strategy deciding what records will be excluded when calculating the target average on the training dataset.
           Options:
            None        - All rows are considered for the calculation
            LeaveOneOut - All rows except the row the calculation is made for
            KFold       - Only out-of-fold data is considered (The option requires foldCol to be set.""",
        H2OTypeConverters.toEnumString(
            "ai.h2o.targetencoding.TargetEncoder$DataLeakageHandlingStrategy"))

    blendedAvgEnabled = Param(
        Params._dummy(), "blendedAvgEnabled",
        "If set, the target average becomes a weighted average of the posterior average for a given "
        "categorical level and the prior average of the target. The weight is determined by the size "
        "of the given group that the row belongs to. By default, the blended average is disabled.",
        H2OTypeConverters.toBoolean())

    blendedAvgInflectionPoint = Param(
        Params._dummy(), "blendedAvgInflectionPoint",
        "A parameter of the blended average. The bigger number is set, the groups relatively bigger to the "
        "overall data set size will consider the global target value as a component in the weighted average. "
        "The default value is 10."
        "", H2OTypeConverters.toFloat())

    blendedAvgSmoothing = Param(
        Params._dummy(), "blendedAvgSmoothing",
        "A parameter of blended average. Controls the rate of transition between a group target value "
        "and a global target value. The default value is 20.",
        H2OTypeConverters.toFloat())

    noise = Param(
        Params._dummy(), "noise",
        "Amount of random noise added to output values. The default value is 0.01",
        H2OTypeConverters.toFloat())

    noiseSeed = Param(Params._dummy(), "noiseSeed",
                      "A seed of the generator producing the random noise",
                      H2OTypeConverters.toInt())

    ##
    # Getters
    ##
    def getFoldCol(self):
        return self.getOrDefault(self.foldCol)

    def getLabelCol(self):
        return self.getOrDefault(self.labelCol)

    def getInputCols(self):
        return self.getOrDefault(self.inputCols)

    def getOutputCols(self):
        columns = self.getOrDefault(self.outputCols)
        if not columns:
            return list(map(lambda c: c + "_te", self.getInputCols()))
        else:
            return columns

    def getHoldoutStrategy(self):
        return self.getOrDefault(self.holdoutStrategy)

    def getBlendedAvgEnabled(self):
        return self.getOrDefault(self.blendedAvgEnabled)

    def getBlendedAvgInflectionPoint(self):
        return self.getOrDefault(self.blendedAvgInflectionPoint)

    def getBlendedAvgSmoothing(self):
        return self.getOrDefault(self.blendedAvgSmoothing)

    def getNoise(self):
        return self.getOrDefault(self.noise)

    def getNoiseSeed(self):
        return self.getOrDefault(self.noiseSeed)
Пример #7
0
class H2OBaseMOJOParams(Params):
    predictionCol = Param(
        Params._dummy(),
        "predictionCol",
        "Prediction column name",
        H2OTypeConverters.toString())

    detailedPredictionCol = Param(
        Params._dummy(),
        "detailedPredictionCol",
        "Column containing additional prediction details, its content depends on the model type.",
        H2OTypeConverters.toString())

    withDetailedPredictionCol = Param(
        Params._dummy(),
        "withDetailedPredictionCol",
        "Enables or disables generating additional prediction column, but with more details",
        H2OTypeConverters.toBoolean())

    withContributions = Param(
        Params._dummy(),
        "withContributions",
        "Enables or disables generating a sub-column of detailedPredictionCol containing Shapley values.",
        H2OTypeConverters.toBoolean())

    featuresCols = Param(
        Params._dummy(),
        "featuresCols",
        "Name of feature columns",
        H2OTypeConverters.toListString())

    convertUnknownCategoricalLevelsToNa = Param(
        Params._dummy(),
        "convertUnknownCategoricalLevelsToNa",
        "If set to 'true', the model converts unknown categorical levels to NA during making predictions.",
        H2OTypeConverters.toBoolean())

    convertInvalidNumbersToNa = Param(
        Params._dummy(),
        "convertInvalidNumbersToNa",
        "If set to 'true', the model converts invalid numbers to NA during making predictions.",
        H2OTypeConverters.toBoolean())

    namedMojoOutputColumns = Param(
        Params._dummy(),
        "namedMojoOutputColumns",
        "Mojo Output is not stored in the array but in the properly named columns",
        H2OTypeConverters.toBoolean())

    withLeafNodeAssignments = Param(
        Params._dummy(),
        "withLeafNodeAssignments",
        "Enables or disables computation of leaf node assignments.",
        H2OTypeConverters.toBoolean())

    withStageResults = Param(
        Params._dummy(),
        "withStageResults",
        "Enables or disables computation of stage results.",
        H2OTypeConverters.toBoolean())

    ##
    # Getters
    ##
    def getPredictionCol(self):
        return self.getOrDefault(self.predictionCol)

    def getDetailedPredictionCol(self):
        return self.getOrDefault(self.detailedPredictionCol)

    def getWithDetailedPredictionCol(self):
        warnings.warn("The method will be removed without a replacement in the version 3.34."
                      "Detailed prediction columns is always enabled.", DeprecationWarning)
        return True

    def getWithContributions(self):
        return self.getOrDefault(self.withContributions)

    def getFeaturesCols(self):
        return self.getOrDefault(self.featuresCols)

    def getConvertUnknownCategoricalLevelsToNa(self):
        return self.getOrDefault(self.convertUnknownCategoricalLevelsToNa)

    def getConvertInvalidNumbersToNa(self):
        return self.getOrDefault(self.convertInvalidNumbersToNa)

    def getNamedMojoOutputColumns(self):
        return self.getOrDefault(self.namedMojoOutputColumns)

    def getWithLeafNodeAssignments(self):
        return self.getOrDefault(self.withLeafNodeAssignments)

    def getWithStageResults(self):
        return self.getOrDefault(self.withStageResults)
Пример #8
0
class H2OCommonParams(H2OBaseMOJOParams):

    ##
    # Param definitions
    ##
    splitRatio = Param(
        Params._dummy(), "splitRatio",
        "Accepts values in range [0, 1.0] which determine how large part of dataset is used for training"
        " and for validation. For example, 0.8 -> 80% training 20% validation.",
        H2OTypeConverters.toFloat())

    columnsToCategorical = Param(
        Params._dummy(), "columnsToCategorical",
        "List of columns to convert to categorical before modelling",
        H2OTypeConverters.toListString())

    ##
    # Getters
    ##
    def getSplitRatio(self):
        return self.getOrDefault(self.splitRatio)

    def getColumnsToCategorical(self):
        return self.getOrDefault(self.columnsToCategorical)

    ##
    # Setters
    ##
    def setSplitRatio(self, value):
        return self._set(splitRatio=value)

    def setColumnsToCategorical(self, value, *args):
        assert_is_type(value, [str], str)

        if isinstance(value, str):
            prepared_array = [value]
        else:
            prepared_array = value

        for arg in args:
            prepared_array.append(arg)

        return self._set(columnsToCategorical=value)

    # Setters for parameters which are defined on MOJO as well
    def setPredictionCol(self, value):
        return self._set(predictionCol=value)

    def setDetailedPredictionCol(self, value):
        return self._set(detailedPredictionCol=value)

    def setWithDetailedPredictionCol(self, value):
        return self._set(withDetailedPredictionCol=value)

    def setFeaturesCols(self, value):
        return self._set(featuresCols=value)

    def setConvertUnknownCategoricalLevelsToNa(self, value):
        return self._set(convertUnknownCategoricalLevelsToNa=value)

    def setConvertInvalidNumbersToNa(self, value):
        return self._set(convertInvalidNumbersToNa=value)

    def setNamedMojoOutputColumns(self, value):
        return self._set(namedMojoOutputColumns=value)

    def setWithContributions(self, value):
        return self._set(withContributions=value)