Exemplo n.º 1
0
    class Foo(object): # required to get it working on Py2
        def __init__(self, bar=1):
            self._bar = bar
            
        @property
        def bar(self):
            return self._bar
        
        @bar.setter
        def bar(self, v):
            self._bar = v

        Bar = deprecated_property('Bar', replaced_by=bar)
        Baz = deprecated_property('Baz')
        Biz = deprecated_property('Biz', message="Biz custom message")
Exemplo n.º 2
0
class H2OTargetEncoderEstimator(H2OEstimator):
    """
    TargetEncoder

    """

    algo = "targetencoder"
    param_names = {"model_id", "training_frame", "fold_column", "response_column", "ignored_columns",
                   "keep_original_categorical_columns", "blending", "inflection_point", "smoothing",
                   "data_leakage_handling", "noise", "seed"}

    def __init__(self, **kwargs):
        super(H2OTargetEncoderEstimator, self).__init__()
        self._parms = {}
        for pname, pvalue in kwargs.items():
            if pname == 'model_id':
                self._id = pvalue
                self._parms["model_id"] = pvalue
            elif pname in self._deprecated_params_:
                setattr(self, pname, pvalue)  # property handles the redefinition
            elif pname in self.param_names:
                # Using setattr(...) will invoke type-checking of the arguments
                setattr(self, pname, pvalue)
            else:
                raise H2OValueError("Unknown parameter %s = %r" % (pname, pvalue))

    @property
    def training_frame(self):
        """
        Id of the training data frame.

        Type: ``H2OFrame``.

        :examples:

        >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
        >>> predictors = ["home.dest", "cabin", "embarked"]
        >>> response = "survived"
        >>> titanic["survived"] = titanic["survived"].asfactor()
        >>> fold_col = "kfold_column"
        >>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234)
        >>> titanic_te = H2OTargetEncoderEstimator(inflection_point=35,
        ...                                        smoothing=25,
        ...                                        blending=True)
        >>> titanic_te.train(x=predictors,
        ...                  y=response,
        ...                  training_frame=titanic)
        >>> titanic_te
        """
        return self._parms.get("training_frame")

    @training_frame.setter
    def training_frame(self, training_frame):
        self._parms["training_frame"] = H2OFrame._validate(training_frame, 'training_frame')


    @property
    def fold_column(self):
        """
        Column with cross-validation fold index assignment per observation.

        Type: ``str``.

        :examples:

        >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
        >>> predictors = ["home.dest", "cabin", "embarked"]
        >>> response = "survived"
        >>> titanic["survived"] = titanic["survived"].asfactor()
        >>> fold_col = "kfold_column"
        >>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234)
        >>> titanic_te = H2OTargetEncoderEstimator(inflection_point=35,
        ...                                        smoothing=25,
        ...                                        blending=True)
        >>> titanic_te.train(x=predictors,
        ...                  y=response,
        ...                  training_frame=titanic)
        >>> titanic_te
        """
        return self._parms.get("fold_column")

    @fold_column.setter
    def fold_column(self, fold_column):
        assert_is_type(fold_column, None, str)
        self._parms["fold_column"] = fold_column


    @property
    def response_column(self):
        """
        Response variable column.

        Type: ``str``.
        """
        return self._parms.get("response_column")

    @response_column.setter
    def response_column(self, response_column):
        assert_is_type(response_column, None, str)
        self._parms["response_column"] = response_column


    @property
    def ignored_columns(self):
        """
        Names of columns to ignore for training.

        Type: ``List[str]``.
        """
        return self._parms.get("ignored_columns")

    @ignored_columns.setter
    def ignored_columns(self, ignored_columns):
        assert_is_type(ignored_columns, None, [str])
        self._parms["ignored_columns"] = ignored_columns


    @property
    def keep_original_categorical_columns(self):
        """
        If true, the original non-encoded categorical features will remain in the result frame.

        Type: ``bool``  (default: ``True``).
        """
        return self._parms.get("keep_original_categorical_columns")

    @keep_original_categorical_columns.setter
    def keep_original_categorical_columns(self, keep_original_categorical_columns):
        assert_is_type(keep_original_categorical_columns, None, bool)
        self._parms["keep_original_categorical_columns"] = keep_original_categorical_columns


    @property
    def blending(self):
        """
        If true, enables blending of posterior probabilities (computed for a given categorical value) with prior
        probabilities (computed on the entire set). This allows to mitigate the effect of categorical values with small
        cardinality. The blending effect can be tuned using the `inflection_point` and `smoothing` parameters.

        Type: ``bool``  (default: ``False``).

        :examples:

        >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
        >>> predictors = ["home.dest", "cabin", "embarked"]
        >>> response = "survived"
        >>> titanic["survived"] = titanic["survived"].asfactor()
        >>> fold_col = "kfold_column"
        >>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234)
        >>> titanic_te = H2OTargetEncoderEstimator(inflection_point=35,
        ...                                        smoothing=25,
        ...                                        blending=True)
        >>> titanic_te.train(x=predictors,
        ...                  y=response,
        ...                  training_frame=titanic)
        >>> titanic_te
        """
        return self._parms.get("blending")

    @blending.setter
    def blending(self, blending):
        assert_is_type(blending, None, bool)
        self._parms["blending"] = blending


    @property
    def inflection_point(self):
        """
        Inflection point of the sigmoid used to blend probabilities (see `blending` parameter). For a given categorical
        value, if it appears less that `inflection_point` in a data sample, then the influence of the posterior
        probability will be smaller than the prior.

        Type: ``float``  (default: ``10``).

        :examples:

        >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
        >>> predictors = ["home.dest", "cabin", "embarked"]
        >>> response = "survived"
        >>> titanic["survived"] = titanic["survived"].asfactor()
        >>> fold_col = "kfold_column"
        >>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234)
        >>> titanic_te = H2OTargetEncoderEstimator(inflection_point=35,
        ...                                        smoothing=25,
        ...                                        blending=True)
        >>> titanic_te.train(x=predictors,
        ...                  y=response,
        ...                  training_frame=titanic)
        >>> titanic_te
        """
        return self._parms.get("inflection_point")

    @inflection_point.setter
    def inflection_point(self, inflection_point):
        assert_is_type(inflection_point, None, numeric)
        self._parms["inflection_point"] = inflection_point


    @property
    def smoothing(self):
        """
        Smoothing factor corresponds to the inverse of the slope at the inflection point on the sigmoid used to blend
        probabilities (see `blending` parameter). If smoothing tends towards 0, then the sigmoid used for blending turns
        into a Heaviside step function.

        Type: ``float``  (default: ``20``).

        :examples:

        >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
        >>> predictors = ["home.dest", "cabin", "embarked"]
        >>> response = "survived"
        >>> titanic["survived"] = titanic["survived"].asfactor()
        >>> fold_col = "kfold_column"
        >>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234)
        >>> titanic_te = H2OTargetEncoderEstimator(inflection_point=35,
        ...                                        smoothing=25,
        ...                                        blending=True)
        >>> titanic_te.train(x=predictors,
        ...                  y=response,
        ...                  training_frame=titanic)
        >>> titanic_te
        """
        return self._parms.get("smoothing")

    @smoothing.setter
    def smoothing(self, smoothing):
        assert_is_type(smoothing, None, numeric)
        self._parms["smoothing"] = smoothing


    @property
    def data_leakage_handling(self):
        """
        Data leakage handling strategy used to generate the encoding. Supported options are: 1) "none" (default) - no
        holdout, using the entire training frame. 2) "leave_one_out" - current row's response value is subtracted from
        the per-level frequencies pre-calculated on the entire training frame. 3) "k_fold" - encodings for a fold are
        generated based on out-of-fold data.

        One of: ``"leave_one_out"``, ``"k_fold"``, ``"none"``  (default: ``"none"``).

        :examples:

        >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
        >>> predictors = ["home.dest", "cabin", "embarked"]
        >>> response = "survived"
        >>> titanic["survived"] = titanic["survived"].asfactor()
        >>> fold_col = "kfold_column"
        >>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234)
        >>> titanic_te = H2OTargetEncoderEstimator(inflection_point=35,
        ...                                        smoothing=25,
        ...                                        data_leakage_handling="k_fold",
        ...                                        blending=True)
        >>> titanic_te.train(x=predictors,
        ...                  y=response,
        ...                  training_frame=titanic)
        >>> titanic_te
        """
        return self._parms.get("data_leakage_handling")

    @data_leakage_handling.setter
    def data_leakage_handling(self, data_leakage_handling):
        assert_is_type(data_leakage_handling, None, Enum("leave_one_out", "k_fold", "none"))
        self._parms["data_leakage_handling"] = data_leakage_handling


    @property
    def noise(self):
        """
        The amount of noise to add to the encoded column. Use 0 to disable noise, and -1 (=AUTO) to let the algorithm
        determine a reasonable amount of noise.

        Type: ``float``  (default: ``0.01``).
        """
        return self._parms.get("noise")

    @noise.setter
    def noise(self, noise):
        assert_is_type(noise, None, numeric)
        self._parms["noise"] = noise


    @property
    def seed(self):
        """
        Seed used to generate the noise. By default, the seed is chosen randomly.

        Type: ``int``  (default: ``-1``).
        """
        return self._parms.get("seed")

    @seed.setter
    def seed(self, seed):
        assert_is_type(seed, None, int)
        self._parms["seed"] = seed


    _deprecated_params_ = ['k', 'f', 'noise_level']
    k = deprecated_property('k', inflection_point)
    f = deprecated_property('f', smoothing)
    noise_level = deprecated_property('noise_level', noise)

    def transform(self, frame, blending=None, inflection_point=None, smoothing=None, noise=None, as_training=False, **kwargs):
        """
        Apply transformation to `te_columns` based on the encoding maps generated during `train()` method call.

        :param H2OFrame frame: the frame on which to apply the target encoding transformations.
        :param boolean blending: If provided, this overrides the `blending` parameter on the model.
        :param float inflection_point: If provided, this overrides the `inflection_point` parameter on the model.
        :param float smoothing: If provided, this overrides the `smoothing` parameter on the model.
        :param float noise: If provided, this overrides the amount of random noise added to the target encoding defined on the model, this helps prevent overfitting.
        :param boolean as_training: Must be set to True when encoding the training frame. Defaults to False.

        :example:
        >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
        >>> predictors = ["home.dest", "cabin", "embarked"]
        >>> response = "survived"
        >>> titanic[response] = titanic[response].asfactor()
        >>> fold_col = "kfold_column"
        >>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234)
        >>> titanic_te = H2OTargetEncoderEstimator(data_leakage_handling="leave_one_out",
        ...                                        inflection_point=35,
        ...                                        smoothing=25,
        ...                                        blending=True,
        ...                                        seed=1234)
        >>> titanic_te.train(x=predictors,
        ...                  y=response,
        ...                  training_frame=titanic)
        >>> transformed = titanic_te.transform(frame=titanic)
        """
        for k in kwargs:
            if k in ['seed', 'data_leakage_handling']:
                warnings.warn("`%s` is deprecated in `transform` method and will be ignored. "
                              "Instead, please ensure that it was set before training on the H2OTargetEncoderEstimator model." % k, H2ODeprecationWarning)
            else:
                raise TypeError("transform() got an unexpected keyword argument '%s'" % k)

        if 'data_leakage_handling' in kwargs:
            dlh = kwargs['data_leakage_handling']
            assert_is_type(dlh, None, Enum("leave_one_out", "k_fold", "none"))
            if dlh is not None and dlh.lower() != "none":
                warnings.warn("Deprecated `data_leakage_handling=%s` is replaced by `as_training=True`. "
                              "Please update your code." % dlh, H2ODeprecationWarning)
                as_training = True

        params = dict(
            model=self.model_id,
            frame=frame.key,
            blending=blending if blending is not None else self.blending,  # always need to provide blending here as we can't represent unset value 
            inflection_point=inflection_point,
            smoothing=smoothing,
            noise=noise,
            as_training=as_training,
        )

        output = h2o.api("GET /3/TargetEncoderTransform", data=params)
        return h2o.get_frame(output["name"])
Exemplo n.º 3
0
class H2ORuleFitEstimator(H2OEstimator):
    """
    RuleFit

    Builds a RuleFit on a parsed dataset, for regression or 
    classification. 
    """

    algo = "rulefit"
    supervised_learning = True

    @deprecated_params({'Lambda': 'lambda_'})
    def __init__(self,
                 model_id=None,  # type: Optional[Union[None, str, H2OEstimator]]
                 training_frame=None,  # type: Optional[Union[None, str, H2OFrame]]
                 validation_frame=None,  # type: Optional[Union[None, str, H2OFrame]]
                 seed=-1,  # type: int
                 response_column=None,  # type: Optional[str]
                 ignored_columns=None,  # type: Optional[List[str]]
                 algorithm="auto",  # type: Literal["auto", "drf", "gbm"]
                 min_rule_length=3,  # type: int
                 max_rule_length=3,  # type: int
                 max_num_rules=-1,  # type: int
                 model_type="rules_and_linear",  # type: Literal["rules_and_linear", "rules", "linear"]
                 weights_column=None,  # type: Optional[str]
                 distribution="auto",  # type: Literal["auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber"]
                 rule_generation_ntrees=50,  # type: int
                 auc_type="auto",  # type: Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"]
                 remove_duplicates=True,  # type: bool
                 lambda_=None,  # type: Optional[List[float]]
                 max_categorical_levels=10,  # type: int
                 ):
        """
        :param model_id: Destination id for this model; auto-generated if not specified.
               Defaults to ``None``.
        :type model_id: Union[None, str, H2OEstimator], optional
        :param training_frame: Id of the training data frame.
               Defaults to ``None``.
        :type training_frame: Union[None, str, H2OFrame], optional
        :param validation_frame: Id of the validation data frame.
               Defaults to ``None``.
        :type validation_frame: Union[None, str, H2OFrame], optional
        :param seed: Seed for pseudo random number generator (if applicable).
               Defaults to ``-1``.
        :type seed: int
        :param response_column: Response variable column.
               Defaults to ``None``.
        :type response_column: str, optional
        :param ignored_columns: Names of columns to ignore for training.
               Defaults to ``None``.
        :type ignored_columns: List[str], optional
        :param algorithm: The algorithm to use to generate rules.
               Defaults to ``"auto"``.
        :type algorithm: Literal["auto", "drf", "gbm"]
        :param min_rule_length: Minimum length of rules. Defaults to 3.
               Defaults to ``3``.
        :type min_rule_length: int
        :param max_rule_length: Maximum length of rules. Defaults to 3.
               Defaults to ``3``.
        :type max_rule_length: int
        :param max_num_rules: The maximum number of rules to return. defaults to -1 which means the number of rules is
               selected
               by diminishing returns in model deviance.
               Defaults to ``-1``.
        :type max_num_rules: int
        :param model_type: Specifies type of base learners in the ensemble.
               Defaults to ``"rules_and_linear"``.
        :type model_type: Literal["rules_and_linear", "rules", "linear"]
        :param weights_column: Column with observation weights. Giving some observation a weight of zero is equivalent
               to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating
               that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do
               not increase the size of the data frame. This is typically the number of times a row is repeated, but
               non-integer values are supported as well. During training, rows with higher weights matter more, due to
               the larger loss function pre-factor. If you set weight = 0 for a row, the returned prediction frame at
               that row is zero and this is incorrect. To get an accurate prediction, remove all rows with weight == 0.
               Defaults to ``None``.
        :type weights_column: str, optional
        :param distribution: Distribution function
               Defaults to ``"auto"``.
        :type distribution: Literal["auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace",
               "quantile", "huber"]
        :param rule_generation_ntrees: Specifies the number of trees to build in the tree model. Defaults to 50.
               Defaults to ``50``.
        :type rule_generation_ntrees: int
        :param auc_type: Set default multinomial AUC type.
               Defaults to ``"auto"``.
        :type auc_type: Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"]
        :param remove_duplicates: Whether to remove rules which are identical to an earlier rule. Defaults to true.
               Defaults to ``True``.
        :type remove_duplicates: bool
        :param lambda_: Lambda for LASSO regressor.
               Defaults to ``None``.
        :type lambda_: List[float], optional
        :param max_categorical_levels: For every categorical feature, only use this many most frequent categorical
               levels for model training. Only used for categorical_encoding == EnumLimited.
               Defaults to ``10``.
        :type max_categorical_levels: int
        """
        super(H2ORuleFitEstimator, self).__init__()
        self._parms = {}
        self._id = self._parms['model_id'] = model_id
        self.training_frame = training_frame
        self.validation_frame = validation_frame
        self.seed = seed
        self.response_column = response_column
        self.ignored_columns = ignored_columns
        self.algorithm = algorithm
        self.min_rule_length = min_rule_length
        self.max_rule_length = max_rule_length
        self.max_num_rules = max_num_rules
        self.model_type = model_type
        self.weights_column = weights_column
        self.distribution = distribution
        self.rule_generation_ntrees = rule_generation_ntrees
        self.auc_type = auc_type
        self.remove_duplicates = remove_duplicates
        self.lambda_ = lambda_
        self.max_categorical_levels = max_categorical_levels

    @property
    def training_frame(self):
        """
        Id of the training data frame.

        Type: ``Union[None, str, H2OFrame]``.
        """
        return self._parms.get("training_frame")

    @training_frame.setter
    def training_frame(self, training_frame):
        self._parms["training_frame"] = H2OFrame._validate(training_frame, 'training_frame')

    @property
    def validation_frame(self):
        """
        Id of the validation data frame.

        Type: ``Union[None, str, H2OFrame]``.
        """
        return self._parms.get("validation_frame")

    @validation_frame.setter
    def validation_frame(self, validation_frame):
        self._parms["validation_frame"] = H2OFrame._validate(validation_frame, 'validation_frame')

    @property
    def seed(self):
        """
        Seed for pseudo random number generator (if applicable).

        Type: ``int``, defaults to ``-1``.
        """
        return self._parms.get("seed")

    @seed.setter
    def seed(self, seed):
        assert_is_type(seed, None, int)
        self._parms["seed"] = seed

    @property
    def response_column(self):
        """
        Response variable column.

        Type: ``str``.
        """
        return self._parms.get("response_column")

    @response_column.setter
    def response_column(self, response_column):
        assert_is_type(response_column, None, str)
        self._parms["response_column"] = response_column

    @property
    def ignored_columns(self):
        """
        Names of columns to ignore for training.

        Type: ``List[str]``.
        """
        return self._parms.get("ignored_columns")

    @ignored_columns.setter
    def ignored_columns(self, ignored_columns):
        assert_is_type(ignored_columns, None, [str])
        self._parms["ignored_columns"] = ignored_columns

    @property
    def algorithm(self):
        """
        The algorithm to use to generate rules.

        Type: ``Literal["auto", "drf", "gbm"]``, defaults to ``"auto"``.
        """
        return self._parms.get("algorithm")

    @algorithm.setter
    def algorithm(self, algorithm):
        assert_is_type(algorithm, None, Enum("auto", "drf", "gbm"))
        self._parms["algorithm"] = algorithm

    @property
    def min_rule_length(self):
        """
        Minimum length of rules. Defaults to 3.

        Type: ``int``, defaults to ``3``.
        """
        return self._parms.get("min_rule_length")

    @min_rule_length.setter
    def min_rule_length(self, min_rule_length):
        assert_is_type(min_rule_length, None, int)
        self._parms["min_rule_length"] = min_rule_length

    @property
    def max_rule_length(self):
        """
        Maximum length of rules. Defaults to 3.

        Type: ``int``, defaults to ``3``.
        """
        return self._parms.get("max_rule_length")

    @max_rule_length.setter
    def max_rule_length(self, max_rule_length):
        assert_is_type(max_rule_length, None, int)
        self._parms["max_rule_length"] = max_rule_length

    @property
    def max_num_rules(self):
        """
        The maximum number of rules to return. defaults to -1 which means the number of rules is selected
        by diminishing returns in model deviance.

        Type: ``int``, defaults to ``-1``.
        """
        return self._parms.get("max_num_rules")

    @max_num_rules.setter
    def max_num_rules(self, max_num_rules):
        assert_is_type(max_num_rules, None, int)
        self._parms["max_num_rules"] = max_num_rules

    @property
    def model_type(self):
        """
        Specifies type of base learners in the ensemble.

        Type: ``Literal["rules_and_linear", "rules", "linear"]``, defaults to ``"rules_and_linear"``.
        """
        return self._parms.get("model_type")

    @model_type.setter
    def model_type(self, model_type):
        assert_is_type(model_type, None, Enum("rules_and_linear", "rules", "linear"))
        self._parms["model_type"] = model_type

    @property
    def weights_column(self):
        """
        Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the
        dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative
        weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data
        frame. This is typically the number of times a row is repeated, but non-integer values are supported as well.
        During training, rows with higher weights matter more, due to the larger loss function pre-factor. If you set
        weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get an
        accurate prediction, remove all rows with weight == 0.

        Type: ``str``.
        """
        return self._parms.get("weights_column")

    @weights_column.setter
    def weights_column(self, weights_column):
        assert_is_type(weights_column, None, str)
        self._parms["weights_column"] = weights_column

    @property
    def distribution(self):
        """
        Distribution function

        Type: ``Literal["auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace",
        "quantile", "huber"]``, defaults to ``"auto"``.
        """
        return self._parms.get("distribution")

    @distribution.setter
    def distribution(self, distribution):
        assert_is_type(distribution, None, Enum("auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace", "quantile", "huber"))
        self._parms["distribution"] = distribution

    @property
    def rule_generation_ntrees(self):
        """
        Specifies the number of trees to build in the tree model. Defaults to 50.

        Type: ``int``, defaults to ``50``.
        """
        return self._parms.get("rule_generation_ntrees")

    @rule_generation_ntrees.setter
    def rule_generation_ntrees(self, rule_generation_ntrees):
        assert_is_type(rule_generation_ntrees, None, int)
        self._parms["rule_generation_ntrees"] = rule_generation_ntrees

    @property
    def auc_type(self):
        """
        Set default multinomial AUC type.

        Type: ``Literal["auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"]``, defaults to
        ``"auto"``.
        """
        return self._parms.get("auc_type")

    @auc_type.setter
    def auc_type(self, auc_type):
        assert_is_type(auc_type, None, Enum("auto", "none", "macro_ovr", "weighted_ovr", "macro_ovo", "weighted_ovo"))
        self._parms["auc_type"] = auc_type

    @property
    def remove_duplicates(self):
        """
        Whether to remove rules which are identical to an earlier rule. Defaults to true.

        Type: ``bool``, defaults to ``True``.
        """
        return self._parms.get("remove_duplicates")

    @remove_duplicates.setter
    def remove_duplicates(self, remove_duplicates):
        assert_is_type(remove_duplicates, None, bool)
        self._parms["remove_duplicates"] = remove_duplicates

    @property
    def lambda_(self):
        """
        Lambda for LASSO regressor.

        Type: ``List[float]``.
        """
        return self._parms.get("lambda")

    @lambda_.setter
    def lambda_(self, lambda_):
        assert_is_type(lambda_, None, numeric, [numeric])
        self._parms["lambda"] = lambda_

    @property
    def max_categorical_levels(self):
        """
        For every categorical feature, only use this many most frequent categorical levels for model training. Only used
        for categorical_encoding == EnumLimited.

        Type: ``int``, defaults to ``10``.
        """
        return self._parms.get("max_categorical_levels")

    @max_categorical_levels.setter
    def max_categorical_levels(self, max_categorical_levels):
        assert_is_type(max_categorical_levels, None, int)
        self._parms["max_categorical_levels"] = max_categorical_levels

    Lambda = deprecated_property('Lambda', lambda_)

    def rule_importance(self):
        """
        Retrieve rule importances for a Rulefit model

        :return: H2OTwoDimTable
        """
        if self._model_json["algo"] != "rulefit":
            raise H2OValueError("This function is available for Rulefit models only")

        kwargs = {}
        kwargs["model_id"] = self.model_id

        json = h2o.api("POST /3/SignificantRules", data=kwargs)
        return json['significant_rules_table']

    def predict_rules(self, frame, rule_ids):
        """
        Evaluates validity of the given rules on the given data. 

        :param frame: H2OFrame on which rule validity is to be evaluated
        :param rule_ids: string array of rule ids to be evaluated against the frame
        :return: H2OFrame with a column per each input ruleId, representing a flag whether given rule is applied to the observation or not.
        """
        from h2o.frame import H2OFrame
        from h2o.utils.typechecks import assert_is_type
        from h2o.expr import ExprNode
        assert_is_type(frame, H2OFrame)
        return H2OFrame._expr(expr=ExprNode("rulefit.predict.rules", self, frame, rule_ids))
Exemplo n.º 4
0
class H2OTargetEncoderEstimator(H2OEstimator):
    """
    TargetEncoder

    """

    algo = "targetencoder"
    supervised_learning = True

    @deprecated_params({
        'k': 'inflection_point',
        'f': 'smoothing',
        'noise_level': 'noise'
    })
    def __init__(
            self,
            model_id=None,  # type: Optional[Union[None, str, H2OEstimator]]
            training_frame=None,  # type: Optional[Union[None, str, H2OFrame]]
            fold_column=None,  # type: Optional[str]
            response_column=None,  # type: Optional[str]
            ignored_columns=None,  # type: Optional[List[str]]
            columns_to_encode=None,  # type: Optional[List[List[str]]]
            keep_original_categorical_columns=True,  # type: bool
            blending=False,  # type: bool
            inflection_point=10.0,  # type: float
            smoothing=20.0,  # type: float
            data_leakage_handling="none",  # type: Literal["leave_one_out", "k_fold", "none"]
            noise=0.01,  # type: float
            seed=-1,  # type: int
    ):
        """
        :param model_id: Destination id for this model; auto-generated if not specified.
               Defaults to ``None``.
        :type model_id: Union[None, str, H2OEstimator], optional
        :param training_frame: Id of the training data frame.
               Defaults to ``None``.
        :type training_frame: Union[None, str, H2OFrame], optional
        :param fold_column: Column with cross-validation fold index assignment per observation.
               Defaults to ``None``.
        :type fold_column: str, optional
        :param response_column: Response variable column.
               Defaults to ``None``.
        :type response_column: str, optional
        :param ignored_columns: Names of columns to ignore for training.
               Defaults to ``None``.
        :type ignored_columns: List[str], optional
        :param columns_to_encode: List of categorical columns or groups of categorical columns to encode. When groups of
               columns are specified, each group is encoded as a single column (interactions are created internally).
               Defaults to ``None``.
        :type columns_to_encode: List[List[str]], optional
        :param keep_original_categorical_columns: If true, the original non-encoded categorical features will remain in
               the result frame.
               Defaults to ``True``.
        :type keep_original_categorical_columns: bool
        :param blending: If true, enables blending of posterior probabilities (computed for a given categorical value)
               with prior probabilities (computed on the entire set). This allows to mitigate the effect of categorical
               values with small cardinality. The blending effect can be tuned using the `inflection_point` and
               `smoothing` parameters.
               Defaults to ``False``.
        :type blending: bool
        :param inflection_point: Inflection point of the sigmoid used to blend probabilities (see `blending` parameter).
               For a given categorical value, if it appears less that `inflection_point` in a data sample, then the
               influence of the posterior probability will be smaller than the prior.
               Defaults to ``10.0``.
        :type inflection_point: float
        :param smoothing: Smoothing factor corresponds to the inverse of the slope at the inflection point on the
               sigmoid used to blend probabilities (see `blending` parameter). If smoothing tends towards 0, then the
               sigmoid used for blending turns into a Heaviside step function.
               Defaults to ``20.0``.
        :type smoothing: float
        :param data_leakage_handling: Data leakage handling strategy used to generate the encoding. Supported options
               are:
               1) "none" (default) - no holdout, using the entire training frame.
               2) "leave_one_out" - current row's response value is subtracted from the per-level frequencies pre-
               calculated on the entire training frame.
               3) "k_fold" - encodings for a fold are generated based on out-of-fold data.

               Defaults to ``"none"``.
        :type data_leakage_handling: Literal["leave_one_out", "k_fold", "none"]
        :param noise: The amount of noise to add to the encoded column. Use 0 to disable noise, and -1 (=AUTO) to let
               the algorithm determine a reasonable amount of noise.
               Defaults to ``0.01``.
        :type noise: float
        :param seed: Seed used to generate the noise. By default, the seed is chosen randomly.
               Defaults to ``-1``.
        :type seed: int
        """
        super(H2OTargetEncoderEstimator, self).__init__()
        self._parms = {}
        self._id = self._parms['model_id'] = model_id
        self.training_frame = training_frame
        self.fold_column = fold_column
        self.response_column = response_column
        self.ignored_columns = ignored_columns
        self.columns_to_encode = columns_to_encode
        self.keep_original_categorical_columns = keep_original_categorical_columns
        self.blending = blending
        self.inflection_point = inflection_point
        self.smoothing = smoothing
        self.data_leakage_handling = data_leakage_handling
        self.noise = noise
        self.seed = seed

    @property
    def training_frame(self):
        """
        Id of the training data frame.

        Type: ``Union[None, str, H2OFrame]``.

        :examples:

        >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
        >>> predictors = ["home.dest", "cabin", "embarked"]
        >>> response = "survived"
        >>> titanic["survived"] = titanic["survived"].asfactor()
        >>> fold_col = "kfold_column"
        >>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234)
        >>> titanic_te = H2OTargetEncoderEstimator(inflection_point=35,
        ...                                        smoothing=25,
        ...                                        blending=True)
        >>> titanic_te.train(x=predictors,
        ...                  y=response,
        ...                  training_frame=titanic)
        >>> titanic_te
        """
        return self._parms.get("training_frame")

    @training_frame.setter
    def training_frame(self, training_frame):
        self._parms["training_frame"] = H2OFrame._validate(
            training_frame, 'training_frame')

    @property
    def fold_column(self):
        """
        Column with cross-validation fold index assignment per observation.

        Type: ``str``.

        :examples:

        >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
        >>> predictors = ["home.dest", "cabin", "embarked"]
        >>> response = "survived"
        >>> titanic["survived"] = titanic["survived"].asfactor()
        >>> fold_col = "kfold_column"
        >>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234)
        >>> titanic_te = H2OTargetEncoderEstimator(inflection_point=35,
        ...                                        smoothing=25,
        ...                                        blending=True)
        >>> titanic_te.train(x=predictors,
        ...                  y=response,
        ...                  training_frame=titanic)
        >>> titanic_te
        """
        return self._parms.get("fold_column")

    @fold_column.setter
    def fold_column(self, fold_column):
        assert_is_type(fold_column, None, str)
        self._parms["fold_column"] = fold_column

    @property
    def response_column(self):
        """
        Response variable column.

        Type: ``str``.
        """
        return self._parms.get("response_column")

    @response_column.setter
    def response_column(self, response_column):
        assert_is_type(response_column, None, str)
        self._parms["response_column"] = response_column

    @property
    def ignored_columns(self):
        """
        Names of columns to ignore for training.

        Type: ``List[str]``.
        """
        return self._parms.get("ignored_columns")

    @ignored_columns.setter
    def ignored_columns(self, ignored_columns):
        assert_is_type(ignored_columns, None, [str])
        self._parms["ignored_columns"] = ignored_columns

    @property
    def columns_to_encode(self):
        """
        List of categorical columns or groups of categorical columns to encode. When groups of columns are specified,
        each group is encoded as a single column (interactions are created internally).

        Type: ``List[List[str]]``.
        """
        return self._parms.get("columns_to_encode")

    @columns_to_encode.setter
    def columns_to_encode(self, columns_to_encode):
        assert_is_type(columns_to_encode, None, [U(str, [str])])
        if columns_to_encode:  # standardize as a nested list
            columns_to_encode = [[g] if isinstance(g, str) else g
                                 for g in columns_to_encode]
        self._parms["columns_to_encode"] = columns_to_encode

    @property
    def keep_original_categorical_columns(self):
        """
        If true, the original non-encoded categorical features will remain in the result frame.

        Type: ``bool``, defaults to ``True``.
        """
        return self._parms.get("keep_original_categorical_columns")

    @keep_original_categorical_columns.setter
    def keep_original_categorical_columns(self,
                                          keep_original_categorical_columns):
        assert_is_type(keep_original_categorical_columns, None, bool)
        self._parms[
            "keep_original_categorical_columns"] = keep_original_categorical_columns

    @property
    def blending(self):
        """
        If true, enables blending of posterior probabilities (computed for a given categorical value) with prior
        probabilities (computed on the entire set). This allows to mitigate the effect of categorical values with small
        cardinality. The blending effect can be tuned using the `inflection_point` and `smoothing` parameters.

        Type: ``bool``, defaults to ``False``.

        :examples:

        >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
        >>> predictors = ["home.dest", "cabin", "embarked"]
        >>> response = "survived"
        >>> titanic["survived"] = titanic["survived"].asfactor()
        >>> fold_col = "kfold_column"
        >>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234)
        >>> titanic_te = H2OTargetEncoderEstimator(inflection_point=35,
        ...                                        smoothing=25,
        ...                                        blending=True)
        >>> titanic_te.train(x=predictors,
        ...                  y=response,
        ...                  training_frame=titanic)
        >>> titanic_te
        """
        return self._parms.get("blending")

    @blending.setter
    def blending(self, blending):
        assert_is_type(blending, None, bool)
        self._parms["blending"] = blending

    @property
    def inflection_point(self):
        """
        Inflection point of the sigmoid used to blend probabilities (see `blending` parameter). For a given categorical
        value, if it appears less that `inflection_point` in a data sample, then the influence of the posterior
        probability will be smaller than the prior.

        Type: ``float``, defaults to ``10.0``.

        :examples:

        >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
        >>> predictors = ["home.dest", "cabin", "embarked"]
        >>> response = "survived"
        >>> titanic["survived"] = titanic["survived"].asfactor()
        >>> fold_col = "kfold_column"
        >>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234)
        >>> titanic_te = H2OTargetEncoderEstimator(inflection_point=35,
        ...                                        smoothing=25,
        ...                                        blending=True)
        >>> titanic_te.train(x=predictors,
        ...                  y=response,
        ...                  training_frame=titanic)
        >>> titanic_te
        """
        return self._parms.get("inflection_point")

    @inflection_point.setter
    def inflection_point(self, inflection_point):
        assert_is_type(inflection_point, None, numeric)
        self._parms["inflection_point"] = inflection_point

    @property
    def smoothing(self):
        """
        Smoothing factor corresponds to the inverse of the slope at the inflection point on the sigmoid used to blend
        probabilities (see `blending` parameter). If smoothing tends towards 0, then the sigmoid used for blending turns
        into a Heaviside step function.

        Type: ``float``, defaults to ``20.0``.

        :examples:

        >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
        >>> predictors = ["home.dest", "cabin", "embarked"]
        >>> response = "survived"
        >>> titanic["survived"] = titanic["survived"].asfactor()
        >>> fold_col = "kfold_column"
        >>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234)
        >>> titanic_te = H2OTargetEncoderEstimator(inflection_point=35,
        ...                                        smoothing=25,
        ...                                        blending=True)
        >>> titanic_te.train(x=predictors,
        ...                  y=response,
        ...                  training_frame=titanic)
        >>> titanic_te
        """
        return self._parms.get("smoothing")

    @smoothing.setter
    def smoothing(self, smoothing):
        assert_is_type(smoothing, None, numeric)
        self._parms["smoothing"] = smoothing

    @property
    def data_leakage_handling(self):
        """
        Data leakage handling strategy used to generate the encoding. Supported options are:
        1) "none" (default) - no holdout, using the entire training frame.
        2) "leave_one_out" - current row's response value is subtracted from the per-level frequencies pre-calculated on
        the entire training frame.
        3) "k_fold" - encodings for a fold are generated based on out-of-fold data.

        Type: ``Literal["leave_one_out", "k_fold", "none"]``, defaults to ``"none"``.

        :examples:

        >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
        >>> predictors = ["home.dest", "cabin", "embarked"]
        >>> response = "survived"
        >>> titanic["survived"] = titanic["survived"].asfactor()
        >>> fold_col = "kfold_column"
        >>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234)
        >>> titanic_te = H2OTargetEncoderEstimator(inflection_point=35,
        ...                                        smoothing=25,
        ...                                        data_leakage_handling="k_fold",
        ...                                        blending=True)
        >>> titanic_te.train(x=predictors,
        ...                  y=response,
        ...                  training_frame=titanic)
        >>> titanic_te
        """
        return self._parms.get("data_leakage_handling")

    @data_leakage_handling.setter
    def data_leakage_handling(self, data_leakage_handling):
        assert_is_type(data_leakage_handling, None,
                       Enum("leave_one_out", "k_fold", "none"))
        self._parms["data_leakage_handling"] = data_leakage_handling

    @property
    def noise(self):
        """
        The amount of noise to add to the encoded column. Use 0 to disable noise, and -1 (=AUTO) to let the algorithm
        determine a reasonable amount of noise.

        Type: ``float``, defaults to ``0.01``.
        """
        return self._parms.get("noise")

    @noise.setter
    def noise(self, noise):
        assert_is_type(noise, None, numeric)
        self._parms["noise"] = noise

    @property
    def seed(self):
        """
        Seed used to generate the noise. By default, the seed is chosen randomly.

        Type: ``int``, defaults to ``-1``.
        """
        return self._parms.get("seed")

    @seed.setter
    def seed(self, seed):
        assert_is_type(seed, None, int)
        self._parms["seed"] = seed

    k = deprecated_property('k', inflection_point)
    f = deprecated_property('f', smoothing)
    noise_level = deprecated_property('noise_level', noise)

    def transform(self,
                  frame,
                  blending=None,
                  inflection_point=None,
                  smoothing=None,
                  noise=None,
                  as_training=False,
                  **kwargs):
        """
        Apply transformation to `te_columns` based on the encoding maps generated during `train()` method call.

        :param H2OFrame frame: the frame on which to apply the target encoding transformations.
        :param boolean blending: If provided, this overrides the `blending` parameter on the model.
        :param float inflection_point: If provided, this overrides the `inflection_point` parameter on the model.
        :param float smoothing: If provided, this overrides the `smoothing` parameter on the model.
        :param float noise: If provided, this overrides the amount of random noise added to the target encoding defined on the model, this helps prevent overfitting.
        :param boolean as_training: Must be set to True when encoding the training frame. Defaults to False.

        :example:
        >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
        >>> predictors = ["home.dest", "cabin", "embarked"]
        >>> response = "survived"
        >>> titanic[response] = titanic[response].asfactor()
        >>> fold_col = "kfold_column"
        >>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234)
        >>> titanic_te = H2OTargetEncoderEstimator(data_leakage_handling="leave_one_out",
        ...                                        inflection_point=35,
        ...                                        smoothing=25,
        ...                                        blending=True,
        ...                                        seed=1234)
        >>> titanic_te.train(x=predictors,
        ...                  y=response,
        ...                  training_frame=titanic)
        >>> transformed = titanic_te.transform(frame=titanic)
        """
        for k in kwargs:
            if k in ['seed', 'data_leakage_handling']:
                warnings.warn(
                    "`%s` is deprecated in `transform` method and will be ignored. "
                    "Instead, please ensure that it was set before training on the H2OTargetEncoderEstimator model."
                    % k, H2ODeprecationWarning)
            else:
                raise TypeError(
                    "transform() got an unexpected keyword argument '%s'" % k)

        if 'data_leakage_handling' in kwargs:
            dlh = kwargs['data_leakage_handling']
            assert_is_type(dlh, None, Enum("leave_one_out", "k_fold", "none"))
            if dlh is not None and dlh.lower() != "none":
                warnings.warn(
                    "Deprecated `data_leakage_handling=%s` is replaced by `as_training=True`. "
                    "Please update your code." % dlh, H2ODeprecationWarning)
                as_training = True

        params = dict(
            model=self.model_id,
            frame=frame.key,
            blending=blending if blending is not None else self.
            blending,  # always need to provide blending here as we can't represent unset value 
            inflection_point=inflection_point,
            smoothing=smoothing,
            noise=noise,
            as_training=as_training,
        )

        output = h2o.api("GET /3/TargetEncoderTransform", data=params)
        return h2o.get_frame(output["name"])