예제 #1
0
    def binary_encoder(self, df, configger):
        """

        :param df: the train dataset.
        :param configger: the json str of configger setting, the params means:
            verbose: int
                integer indicating verbosity of the output. 0 for none.
            cols: list
                a list of columns to encode, if None, all string columns will be encoded.
            drop_invariant: bool
                boolean for whether or not to drop columns with 0 variance.
            return_df: bool
                boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
            handle_unknown: str
                options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used,
                an extra column will be added in if the transform matrix has unknown categories.  This can cause
                unexpected changes in dimension in some cases.
            handle_missing: str
                options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used,
                an extra column will be added in if the transform matrix has nan values.  This can cause
                unexpected changes in dimension in some cases.
        :return: the transform result
        """
        X, y, encode_col = self.get_Xy(df, configger)

        drop_invariant = set_default_vale("drop_invariant", configger, False, is_bool=True)
        handle_missing = set_default_vale("handle_missing", configger, "value")
        handle_unknown = set_default_vale("handle_unknown", configger, "value")

        encoder = BinaryEncoder(verbose=1, cols=encode_col, drop_invariant=drop_invariant, return_df=True,
                                handle_unknown=handle_unknown, handle_missing=handle_missing)

        res = encoder.fit_transform(X, y)

        return res
예제 #2
0
    def hash_encoder(self, df, configger):
        """

        :param df: the train dataset.
        :param configger: the json str of configger setting, the params means:
            verbose: int
                integer indicating verbosity of the output. 0 for none.
            cols: list
                a list of columns to encode, if None, all string columns will be encoded.
            drop_invariant: bool
                boolean for whether or not to drop columns with 0 variance.
            return_df: bool
                boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
            hash_method: str
                which hashing method to use. Any method from hashlib works.
            max_process: int
                how many processes to use in transform(). Limited in range(1, 64).
                By default, it uses half of the logical CPUs.
                For example, 4C4T makes max_process=2, 4C8T makes max_process=4.
                Set it larger if you have a strong CPU.
                It is not recommended to set it larger than is the count of the
                logical CPUs as it will actually slow down the encoding.
            max_sample: int
                how many samples to encode by each process at a time.
                This setting is useful on low memory machines.
                By default, max_sample=(all samples num)/(max_process).
                For example, 4C8T CPU with 100,000 samples makes max_sample=25,000,
                6C12T CPU with 100,000 samples makes max_sample=16,666.
                It is not recommended to set it larger than the default value.

        :return: the transform result
        """
        X, y, encode_col = self.get_Xy(df, configger)

        drop_invariant = set_default_vale("drop_invariant", configger, False, is_bool=True)
        max_process = set_default_vale("max_process", configger, 0)
        max_sample = set_default_vale("max_sample", configger, 0)
        n_components = set_default_vale("n_components", configger, 8)
        hash_method = set_default_vale("hash_method", configger, "md5")

        encoder = HashingEncoder(verbose=1, cols=encode_col, drop_invariant=drop_invariant, return_df=True,
                                 max_process=max_process, max_sample=max_sample, n_components=n_components,
                                 hash_method=hash_method)

        res = encoder.fit_transform(X, y)

        return res
예제 #3
0
    def one_hot_encoder(self, df, configger):
        """

        :param df: the train dataset.
        :param configger: the json str of configger setting, the params means:
            verbose: int
                integer indicating verbosity of the output. 0 for none.
            cols: list
                a list of columns to encode, if None, all string columns will be encoded.
            drop_invariant: bool
                boolean for whether or not to drop columns with 0 variance.
            return_df: bool
                boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
            use_cat_names: bool
                if True, category values will be included in the encoded column names. Since this can result in duplicate column names, duplicates are suffixed with '#' symbol until a unique name is generated.
                If False, category indices will be used instead of the category values.
            handle_unknown: str
                options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used,
                an extra column will be added in if the transform matrix has unknown categories.  This can cause
                unexpected changes in dimension in some cases.
            handle_missing: str
                options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used,
                an extra column will be added in if the transform matrix has nan values.  This can cause
                unexpected changes in dimension in some cases.

        :return: the transform result
        """
        X, y, encode_col = self.get_Xy(df, configger)

        drop_invariant = set_default_vale("drop_invariant", configger, False, is_bool=True)
        handle_missing = set_default_vale("handle_missing", configger, "value")
        handle_unknown = set_default_vale("handle_unknown", configger, "value")
        use_cat_names = set_default_vale("use_cat_names", configger, False, is_bool=True)

        encoder = OneHotEncoder(verbose=1, cols=encode_col, drop_invariant=drop_invariant, return_df=True,
                                use_cat_names=use_cat_names,
                                handle_unknown=handle_unknown, handle_missing=handle_missing)

        res = encoder.fit_transform(X, y)

        return res
예제 #4
0
    def target_encoder(self, df, configger):
        """

        :param df: the train dataset.
        :param configger: the json str of configger setting, the params means:
            verbose: int
                integer indicating verbosity of the output. 0 for none.
            cols: list
                a list of columns to encode, if None, all string columns will be encoded.
            drop_invariant: bool
                boolean for whether or not to drop columns with 0 variance.
            return_df: bool
                boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
            handle_missing: str
                options are 'error', 'return_nan'  and 'value', defaults to 'value', which returns the target mean.
            handle_unknown: str
                options are 'error', 'return_nan' and 'value', defaults to 'value', which returns the target mean.
            min_samples_leaf: int
                minimum samples to take category average into account.
            smoothing: float
                smoothing effect to balance categorical average vs prior. Higher value means stronger regularization.
                The value must be strictly bigger than 0.

        :return: the transform result
        """
        X, y, encode_col = self.get_Xy(df, configger)

        drop_invariant = set_default_vale("drop_invariant", configger, False, is_bool=True)
        handle_missing = set_default_vale("handle_missing", configger, "value")
        handle_unknown = set_default_vale("handle_unknown", configger, "value")
        min_samples_leaf = set_default_vale("min_samples_leaf", configger, 1)
        smoothing = set_default_vale("smoothing", configger, 1.0)

        encoder = TargetEncoder(verbose=1, cols=encode_col, drop_invariant=drop_invariant, return_df=True,
                                handle_missing=handle_missing,
                                handle_unknown=handle_unknown, min_samples_leaf=min_samples_leaf, smoothing=smoothing)

        res = encoder.fit_transform(X, y)

        return res
예제 #5
0
    def sum_encoder(self, df, configger):
        """

        :param df: the train dataset.
        :param configger: the json str of configger setting, the params means:
            verbose: int
                integer indicating verbosity of the output. 0 for none.
            cols: list
                a list of columns to encode, if None, all string columns will be encoded.
            drop_invariant: bool
                boolean for whether or not to drop columns with 0 variance.
            return_df: bool
                boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
            mapping: list of dict
                a mapping of class to label to use for the encoding, optional.
                the dict contains the keys 'col' and 'mapping'.
                the value of 'col' should be the feature name.
                the value of 'mapping' should be a dictionary of 'original_label' to 'encoded_label'.
                example mapping: [{'col': 'col1', 'mapping': {None: 0, 'a': 1, 'b': 2}}]
            handle_unknown: str
                options are 'error', 'return_nan' and 'value', defaults to 'value', which will impute the category -1.
            handle_missing: str
                options are 'error', 'return_nan', and 'value, default to 'value', which treat nan as a category at fit time,
                or -2 at transform time if nan is not a category during fit.

        :return: the transform result
        """
        X, y, encode_col = self.get_Xy(df, configger)

        drop_invariant = set_default_vale("drop_invariant", configger, False, is_bool=True)
        handle_missing = set_default_vale("handle_missing", configger, "value")
        handle_unknown = set_default_vale("handle_unknown", configger, "value")

        encoder = SumEncoder(verbose=1, cols=encode_col, drop_invariant=drop_invariant, return_df=True,
                             handle_unknown=handle_unknown, handle_missing=handle_missing)

        res = encoder.fit_transform(X, y)

        return res
예제 #6
0
    def leaveone_encoder(self, df, configger):
        """

        :param df: the train dataset.
        :param configger: the json str of configger setting, the params means:
            verbose: int
                integer indicating verbosity of the output. 0 for none.
            cols: list
                a list of columns to encode, if None, all string columns will be encoded.
            drop_invariant: bool
                boolean for whether or not to drop columns with 0 variance.
            return_df: bool
                boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
            handle_missing: str
                options are 'error', 'return_nan'  and 'value', defaults to 'value', which returns the target mean.
            handle_unknown: str
                options are 'error', 'return_nan' and 'value', defaults to 'value', which returns the target mean.
            sigma: float
                adds normal (Gaussian) distribution noise into training data in order to decrease overfitting (testing
                data are untouched). Sigma gives the standard deviation (spread or "width") of the normal distribution.
                The optimal value is commonly between 0.05 and 0.6. The default is to not add noise, but that leads
                to significantly suboptimal results.
        :return: the transform result
        """
        X, y, encode_col = self.get_Xy(df, configger)

        drop_invariant = set_default_vale("drop_invariant", configger, False, is_bool=True)
        handle_missing = set_default_vale("handle_missing", configger, "value")
        handle_unknown = set_default_vale("handle_unknown", configger, "value")
        random_state = set_default_vale("random_state", configger, None)
        sigma = set_default_vale("sigma", configger, None)

        encoder = LeaveOneOutEncoder(verbose=1, cols=encode_col, drop_invariant=drop_invariant, return_df=True,
                                     handle_unknown=handle_unknown, handle_missing=handle_missing,
                                     random_state=random_state, sigma=sigma)

        res = encoder.fit_transform(X, y)

        return res
예제 #7
0
    def weight_of_evidence_encoder(self, df, configger):
        """

        :param df: the train dataset.
        :param configger: the json str of configger setting, the params means:
            verbose: int
                integer indicating verbosity of the output. 0 for none.
            cols: list
                a list of columns to encode, if None, all string columns will be encoded.
            drop_invariant: bool
                boolean for whether or not to drop columns with 0 variance.
            return_df: bool
                boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
            handle_missing: str
                options are 'return_nan', 'error' and 'value', defaults to 'value', which will assume WOE=0.
            handle_unknown: str
                options are 'return_nan', 'error' and 'value', defaults to 'value', which will assume WOE=0.
            randomized: bool,
                adds normal (Gaussian) distribution noise into training data in order to decrease overfitting (testing data are untouched).
            sigma: float
                standard deviation (spread or "width") of the normal distribution.
            regularization: float
                the purpose of regularization is mostly to prevent division by zero.
                When regularization is 0, you may encounter division by zero.

        :return: the transform result
        """
        X, y, encode_col = self.get_Xy(df, configger)

        drop_invariant = set_default_vale("drop_invariant", configger, False, is_bool=True)
        handle_missing = set_default_vale("handle_missing", configger, "value")
        handle_unknown = set_default_vale("handle_unknown", configger, "value")
        random_state = set_default_vale("random_state", configger, None)
        randomized = set_default_vale("randomized", configger, False, is_bool=True)
        sigma = set_default_vale("sigma", configger, 0.05)
        regularization = set_default_vale("regularization", configger, 1.0)

        encoder = WOEEncoder(verbose=1, cols=encode_col, drop_invariant=drop_invariant, return_df=True,
                             handle_unknown=handle_unknown, handle_missing=handle_missing,
                             random_state=random_state, randomized=randomized, sigma=sigma,
                             regularization=regularization)

        res = encoder.fit_transform(X, y)

        return res
예제 #8
0
    def mestimate_encoder(self, df, configger):
        """

        :param df: the train dataset.
        :param configger: the json str of configger setting, the params means:
            verbose: int
                integer indicating verbosity of the output. 0 for none.
            cols: list
                a list of columns to encode, if None, all string columns will be encoded.
            drop_invariant: bool
                boolean for whether or not to drop encoded columns with 0 variance.
            return_df: bool
                boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
            handle_missing: str
                options are 'return_nan', 'error' and 'value', defaults to 'value', which returns the prior probability.
            handle_unknown: str
                options are 'return_nan', 'error' and 'value', defaults to 'value', which returns the prior probability.
            randomized: bool,
                adds normal (Gaussian) distribution noise into training data in order to decrease overfitting (testing data are untouched).
            sigma: float
                standard deviation (spread or "width") of the normal distribution.
            m: float
                this is the "m" in the m-probability estimate. Higher value of m results into stronger shrinking.
                M is non-negative.

        :return: the transform result
        """
        X, y, encode_col = self.get_Xy(df, configger)

        drop_invariant = set_default_vale("drop_invariant", configger, False, is_bool=True)
        handle_missing = set_default_vale("handle_missing", configger, "value")
        handle_unknown = set_default_vale("handle_unknown", configger, "value")
        random_state = set_default_vale("random_state", configger, None)
        randomized = set_default_vale("randomized", configger, False, is_bool=True)
        sigma = set_default_vale("sigma", configger, 0.05)
        m = set_default_vale("m", configger, 1.0)

        encoder = MEstimateEncoder(verbose=1, cols=encode_col, drop_invariant=drop_invariant, return_df=True,
                                   handle_unknown=handle_unknown, handle_missing=handle_missing,
                                   random_state=random_state, randomized=randomized, sigma=sigma, m=m)

        res = encoder.fit_transform(X, y)

        return res
예제 #9
0
    def catboost_encoder(self, df, configger):
        """

        :param df: the train dataset.
        :param configger: the json str of configger setting, the params means:
            verbose: int
                integer indicating verbosity of the output. 0 for none.
            cols: list
                a list of columns to encode, if None, all string columns will be encoded.
            drop_invariant: bool
                boolean for whether or not to drop columns with 0 variance.
            return_df: bool
                boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
            handle_missing: str
                options are 'error', 'return_nan'  and 'value', defaults to 'value', which returns the target mean.
            handle_unknown: str
                options are 'error', 'return_nan' and 'value', defaults to 'value', which returns the target mean.
            sigma: float
                adds normal (Gaussian) distribution noise into training data in order to decrease overfitting (testing data are untouched).
                sigma gives the standard deviation (spread or "width") of the normal distribution.
            a: float
                additive smoothing (it is the same variable as "m" in m-probability estimate). By default set to 1.

        :return: the transform result
        """
        X, y, encode_col = self.get_Xy(df, configger)

        drop_invariant = set_default_vale("drop_invariant", configger, False, is_bool=True)
        handle_missing = set_default_vale("handle_missing", configger, "value")
        handle_unknown = set_default_vale("handle_unknown", configger, "value")
        random_state = set_default_vale("random_state", configger, None)
        sigma = set_default_vale("sigma", configger, None)
        a = set_default_vale("a", configger, 1)

        encoder = CatBoostEncoder(verbose=1, cols=encode_col, drop_invariant=drop_invariant, return_df=True,
                                  handle_unknown=handle_unknown, handle_missing=handle_missing,
                                  random_state=random_state, sigma=sigma, a=a)

        res = encoder.fit_transform(X, y)

        return res