class CustomEncoder(BaseEstimator, TransformerMixin):
    # Class constructor
    def __init__(self):
        self.woe = None
        self.woe_cols = None
        self.dummy_cols = None
        self.thresh = None

    @staticmethod
    def thresh_func(n):
        return n // 50 + np.ceil(np.log2(n))

    def fit(self, X, y):
        self.woe_cols = []
        self.dummy_cols = []
        self.thresh = self.thresh_func(len(X))
        cat_cols = X.dtypes[X.dtypes == "object"].index
        for i in cat_cols:
            if len(X[i].unique()) > self.thresh:
                self.woe_cols.append(i)
            else:
                self.dummy_cols.append(i)
        self.woe = WOEEncoder(drop_invariant=True,
                              random_state=1234,
                              cols=self.woe_cols)
        self.woe.fit(X, y)
        return self

    def transform(self, X, y=None):
        X = self.woe.transform(X, y)
        X = pd.get_dummies(X, columns=self.dummy_cols)
        return X
Exemplo n.º 2
0
class DFWOEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, **kwargs):
        self.columns = columns
        self.model = WOEEncoder(**kwargs)
        self.transform_cols = None

    def fit(self, X, y):
        self.columns = X.columns if self.columns is None else self.columns
        self.transform_cols = [x for x in X.columns if x in self.columns]
        self.model.fit(X[self.transform_cols], y)

        return self

    def transform(self, X):
        if self.transform_cols is None:
            raise NotFittedError(
                f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator."
            )

        new_X = X.drop(columns=self.transform_cols)
        new_X = pd.concat(
            [new_X, self.model.transform(X[self.transform_cols])], axis=1)

        return new_X

    def fit_transform(self, X, y):
        return self.fit(X, y).transform(X)
 def fit(self, X, y):
     self.woe_cols = []
     self.dummy_cols = []
     self.thresh = self.thresh_func(len(X))
     cat_cols = X.dtypes[X.dtypes == "object"].index
     for i in cat_cols:
         if len(X[i].unique()) > self.thresh:
             self.woe_cols.append(i)
         else:
             self.dummy_cols.append(i)
     self.woe = WOEEncoder(drop_invariant=True,
                           random_state=1234,
                           cols=self.woe_cols)
     self.woe.fit(X, y)
     return self
Exemplo n.º 4
0
def get_signals(X_train, y_train, X_test, threshold):
    """
    Used to predict buy and sell signals. The function itself has no awareness
    what it is predicting. It is just a helper function used by 
    get_possible_trades().

    Target is the column that contains the target. The other columns are
    considered to be features to be used for training and prediction.

    The function uses a balanced weight of evidence scorecard to predict the 
    signals. It returns the signals array.

    Note that the function uses 70% for training and 30% for testing. The 
    date where the split happens is dependent on how much data the hist
    dataframe contains. So, the caller will not see a single split date for
    all tickers. 
    """

    log(f"- Building model with features: {X_train.columns}")

    scaler    = StandardScaler()
    encoder   = WOEEncoder()
    binner    = KBinsDiscretizer(n_bins=5, encode='ordinal')
    objectify = FunctionTransformer(func=stringify, check_inverse=False, validate=False)
    imputer   = SimpleImputer(strategy='constant', fill_value=0.0)
    clf       = LogisticRegression(class_weight='balanced', random_state=42)

    pipe = make_pipeline(scaler, binner, objectify, encoder, imputer, clf)
    pipe.fit(X_train, y_train.values)

    test_signals = (pipe.predict_proba(X_test)  > threshold).astype(int)[:,1]
    return y_train.values, test_signals.copy()
    def fit_transform(self, df, colname, targetname):
        '''
        Fit encoder, transform column in df, save attributes for transform(/inverse_transform().
                                                                           
        Variable is encoded with adding minor noize to reduce the risk of overfitting.

        Parameters
        ----------
        df : pd.DataFrame
            Data containing the colname to transform.
        colname : str
            Column name in df to be transformed.
        targetname : str
            column name for extracting the mean values for each colname category.

        Returns
        -------
        transformed_df : pd.DataFrame
            Data with the column transformed.
        '''

        assert_fit_transform_args(df, colname)
        assert_binary_target(df, targetname)
        encoded_df = df.copy()
        self._colname = colname
        from category_encoders import WOEEncoder
        generic_encoder = WOEEncoder(**self._params)
        encoded_column = generic_encoder.fit_transform(df[colname],
                                                       df[targetname])
        self.__generic_encoder = generic_encoder
        encoded_df[self._colname] = encoded_column

        # save inverse_transform pattern for test set (without noize)
        woe_vals_no_noize = self.transform(df)[self._colname].unique()
        original_vals = df[self._colname].unique()
        self.__pattern = dict(zip(woe_vals_no_noize, original_vals))

        return encoded_df
Exemplo n.º 6
0
 def WOE_Encoding(self,
                  regularization: float = 1.0,
                  sigma: float = 0.05,
                  randomized: bool = False):
     """
     woe编码
     :param regularization:
     :param sigma:
     :param randomized:
     :return:
     """
     self.encoder = WOEEncoder(cols=self.cols,
                               regularization=regularization,
                               randomized=randomized,
                               sigma=sigma)
Exemplo n.º 7
0
def CatEncoder(X, cat_cols, tags, estimator_name, objective_type, trial, n_classes, random_state):
    if tags["handles categorical"] == False:
        large_threshold = 6
        #TODO: handle numpy arrays with categorical?
        #TODO: handle multiclass / Regression
        if isinstance(X, pd.DataFrame) and isinstance(cat_cols[0], str):
            large_cardinal_cats = [col for col in X[cat_cols].columns if X[col].nunique() > large_threshold]
            small_cardinal_cats = [col for col in X[cat_cols].columns if X[col].nunique() <= large_threshold]
        elif isinstance(X, pd.DataFrame):
            large_cardinal_cats = [col for col in cat_cols if len(np.unique(X.iloc[:,col])) > large_threshold]
            small_cardinal_cats = [col for col in cat_cols if len(np.unique(X.iloc[:,col])) <= large_threshold]
        else:
            large_cardinal_cats = [col for col in cat_cols if len(np.unique(X[:,col])) > large_threshold]
            small_cardinal_cats = [col for col in cat_cols if len(np.unique(X[:,col])) <= large_threshold]

        enc_pipe = None
        cat_enc_types = ["target", "binary", "catboost"]

        if len(small_cardinal_cats) > 0:
            enc_pipe = add_to_pipe(enc_pipe, "ohe", OneHotEncoder(cols=small_cardinal_cats, drop_invariant=True))

        if len(large_cardinal_cats) > 0:
            if (objective_type == "classification" and n_classes == 1):
                cat_enc_types.append("woe")

            cat_enc_type = trial.suggest_categorical(estimator_name + " cat_enc_type", cat_enc_types)

            if cat_enc_type == "binary":
                # mapping = get_mapping(X, large_cardinal_cats)
                enc = BinaryEncoder(cols=large_cardinal_cats,
                                    # mapping=mapping
                                    )

            elif cat_enc_type == "woe":
                enc = WOEEncoder(cols=large_cardinal_cats, drop_invariant=True)

            elif cat_enc_type == "target":
                min_samples_leaf = 6  # TODO: calculate percentage or something else
                enc = TargetEncoder(min_samples_leaf=min_samples_leaf,
                                    cols=large_cardinal_cats)

            else: # catboost
                enc = CatBoostEncoder(cols=large_cardinal_cats,
                                      random_state=random_state)  # TODO: replace SEED
                # TODO: permute to the dataset beforehand

            enc_pipe = add_to_pipe(enc_pipe, cat_enc_type + "_encoder", enc)
    return enc_pipe
Exemplo n.º 8
0
 def __init__(self, kind, **kwargs):
     self.kind = kind
     if kind not in ['OHE', 'TE', 'LOOE', 'WOE', 'LE']:
         raise Exception(
             "Encoder type not supported, choose one of ('OHE','TE','LOOE','WOE', 'LE')"
         )
     else:
         if kind == 'OHE':
             self.encoder = OneHotEncoder(**kwargs)
         elif kind == 'TE':
             self.encoder = TargetEncoder(**kwargs)
         elif kind == 'LOOE':
             self.encoder = LeaveOneOutEncoder(**kwargs)
         elif kind == 'WOE':
             self.encoder = WOEEncoder(**kwargs)
         elif kind == 'LE':
             self.encoder = MultiColumnTransformer(LabelEncoder)
Exemplo n.º 9
0
def CatEncoder(X, cat_cols, tags, objective_type, trial, n_classes, random_state):
    if tags["handles categorical"] == False:
        large_threshold = 6
        #TODO: handle numpy arrays with categorical?
        large_cardinal_cats = [col for col in X[cat_cols].columns if X[col].nunique() > large_threshold]
        small_cardinal_cats = [col for col in X[cat_cols].columns if X[col].nunique() <= large_threshold]

        enc_pipe = None
        cat_enc_types = ["binary", "catboost", "woe", "target"]

        if small_cardinal_cats is not None:
            enc_pipe = add_to_pipe(enc_pipe, "ohe", OneHotEncoder(cols=small_cardinal_cats, drop_invariant=True))

        if large_cardinal_cats is not None:
            if (objective_type == "classification" and n_classes > 2): #multiclass
                cat_enc_types = ["binary"]

            cat_enc_type = trial.suggest_categorical("cat_enc_type", cat_enc_types)

            if cat_enc_type == "binary":
                # mapping = get_mapping(X, large_cardinal_cats)
                enc = BinaryEncoder(cols=large_cardinal_cats,
                                    drop_invariant=True,
                                    # mapping=mapping
                                    )

            elif cat_enc_type == "woe":
                enc = WOEEncoder(cols=large_cardinal_cats, drop_invariant=True)

            elif cat_enc_type == "target":
                min_samples_leaf = 10  # TODO: calculate percentage or something else
                enc = TargetEncoder(min_samples_leaf=min_samples_leaf,
                                    cols=large_cardinal_cats,
                                    drop_invariant=True)

            else: # catboost
                enc = CatBoostEncoder(cols=large_cardinal_cats,
                                      drop_invariant=True,
                                      random_state=random_state)  # TODO: replace SEED
                # TODO: permute to the dataset beforehand

            enc_pipe = add_to_pipe(enc_pipe, cat_enc_type + "_encoder", enc)
    return enc_pipe
Exemplo n.º 10
0
def get_signals(hist, target, threshold):
    """
    Used to predict buy and sell signals. The function itself has no awareness
    what it is predicting. It is just a helper function used by 
    get_possible_trades().

    Target is the column that contains the target. The other columns are
    considered to be features to be used for training and prection.

    The function uses a balanced weight of evidence scorecard to predict the 
    signals. It returns the signals array.

    Note that the function uses 70% for training and 30% for testing. The 
    date where the split happens is dependent on how much data the hist
    dataframe contains. So, the caller will not see a single split date for
    all tickers. 
    """
    # NB: we do not include smooth in data!
    data = hist[['Close', 'Open', 'Low', 'High']]
    data = features(data, hist, target)

    used_cols = [c for c in data.columns.tolist() if c not in [target]]
    X, y, X_train, X_test, y_train, y_test = split_data(
        data, used_cols, target, 0.7)

    encoder = WOEEncoder()
    binner = KBinsDiscretizer(n_bins=5, encode='ordinal')
    objectify = FunctionTransformer(func=stringify,
                                    check_inverse=False,
                                    validate=False)
    imputer = SimpleImputer(strategy='constant', fill_value=0.0)
    clf = LogisticRegression(class_weight='balanced', random_state=42)

    pipe = make_pipeline(binner, objectify, encoder, imputer, clf)
    pipe.fit(X_train, y_train.values)

    test_signals = (pipe.predict_proba(X_test) > threshold).astype(int)[:, 1]
    return y_train.values, test_signals
Exemplo n.º 11
0
def doCleanupEncode(X,
                    y=None,
                    cat=None,
                    oh=None,
                    binary=None,
                    loo=None,
                    woe=None,
                    lp_cols=None,
                    NoData=True):
    from enrich import replaceCVs
    from enrich import one_hot_encode
    from category_encoders import BinaryEncoder
    from category_encoders import OneHotEncoder
    from category_encoders import WOEEncoder
    from category_encoders import LeaveOneOutEncoder

    if NoData is False:
        if cat is not None | oh is not None:
            # translate associated columns' null, NaN, blank and 9 values to zero
            X = replaceCVs(X, cat + oh, [np.nan, 9, "", " "], 0)

    if oh is not None:
        if NoData:
            ec = OneHotEncoder(cols=oh,
                               use_cat_names=True,
                               return_df=True,
                               handle_unknown='indicator',
                               handle_missing='indicator').fit(X)
            X = ec.fit_transform(X)
            # dropping these columns did not help performance
            # for o in oh:
            #    stem = o.split("_")[1]
            #    d1 = "L_" + stem + "_-1"
            #    d2 = "L_" + stem + "_nan"
            #    print("DROPPING ", d1, " ", d2, "\n")
            #    X.drop(d1, axis=1, errors='ignore', inplace=True)
            #    X.drop(d2, axis=1, errors='ignore', inplace=True)
        else:
            # one-hot encode, then drop 0 if created
            for oh_c in oh:
                X = one_hot_encode(X, oh_c, False)
                X.drop(0, axis=1, errors='ignore', inplace=True)

    if binary is not None:
        # binary encode binary columns
        if NoData:
            enc = BinaryEncoder(cols=binary,
                                drop_invariant=True,
                                return_df=True,
                                handle_unknown='indicator').fit(X)
            X = enc.transform(X)
        else:
            enc = BinaryEncoder(cols=binary,
                                drop_invariant=True,
                                return_df=True).fit(X)
            X = enc.transform(X)

    if woe is not None:
        # use weight of evidence on woe columns
        for w in woe:
            X[w] = X[w].fillna('NoData')

        wenc = WOEEncoder(cols=woe).fit(X, y)
        X = wenc.transform(X).round(2)

    if loo is not None:
        # use leave one out on loo columns
        for l in loo:
            X[l] = X[l].fillna('NoData')

        lenc = LeaveOneOutEncoder(cols=loo, return_df=True).fit(X, y)
        X = lenc.transform(X).round(2)

    # Cast all to int64
    # X = X.astype("int64")

    if lp_cols is not None:
        # drop least predictive
        X.drop(lp_cols, axis=1, errors="ignore", inplace=True)

    X.reset_index(drop=True, inplace=True)
    return X
Exemplo n.º 12
0
 def __init__(self, columns=None, **kwargs):
     self.columns = columns
     self.model = WOEEncoder(**kwargs)
     self.transform_cols = None
def Convert_to_numeric(df):

    #Ordinal features
    map_ord = {
        'Novice': 0,
        'Contributor': 1,
        'Expert': 2,
        'Master': 3,
        'Grandmaster': 4,
        'Freezing': 0,
        'Cold': 1,
        'Warm': 2,
        'Hot': 3,
        'Boiling Hot': 4,
        'Lava Hot': 5
    }

    scii_letters_list = list(string.ascii_letters)
    map_ord_hex = dict(zip(scii_letters_list, range(0,
                                                    len(scii_letters_list))))

    df['ord_0'] = df['ord_0']
    df['ord_1'] = df['ord_1'].replace(map_ord)
    df['ord_2'] = df['ord_2'].replace(map_ord)
    df['ord_3'] = df['ord_3'].replace(map_ord_hex)
    df['ord_4'] = df['ord_4'].replace(map_ord_hex)

    df[features_ord] = df[features_ord].fillna(df[features_ord].mean())

    StandardScaler_Encoder = preprocessing.StandardScaler()
    df[features_ord] = StandardScaler_Encoder.fit_transform(
        df[features_ord].astype(float))

    #Binary, Low nominal and time features WOE encoder.
    n_splits = 5
    WOE_features = features_bin + features_low_nom + features_cyc
    # for col in WOE_features:
    #     df[f'{col}_Encode']=0
    #     for tr_idx, tst_idx in StratifiedKFold(n_splits=n_splits, random_state=2020, shuffle=True).split(df[:600000], y_train):
    #         WOE_encoder = WOEEncoder(cols=col)
    #         WOE_encoder.fit(df[:600000].iloc[tr_idx, :], y_train.iloc[tr_idx])
    #         col_df=WOE_encoder.transform(df)[col]/n_splits
    #         df[f'{col}_Encode']= df[f'{col}_Encode']+col_df
    WOE_features_encode = [w + '_Encode' for w in WOE_features]
    for c in WOE_features_encode:
        df[c] = 0
    for tr_idx, tst_idx in StratifiedKFold(n_splits=n_splits,
                                           random_state=2020,
                                           shuffle=True).split(
                                               df[:600000], y_train):
        WOE_encoder = WOEEncoder(cols=WOE_features)
        WOE_encoder.fit(df[:600000].loc[tr_idx, WOE_features],
                        y_train.iloc[tr_idx])
        col_df = WOE_encoder.transform(df[WOE_features]) / n_splits
        df.loc[:, WOE_features_encode] += col_df
    df = df.drop(WOE_features, axis=1)

    #High Nominal Features Label encoder.
    Label_col = features_hi_nom + features_hi_ord
    for col in Label_col:
        Label_Encoder = preprocessing.LabelEncoder()
        df[col] = Label_Encoder.fit_transform(
            df[col].fillna("-1").astype(str).values)

    return df
Exemplo n.º 14
0
    # 单词特征的特征散列化
    def hash_features(word_list, m):
        output = [0] * m
        for word in word_list:
            index = hash_fcn(word) % m
            output[index] += 1
        return output

    # 带符号的特征散列化
    def hash_features(word_list, m):
        output = [0] * m
        for word in word_list:
            index = hash_fcn(word) % m
            sign_bit = sign_hash(word) % 2
            if sign_bit == 0:
                output[index] -= 1
            else:
                output[index] += 1
        return output

    h = FeatureHasher(n_features=m, input_type="string")
    f = h.trasnform(df["feat"])

    enc = TargetEncoder(cols=['Name_of_col', 'Another_name'])
    training_set = enc.fit_transform(X_train, y_train)

    enc = LeaveOneOutEncoder(cols=['Name_of_col', 'Another_name'])
    training_set = enc.fit_transform(X_train, y_train)

    enc = WOEEncoder(cols=['Name_of_col', 'Another_name'])
    training_set = enc.fit_transform(X_train, y_train)
Exemplo n.º 15
0
from sklearn.preprocessing import OrdinalEncoder as SklOrdinalEncoder
from category_encoders import WOEEncoder, OrdinalEncoder
from skl2onnx import update_registered_converter, to_onnx, get_model_alias
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx.common.utils import check_input_and_output_numbers
from skl2onnx.algebra.onnx_ops import OnnxCast
from skl2onnx.algebra.onnx_operator import OnnxSubEstimator
from skl2onnx.sklapi import WOETransformer
import skl2onnx.sklapi.register  # noqa

data = load_iris()
X, y = data.data, data.target
X = X.astype(np.int64)[:, :2]
y = (y == 2).astype(np.int64)

woe = WOEEncoder(cols=[0]).fit(X, y)
print(woe.transform(X[:5]))

########################################
# Let's look into the trained parameters of the model.
# It appears that WOEEncoder uses an OrdinalEncoder
# but not the one from scikit-learn. We need to add a
# converter for this model tool.

print("encoder", type(woe.ordinal_encoder), woe.ordinal_encoder)
print("mapping", woe.mapping)
print("encoder.mapping", woe.ordinal_encoder.mapping)
print("encoder.cols", woe.ordinal_encoder.cols)

######################################
# Custom converter for OrdinalEncoder
 def add_woe_encoding(self):
     self.pipeline.append(("WOEncoder", WOEEncoder()))