Exemplo n.º 1
0
def apply_weight_of_evidence_encoding(df, categorical_columns, label='y'):
    if not isinstance(df, pd.DataFrame):
        raise DataFrameTypeError('df', df)
    import category_encoders as ce
    encoder = ce.WOEEncoder(cols=categorical_columns).fit(
        df.drop([label], axis=1), df[label])
    X_transformed = encoder.transform(df)
    return X_transformed
Exemplo n.º 2
0
def cal_woe(df_tr, col):
    enc = ce.WOEEncoder(cols=[col]).fit(df_tr.loc[::, feature_col],
                                        df_tr.loc[::, 'isDefault'])
    tmp = pd.DataFrame({
        f'{col}':
        df_tr.loc[::, col],
        f'woe_{col}':
        enc.transform(df_tr.loc[::, feature_col], df_tr.loc[::,
                                                            'isDefault'])[col]
    })
    return tmp.groupby([col])[f'woe_{col}'].mean(), f'woe_{col}'
Exemplo n.º 3
0
    def test_HandleUnknownValue_HaveUnknown_ExpectEncodedWithZero(self):
        X = ['a', 'a', 'b', 'b']
        y = [1, 0, 0, 0]
        test = ['a', 'c']
        enc = encoders.WOEEncoder(handle_unknown='value')

        enc.fit(X, y)
        result = enc.transform(test)

        expected = pd.Series([0.5108256237659906, 0], name=0)
        pd.testing.assert_series_equal(expected, result[0])
    def __init__(self, encoder_type, columns_name=None):
        """
        :param encoder_type:
        :param columns_name: list, 特征名组成的列表名
        """
        if encoder_type == "BackwardDe":  # 反向差分编码
            self.encoder = ce.BackwardDifferenceEncoder(cols=columns_name)

        elif encoder_type == "BaseN":  # BaseN编码
            self.encoder = ce.BaseNEncoder(cols=columns_name)

        elif encoder_type == "Binary":  # 二值编码
            self.encoder = ce.BinaryEncoder(cols=columns_name)

        elif encoder_type == "Catboost":
            self.encoder = ce.CatBoostEncoder(cols=columns_name)

        elif encoder_type == "Hash":
            self.encoder = ce.HashingEncoder(cols=columns_name)

        elif encoder_type == "Helmert":
            self.encoder = ce.HelmertEncoder(cols=columns_name)

        elif encoder_type == "JamesStein":
            self.encoder = ce.JamesSteinEncoder(cols=columns_name)

        elif encoder_type == "LOO":  # LeaveOneOutEncoder  编码
            self.encoder = ce.LeaveOneOutEncoder(cols=columns_name)

        elif encoder_type == "ME":
            self.encoder = ce.MEstimateEncoder(cols=columns_name)  # M估计编码器

        elif encoder_type == "OneHot":
            self.encoder = ce.OneHotEncoder(cols=columns_name)

        elif encoder_type == "OridinalEncoder":  # 原始编码
            self.encoder = ce.OrdinalEncoder(cols=columns_name)

        elif encoder_type == "Sum":  # 求和编码
            self.encoder = ce.SumEncoder(cols=columns_name)

        elif encoder_type == "Polynomial":  # 多项式编码
            self.encoder = ce.PolynomialEncoder(cols=columns_name)

        elif encoder_type == "Target":  # 目标编码
            self.encoder = ce.TargetEncoder(cols=columns_name)

        elif encoder_type == "WOE":  # WOE 编码器
            self.encoder = ce.WOEEncoder(cols=columns_name)

        else:
            raise ValueError("请选择正确的编码方式")
Exemplo n.º 5
0
    def test_HaveArrays_ExpectCalculatedProperly(self):
        X = ['a', 'a', 'b', 'b']
        y = [1, 0, 0, 0]
        enc = encoders.WOEEncoder()

        result = enc.fit_transform(X, y)

        expected = pd.Series([
            0.5108256237659906, .5108256237659906, -0.587786664902119,
            -0.587786664902119
        ],
                             name=0)
        pd.testing.assert_series_equal(expected, result[0])
Exemplo n.º 6
0
    def test_HandleMissingValue_HaveMissingInTrain_ExpectEncoded(self):
        X = ['a', 'a', np.nan, np.nan]
        y = [1, 0, 0, 0]
        enc = encoders.WOEEncoder(handle_missing='value')

        result = enc.fit_transform(X, y)

        expected = pd.Series([
            0.5108256237659906, .5108256237659906, -0.587786664902119,
            -0.587786664902119
        ],
                             name=0)
        pd.testing.assert_series_equal(expected, result[0])
Exemplo n.º 7
0
def woe_encoding(X_fit, y_fit, cols, X_test=None, sigma=0):
    """
    只针对binomial target
    X_fit: 用来计算encoding的df, 包含cols
    y_fit: encoding的target
    X_test: 需要transform的对象
    cols: 需要encoding的列
    sigma: 添加噪声的标准差,防止过拟合
    """
    if X_test is None:
        X_test = X_fit
    encoder = ce.WOEEncoder(cols=cols, sigma=sigma)
    encoder.fit(X_fit, y_fit)
    result = encoder.transform(X_test)
    return result
Exemplo n.º 8
0
def woe_discrete(df, discrete_variable_name, target):
    """Generates woe transformation of discrete variables

    Args:
        df (pd.Dataframe): dataframe containing discrete variables to be trnasformed
        discrete_variable_name (list): list of discrete variables to be transformed
        target (str): target variable name

    Returns:
        pd.Dataframe: dataframe with woe transformed discrete variables added as columns to the original dataframe
    """

    woe_encoder = ce.WOEEncoder(cols=discrete_variable_name)
    woe_of_discrete_variables = woe_encoder.fit_transform(
        df[discrete_variable_name], df[target]).add_suffix('_woe')
    df = df.join(woe_of_discrete_variables)
    return df
Exemplo n.º 9
0
 def woe_encode(self, train, test, feature):
     '''
     Weight of Evidence Encoding (WOE)
     FYI warning can be ignored, refers to this issue:
     https://github.com/scikit-learn-contrib/category_encoders/issues/281
     '''
     train = train.copy()
     test = test.copy()
     encoder = category_encoders.WOEEncoder()
     
     train[f'{feature}_WOE'] = encoder.fit_transform(
         train[feature].astype("category"), train["purchased"]
     )[feature].values
     
     test[f'{feature}_WOE'] = encoder.transform(
         test[feature].astype("category")
     )[feature].values
     return train, test
Exemplo n.º 10
0
def define_lr_pipeline(df: pd.DataFrame, target_col: str, n_jobs: int,
                       random_state: int) -> Pipeline:
    woe = ce.WOEEncoder()
    sc = StandardScaler()

    lr = LogisticRegression(n_jobs=n_jobs, random_state=random_state)

    # from sklearn.tree import DecisionTreeClassifier
    # dt = DecisionTreeClassifier(max_depth=5, random_state=random_state)

    cat_features = (df.drop(
        columns=target_col).select_dtypes('object').columns)
    num_features = (df.drop(
        columns=target_col).select_dtypes('number').columns)

    transformer = ColumnTransformer([('woe', woe, cat_features),
                                     ('sc', sc, num_features)])

    pipeline = Pipeline([('transformer', transformer), ('clf', lr)])

    return pipeline
Exemplo n.º 11
0
def fit(X, y, output_dir, **kwargs):
    """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data
    DataRobot runs this hook when the task is being trained inside a blueprint.
    As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data.
    The input parameters are passed by DataRobot based on project and blueprint configuration.

    Parameters
    -------
    X: pd.DataFrame
        Training data that DataRobot passes when this task is being trained.
    y: pd.Series
        Project's target column (None is passed for unsupervised projects).
    output_dir: str
        A path to the output folder; the artifact [in this example - containing median of each numeric column] must be saved into this folder to be re-used in transform().

    Returns
    -------
    None
        fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir
        so that the trained object can be used during scoring inside transform()
    """

    # Transform categorical columns into a numeric transformation using Weight of Evidence
    encoder_woe = ce.WOEEncoder(cols=X.columns,
                                randomized=True,
                                handle_missing='value',
                                handle_unknown='value')
    encoder_woe.fit(X, y)

    # dump the trained object
    # into an artifact [in this example - woe.pkl]
    # and save it into output_dir so that it can be used later to impute on new data
    output_dir_path = Path(output_dir)
    if output_dir_path.exists() and output_dir_path.is_dir():
        with open("{}/woe.pkl".format(output_dir), "wb") as fp:
            pickle.dump(encoder_woe, fp)
Exemplo n.º 12
0
    'victimIsDucked',
    'victimIsDucking',
    'victimIsDefusing',
    'victimIsScoped',
    'victimHasHelmet',
    'hitgroup',
]
features = [
    "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12",
    "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22",
    "x23", "x24", "x25", "x26", "x27"
]

X0 = df.drop(['ct_wins', 't_wins'], axis=1)

encoder = ce.WOEEncoder(cols=nominal_columns).fit(X0, y)
X = encoder.transform(X0)

X = X.rename(index=str,
             columns={
                 "attackerHealth": "x1",
                 "attackerXPosition": "x2",
                 "attackerYPosition": "x3",
                 "attackerZPosition": "x4",
                 "weapon": "x5",
                 "attackerSpotted": "x6",
                 "attackerSide": "x7",
                 "attackerIsScoped": "x8",
                 "attackerIsDucked": "x9",
                 "attackerIsDucking": "x10",
                 "attackerHasHelmet": "x11",
    'sick.arff', 'solar.flare1.arff', 'solar.flare2.arff', 'soybean.arff',
    'spectrometer.arff', 'sponge.arff', 'tic-tac-toe.arff', 'trains.arff',
    'vote.arff', 'vowel.arff'
]

# We ignore encoders {BackwardDifferenceEncoder, HelmertEncoder, PolynomialEncoder and SumEncoder} because of:
#   https://github.com/scikit-learn-contrib/categorical-encoding/issues/91
encoders = [
    category_encoders.BaseNEncoder(),
    category_encoders.OneHotEncoder(),
    category_encoders.BinaryEncoder(),
    category_encoders.HashingEncoder(),
    category_encoders.OrdinalEncoder(),
    category_encoders.TargetEncoder(),
    category_encoders.LeaveOneOutEncoder(),
    category_encoders.WOEEncoder()
]

# Initialization
if os.path.isfile('./output/result.csv'):
    os.remove('./output/result.csv')

# Ok...
warnings.filterwarnings('ignore')

# Loop over datasets, then over encoders, and finally, over the models
for dataset_name in datasets:
    X, y, fold_count = arff_loader.load(dataset_name)
    non_numeric = list(X.select_dtypes(exclude=[np.number]).columns.values)
    for encoder in encoders:
        print("Encoding:", dataset_name, y.name, encoder.__class__.__name__)
Exemplo n.º 14
0
# We painstakingly initialize each encoder here because that gives us the freedom to initialize the
# encoders with any setting we want.
encoders = [ #category_encoders.BackwardDifferenceEncoder(),
             category_encoders.BaseNEncoder(),
             category_encoders.BinaryEncoder(),
             category_encoders.HashingEncoder(),
             # category_encoders.HelmertEncoder(),
             category_encoders.JamesSteinEncoder(),
             category_encoders.LeaveOneOutEncoder(),
             category_encoders.MEstimateEncoder(),
             category_encoders.OneHotEncoder(),
             category_encoders.OrdinalEncoder(),
             # category_encoders.PolynomialEncoder(),
             # category_encoders.SumEncoder(),
             category_encoders.TargetEncoder(),
             category_encoders.WOEEncoder()]

encoders = [ #category_encoders.BackwardDifferenceEncoder(),
             category_encoders.BaseNEncoder(handle_missing='value'),
             category_encoders.BaseNEncoder(handle_missing='indicator'),
             category_encoders.BinaryEncoder(handle_missing='value'),
category_encoders.BinaryEncoder(handle_missing='indicator'),
#              category_encoders.HashingEncoder(handle_missing='value'),
# category_encoders.HashingEncoder(handle_missing='indicator'),
             # category_encoders.HelmertEncoder(),
             category_encoders.JamesSteinEncoder(handle_missing='value'),
category_encoders.JamesSteinEncoder(handle_missing='indicator'),
             category_encoders.LeaveOneOutEncoder(handle_missing='value'),
category_encoders.LeaveOneOutEncoder(handle_missing='indicator'),
             category_encoders.MEstimateEncoder(handle_missing='value'),
category_encoders.MEstimateEncoder(handle_missing='indicator'),
Exemplo n.º 15
0
 def get_encoder(self) -> BaseEstimator:
     return ce.WOEEncoder(cols=self.target_columns)
Exemplo n.º 16
0
#target encoding
start_time = time.time()
target_encoder = ce.TargetEncoder(cols=cat_cols_bank, smoothing=1)
mean_target_transformed = target_encoder.fit_transform(df_bank[cat_cols_bank],
                                                       df_bank['y'])
print('computation time of target:', time.time() - start_time)
print(
    'Memory usage after encoding: ',
    round(
        mean_target_transformed.memory_usage(deep=True).sum() * BYTES_TO_MB,
        3))

#WoE
start_time = time.time()
woe_encoder = ce.WOEEncoder(cols=cat_cols_bank)
woe_encoder_transformed = woe_encoder.fit_transform(df_bank[cat_cols_bank],
                                                    df_bank['y'])
print('computation time of WOE :', time.time() - start_time)
print(
    'Memory usage after encoding: ',
    round(
        woe_encoder_transformed.memory_usage(deep=True).sum() * BYTES_TO_MB,
        3))

#embeddings = [('one hot encoding',df_bank_one_hot_transformed), ('label encoding',df_bank_label_transformed),
#              ('hash encoding',hash_transformed), ('target encoding',mean_target_transformed), ('WOE encoding',woe_encoder_transformed)]

#%% Train-Test split
num_fold = 5
X = label_transformed.drop(['y'],
Exemplo n.º 17
0
                      min_split_gain=0.0,
                      missing=-999,
                      n_estimators=500,
                      n_jobs=1,
                      num_leaves=31,
                      objective=None,
                      random_state=64,
                      reg_alpha=0.0,
                      reg_lambda=0.0,
                      silent=1,
                      subsample=0.8,
                      subsample_for_bin=200000,
                      subsample_freq=0)

pipe = Pipeline([('transformer', FeatureSelector()),
                 ('encoder', ce.WOEEncoder()), ('scaler', MinMaxScaler()),
                 ('classifier', lgbm)])

pipe.fit(train, y)

cloudpickle.dump(pipe,
                 open('titanicModel.pkl', 'wb'),
                 protocol=pickle.HIGHEST_PROTOCOL)

model = pickle.load(open('titanicModel.pkl', 'rb'))

col_dict = {i: [] for i in train.columns}

col_dict['Pclass'].append(3)
col_dict['Name'].append('asda, Mr. Ram')
col_dict['Sex'].append('male')
Exemplo n.º 18
0
def get_model(PARAMS):
    """return model for provided params

    :param PARAMS: dictionary with model params
    :type PARAMS: dicr
    :return: model pipeline
    :rtype: sklearn pipeline
    """

    try:
        te_dict = {
            'CatBoostEncoder': ce.CatBoostEncoder(),
            'HashingEncoder': ce.HashingEncoder(),
            'HelmertEncoder': ce.HelmertEncoder(),
            'LeaveOneOutEncoder': ce.LeaveOneOutEncoder(),
            'OneHotEncoder': ce.OneHotEncoder(),
            'TargetEncoder': ce.TargetEncoder(),
            'WOEEncoder': ce.WOEEncoder(),
            'BackwardDifferenceEncoder': ce.BackwardDifferenceEncoder(),
            'BaseNEncoder': ce.BaseNEncoder(),
            'BinaryEncoder': ce.BinaryEncoder(),
            'CountEncoder': ce.CountEncoder(),
            'JamesSteinEncoder': ce.JamesSteinEncoder(),
            'MEstimateEncoder': ce.MEstimateEncoder(),
            'PolynomialEncoder': ce.PolynomialEncoder(),
            'SumEncoder': ce.SumEncoder()
        }

        pipe = make_pipeline(
            helpers.PrepareData(extraxt_year=True, unicode_text=True),
            ColumnTransformer([
                ('num', helpers.PassThroughOrReplace(), [
                    'flat_size', 'rooms', 'floor', 'number_of_floors',
                    'year_of_building', 'GC_latitude', 'GC_longitude'
                ]),
                ('te_producer', te_dict.get(PARAMS['te_producer']),
                 'producer_name'),
                ('te_road', te_dict.get(PARAMS['te_road']), 'GC_addr_road'),
                ('te_neighbourhood', te_dict.get(PARAMS['te_neighbourhood']),
                 'GC_addr_neighbourhood'),
                ('te_suburb', te_dict.get(PARAMS['te_suburb']),
                 'GC_addr_suburb'),
                ('te_postcode', te_dict.get(PARAMS['te_postcode']),
                 'GC_addr_postcode'),
                ('txt_name',
                 TfidfVectorizer(lowercase=True,
                                 ngram_range=(1,
                                              PARAMS['txt_name__ngram_range']),
                                 max_features=PARAMS['txt_name__max_features'],
                                 dtype=np.float32,
                                 binary=PARAMS['txt_name__binary'],
                                 use_idf=PARAMS['txt_name__use_idf']), 'name'),
                ('txt_dscr',
                 TfidfVectorizer(lowercase=True,
                                 ngram_range=(1,
                                              PARAMS['txt_dscr__ngram_range']),
                                 max_features=PARAMS['txt_dscr__max_features'],
                                 dtype=np.float32,
                                 binary=PARAMS['txt_dscr__binary'],
                                 use_idf=PARAMS['txt_dscr__use_idf']),
                 'description'),
            ]),
            TransformedTargetRegressor(
                regressor=lgb.LGBMRegressor(**PARAMS, random_state=seed),
                func=np.log1p,
                inverse_func=np.expm1))

        return pipe

    except BaseException as e:
        LOG.error(e)
        return None
Exemplo n.º 19
0
    def test_woe(self):
        cols = [
            'unique_str', 'underscore', 'extra', 'none', 'invariant', 321,
            'categorical', 'na_categorical', 'categorical_int'
        ]

        # balanced label with balanced features
        X_balanced = pd.DataFrame(data=['1', '1', '1', '2', '2', '2'],
                                  columns=['col1'])
        y_balanced = [True, False, True, False, True, False]
        enc = encoders.WOEEncoder()
        enc.fit(X_balanced, y_balanced)
        X1 = enc.transform(X_balanced)
        self.assertTrue(
            all(X1.sum() < 0.001),
            "When the class label is balanced, WoE should sum to 0 in each transformed column"
        )

        enc = encoders.WOEEncoder(cols=cols)
        enc.fit(X, np_y)
        X1 = enc.transform(X_t)
        th.verify_numeric(X1[cols])
        self.assertTrue(
            np.isfinite(X1[cols].values).all(),
            'There must not be any NaN, inf or -inf in the transformed columns'
        )
        self.assertEqual(len(list(X_t)), len(list(X1)),
                         'The count of attributes must not change')
        self.assertEqual(len(X_t), len(X1),
                         'The count of rows must not change')
        X2 = enc.transform(X_t, np_y_t)
        th.verify_numeric(X2)
        self.assertTrue(
            np.isfinite(X2[cols].values).all(),
            'There must not be any NaN, inf or -inf in the transformed columns'
        )
        self.assertEqual(len(list(X_t)), len(list(X2)),
                         'The count of attributes must not change')
        self.assertEqual(len(X_t), len(X2),
                         'The count of rows must not change')
        X3 = enc.transform(X, np_y)
        th.verify_numeric(X3)
        self.assertTrue(
            np.isfinite(X3[cols].values).all(),
            'There must not be any NaN, inf or -inf in the transformed columns'
        )
        self.assertEqual(len(list(X)), len(list(X3)),
                         'The count of attributes must not change')
        self.assertEqual(len(X), len(X3), 'The count of rows must not change')
        self.assertTrue(
            X3['unique_str'].var() < 0.001,
            'The unique string column must not be predictive of the label')
        X4 = enc.fit_transform(X, np_y)
        th.verify_numeric(X4)
        self.assertTrue(
            np.isfinite(X4[cols].values).all(),
            'There must not be any NaN, inf or -inf in the transformed columns'
        )
        self.assertEqual(len(list(X)), len(list(X4)),
                         'The count of attributes must not change')
        self.assertEqual(len(X), len(X4), 'The count of rows must not change')
        self.assertTrue(
            X4['unique_str'].var() < 0.001,
            'The unique string column must not be predictive of the label')

        enc = encoders.WOEEncoder()
        enc.fit(X, np_y)
        X1 = enc.transform(X_t)
        self.assertEqual(len(list(X_t)), len(list(X1)),
                         'The count of attributes must not change')
        self.assertEqual(len(X_t), len(X1),
                         'The count of rows must not change')
        th.verify_numeric(X1)
        X2 = enc.transform(X_t, np_y_t)
        th.verify_numeric(X2)
        self.assertEqual(len(list(X_t)), len(list(X2)),
                         'The count of attributes must not change')
        self.assertEqual(len(X_t), len(X2),
                         'The count of rows must not change')

        # seed
        enc = encoders.WOEEncoder(cols=cols,
                                  random_state=2001,
                                  randomized=True)
        enc.fit(X, np_y)
        X1 = enc.transform(X_t, np_y_t)
        X2 = enc.transform(X_t, np_y_t)
        self.assertTrue(
            X1.equals(X2),
            "When the seed is given, the results must be identical")
        th.verify_numeric(X1)
        th.verify_numeric(X2)

        # invariant target
        y_invariant = [True, True, True, True, True, True]
        enc = encoders.WOEEncoder()
        with self.assertRaises(ValueError):
            enc.fit(X_balanced, y_invariant)

        # branch coverage unit tests - no cols
        enc = encoders.WOEEncoder(cols=[])
        enc.fit(X, np_y)
        self.assertTrue(enc.transform(X_t).equals(X_t))

        # missing values in the target
        y_missing = [True, True, None, True, True, True]
        enc = encoders.WOEEncoder()
        with self.assertRaises(ValueError):
            enc.fit(X_balanced, y_missing)

        # impute missing
        enc = encoders.WOEEncoder(handle_missing='return_nan')
        enc.fit(X, np_y)
        X1 = enc.transform(X_t)
        th.verify_numeric(X1)
        self.assertTrue(X1.isnull().values.any())
        self.assertEqual(len(list(X_t)), len(list(X1)),
                         'The count of attributes must not change')
        self.assertEqual(len(X_t), len(X1),
                         'The count of rows must not change')

        X2 = enc.transform(X_t, np_y_t)
        th.verify_numeric(X2)
        self.assertTrue(X1.isnull().values.any())
        self.assertEqual(len(list(X_t)), len(list(X2)),
                         'The count of attributes must not change')
        self.assertEqual(len(X_t), len(X2),
                         'The count of rows must not change')
Exemplo n.º 20
0
train, test = train_test_split(data, test_size=0.15, random_state=42)

# %%
x = train.drop(columns=['y']).reset_index(drop=True)
y = train['y'].reset_index(drop=True)

x_test = test.drop(columns=['y']).reset_index(drop=True)
y_test = test['y'].reset_index(drop=True)

# %%
encoders = {
    "one-hot":
    ce.OneHotEncoder(drop_invariant=True, return_df=True, use_cat_names=True),
    "woe":
    ce.WOEEncoder(drop_invariant=True, return_df=True),
    "binary":
    ce.BinaryEncoder(drop_invariant=True, return_df=True),
}


def objective(trial: opt.Trial):
    # only test dropping sozio economic facotrs
    drop_sozioeco = trial.suggest_categorical("drop_eco", [True, False])
    # rest of preprocessing keeps default values

    # categrorial encoding, try identical encoders for all columns (for now)
    enc_name = trial.suggest_categorical("encoder",
                                         ["one-hot", "woe", "binary"])
    enc = encoders[enc_name]
# We painstakingly initialize each encoder here because that gives us the freedom to initialize the
# encoders with any setting we want.
encoders = [ #category_encoders.BackwardDifferenceEncoder(),
             category_encoders.BaseNEncoder(),
             category_encoders.BinaryEncoder(),
             category_encoders.HashingEncoder(),
             # category_encoders.HelmertEncoder(),
             category_encoders.JamesSteinEncoder(),
             category_encoders.LeaveOneOutEncoder(),
             category_encoders.MEstimateEncoder(),
             category_encoders.OneHotEncoder(),
             category_encoders.OrdinalEncoder(),
             # category_encoders.PolynomialEncoder(),
             # category_encoders.SumEncoder(),
             category_encoders.TargetEncoder(),
             category_encoders.WOEEncoder()]

encoders = [category_encoders.TargetEncoder(), category_encoders.JamesSteinEncoder(), category_encoders.WOEEncoder()]

# Initialization
if os.path.isfile('./output/result.csv'):
    os.remove('./output/result.csv')

# Loop over datasets, then over encoders
for dataset_name in datasets:
    # X, y, fold_count = arff_loader.load(dataset_name)
    X, y, fold_count, nominal_columns = csv_loader.load(dataset_name)

    # Get indexes (not names) of categorical features
    categorical_indexes = []
    for col in X.select_dtypes(exclude=[np.number]).columns.values:
df=job.sample(frac=1, random_state=12)
#%% different embedding 
# one-hot encoding 
one_hot_encoder=ce.OneHotEncoder(cols=['Job']) 
df_one_hot_transformed=one_hot_encoder.fit_transform(df)
print(df_one_hot_transformed.iloc[0:7,])

# label encode
label_encoder=ce.OrdinalEncoder(cols=['Job']) 
df_label_transformed=label_encoder.fit_transform(df)
print(df_label_transformed.iloc[0:7,])

#hash encoding  with md5 hash function

hash_encoder=ce.HashingEncoder(cols=['Job'],n_components=7)
hash_transformed=hash_encoder.fit_transform(df)
print(hash_transformed.iloc[0:7,])


#target encoding 
target_encoder=ce.TargetEncoder(cols='Job',smoothing=1)
mean_target_transformed=target_encoder.fit_transform(df['Job'],df['Target'])
print(mean_target_transformed.iloc[0:7,])

#WoE
woe_encoder=ce.WOEEncoder(cols='Job')
woe_encoder_transformed=woe_encoder.fit_transform(df['Job'],df['Target'])
print(woe_encoder_transformed.iloc[0:7,])
y=df[df['Job']=='student']

Exemplo n.º 23
0
print('X train shape: {}'.format(X_train.shape))
print('y train shape: {}'.format(y_train.shape))
print('X test shape: {}'.format(X_test.shape))
print('y test shape: {}'.format(y_test.shape))

X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

print('reindexed X train and y train for WOE embeddings')
print('reindexed X test and y test for WOE embeddings')

# In[480]:

encoding = ce.WOEEncoder(cols=['category_embed'], impute_missing=True)
encoding.fit(X_train[['category_embed']], X_train['paren_match'])
X_train['category_embed'] = encoding.transform(X_train[['category_embed']])
print('Category embeddings created for training data')

# Create a new column to embed categories (testing data)
X_test['category_embed'] = encoding.transform(X_test[['category_embed']])
print('Category embeddings created for testing data')

# In[481]:
"""
Add new feature: topics (topic modeling)
We have 4 categories in the dataset. SO lets use 4 topics
"""
# Preprocessing the text first
clean_questions = [
Exemplo n.º 24
0
def blight_model():


    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import re
    import traceback
    import string
    from sklearn.base import BaseEstimator
    from category_encoders.ordinal import OrdinalEncoder
    import category_encoders.utils as util
    from sklearn.utils.random import check_random_state
    from feature_engine import categorical_encoders as ce
    import xgboost as xgb
    from sklearn.datasets import load_boston
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_squared_error
    from xgboost import plot_importance
    import xgboost
    from matplotlib import pyplot
    import category_encoders as ces
    import seaborn as sns
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression, SGDClassifier
    from sklearn.svm import SVC
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.naive_bayes import GaussianNB
    from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
    from sklearn.kernel_approximation import RBFSampler
    from xgboost import XGBClassifier

    from category_encoders.cat_boost import CatBoostEncoder

    from sklearn.metrics import confusion_matrix, roc_curve, auc, plot_roc_curve, accuracy_score
    from sklearn.model_selection import cross_val_score, GridSearchCV
    from sklearn.linear_model import Ridge
    from sklearn.metrics import roc_auc_score


    train = pd.read_csv('train.csv', encoding = 'ISO-8859-1')
    test = pd.read_csv('test.csv')



    #train_no_null['compliance_detail'].unique()
    #a =train_no_null[train_no_null['compliance_detail'] == 'non-compliant by no payment']
    #a['payment_status'].unique()
    #a['compliance'].unique()

    ############################################################
    ########## DATA CLEARNING & DATA LEAKAGE PREVENT ###########
    ############################################################

    train_no_null = train.loc[train.compliance.notnull()]


    ## ifdentifying indecies which are not satisfy conditions
    badValuesTrain = []
    for index, row in train_no_null.iterrows():
        if (train_no_null['payment_status'].loc[index] == 'PAID IN FULL') and (train_no_null['compliance'].loc[index] == 0)  and  (train_no_null['compliance_detail'].loc[index] == 'non-compliant by late payment more than 1 month') and (train_no_null['compliance'].loc[index] == 1)\
                or (train_no_null['payment_status'].loc[index] == 'NO PAYMENT APPLIED') and (train_no_null['compliance'].loc[index] == 1)\
                or (train_no_null['payment_status'].loc[index] == 'PARTIAL PAYMENT APPLIED') and (train_no_null['compliance'].loc[index] == 1)\
                or (train_no_null['payment_status'].loc[index] == 'NO PAYMENT APPLIED') and (train_no_null['compliance_detail'].loc[index] == 'compliant by no fine') and (train_no_null['compliance'].loc[index] == 1):
            badValuesTrain.append(index)


    # remove obtained indexes from the initial DF using QUERY
    a = train_no_null.query('index not in @badValuesTrain')

    # how many NaNs per column in TRAIN DATA

    train_no_null = train_no_null.query("state == state")
    train_no_null = train_no_null.query("zip_code == zip_code")
    train_no_null = train_no_null.query("mailing_address_str_number == mailing_address_str_number")
    train_no_null = train_no_null.query("mailing_address_str_name == mailing_address_str_name")

    #test = test.query("state == state")
    #test = test.query("zip_code == zip_code")
    #test = test.query("city == city")
    #test = test.query("violator_name == violator_name")
    #test = test.query("mailing_address_str_number == mailing_address_str_number")
    #test = test.query("mailing_address_str_name == mailing_address_str_name")


    #train_no_null.isnull().sum(axis = 0)
    #test.isnull().sum(axis = 0)

    train_no_null['hearing_date'].fillna(train_no_null['hearing_date'].value_counts().index[0], inplace=True)
    test['hearing_date'].fillna(test['hearing_date'].value_counts().index[0], inplace=True)
    test['state'].fillna(test['state'].value_counts().index[0], inplace=True)
    test['zip_code'].fillna(test['zip_code'].value_counts().index[0], inplace=True)
    test['mailing_address_str_number'].fillna(test['mailing_address_str_number'].value_counts().index[0], inplace=True)
    test['mailing_address_str_name'].fillna(test['mailing_address_str_name'].value_counts().index[0], inplace=True)


    # remove the colums from TRAINING data which are not corresponds to TEST data
    # getting a list of common columns betwee TRAIN and TEST
    common_cols = list(set(train_no_null.columns).intersection(test.columns))
    train_upd = train_no_null[common_cols]
    removedColumnsTrain = train_no_null.drop([col for col in train_no_null.columns if col in train_no_null.columns and col in test.columns], axis=1)
    y_train = removedColumnsTrain['compliance']

    # remove colums with lots of NaNs for both TRAIN and TEST DS
    train_upd = train_upd.drop(['non_us_str_code'], axis=1)
    test = test.drop(['non_us_str_code'], axis=1)
    train_upd = train_upd.drop(['violation_zip_code'], axis=1)
    test = test.drop(['violation_zip_code'], axis=1)
    train_upd = train_upd.drop(['grafitti_status'], axis=1)
    test = test.drop(['grafitti_status'], axis=1)



    #####################################################################
    ##################### PLOTTING/CLEANING #############################
    #####################################################################
    #train_upd.plot(subplots=True, layout=(4,3))
    #test.plot(subplots=True, layout=(4,3))
    #plt.close('figure')
    # since "state_fee", "clean_up_cost", "admin_fee" have no impact factor, they are constant, we remove them
    train_upd = train_upd.drop(['state_fee'], axis=1)
    test = test.drop(['state_fee'], axis=1)
    train_upd = train_upd.drop(['clean_up_cost'], axis=1)
    test = test.drop(['clean_up_cost'], axis=1)
    train_upd = train_upd.drop(['admin_fee'], axis=1)
    test = test.drop(['admin_fee'], axis=1)

    ################# EXTRA PLOTING FEATURES ###############################

    def plot_Comp_train_test(train, test, plotVar, titleName, plotShowNumsorted=30, plotkind='bar', figsize=(18, 3.2)):
        plt.subplots(1, 2, figsize=(18, 5))

        plt.subplot(1, 2, 1)
        yvalue = train[plotVar].value_counts()
        (yvalue[:plotShowNumsorted] / train.shape[0]).plot(kind="bar", alpha=0.6, color='slateblue')
        plt.title(titleName + ' (training set)')

        plt.subplot(1, 2, 2)
        yvalue = test[plotVar].value_counts()
        (yvalue[:plotShowNumsorted] / test.shape[0]).plot(kind="bar", alpha=0.6, color='teal')
        plt.title(titleName + ' (test set)')

        return plt


# plot_Comp_train_test(train_upd, test, 'zip_code', 'zip_code', plotShowNumsorted=55, figsize=(20,3.2));
# plot_Comp_train_test(train_upd, test, 'violation_code', 'violation_code', plotShowNumsorted=55, figsize=(20,3.2));

##################################################################
############# FEATURES PREPROCESSING  REGEX  #####################
##################################################################
##################################################################

################# CREATING DATE & TIME FEATURES ##################
##################################################################


    train_upd['ticket_issued_date'] = pd.to_datetime(train_upd.ticket_issued_date, format='%Y-%m-%d %H:%M:%S')
    train_upd['hearing_date'] = pd.to_datetime(train_upd.hearing_date, format='%Y-%m-%d %H:%M:%S')
    test['ticket_issued_date'] = pd.to_datetime(test.ticket_issued_date, format='%Y-%m-%d %H:%M:%S')
    test['hearing_date'] = pd.to_datetime(test.hearing_date, format='%Y-%m-%d %H:%M:%S')

    datetime = ['day', 'month', 'year', 'hour', 'minute', 'weekday', 'week']

    for period in datetime:
        if datetime != 'week':
            train_upd['Issued_' + period] = getattr(train_upd.ticket_issued_date.dt, period)
            test['Issued_' + period] = getattr(test.ticket_issued_date.dt, period)
            train_upd['Hearing_' + period] = getattr(train_upd.hearing_date.dt, period)
            test['Hearing_' + period] = getattr(test.hearing_date.dt, period)
        else:
            train_upd['Issued_' + period] = getattr(train_upd.ticket_issued_date.dt.isocalendar(), period)
            test['Issued_' + period] = getattr(test.ticket_issued_date.dt.isocalendar(), period)
            train_upd['Hearing_' + period] = getattr(train_upd.hearing_date.dt.isocalendar(), period)
            test['Hearing_' + period] = getattr(test.hearing_date.dt.isocalendar(), period)

    # removing columns with DataTime
    train_upd = train_upd.drop(['ticket_issued_date'], axis=1)
    train_upd = train_upd.drop(['hearing_date'], axis=1)
    test = test.drop(['ticket_issued_date'], axis=1)
    test = test.drop(['hearing_date'], axis=1)

    #train_upd.isnull().sum(axis=0)

    ### cleaning mailing_address_str_number column ####
    for i, row in list(test.iterrows()):

        if type(row['mailing_address_str_number']) != 'int':
            c = str(row['mailing_address_str_number'])


        if ('p' in row['mailing_address_str_number'].lower()) or ('*' in row['mailing_address_str_number']) \
                or ('.' in row['mailing_address_str_number']) or ('O' in row['mailing_address_str_number']) \
                or ('o' in row['mailing_address_str_number']) or ('G' in row['mailing_address_str_number']) \
                or ('# 143' in row['mailing_address_str_number'] )  or ('XX' in row['mailing_address_str_number']) \
                or ('22A' in row['mailing_address_str_number']) or ('NE' in row['mailing_address_str_number'])\
                or ('12 1ST' in row['mailing_address_str_number']) or ('11111A' in row['mailing_address_str_number']) :
            test.at[i,'mailing_address_str_number'] = 11111
            #print(i, test.at[i,'mailing_address_str_number'])

    test.mailing_address_str_number = test.mailing_address_str_number.replace(' ','',regex=True).replace(',','',regex=True)
    test.mailing_address_str_number = test.mailing_address_str_number.replace(to_replace='[A-Z-a-z][0-9]*', value = '11111', regex=True).replace('-','',regex=True).replace('`','',regex=True).replace('#','11111',regex=True)



### converting mailing adrees for both TRAIN and TEXT into numbers instread of stirngs
    for i, row  in list(train_upd.iterrows()):
        if isinstance(train_upd.at[i,'mailing_address_str_number'], float) == False:
            train_upd.at[i,'mailing_address_str_number'] = float(train_upd.at[i,'mailing_address_str_number'])

    for i, row  in list(test.iterrows()):
        if isinstance(test.at[i,'mailing_address_str_number'], float) == False:
            test.at[i,'mailing_address_str_number'] = float(test.at[i,'mailing_address_str_number'])



    ######### categorical encoding Weight of Evidence #########
    ###########################################################
    ####### Weight of Evidence transformation of text values into categories ##############
    cat_columns = ['country', 'city', 'state', 'agency_name', 'disposition', 'zip_code', 'mailing_address_str_name',  'violator_name', 'violation_street_name', 'violation_code',
                   'violation_description', 'inspector_name']


    woe_encoder = ces.WOEEncoder(cols=cat_columns)
    #fit the encoder
    woe_encoded_train = woe_encoder.fit_transform(train_upd.iloc[:], y_train)
    woe_encoded_train = woe_encoder.fit_transform(train_upd, y_train)
    # transform
    XTrain_transformed = woe_encoder.transform(train_upd)
    XTest_transformed = woe_encoder.transform(test)


   # CBE_encoder = CatBoostEncoder()
   # train_encoded = CBE_encoder.fit_transform(train_upd[cat_columns], y_train)
   # test_encoded = CBE_encoder.transform(test[cat_columns])
   # t = train_upd
   # t = t.drop(['country', 'city', 'state', 'agency_name', 'disposition', 'zip_code', 'mailing_address_str_name', 'violator_name', 'violation_street_name', 'violation_code', 'violation_description', 'inspector_name'], axis=1, inplace=True)
   # tt = test
   # tt = tt.drop(['country', 'city', 'state', 'agency_name', 'disposition', 'zip_code', 'mailing_address_str_name', 'violator_name', 'violation_street_name', 'violation_code', 'violation_description', 'inspector_name'], axis=1, inplace=True)

   # XTrain_transformed = pd.concat([train_upd, train_encoded], axis=1, sort=False)
   # XTest_transformed = pd.concat([test, test_encoded], axis=1, sort=False)


##########################################################
############# Correlation map for features ###############
##########################################################

    correlation = XTrain_transformed.corr().round(1)

    fig, ax = plt.subplots(1, 1, figsize=(8, 6.5))
    sns.heatmap(data=correlation, annot=True, cmap="YlGn")
    ax.set_title("Correlation matrix for taken variables");

    #plt.savefig('plots/correlationMap.pdf')
    ##########################################################
    ################## saving new data #######################
    ##########################################################

    #XTrain_transformed.to_csv(r'/Users/kreozotica/PycharmProjects/current/ML_Coursera/processed_train.csv', index=False)
    #XTest_transformed.to_csv(r'/Users/kreozotica/PycharmProjects/current/ML_Coursera/XTest_transformed.csv', index=False)


    #############################################################################################################################
    # Further, since we don't have prediction data, we keep TEST data as prediction and SPLIT TRAIN data into new TRAIN and TEST
    #############################################################################################################################

    X_train, X_test, y_train, y_test = train_test_split(XTrain_transformed, y_train, random_state=0, test_size=0.75)
    #### scalling
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.fit_transform(X_test)
    XTest_transformed_scaled = scaler.fit_transform(XTest_transformed)
    XTest_transformed_scaled = pd.DataFrame(XTest_transformed_scaled, columns = XTest_transformed.columns)


    #plot_importance(model_XGB)
    #pyplot.show()
    # featureImportance = pd.DataFrame(regressor.feature_importances_.reshape(1, -1), columns=TrainTest_noLabel.columns)


    ##########################################################
    ################## MODELLING APPROACH ####################
    ##########################################################

    ##### universal model's function
    def modelFit(X_train, X_test, y_train, y_test, clf, cv=5):

        clf = clf.fit(X_train, y_train)

        cv = cross_val_score(clf, X_test, y_test, cv=cv, scoring = 'roc_auc')
        cv_mean = round(cv.mean(), 3)
        cv_std = round(cv.std(), 3)
        print('Cross-validation (AUC)', cv, ', mean =', cv_mean, ', std =', cv_std)

        #y_pred =clf.predict(X_test)
        #confusion = confusion_matrix(y_test, y_pred)
        #print(confusion)

        return cv_mean, cv_std

    ##### XGBoost
    clf_XGB = XGBClassifier()
    auc_mean_XGB, auc_std_XGB = modelFit(X_train_scaled, X_test_scaled, y_train, y_test, clf_XGB, cv=20)


    ##### Gradient-boosted Decision Trees¶

    clf_GBC = GradientBoostingClassifier(learning_rate=0.05)
    # scaling doesn't really need it, advantage
    auc_mean_GBC, auc_std_GBC = modelFit(X_train_scaled, X_test_scaled, y_train, y_test, clf_GBC, cv=20)

    ##### SVM

    clf_SVM = SVC(kernel='rbf', C=1, random_state=0)
    auc_mean_SVM, auc_std_SVM = modelFit(X_train_scaled, X_test_scaled, y_train, y_test, clf_SVM, cv=20)

    #### LogReg

    grid_values = {'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}
    LogReg = LogisticRegression()
    grid_rbf_recall = GridSearchCV(LogReg, param_grid = grid_values, scoring='recall')
    auc_mean_LR, auc_std_LR = modelFit(X_train_scaled, X_test_scaled, y_train, y_test, grid_rbf_recall, cv=20)

    #### RidgeReg
    #RdgReg_clf = Ridge()
    #auc_mean_RG, auc_std_RG = modelFit(X_train_scaled, X_test_scaled, y_train, y_test, RdgReg_clf, cv=20)

    ### NaiveBayes

    NB_clf = GaussianNB()
    auc_mean_NB, auc_std_NB = modelFit(X_train_scaled, X_test_scaled, y_train, y_test, NB_clf, cv=20)

    ################## ROC vis ##################
    def roCurves(clfList, X_test, y_test):

        roCurveList = []
        plt.subplots(1, 1, figsize=(5, 5))
        styleList = ['solid', 'solid', 'dashed', 'dashed', 'dotted', 'dashed']

        for clf, sty in zip(clfList, styleList):
            ax = plt.gca()
            roc = plot_roc_curve(clf, X_test, y_test, ax=ax, alpha=0.85, lw=2, linestyle=sty)
            roCurveList.append(roc)
        plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='dotted')
        plt.title('ROC')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')

        return roCurveList


    exps = [clf_XGB, clf_GBC, clf_SVM, grid_rbf_recall, NB_clf]

    roCurves(exps, X_test_scaled, y_test)

    # Save the figure and show
    #plt.tight_layout()
    #plt.savefig('plots/ROCs.png')
    #plt.show()



    ##### Pedict probabilities for the best model - XGBoost
    y_proba = clf_XGB.predict_proba(XTest_transformed_scaled)[:,1]
    # Integrate with reloaded test data
    test['compliance'] = y_proba




    return  test.compliance
Exemplo n.º 25
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):
        orig_cols = list(X.names)
        if self.num_classes >= 2:
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)

        min_count = np.min(np.unique(y, return_counts=True)[1])
        if min_count < 9:
            self.params['cv_search'] = False
        if min_count < 3:
            self.params['grid_search_iterations'] = False
            self.params['cv_search'] = False

        # save pre-datatable-imputed X
        X_dt = X

        # Apply OOB imputation
        self.oob_imputer = OOBImpute(self._impute_num_type,
                                     self._impute_int_type,
                                     self._impute_bool_type,
                                     self._impute_cat_type, self._oob_bool,
                                     self._oob_cat)
        X = self.oob_imputer.fit_transform(X)

        # convert to pandas for sklearn
        X = X.to_pandas()
        X_orig_cols_names = list(X.columns)
        if self._kaggle_features:
            self.features = make_features()
            X = self.features.fit_transform(X)
        else:
            self.features = None
        # print("LR: pandas dtypes: %s" % (str(list(X.dtypes))))

        # FEATURE GROUPS

        # Choose which features are numeric or categorical
        cat_features = [
            x for x in X_orig_cols_names
            if CatOriginalTransformer.is_me_transformed(x)
        ]
        catlabel_features = [
            x for x in X_orig_cols_names if CatTransformer.is_me_transformed(x)
        ]
        # can add explicit column name list to below force_cats
        force_cats = cat_features + catlabel_features

        # choose if numeric is treated as categorical
        if not self._num_as_cat:
            numerical_features = (X.dtypes == 'float') | (
                X.dtypes == 'float32') | (X.dtypes == 'float64')
        else:
            numerical_features = X.dtypes == 'invalid'
            # force oob imputation for numerics
            self.oob_imputer = OOBImpute('oob', 'oob', 'oob',
                                         self._impute_cat_type, self._oob_bool,
                                         self._oob_cat)
            X = self.oob_imputer.fit_transform(X_dt)
            X = X.to_pandas()
            X = self.features.fit_transform(X)
        if self._kaggle_features:
            numerical_features = self.features.update_numerical_features(
                numerical_features)

        categorical_features = ~numerical_features
        # below can lead to overlap between what is numeric and what is categorical
        more_cats = (pd.Series([
            True if x in force_cats else False
            for x in list(categorical_features.index)
        ],
                               index=categorical_features.index))
        categorical_features = (categorical_features) | (more_cats)
        if self._kaggle_features:
            categorical_features = self.features.update_categorical_features(
                categorical_features)

        if self._debug:
            import uuid
            struuid = str(uuid.uuid4())
            Xy = X.copy()
            Xy.loc[:, 'target'] = y
            Xy.to_csv("munged_%s.csv" % struuid)

        cat_X = X.loc[:, categorical_features]
        num_X = X.loc[:, numerical_features]
        if self._debug:
            print("LR: Cat names: %s" % str(list(cat_X.columns)))
            print("LR: Num names: %s" % str(list(num_X.columns)))

        # TRANSFORMERS
        lr_params = copy.deepcopy(self.params)
        lr_params.pop('grid_search_by_iterations', None)
        lr_params.pop('cv_search', None)
        grid_search = False  # WIP

        full_features_list = []
        transformers = []
        if self._use_numerics and any(numerical_features.values):
            impute_params = {}
            impute_params['strategy'] = lr_params.pop('strategy', 'mean')
            full_features_list.extend(list(num_X.columns))
            transformers.append(
                (make_pipeline(SimpleImputer(**impute_params),
                               StandardScaler()), numerical_features))
        # http://contrib.scikit-learn.org/categorical-encoding/
        if self._use_ordinal_encoding and any(categorical_features.values):
            ord_params = dict(handle_missing='value', handle_unknown='value')
            full_features_list.extend(list(cat_X.columns))
            # Note: OrdinalEncoder doesn't handle unseen features, while CategoricalEncoder used too
            import category_encoders as ce
            transformers.append(
                (ce.OrdinalEncoder(**ord_params), categorical_features))
        if self._use_catboost_encoding and any(categorical_features.values):
            cb_params = dict(handle_missing='value', handle_unknown='value')
            cb_params['sigma'] = lr_params.pop('sigma')
            full_features_list.extend(list(cat_X.columns))
            import category_encoders as ce
            transformers.append(
                (ce.CatBoostEncoder(**cb_params), categorical_features))
        if self._use_woe_encoding and any(categorical_features.values):
            woe_params = dict(handle_missing='value', handle_unknown='value')
            woe_params['randomized'] = lr_params.pop('randomized')
            woe_params['sigma'] = lr_params.pop('sigma_woe')
            woe_params['regularization'] = lr_params.pop('regularization')
            full_features_list.extend(list(cat_X.columns))
            import category_encoders as ce
            transformers.append(
                (ce.WOEEncoder(**woe_params), categorical_features))
        if self._use_target_encoding and any(categorical_features.values):
            te_params = dict(handle_missing='value', handle_unknown='value')
            te_params['min_samples_leaf'] = lr_params.pop('min_samples_leaf')
            te_params['smoothing'] = lr_params.pop('smoothing')
            full_features_list.extend(list(cat_X.columns))
            import category_encoders as ce
            transformers.append(
                (ce.TargetEncoder(**te_params), categorical_features))
        if self._use_target_encoding_other and any(
                categorical_features.values):
            full_features_list.extend(list(cat_X.columns))
            len_uniques = []
            cat_X_copy = cat_X.copy()
            for c in cat_X.columns:
                le = LabelEncoder()
                le.fit(cat_X[c])
                cat_X_copy[c] = le.transform(cat_X_copy[c])
                len_uniques.append(len(le.classes_))
            if self._debug:
                uniques_series = pd.Series(len_uniques,
                                           index=list(cat_X.columns))
                print("uniques_series: %s" % uniques_series)
            ALPHA = 75
            MAX_UNIQUE = max(len_uniques)
            # FEATURES_COUNT = cat_X.shape[1]
            cv = StratifiedKFold(n_splits=5,
                                 shuffle=True,
                                 random_state=self.params['random_state'])
            split_cv = [cv]
            # split_cv = [3, 3]
            from target_encoding import TargetEncoder
            transformers.append(
                (TargetEncoder(alpha=ALPHA,
                               max_unique=MAX_UNIQUE,
                               split_in=split_cv), categorical_features))
        if self._use_ohe_encoding and any(categorical_features.values):
            transformers.append(
                (OneHotEncoder(handle_unknown='ignore',
                               sparse=True), categorical_features))
        assert len(transformers) > 0, "should have some features"

        preprocess = make_column_transformer(*transformers)

        # ESTIMATOR
        lr_defaults = dict(penalty='l2',
                           dual=False,
                           tol=1e-4,
                           C=1.0,
                           fit_intercept=True,
                           intercept_scaling=1,
                           class_weight=None,
                           random_state=None,
                           solver='warn',
                           max_iter=100,
                           multi_class='warn',
                           verbose=0,
                           warm_start=False,
                           n_jobs=None,
                           l1_ratio=None)
        allowed_lr_kwargs_keys = lr_defaults.keys()
        lr_params_copy = copy.deepcopy(lr_params)
        for k, v in lr_params_copy.items():
            if k not in allowed_lr_kwargs_keys:
                lr_params.pop(k, None)
        del lr_params_copy

        can_score = self.num_classes == 2 and 'AUC' in self.params_base[
            'score_f_name'].upper()
        # print("LR: can_score: %s" % str(can_score))
        if can_score:
            scorer = make_scorer(roc_auc_score,
                                 greater_is_better=True,
                                 needs_proba=True)
        else:
            scorer = None

        if not ('C' in lr_params or 'l1_ratios' in lr_params):
            # override
            self.params['cv_search'] = False

        if not self.params['cv_search']:
            estimator = LogisticRegression(**lr_params)
            estimator_name = 'logisticregression'
        else:
            lr_params_cv = copy.deepcopy(lr_params)
            if 'C' in lr_params:
                lr_params_cv['Cs'] = self.get_param_range(
                    self.params['C'],
                    self.params['fit_count'],
                    func_type='log')
                # print("LR: CV: Cs: %s" % str(lr_params_cv['Cs']))
            if 'l1_ratios' in lr_params:
                lr_params_cv['l1_ratios'] = self.get_param_range(
                    self.params['l1_ratio'],
                    self.params['fit_count'],
                    func_type='linear')
                # print("LR: CV: l1_ratios: %s" % str(lr_params_cv['l1_ratios']))
            lr_params_cv.pop('n_jobs', None)
            lr_params_cv.pop('C', None)
            lr_params_cv.pop('l1_ratio', None)
            if lr_params_cv['penalty'] == 'none':
                lr_params_cv['penalty'] = 'l2'
            estimator = LogisticRegressionCV(n_jobs=self.params['n_jobs'],
                                             cv=3,
                                             refit=True,
                                             scoring=scorer,
                                             **lr_params_cv)
            estimator_name = 'logisticregressioncv'

        # PIPELINE
        model = make_pipeline(preprocess, estimator)

        # FIT
        if self.params['grid_search_iterations'] and can_score:
            # WIP FIXME for multiclass and other scorers
            from sklearn.model_selection import GridSearchCV

            max_iter_range = self.get_param_range(
                self.params['max_iter'],
                self.params['fit_count'],
                range_limit=self._overfit_limit_iteration_step,
                func_type='log')
            # print("LR: max_iter_range: %s" % str(max_iter_range))
            param_grid = {
                '%s__max_iter' % estimator_name: max_iter_range,
            }
            grid_clf = GridSearchCV(model,
                                    param_grid,
                                    n_jobs=self.params['n_jobs'],
                                    cv=3,
                                    iid=True,
                                    refit=True,
                                    scoring=scorer)
            grid_clf.fit(X, y)
            model = grid_clf.best_estimator_
            # print("LR: best_index=%d best_score: %g best_params: %s" % (
            #    grid_clf.best_index_, grid_clf.best_score_, str(grid_clf.best_params_)))
        elif grid_search:
            # WIP
            from sklearn.model_selection import GridSearchCV

            param_grid = {
                'columntransformer__pipeline__simpleimputer__strategy':
                ['mean', 'median'],
                '%s__C' % estimator_name: [0.1, 0.5, 1.0],
            }
            grid_clf = GridSearchCV(model, param_grid, cv=10, iid=False)
            grid_clf.fit(X, y)
            model = grid_clf.best_estimator_
            # self.best_params = grid_clf.best_params_
        else:
            model.fit(X, y)

        # get actual LR model
        lr_model = model.named_steps[estimator_name]

        if self._debug and False:
            import uuid
            struuid = str(uuid.uuid4())
            save_obj(
                model.named_steps['columntransformer'].fit_transform(X, y),
                "columns_csr_%s.pkl" % struuid)

        # average importances over classes
        importances = np.average(np.array(lr_model.coef_), axis=0)
        # average iterations over classes (can't take max_iter per class)
        iterations = np.average(lr_model.n_iter_)
        # print("LR: iterations: %d" % iterations)

        # reduce OHE features to original names
        ohe_features_short = []
        if self._use_ohe_encoding and any(categorical_features.values):
            if self._use_ohe_encoding:
                input_features = [x + self._ohe_postfix for x in cat_X.columns]
                ohe_features = pd.Series(
                    model.named_steps['columntransformer'].
                    named_transformers_['onehotencoder'].get_feature_names(
                        input_features=input_features))

                def f(x):
                    return '_'.join(x.split(self._ohe_postfix + '_')[:-1])

                # identify OHE features
                ohe_features_short = ohe_features.apply(lambda x: f(x))
                full_features_list.extend(list(ohe_features_short))

        # aggregate our own features
        if self._kaggle_features:
            self.features.aggregate(full_features_list, importances)

        msg = "LR: num=%d cat=%d : ohe=%d : imp=%d full=%d" % (
            len(num_X.columns), len(cat_X.columns), len(ohe_features_short),
            len(importances), len(full_features_list))
        if self._debug:
            print(msg)
        assert len(importances) == len(full_features_list), msg

        # aggregate importances by dai feature name
        importances = pd.Series(
            np.abs(importances),
            index=full_features_list).groupby(level=0).mean()
        assert len(importances) == len(
            X_orig_cols_names), "%d %d %s : %s %s" % (
                len(importances), len(X_orig_cols_names), msg,
                str(list(X.columns)), str(list(X.dtypes)))

        # save hyper parameter searched results for next search
        self.params['max_iter'] = iterations
        if self.params['cv_search']:
            self.params['C'] = np.average(lr_model.C_, axis=0)
        if 'l1_ratios' in lr_params and self.params['cv_search']:
            self.params['l1_ratio'] = np.average(lr_model.l1_ratio_, axis=0)
        if 'fit_count' in self.params:
            self.params['fit_count'] += 1
        else:
            self.params['fit_count'] = 0

        self.set_model_properties(model=(model, self.features),
                                  features=orig_cols,
                                  importances=importances.tolist(),
                                  iterations=iterations)
        self.features = None
Exemplo n.º 26
0
    'Ordinal':
    ce.OrdinalEncoder(),
    'Polynomial':
    ce.PolynomialEncoder(),
    'OneHot':
    ce.OneHotEncoder(),
    'BackwardDifference':
    ce.BackwardDifferenceEncoder(),
    'Helmert':
    ce.HelmertEncoder(),
    'EntityEmbedding':
    EntityEmbeddingEncoder(),
    'TargetEnc':
    ce.TargetEncoder(),
    'WOE':
    ce.WOEEncoder(),
    'CENG':
    CENGEncoder(verbose=0),
    'GeneticPP':
    GeneticPPEncoder(estimator_name='LinearRegression', num_predictors=2),
    'AgingPP':
    AgingPPEncoder(estimator_name='LinearRegression', num_predictors=2),
    'SimplePP':
    SimplePPEncoder(estimator_name='LinearRegression', num_predictors=2),
    'CESAMOEncoder':
    CESAMOEncoder()
}

if target_flag == 0:
    del Encoders['EntityEmbedding']
    del Encoders['TargetEnc']
Exemplo n.º 27
0
 def fit(self, X, y, column):
     self.col_name = column
     self.real_encoder = ce.WOEEncoder()
     self.real_encoder.fit(X[column], y)
import sys
sys.path.append('../encoders/')
from ceng import CENGEncoder
from pattern_preserving import SimplePPEncoder, AgingPPEncoder, GeneticPPEncoder
from entity_embedding import EntityEmbeddingEncoder
from cesamo import CESAMOEncoder

Encoders = {
    'Ordinal': ce.OrdinalEncoder(),
    'Polynomial': ce.PolynomialEncoder(),
    'OneHot': ce.OneHotEncoder(),
    'BackwardDifference': ce.BackwardDifferenceEncoder(),
    'Helmert': ce.HelmertEncoder(),
    'EntityEmbedding': EntityEmbeddingEncoder(),
    'TargetEnc': ce.TargetEncoder(),
    'WOE': ce.WOEEncoder(),
    'CENG': CENGEncoder(verbose=0),
    'GeneticPP': GeneticPPEncoder(num_predictors=2),
    'AgingPP': AgingPPEncoder(num_predictors=2),
    'SimplePP': SimplePPEncoder(num_predictors=2),
    'CESAMOEncoder': CESAMOEncoder()
}

if target_flag == 0:
    del Encoders['EntityEmbedding']
    del Encoders['TargetEnc']
    del Encoders['WOE']
"""END: Import encoders"""

import time
Exemplo n.º 29
0
def encode_all(df,dfv,dfk,encoder_to_use,handle_missing='return_nan'):
    
    encoders_used = {}
    
    for col in encoder_to_use:

        if encoder_to_use[col] == 'ColumnDropper':
            df = df.drop(columns = col)
            dfv = dfv.drop(columns = col)
            dfk = dfk.drop(columns = col)
            encoders_used[col] = 'ColumnDropper'    
                
        if encoder_to_use[col]=='BackwardDifferenceEncoder':
            encoder=ce.BackwardDifferenceEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='BaseNEncoder':
            encoder=ce.BaseNEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,base=3) 
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='BinaryEncoder':
            encoder=ce.BinaryEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='CatBoostEncoder':
            encoder=ce.CatBoostEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,sigma=None,a=2)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

    #     if encoder_to_use[col]=='HashingEncoder':
    #         encoder=ce.HashingEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
    #         encoder.fit(X=df,y=df['set_clicked'])
    #         df=encoder.transform(df)
    #         encoders_used[col]=encoder

        if encoder_to_use[col]=='HelmertEncoder':
            encoder=ce.HelmertEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='JamesSteinEncoder':
            encoder=ce.JamesSteinEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing, model='binary')
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='LeaveOneOutEncoder':
            encoder=ce.LeaveOneOutEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,sigma=None)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='MEstimateEncoder':
            encoder=ce.MEstimateEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,randomized=True,sigma=None,m=2)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='OneHotEncoder':
            encoder=ce.OneHotEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,use_cat_names=True)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='OrdinalEncoder':
            encoder=ce.OrdinalEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='SumEncoder':
            encoder=ce.SumEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='PolynomialEncoder':
            encoder=ce.PolynomialEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder

        if encoder_to_use[col]=='TargetEncoder':
            encoder=ce.TargetEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,min_samples_leaf=10, smoothing=5)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder


        if encoder_to_use[col]=='WOEEncoder':
            encoder=ce.WOEEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,randomized=True,sigma=None)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            dfv=encoder.transform(dfv)
            dfk=encoder.transform(dfk)
            encoders_used[col]=encoder
            
#         print("Encoding done for - ",col)
    
    print("Completed encoder - ",datetime.datetime.now())
    
    return df, dfv, dfk, encoders_used
Exemplo n.º 30
0
test['Score difference'] = diff

################################################################

# Target feature paren_match (training data). It is 1 if answer and wiki page match. 0 otherwise
train['paren_match'] = 0

for i, row in train.iterrows():
    if row['Answer'] == row['Wiki page']:
        train.loc[i, 'paren_match'] = 1

#################################################################

# WOE encoding

encoding = ce.WOEEncoder(cols=['category', 'Wiki page'])
encoding.fit(train, train['paren_match'])
train_df = encoding.transform(train)

features = [
    'Wiki page', 'Quest len', 'Page score', 'category', 'Score difference'
]
target = ['paren_match']

scaler = StandardScaler()

scaler.fit(train_df[features].values)

train_df[features] = scaler.transform(train_df[features].values)

train_df.head()