Exemplos de TargetEncoder em Python, exemplos de category_encoders.TargetEncoder em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: w_4_categorical.py Projeto: nadimkaysar/Bank-Loan-Prediction-With-Python-and-Machine-Learning

target_encoded_features = list(
    df_stat[(df_stat.unique_values > 2)
            & (df_stat.unique_values <= 200)]['index'])

# In[12]:

target_encoded_features

# In[19]:

import category_encoders as ce

# In[20]:

target_encoder = ce.TargetEncoder(cols=target_encoded_features)

# In[21]:

target_encoder.fit(df, y=df.loan_status)

# In[22]:

encoded_df = target_encoder.transform(df)

# In[23]:

encoded_df.head()

# In[24]:

Exemplo n.º 2

0

Exibir arquivo

import category_encoders as ce
import sys
sys.path.append('../encoders/')
from ceng import CENGEncoder
from cesamo import CESAMOEncoder
from entity_embedding import EntityEmbeddingEncoder
from pattern_preserving import SimplePPEncoder, AgingPPEncoder, GeneticPPEncoder

Encoders = {
    'Ordinal': ce.OrdinalEncoder(),
    'Polynomial': ce.PolynomialEncoder(),
    'OneHot': ce.OneHotEncoder(),
    'BackwardDifference': ce.BackwardDifferenceEncoder(),
    'Helmert': ce.HelmertEncoder(),
    'EntityEmbedding': EntityEmbeddingEncoder(),
    'TargetEnc': ce.TargetEncoder(),
    'CENG': CENGEncoder(verbose=0),
    'GeneticPP': GeneticPPEncoder(),
    'AgingPP': AgingPPEncoder(),
    'SimplePP': SimplePPEncoder(),
    'CESAMOEncoder': CESAMOEncoder()
}
"""END: Import encoders"""
"""START: Import models"""
try:
    import sklearn.linear_model as lm
    import sklearn.svm as svm
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.neural_network import MLPRegressor
    from sklearn.gaussian_process.kernels import RBF

Exemplo n.º 3

0

Exibir arquivo

# We painstakingly initialize each encoder here because that gives us the freedom to initialize the
# encoders with any setting we want.
encoders = [  #category_encoders.BackwardDifferenceEncoder(),
    category_encoders.BaseNEncoder(),
    category_encoders.BinaryEncoder(),
    category_encoders.HashingEncoder(),
    # category_encoders.HelmertEncoder(),
    category_encoders.JamesSteinEncoder(),
    category_encoders.LeaveOneOutEncoder(),
    category_encoders.MEstimateEncoder(),
    category_encoders.OneHotEncoder(),
    category_encoders.OrdinalEncoder(),
    # category_encoders.PolynomialEncoder(),
    # category_encoders.SumEncoder(),
    category_encoders.TargetEncoder(),
    category_encoders.WOEEncoder()
]

# Initialization
if os.path.isfile('./output/result.csv'):
    os.remove('./output/result.csv')

# Loop over datasets, then over encoders, and finally, over the models
for dataset_name in datasets:
    X, y, fold_count = arff_loader.load(dataset_name)
    non_numeric = list(X.select_dtypes(exclude=[np.number]).columns.values)
    for encoder in encoders:
        print("Encoding:", dataset_name, y.name, encoder.__class__.__name__)
        folds, fit_encoder_time, score_encoder_time = train_encoder(
            X, y, fold_count, encoder)

Exemplo n.º 4

0

Exibir arquivo

text_cols = dta.select_dtypes('object')
text_cols = text_cols.apply(lambda col: col.str.replace('\\W+', '_'), axis = 1)

dta = pd.concat([numeric_cols, text_cols], axis = 1, ignore_index = False)

dta['msg_construction_year'] = dta.construction_year.isnull().astype(int)
dta['msg_population'] = dta.population.isnull().astype(int)


#%% separate data again
train = dta.loc[dta.id.isin(train.id)]
test = dta.loc[dta.id.isin(test.id)]

#%% categorical encoding and imputation

targ_enc = ce.TargetEncoder(cols = None)
targ_enc.fit(train, train_labels['status_group'])
train = targ_enc.transform(train)


imp = IterativeImputer(max_iter=10, max_value = 2013)
imp.fit(train)

train = pd.DataFrame(imp.transform(train), columns = train.columns)
train['construction_year'] = train['construction_year'].round(0)
train['permit'] = train['permit'].round(0)


#%% rf

rf = RandomForestClassifier(n_estimators = 250,

Exemplo n.º 5

0

Exibir arquivo

label_transformed = label_encoder.fit_transform(df_bank)
print('computation time of label:', time.time() - start_time)
print('Memory usage after encoding: ',
      round(label_transformed.memory_usage(deep=True).sum() * BYTES_TO_MB, 3))

#hash encoding  with md5 hash function
start_time = time.time()
hash_encoder = ce.HashingEncoder(cols=cat_cols_bank, n_components=9)
hash_transformed = hash_encoder.fit_transform(df_bank)
print('computation time of hash:', time.time() - start_time)
print('Memory usage after encoding: ',
      round(hash_transformed.memory_usage(deep=True).sum() * BYTES_TO_MB, 3))

#target encoding
start_time = time.time()
target_encoder = ce.TargetEncoder(cols=cat_cols_bank, smoothing=1)
mean_target_transformed = target_encoder.fit_transform(df_bank[cat_cols_bank],
                                                       df_bank['y'])
print('computation time of target:', time.time() - start_time)
print(
    'Memory usage after encoding: ',
    round(
        mean_target_transformed.memory_usage(deep=True).sum() * BYTES_TO_MB,
        3))

#WoE
start_time = time.time()
woe_encoder = ce.WOEEncoder(cols=cat_cols_bank)
woe_encoder_transformed = woe_encoder.fit_transform(df_bank[cat_cols_bank],
                                                    df_bank['y'])
print('computation time of WOE :', time.time() - start_time)

Exemplo n.º 6

0

Exibir arquivo

train_y = train['target']
test_id = test['id']
train.drop(['target', 'id'], axis=1, inplace=True)
test.drop('id', axis=1, inplace=True)
from sklearn.metrics import roc_auc_score

cat_feat_to_encode = train.columns.tolist()
smoothing = 0.20
import category_encoders as ce

oof = pd.DataFrame([])
from sklearn.model_selection import StratifiedKFold
for tr_idx, oof_idx in StratifiedKFold(n_splits=5,
                                       random_state=2020,
                                       shuffle=True).split(train, train_y):
    ce_target_encoder = ce.TargetEncoder(cols=cat_feat_to_encode,
                                         smoothing=smoothing)
    ce_target_encoder.fit(train.iloc[tr_idx, :], train_y.iloc[tr_idx])
    oof = oof.append(ce_target_encoder.transform(train.iloc[oof_idx, :]),
                     ignore_index=False)
ce_target_encoder = ce.TargetEncoder(cols=cat_feat_to_encode,
                                     smoothing=smoothing)
ce_target_encoder.fit(train, train_y)
train = oof.sort_index()
test = ce_target_encoder.transform(test)
from sklearn import linear_model

glm = linear_model.LogisticRegression(random_state=1,
                                      solver='lbfgs',
                                      max_iter=2020,
                                      fit_intercept=True,
                                      penalty='none',

Exemplo n.º 7

0

Exibir arquivo

Arquivo: ex2_ensemble.py Projeto: gshamay/Ex2

def target_encode(X, y, categorical_columns):
    ce_target_encoder = ce.TargetEncoder(cols=categorical_columns,
                                         min_samples_leaf=10)
    ce_target_encoder.fit(X, y)
    X = ce_target_encoder.transform(X)
    return X, ce_target_encoder

Exemplo n.º 8

0

Exibir arquivo

        'color_72', 'color_73', 'color_74', 'color_75', 'color_76', 'color_77',
        'color_78', 'color_79', 'color_80', 'color_81', 'color_82', 'color_83',
        'color_84', 'color_85', 'color_86', 'color_87', 'color_88', 'color_89',
        'color_90', 'color_91', 'color_92', 'color_93', 'color_94', 'color_95',
        'color_96', 'color_97', 'color_98'
    ]
]
doPermutationTests(X, y, features, 'sum')

encoder = ce.LeaveOneOutEncoder(cols=nominal_columns).fit(X0, y)
X = encoder.transform(X0)
results.append(doAccuracyTests(X, y, 'leaveoneout'))
features = ['diameter', 'color']
doPermutationTests(X, y, features, 'leaveoneout')

encoder = ce.TargetEncoder(cols=nominal_columns).fit(X0, y)
X = encoder.transform(X0)
results.append(doAccuracyTests(X, y, 'target'))
features = ['diameter', 'color']
doPermutationTests(X, y, features, 'target')

encoder = ce.OrdinalEncoder(cols=nominal_columns).fit(X0, y)
X = encoder.transform(X0)
results.append(doAccuracyTests(X, y, 'ordinal'))
features = ['diameter', 'color']
doPermutationTests(X, y, features, 'ordinal')

encoder = ce.WOEEncoder(cols=nominal_columns).fit(X0, y)
X = encoder.transform(X0)
results.append(doAccuracyTests(X, y, 'woe'))
features = ['diameter', 'color']

Exemplo n.º 9

0

Exibir arquivo

    def test_check_preprocessing_1(self):
        """
        Test check preprocessing on multiple preprocessing
        """
        train = pd.DataFrame({'Onehot1': ['A', 'B', 'A', 'B'], 'Onehot2': ['C', 'D', 'C', 'D'],
                              'Binary1': ['E', 'F', 'E', 'F'], 'Binary2': ['G', 'H', 'G', 'H'],
                              'Ordinal1': ['I', 'J', 'I', 'J'], 'Ordinal2': ['K', 'L', 'K', 'L'],
                              'BaseN1': ['M', 'N', 'M', 'N'], 'BaseN2': ['O', 'P', 'O', 'P'],
                              'Target1': ['Q', 'R', 'Q', 'R'], 'Target2': ['S', 'T', 'S', 'T'],
                              'other': ['other', np.nan, 'other', 'other']})

        y = pd.DataFrame(data=[0, 1, 0, 0], columns=['y'])

        enc_onehot = ce.OneHotEncoder(cols=['Onehot1', 'Onehot2']).fit(train)
        train_onehot = enc_onehot.transform(train)
        enc_binary = ce.BinaryEncoder(cols=['Binary1', 'Binary2']).fit(train_onehot)
        train_binary = enc_binary.transform(train_onehot)
        enc_ordinal = ce.OrdinalEncoder(cols=['Ordinal1', 'Ordinal2']).fit(train_binary)
        train_ordinal = enc_ordinal.transform(train_binary)
        enc_basen = ce.BaseNEncoder(cols=['BaseN1', 'BaseN2']).fit(train_ordinal)
        train_basen = enc_basen.transform(train_ordinal)
        enc_target = ce.TargetEncoder(cols=['Target1', 'Target2']).fit(train_basen, y)

        input_dict1 = dict()
        input_dict1['col'] = 'Onehot2'
        input_dict1['mapping'] = pd.Series(data=['C', 'D', np.nan], index=['C', 'D', 'missing'])
        input_dict1['data_type'] = 'object'

        input_dict2 = dict()
        input_dict2['col'] = 'Binary2'
        input_dict2['mapping'] = pd.Series(data=['G', 'H', np.nan], index=['G', 'H', 'missing'])
        input_dict2['data_type'] = 'object'

        input_dict = dict()
        input_dict['col'] = 'state'
        input_dict['mapping'] = pd.Series(data=['US', 'FR-1', 'FR-2'], index=['US', 'FR', 'FR'])
        input_dict['data_type'] = 'object'

        input_dict3 = dict()
        input_dict3['col'] = 'Ordinal2'
        input_dict3['mapping'] = pd.Series(data=['K', 'L', np.nan], index=['K', 'L', 'missing'])
        input_dict3['data_type'] = 'object'
        list_dict = [input_dict2, input_dict3]

        y = pd.DataFrame(data=[0, 1], columns=['y'])

        train = pd.DataFrame({'city': ['chicago', 'paris'],
                              'state': ['US', 'FR'],
                              'other': ['A', 'B']})
        enc = ColumnTransformer(
            transformers=[
                ('onehot', skp.OneHotEncoder(), ['city', 'state'])
            ],
            remainder='drop')
        enc.fit(train, y)

        wrong_prepro = skp.OneHotEncoder().fit(train, y)

        check_preprocessing([enc_onehot, enc_binary, enc_ordinal, enc_basen, enc_target, input_dict1,
                                           list_dict])
        for preprocessing in [enc_onehot, enc_binary, enc_ordinal, enc_basen, enc_target]:
            check_preprocessing(preprocessing)

        check_preprocessing(input_dict2)
        check_preprocessing(enc)
        check_preprocessing(None)

        with self.assertRaises(Exception):
            check_preprocessing(wrong_prepro)

Exemplo n.º 10

0

Exibir arquivo

# %% execution={"iopub.execute_input": "2020-10-08T14:24:52.214085Z", "iopub.status.busy": "2020-10-08T14:24:52.213088Z", "iopub.status.idle": "2020-10-08T14:24:52.385628Z", "shell.execute_reply": "2020-10-08T14:24:52.384657Z", "shell.execute_reply.started": "2020-10-08T14:24:52.213088Z"} id="kcMLnvJxZIuD"
# One hot encoding for low cardinality feature + Brand
col_to_encode = ['Location', 'Fuel_Type', 'Brand']
oh_encoder = ce.OneHotEncoder(cols=col_to_encode,
                              use_cat_names=True)
oh_encoder.fit(X_train)

# Encoding train set
X_train = oh_encoder.transform(X_train)
# Encoding test set
X_test = oh_encoder.transform(X_test)

# %%
# Target encoding for high cardinality feature
col_to_encode = X_train.select_dtypes("object").columns
encoder = ce.TargetEncoder(cols=col_to_encode)
encoder.fit(X_train, y_train)

# Encoding train set
X_train = encoder.transform(X_train)
# Encoding test set
X_test = encoder.transform(X_test)

# %% [markdown] id="bw10NXJkIuLs"
# ## Feature Selection

# %% id="7u-fc0svIuLt"
# Memfilter feature dengan korelasi tinggi
corr_price = X_train.join(y_train).corr()['Price']
index = corr_price[(corr_price < -0.20) | (corr_price > 0.20)].index

Exemplo n.º 11

0

Exibir arquivo

Arquivo: representation_approaches.py Projeto: weiweivv2222/Categorical-Feature-Representation-A-Survey-of-Approches-and-Applications

df=job.sample(frac=1, random_state=12)
#%% different embedding 
# one-hot encoding 
one_hot_encoder=ce.OneHotEncoder(cols=['Job']) 
df_one_hot_transformed=one_hot_encoder.fit_transform(df)
print(df_one_hot_transformed.iloc[0:7,])

# label encode
label_encoder=ce.OrdinalEncoder(cols=['Job']) 
df_label_transformed=label_encoder.fit_transform(df)
print(df_label_transformed.iloc[0:7,])

#hash encoding  with md5 hash function

hash_encoder=ce.HashingEncoder(cols=['Job'],n_components=7)
hash_transformed=hash_encoder.fit_transform(df)
print(hash_transformed.iloc[0:7,])


#target encoding 
target_encoder=ce.TargetEncoder(cols='Job',smoothing=1)
mean_target_transformed=target_encoder.fit_transform(df['Job'],df['Target'])
print(mean_target_transformed.iloc[0:7,])

#WoE
woe_encoder=ce.WOEEncoder(cols='Job')
woe_encoder_transformed=woe_encoder.fit_transform(df['Job'],df['Target'])
print(woe_encoder_transformed.iloc[0:7,])
y=df[df['Job']=='student']

Exemplo n.º 12

0

Exibir arquivo

Arquivo: scratch_make_ml_pipeline.py Projeto: ccirelli2/scikit_learn

y = df_car[target_column].values.ravel()
X = df_car.drop(columns_to_drop, axis=1)

# Create Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=123)

########################################################################################################################
# Make Pipeline
########################################################################################################################

# Instantiate Transformers
scaler = RobustScaler()
encoder = ce.TargetEncoder(cols=cat_columns)

# Add Transformers to Pipeline
num_transformer = make_pipeline(scaler)
cat_transformer = make_pipeline(encoder)

# Create Preprocessing Pipeline
preprocessor = ColumnTransformer(
    transformers=[("num", num_transformer,
                   num_columns), ("cat", cat_transformer, cat_columns)])

# Model
model_lr = LogisticRegression(class_weight='balanced',
                              solver='lbfgs',
                              random_state=123,
                              max_iter=10_000)

Exemplo n.º 13

0

Exibir arquivo

def transform_data(X, y=None, test=False):
    """
    Preparing final dataset with all features.

    Arguments
    ---
    X - dataframe with preprocessed features and target variable
    test - boolean; if false, it means X is the training set
           If true, it means X is the test set

    """
    config = load_yaml("./config.yaml")

    columns = list(X.columns)

    log_cols = config["transform"]["log_cols"]
    log1p_cols = config["transform"]["log1p_cols"]
    boxcox1p_cols = config["transform"]["boxcox1p_cols"]
    onehot_cols = config["transform"]["onehot_cols"]
    targetencode_cols = config["transform"]["targetencode_cols"]
    log_target = config["transform"]["log_target"]

    # generate time features (only relevant for time series)
    # TODO: make datetime column identifiable from config file
    if "timestamp" in columns:
        X.timestamp = pd.to_datetime(X.timestamp, format="%Y-%m-%d %H:%M:%S")
        # adjust the desirable format accordingly
        X["hour"] = X.timestamp.dt.hour
        X["weekday"] = X.timestamp.dt.weekday

        if not test:
            X.sort_values("timestamp", inplace=True)
            X.reset_index(drop=True, inplace=True)

    # TODO: make cols identified from config file

    if log_cols:
        for col in log_cols:
            # this will replace the columns with their log values
            X[col] = np.log(X[col])

    if log1p_cols:
        for col in log1p_cols:
            # this will replace the columns with their log1p values
            X[col] = np.log1p(X[col])

    if boxcox1p_cols:
        for col in boxcox1p_cols:
            if col in columns:
                print("taking the log of " + str(col))
                # this will replace the columns with their boxcox1p values
                X[col] = boxcox1p(X[col], 0.15)

    # robust scaler
    numeric_cols = X.select_dtypes(include=np.number).columns.tolist()
    if not test:
        global robust_scaler
        robust_scaler = RobustScaler()
        robust_scaler.fit_transform(X[numeric_cols])
    else:
        robust_scaler.transform(X[numeric_cols])

    # transforming target
    if log_target and not test:
        y = np.log1p(y)

    # target encoding
    if targetencode_cols:
        if not test:
            global target_encoder
            target_encoder = ce.TargetEncoder(cols=targetencode_cols)
            X = target_encoder.fit_transform(X, y)
        else:
            X = target_encoder.transform(X)

    if test:
        return X
    else:
        return X, y

Exemplo n.º 14

0

Exibir arquivo

Arquivo: test_category_encoders_backend.py Projeto: yumeone/shapash

    def test_inverse_transform_26(self):
        """
        Test multiple dict encoding
        """
        train = pd.DataFrame({
            'Onehot1': ['A', 'B', 'A', 'B'],
            'Onehot2': ['C', 'D', 'C', 'D'],
            'Binary1': ['E', 'F', 'E', 'F'],
            'Binary2': ['G', 'H', 'G', 'H'],
            'Ordinal1': ['I', 'J', 'I', 'J'],
            'Ordinal2': ['K', 'L', 'K', 'L'],
            'BaseN1': ['M', 'N', 'M', 'N'],
            'BaseN2': ['O', 'P', 'O', 'P'],
            'Target1': ['Q', 'R', 'Q', 'R'],
            'Target2': ['S', 'T', 'S', 'T'],
            'other': ['other', np.nan, 'other', 'other']
        })

        test = pd.DataFrame(
            {
                'Onehot1': ['A', 'B', 'A'],
                'Onehot2': ['C', 'D', 'ZZ'],
                'Binary1': ['E', 'F', 'F'],
                'Binary2': ['G', 'H', 'ZZ'],
                'Ordinal1': ['I', 'J', 'J'],
                'Ordinal2': ['K', 'L', 'ZZ'],
                'BaseN1': ['M', 'N', 'N'],
                'BaseN2': ['O', 'P', 'ZZ'],
                'Target1': ['Q', 'R', 'R'],
                'Target2': ['S', 'T', 'ZZ'],
                'other': ['other', '123', np.nan]
            },
            index=['index1', 'index2', 'index3'])

        expected = pd.DataFrame(
            {
                'Onehot1': ['A', 'B', 'A'],
                'Onehot2': ['C', 'D', 'missing'],
                'Binary1': ['E', 'F', 'F'],
                'Binary2': ['G', 'H', 'missing'],
                'Ordinal1': ['I', 'J', 'J'],
                'Ordinal2': ['K', 'L', 'missing'],
                'BaseN1': ['M', 'N', 'N'],
                'BaseN2': ['O', 'P', np.nan],
                'Target1': ['Q', 'R', 'R'],
                'Target2': ['S', 'T', 'NaN'],
                'other': ['other', '123', np.nan]
            },
            index=['index1', 'index2', 'index3'])

        y = pd.DataFrame(data=[0, 1, 0, 0], columns=['y'])

        enc_onehot = ce.OneHotEncoder(cols=['Onehot1', 'Onehot2']).fit(train)
        train_onehot = enc_onehot.transform(train)
        enc_binary = ce.BinaryEncoder(
            cols=['Binary1', 'Binary2']).fit(train_onehot)
        train_binary = enc_binary.transform(train_onehot)
        enc_ordinal = ce.OrdinalEncoder(
            cols=['Ordinal1', 'Ordinal2']).fit(train_binary)
        train_ordinal = enc_ordinal.transform(train_binary)
        enc_basen = ce.BaseNEncoder(
            cols=['BaseN1', 'BaseN2']).fit(train_ordinal)
        train_basen = enc_basen.transform(train_ordinal)
        enc_target = ce.TargetEncoder(cols=['Target1', 'Target2']).fit(
            train_basen, y)

        input_dict1 = dict()
        input_dict1['col'] = 'Onehot2'
        input_dict1['mapping'] = pd.Series(data=['C', 'D', np.nan],
                                           index=['C', 'D', 'missing'])
        input_dict1['data_type'] = 'object'

        input_dict2 = dict()
        input_dict2['col'] = 'Binary2'
        input_dict2['mapping'] = pd.Series(data=['G', 'H', np.nan],
                                           index=['G', 'H', 'missing'])
        input_dict2['data_type'] = 'object'

        input_dict3 = dict()
        input_dict3['col'] = 'Ordinal2'
        input_dict3['mapping'] = pd.Series(data=['K', 'L', np.nan],
                                           index=['K', 'L', 'missing'])
        input_dict3['data_type'] = 'object'
        list_dict = [input_dict2, input_dict3]

        result1 = enc_onehot.transform(test)
        result2 = enc_binary.transform(result1)
        result3 = enc_ordinal.transform(result2)
        result4 = enc_basen.transform(result3)
        result5 = enc_target.transform(result4)

        original = inverse_transform(result5, [
            enc_onehot, enc_binary, enc_ordinal, enc_basen, enc_target,
            input_dict1, list_dict
        ])

        pd.testing.assert_frame_equal(expected, original)

Exemplo n.º 15

0

Exibir arquivo

Encoders = {
    'Ordinal':
    ce.OrdinalEncoder(),
    'Polynomial':
    ce.PolynomialEncoder(),
    'OneHot':
    ce.OneHotEncoder(),
    'BackwardDifference':
    ce.BackwardDifferenceEncoder(),
    'Helmert':
    ce.HelmertEncoder(),
    'EntityEmbedding':
    EntityEmbeddingEncoder(),
    'TargetEnc':
    ce.TargetEncoder(),
    'WOE':
    ce.WOEEncoder(),
    'CENG':
    CENGEncoder(verbose=0),
    'GeneticPP':
    GeneticPPEncoder(estimator_name='LinearRegression', num_predictors=2),
    'AgingPP':
    AgingPPEncoder(estimator_name='LinearRegression', num_predictors=2),
    'SimplePP':
    SimplePPEncoder(estimator_name='LinearRegression', num_predictors=2),
    'CESAMOEncoder':
    CESAMOEncoder()
}

if target_flag == 0:

Exemplo n.º 16

0

Exibir arquivo

Arquivo: utils.py Projeto: abdjiber/Election-corruption-in-Africa

def target_encode(df, target):
    encoder = category_encoders.TargetEncoder(cols=list(df))
    encoder.fit(df, target)
    df_targ_enc = encoder.transform(df)
    return df_targ_enc

Exemplo n.º 17

0

Exibir arquivo

X_test = df[(Fecha_inicial_test <= df['Fc.Corte'])
            & (df['Fc.Corte'] < Fecha_final_test)]
y_train = df[list_targets[0]][X_train.index]
y_test = df[list_targets[0]][X_test.index]
#Los datos de prueba final seran del año 2020
X_prueba_campo = df[(Fecha_inicial_prueba <= df['Fc.Corte'])
                    & (df['Fc.Corte'] < Fecha_final_prueba)]
y_prueba_campo = df[list_targets[0]][X_prueba_campo.index]
X = df[list_predictors]
X_muestras = Data_Muestras[list_predictors]
y_muestras = Data_Muestras[list_targets[0]]
#Separamos las variables categoricas
train = X_train[predictores_categoricos]
test = X_test[predictores_categoricos]
#Se les aplica el encoder
encoder_mean = ce.TargetEncoder(cols=predictores_categoricos,
                                handle_unknown='ignore')
OH_cols_train = encoder_mean.fit_transform(train[predictores_categoricos],
                                           y_train)
OH_cols_test = encoder_mean.transform(test[predictores_categoricos], y_test)
# Eliminamos las columnas categoricas de nuestra data para luego remplazarlas con las resultantes del HOE
num_X_train = X_train[predictores_numericos]
num_X_test = X_test[predictores_numericos]
# Concatenable
X_train_escalable = pd.concat([num_X_train, OH_cols_train], axis=1)
X_test_escalable = pd.concat([num_X_test, OH_cols_test], axis=1)
# se escalan los datos numéricos
scaler = StandardScaler()
Num_X_Scaler_train = pd.DataFrame(scaler.fit_transform(X_train_escalable))
Num_X_Scaler_test = pd.DataFrame(scaler.transform(X_test_escalable))
#Elimina los indices asi que los volvemos a poner
Num_X_Scaler_train.index = X_train.index

Exemplo n.º 18

0

Exibir arquivo

Arquivo: Churn_Modeling (Base Model).py Projeto: abdillah-fikri/kaggle-churn-modeling

fig.update_yaxes(title_text='Percent')

fig.update_layout(height=500,
                  width=700,
                  title_text='Exited Percentage by Gender',
                  yaxis={'ticksuffix': '%'})
fig.show()

# %% [markdown] id="rI5u37Z0mPYj"
# Female customers have a higher churn rate (25%) than male customers (16.5%).

# %% [markdown] id="7S_v5Qi1nD32"
# ### Heatmap Correlation

# %% execution={"iopub.execute_input": "2020-10-14T04:15:48.003690Z", "iopub.status.busy": "2020-10-14T04:15:48.002693Z", "iopub.status.idle": "2020-10-14T04:15:48.358742Z", "shell.execute_reply": "2020-10-14T04:15:48.357743Z", "shell.execute_reply.started": "2020-10-14T04:15:48.003690Z"} executionInfo={"elapsed": 76410, "status": "ok", "timestamp": 1602616029509, "user": {"displayName": "Abdillah Fikri", "photoUrl": "", "userId": "04470220666512949031"}, "user_tz": -420} id="iyvBwv5Goy2Y" outputId="a7ad3ffe-d22a-4b8d-bf87-8fd60b10519e"
encoder = ce.TargetEncoder()
df_temp = encoder.fit_transform(df.drop(columns='Exited'), df['Exited'])
df_corr = df_temp.join(df['Exited']).corr()

fig = ff.create_annotated_heatmap(z=df_corr.values,
                                  x=list(df_corr.columns),
                                  y=list(df_corr.index),
                                  annotation_text=df_corr.round(2).values,
                                  showscale=True,
                                  colorscale='Viridis')
fig.update_layout(height=600, width=800, title_text='Feature Correlation')
fig.update_xaxes(side='bottom')
fig.show()

# %% [markdown] id="rOzxBfS7nLJp"
# The highest correlation to Target is the Surname feature (0.36), and the second is the Age feature with a value of 0.29.

Exemplo n.º 19

0

Exibir arquivo

Arquivo: baseline.py Projeto: seunghy/kaggle_202104_monthly

plot.tick_params(labelsize=7)
plt.show()

merge_data.drop(['Cabin'], axis=1, inplace=True)

# # get Family name
# merge_data['Name'] = merge_data['Name'].apply(lambda x: x.split(', ')[1])
merge_data['familyname'] = merge_data['Name'].apply(lambda x: x.split(' ')[1])
familyname_ratio = merge_data.groupby('familyname').mean()[[
    'Survived'
]].sort_values(by='Survived', ascending=False)
sns.barplot(familyname_ratio.index, familyname_ratio.Survived)
plt.show()  # --> have huge difference

# drop Name, familyname and add survival ratio by familyname -->  Target encoding instead of familyname
target_encoder = ce.TargetEncoder()
merge_data['survival_ratio'] = target_encoder.transform(
    merge_data.familyname, merge_data.Survived).values

merge_data.drop(['Name', 'familyname'], axis=1, inplace=True)

# # Age: categorical, Embarked: Categorical
# # Age: [,16] [17,40][41,60][61,]
# merge_data['Age'] = pd.cut(merge_data.Age,bins=[0,16,40,60,120],labels=['kid','youth','middle-aged','elderly'])
# # Age labeling---------/

merge_data = labeling(merge_data, ['Pclass'])
merge_data = labeling(merge_data, ['Sex'])
# merge_data = labeling(merge_data, ['Ticket'])
merge_data = labeling(merge_data, ['Ticket_alpha'])
# merge_data = labeling(merge_data, ['Cabin'])

Exemplo n.º 20

0

Exibir arquivo

Arquivo: Model.py Projeto: sumansahoo16/Categorical-Feature-Encoding-Challenge-II

data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

data.drop('id',axis=1,inplace=True)
test.drop('id',axis=1,inplace=True)

target = data['target']
train = data.drop('target',axis=1)

train = train.applymap(str)
test = test.applymap(str)

encoding_cols = train.columns
encoded = pd.DataFrame([])
for tr_in,fold_in in StratifiedKFold(n_splits=12, shuffle=True).split(train, target):
    encoder = ce.TargetEncoder(cols = encoding_cols, smoothing=0.2)
    encoder.fit(train.iloc[tr_in,:],target.iloc[tr_in])
    encoded = encoded.append(encoder.transform(train.iloc[fold_in,:]),ignore_index=False)
encoder = ce.TargetEncoder(cols = encoding_cols,smoothing=0.2)
encoder.fit(train,target)
test = encoder.transform(test)
train = encoded.sort_index()

best_params = {'bagging_temperature': 0.8,
               'depth': 5,
               'iterations': 1000,
               'l2_leaf_reg': 30,
               'learning_rate': 0.05,
               'random_strength': 0.8}

n_splits = 12

Exemplo n.º 21

0

Exibir arquivo

Arquivo: Model.py Projeto: HowardChen123/2020-Unic-Case-Competition

df = df.iloc[:, :-1]
df = df.dropna()

# encode the customer response column
for i in range(len(df)):
    if df['Customer Response'].values[i] == 'Won':
        df['Customer Response'].values[i] = 1
    else:
        df['Customer Response'].values[i] = 0

df = df.rename(columns={'Contract  Status': 'Contract Status'})

# Cateogrical Encoding --------------------------------------------------
import category_encoders as ce
cat_features = ['Contract Status']
target_enc = ce.TargetEncoder(cols=cat_features)

features = [
    'Contract Status', 'Product Family', 'FTS Rate', 'Forecast',
    'Market Share', 'Number of Competitors', 'WAC Price'
]

y = df['Customer Response'].astype('int64')
X = df[features]

from sklearn.model_selection import train_test_split
# split data into training and validation data, for both features and target
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)

# Fit the encoder using the categorical features and target
target_enc.fit(train_X[cat_features], train_y)

Exemplo n.º 22

0

Exibir arquivo

Arquivo: Final.py Projeto: CD77A76746CE163CFC49/ml


    train_columns = np.concatenate([train.columns.values, train_scaled.columns.values], axis= 0)
    train = pd.DataFrame(np.hstack([train, train_scaled]),columns=train_columns)

    y = train.loc[:, ['Income in EUR']].astype('int')
    X = train.copy()
    X.drop(['Instance','Income in EUR'],axis=1, inplace=True)



"""
    Encoding
"""

    encoder = ce.TargetEncoder(cols=['Country','Year of Record','Gender','University Degree','PopulationCatg','Profession'])
    encoder.fit(X[['Country','Year of Record','Gender','University Degree','PopulationCatg','Profession']], y)
    X_cleaned = encoder.transform(X[['Country','Year of Record','Gender','University Degree','PopulationCatg','Profession']], y)
    X.drop(['Country','Year of Record','Gender','University Degree','PopulationCatg','Profession'], axis =1, inplace = True)
    X_columns = np.concatenate([X.columns.values, X_cleaned.columns.values], axis= 0)
    X = pd.DataFrame(np.hstack([X, X_cleaned]),columns=X_columns)


   X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.37, random_state=1)
#-----------------------------------------------------------------------------------------------
   
#-----------------------------------------------------------------------------------------------    

# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

Exemplo n.º 23

0

Exibir arquivo

Arquivo: feature_eng.py Projeto: skepticalchemist/rossman-sales-pred

def feat_eng(rossman_featured,
             train_data=True,
             te_store=None,
             te_week=None,
             te_day=None):
    """
    
    This function perform some feature engineering tasks having as input a pandas dataframe. 
    
    - From the Date column, 3 news features (year, day of the month and week of the year) are created and
     when necessary they have their type changed.
     
    - A new feature is obtained dividing the Sales per Customer per Store.
    
    - One hot encoding and target encoding techniques are applied to some categorical features.
    
    - Some features are dropped.
    
    ...
    
    Attributes
    ----------
    rossman_featured : pandas.core.frame.DataFrame
         
    """
    # create a new features from Date column
    rossman_featured['Year'] = rossman_featured['Date'].dt.year  # year
    #rossman_featured['Month'] = rossman_featured['Date'].dt.month                    # month
    rossman_featured['Day'] = rossman_featured[
        'Date'].dt.day  # day of the month
    rossman_featured['WeekofYear'] = rossman_featured['Date'].dt.isocalendar(
    ).week  # week of the year

    # convert Week of the Year to integer
    rossman_featured['WeekofYear'] = rossman_featured['WeekofYear'].astype(int)

    # apply one hot encoding to some features
    rossman_featured = pd.get_dummies(data=rossman_featured,
                                      columns=[
                                          'StateHoliday', 'SchoolHoliday',
                                          'StoreType', 'Assortment',
                                          'PromoInterval', 'DayOfWeek'
                                      ],
                                      prefix=[
                                          'StateHoliday', 'SchoolHoliday',
                                          'StoreType', 'Assortment',
                                          'PromoInterval', 'DayOfWeek'
                                      ],
                                      prefix_sep='_')

    if train_data == True:
        # apply target encoding to the feature Store
        te_store = ce.TargetEncoder(cols=['Store'])
        rossman_featured['Store_target'] = te_store.fit_transform(
            rossman_featured['Store'], rossman_featured['Sales'])

        # apply target encoding to the feature WeekofYear
        te_week = ce.TargetEncoder(cols=['WeekofYear'])
        rossman_featured['WeekofYear_target'] = te_week.fit_transform(
            rossman_featured['WeekofYear'], rossman_featured['Sales'])

        # apply target encoding to the feature day of the month
        te_day = ce.TargetEncoder(cols=['Day'])
        rossman_featured['Day_target'] = te_day.fit_transform(
            rossman_featured['Day'], rossman_featured['Sales'])
    else:
        # apply target encoding to the feature Store
        rossman_featured['Store_target'] = te_store.transform(
            rossman_featured['Store'])

        # apply target encoding to the feature WeekofYear
        rossman_featured['WeekofYear_target'] = te_week.transform(
            rossman_featured['WeekofYear'])

        # apply target encoding to the feature day of the month
        rossman_featured['Day_target'] = te_day.transform(
            rossman_featured['Day'])

    # # create new feature dividing sales per customers and store
    # rossman_featured['Sales_Cust_Store']=  rossman_featured['Sales'] / (rossman_featured['Customers'] * rossman_featured['Store'])

    # remove chosen features
    rossman_featured = rossman_featured.drop([
        'Date', 'Store', 'Year', 'WeekofYear', 'Day', 'Customers',
        'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear',
        'Promo2SinceWeek', 'Promo2SinceYear'
    ],
                                             axis=1)
    return rossman_featured, te_store, te_week, te_day

Exemplo n.º 24

0

Exibir arquivo

y = df.pop('salary')

# df.head()
# df.shape

#%% Preprocessor functions
ohe = ce.OneHotEncoder(
    drop_invariant=True,
    return_df=True,
    use_cat_names=True,
    handle_missing='return_nan')  # Remember replace(np.nan, 0)

tge = ce.TargetEncoder(
    drop_invariant=True,
    return_df=True,
    handle_missing='value',
    # min_samples_leaf=3,
    # smoothing=0.4,
)

num_cols = ['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p']
cat_cols = [
    'gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation'
]
new_cat_cols = [
    'gender_M', 'gender_F', 'ssc_b_Others', 'ssc_b_Central', 'hsc_b_Others',
    'hsc_b_Central', 'hsc_s_Commerce', 'hsc_s_Science', 'hsc_s_Arts',
    'degree_t_Sci&Tech', 'degree_t_Comm&Mgmt', 'degree_t_Others', 'workex_No',
    'workex_Yes', 'specialisation_Mkt&HR', 'specialisation_Mkt&Fin'
]

Exemplo n.º 25

0

Exibir arquivo

Arquivo: train.py Projeto: ShrishailSGajbhar/Categorical-Feature-Encoding-Challenge-II

    valid_df = df[df.kfold==FOLD].reset_index(drop=True)

    ytrain = train_df.target
    yvalid = valid_df.target

    train_df = train_df.drop(["id", "target", "kfold"], axis=1)
    valid_df = valid_df.drop(["id", "target", "kfold"], axis=1)

    valid_df = valid_df[train_df.columns]

    cols_to_enc = train_df.columns.tolist()

    target_encoders = {}
    for c in train_df.columns:
        if c in cols_to_enc:
            print(c)
            tge = ce.TargetEncoder(cols=[c])
            tge.fit(pd.concat([train_df.loc[:,c],valid_df.loc[:,c]],axis=0), pd.concat([ytrain,yvalid],axis=0))
            train_df.loc[:,c] = tge.transform(train_df.loc[:,c])
            valid_df.loc[:,c] = tge.transform(valid_df.loc[:,c])
            target_encoders[c] = tge
    
    # data is ready to train
    clf = dispatcher.MODELS[MODEL]
    clf.fit(train_df, ytrain)
    preds = clf.predict_proba(valid_df)[:, 1]
    print(metrics.roc_auc_score(yvalid, preds))

    joblib.dump(target_encoders, f"models/{MODEL}_{FOLD}_target_encoder.pkl")
    joblib.dump(clf, f"models/{MODEL}_{FOLD}.pkl")
    joblib.dump(train_df.columns, f"models/{MODEL}_{FOLD}_columns.pkl")

Exemplo n.º 26

0

Exibir arquivo

Arquivo: insurance.py Projeto: shimonoe/CBM_Encoding

def run_i_experiments():
    print("Loading Data")
    df = load_data()
    #columns:
    continuous = ['age', 'bmi']
    categorical = ['sex', 'children', 'smoker', 'region']

    X = df[continuous + categorical]
    y = df[['charges']]

    u_0 = np.mean(y)[0]
    v = np.std(y)[0]

    models = [
        Ridge(),
        RandomForestRegressor(n_estimators=100),
        GradientBoostingRegressor(),
        MLPRegressor()
    ]
    #models = [RandomForestRegressor()]

    results = [[
        'model', 'Encoder', 'R2', 'STD', 'Training Time', 'Sparsity',
        'Dimensions'
    ]]

    for model in models:
        print("")
        print("----------------------")
        print("Testing Algorithm: ")
        print(type(model))
        print("----------------------")

        #TargetEncoder
        print("TargetEncoder Results:")
        r2, std, time, sparsity, dimensions = cv_regression(
            model,
            X,
            y,
            continuous,
            categorical,
            encoder=ce.TargetEncoder(return_df=False))
        results.append([
            type(model), 'TargetEncoder', r2, std, time, sparsity, dimensions
        ])

        #OrdinalEncoder
        print("OrdinalEncoder Results:")
        r2, std, time, sparsity, dimensions = cv_regression(
            model,
            X,
            y,
            continuous,
            categorical,
            encoder=ce.OrdinalEncoder(return_df=False))
        results.append([
            type(model), 'OrdinalEncoder', r2, std, time, sparsity, dimensions
        ])

        #BinaryEncoder
        print("BinaryEncoder Results:")
        r2, std, time, sparsity, dimensions = cv_regression(
            model,
            X,
            y,
            continuous,
            categorical,
            encoder=ce.BinaryEncoder(return_df=False))
        results.append([
            type(model), 'BinaryEncoder', r2, std, time, sparsity, dimensions
        ])

        #HashingEncoder
        print("HashingEncoder Results:")
        r2, std, time, sparsity, dimensions = cv_regression(
            model,
            X,
            y,
            continuous,
            categorical,
            encoder=ce.HashingEncoder(return_df=False))
        results.append([
            type(model), 'HashingEncoder', r2, std, time, sparsity, dimensions
        ])

        #OneHotEncoder
        print("OneHotEncoder Results:")
        r2, std, time, sparsity, dimensions = cv_regression(
            model,
            X,
            y,
            continuous,
            categorical,
            encoder=OneHotEncoder(handle_unknown='ignore', sparse=False))
        results.append([
            type(model), 'OneHotEncoder', r2, std, time, sparsity, dimensions
        ])

        print("GIG Encoder (mean) Results:")
        r2, std, time, sparsity, dimensions = cv_regression(model,
                                                            X,
                                                            y,
                                                            continuous,
                                                            categorical,
                                                            encoder=GIGEncoder(
                                                                u_0=u_0, v=v))
        results.append([
            type(model), 'GIGEncoder (m)', r2, std, time, sparsity, dimensions
        ])

        print("GIG Encoder (mean and variance) Results:")
        r2, std, time, sparsity, dimensions = cv_regression(model,
                                                            X,
                                                            y,
                                                            continuous,
                                                            categorical,
                                                            encoder=GIGEncoder(
                                                                u_0=u_0, v=v),
                                                            moments='mv')
        results.append([
            type(model), 'GIGEncoder (mv)', r2, std, time, sparsity, dimensions
        ])

    file = 'insurance_experiments.csv'
    with open(file, "w") as output:
        writer = csv.writer(output, lineterminator='\n')
        writer.writerows(results)
    try:
        upload_file(file)
    except:
        print("File Not Uploaded")

Exemplo n.º 27

0

Exibir arquivo

Arquivo: xgboost_model.py Projeto: matthewjrogers/flu_shot_learning

#from sklearn.ensemble import RandomForestClassifier

#%% read data

train = pd.read_csv("training_set_features.csv")
test = pd.read_csv("test_set_features.csv")
train_labels = pd.read_csv("training_set_labels.csv")

drop_vars = [
    'respondent_id', 'child_under_6_months', 'behavioral_wash_hands',
    'behavioral_face_mask', 'behavioral_antiviral_meds'
]

#%% encoding

h1n1_targ_enc = ce.TargetEncoder()
h1n1_targ_enc.fit(train.drop(drop_vars, axis=1), train_labels['h1n1_vaccine'])
h1n1_train = h1n1_targ_enc.transform(train.drop(drop_vars, axis=1))

seasonal_targ_enc = ce.TargetEncoder()
seasonal_targ_enc.fit(train.drop(drop_vars, axis=1),
                      train_labels['seasonal_vaccine'])
seasonal_train = h1n1_targ_enc.transform(train.drop(drop_vars, axis=1))

#%% imputation

h1n1_imp = IterativeImputer(max_iter=20, min_value=0)
h1n1_imp.fit(h1n1_train)
h1n1_train = pd.DataFrame(h1n1_imp.transform(h1n1_train),
                          columns=h1n1_train.columns)

Exemplo n.º 28

0

Exibir arquivo

def norm_data(X_train,
              X_test,
              y_train,
              y_test,
              real=None,
              categ=None,
              all=True):
    '''Preprocessing features'''
    #  -------------   Split data on real and categ   -----------------
    X_train_categ = np.hstack((X_train[:, :2], X_train[:, 81:82]))
    X_test_categ = np.hstack((X_test[:, :2], X_test[:, 81:82]))

    X_train_real = np.hstack((X_train[:, 2:81], X_train[:, 82:]))
    X_test_real = np.hstack((X_test[:, 2:81], X_test[:, 82:]))

    #  -------  Check flag that we want to use all data for encoding --------
    if all == True:
        X_all_categ = np.append(X_train_categ, X_test_categ, axis=0)
        #print (X.shape, X_train_categ.shape, X_test_categ.shape)
        y_all = np.append(y_train, y_test, axis=0)
        #print (y_all.shape, y_train.shape, y_test.shape)
    else:
        X_all_categ = X_train_categ
        y_all = y_train

    #  -------  Norm of real data on mean and deviation --------
    if real == 'standart':
        ss = StandardScaler()
        X_train_real_res = ss.fit_transform(X_train_real)
        X_test_real_res = ss.transform(X_test_real)
    elif real == 'normal':
        min_max_scaler = preprocessing.MinMaxScaler()
        X_train_real_res = min_max_scaler.fit_transform(X_train_real)
        X_test_real_res = min_max_scaler.transform(X_test_real)
    else:
        X_train_real_res = X_train_real
        X_test_real_res = X_test_real

    #  -------  Encoding of categorical features  -----------
    if categ == 'target':
        encoder = ce.TargetEncoder(cols=[0, 1, 2], return_df=False)
        encoder.fit(X_all_categ, y_all)

        X_train_categ_res = encoder.transform(X_train_categ)
        X_test_categ_res = encoder.transform(X_test_categ)
    elif categ == 'onehot':
        encoder = ce.OneHotEncoder(cols=[0, 1, 2], return_df=False)
        encoder.fit(X_all_categ, y_all)

        X_train_categ_res = encoder.transform(X_train_categ)
        X_test_categ_res = encoder.transform(X_test_categ)
    elif categ == 'helmert':
        encoder = ce.HelmertEncoder(cols=[0, 1, 2], return_df=False)
        encoder.fit(X_all_categ, y_all)

        X_train_categ_res = encoder.transform(X_train_categ)
        X_test_categ_res = encoder.transform(X_test_categ)
    elif categ == 'hash':
        encoder = ce.HashingEncoder(cols=[0, 1, 2], return_df=False)
        encoder.fit(X_all_categ, y_all)

        X_train_categ_res = encoder.transform(X_train_categ)
        X_test_categ_res = encoder.transform(X_test_categ)
    else:
        X_train_categ_res = X_train_categ
        X_test_categ_res = X_test_categ

    #  ------------     Joy data together  ---------------
    X_train_ready = np.hstack((X_train_categ_res, X_train_real_res))
    X_test_ready = np.hstack((X_test_categ_res, X_test_real_res))

    return X_train_ready, X_test_ready

Exemplo n.º 29

0

Exibir arquivo

        'pref_month_m6_5', 'pref_month_y1_1', 'pref_month_y1_2', 'pref_month_y1_3', 'pref_month_y1_4',
        'pref_month_y1_5', 'pref_month_y2_1', 'pref_month_y2_2', 'pref_month_y2_3', 'pref_month_y2_4',
        'pref_month_y2_5', 'pref_month_y3_1', 'pref_month_y3_2', 'pref_month_y3_3', 'pref_month_y3_4',
        'pref_month_y3_5', 'recent_flt_day', 'pit_add_chnl_m3', 'pit_add_chnl_m6', 'pit_add_chnl_y1',
        'pit_add_chnl_y2', 'pit_add_chnl_y3', 'pref_orig_city_m3', 'pref_orig_city_m6',
        'pref_orig_city_y1',
        'pref_orig_city_y2', 'pref_orig_city_y3', 'pref_dest_city_m3', 'pref_dest_city_m6',
        'pref_dest_city_y1',
        'pref_dest_city_y2', 'pref_dest_city_y3'
        , 'seg_dep_time_month', 'seg_dep_time_year', 'seg_dep_time_is_workday'
    ]
    continue_list = list(set(feature_list) - set(discrete_list))
    print('特征列表长度为{0},离散特征长度{1},连续特征长度{2}'.format(len(feature_list), len(discrete_list), len(continue_list)))

    # 将离散数据进行target_encoding
    encoder = ce.TargetEncoder(cols=discrete_list, drop_invariant=False).fit(data_train_feature,
                                                                             data_target)
    data_train_feature = encoder.transform(data_train_feature).to_numpy()
    train_test_split = getTrainTest(data_train_feature, data_target)

    for train_index, test_index in train_test_split:
        np.random.seed(0)
        obj = Data(data_train_feature, data_target, train_index)
        obj2 = Test_Data(data_train_feature, data_target, train_index, test_index)
        pso = PSO(iterations=100, obj=obj, beta=0.2, alpha=0.4)
        pso.run()
        print('得到的特征子集序列为', pso.best.getPBest())
        print('特征子集长度为', len(set(pso.best.getPBest())))
        print('训练集准确率(适应度)为', pso.best.getCostPBest())
        print('得到的测试集准确率为', obj2.getTestAccuracy(pso.best.getPBest()))
        print('得到的测试集F1值为', obj2.getTestF1(pso.best.getPBest()))

Exemplo n.º 30

0

Exibir arquivo

def run_bs_experiments():
    print("Loading Data")
    df = load_data()
    #columns:
    continuous = ['temp', 'atemp', 'hum', 'windspeed']
    categorical = [
        'dteday', 'season', 'yr', 'mnth', 'hr', 'holiday', 'weekday',
        'workingday', 'weathersit'
    ]

    X = df[continuous + categorical]
    y = df[['cnt']]

    models = [
        Ridge(),
        RandomForestRegressor(n_estimators=100),
        GradientBoostingRegressor(),
        MLPRegressor()
    ]
    #models = [RandomForestRegressor()]

    results = [[
        'model', 'Encoder', 'R2', 'STD', 'Training Time', 'Sparsity',
        'Dimensions'
    ]]

    for model in models:
        print("")
        print("----------------------")
        print("Testing Algorithm: ")
        print(type(model))
        print("----------------------")

        #TargetEncoder
        print("TargetEncoder Results:")
        r2, std, time, sparsity, dimensions = cv_regression(
            model,
            X,
            y,
            continuous,
            categorical,
            encoder=ce.TargetEncoder(return_df=False))
        results.append([
            type(model), 'TargetEncoder', r2, std, time, sparsity, dimensions
        ])

        #OrdinalEncoder
        print("OrdinalEncoder Results:")
        r2, std, time, sparsity, dimensions = cv_regression(
            model,
            X,
            y,
            continuous,
            categorical,
            encoder=ce.OrdinalEncoder(return_df=False))
        results.append([
            type(model), 'OrdinalEncoder', r2, std, time, sparsity, dimensions
        ])

        #BinaryEncoder
        print("BinaryEncoder Results:")
        r2, std, time, sparsity, dimensions = cv_regression(
            model,
            X,
            y,
            continuous,
            categorical,
            encoder=ce.BinaryEncoder(return_df=False))
        results.append([
            type(model), 'BinaryEncoder', r2, std, time, sparsity, dimensions
        ])

        #HashingEncoder
        print("HashingEncoder Results:")
        r2, std, time, sparsity, dimensions = cv_regression(
            model,
            X,
            y,
            continuous,
            categorical,
            encoder=ce.HashingEncoder(return_df=False))
        results.append([
            type(model), 'HashingEncoder', r2, std, time, sparsity, dimensions
        ])

        #OneHotEncoder
        print("OneHotEncoder Results:")
        r2, std, time, sparsity, dimensions = cv_regression(
            model,
            X,
            y,
            continuous,
            categorical,
            encoder=OneHotEncoder(handle_unknown='ignore', sparse=False))
        results.append([
            type(model), 'OneHotEncoder', r2, std, time, sparsity, dimensions
        ])

        print("GIG Encoder (mean) Results:")
        r2, std, time, sparsity, dimensions = cv_regression(
            model, X, y, continuous, categorical, encoder=GIGEncoder())
        results.append([
            type(model), 'GIGEncoder (m)', r2, std, time, sparsity, dimensions
        ])

        print("GIG Encoder (mean and variance Results:")
        r2, std, time, sparsity, dimensions = cv_regression(
            model,
            X,
            y,
            continuous,
            categorical,
            encoder=GIGEncoder(),
            moments='mv')
        results.append([
            type(model), 'GIGEncoder (mv)', r2, std, time, sparsity, dimensions
        ])

    file = 'bike_sharing_experiments.csv'
    with open(file, "w") as output:
        writer = csv.writer(output, lineterminator='\n')
        writer.writerows(results)
    try:
        upload_file(file)
    except:
        print("File Not Uploaded")