def encode_high_cardinality_categorical_df(dataframe, fit=False):
   """
    Encode high cardinality categorical features using Binary Encoding and dropping invariant features
    In Binary Encoding, features are converted to a binary representation and binary digits are used as new
    features.
    ---
    Arguments
        dataframe: pd.DataFrame
            Dataframe with pre-processed data (i.e. renamed features), high card. categorical features only
        fit: boolean
            Indicates if we should train or load an encoder
    Returns
        dataframe: pd.DataFrame
            Dataframe with encoded data
    """
    # Train or load an encoder    
    if fit:
        encoder = BinaryEncoder(cols=dataframe.columns.values, drop_invariant=True)
        encoder.fit(dataframe)
        
        pickle_obj(encoder, 'high_card_categorical_encoder')
    else:
        encoder = unpickle_obj('high_card_categorical_encoder')

    # transform data
    return encoder.transform(dataframe)
示例#2
0
def do_cat_bin(X, X_test, cols):
    be = BinaryEncoder(cols=cols).fit(X[cols])
    X_tr = be.transform(X[cols])
    X_te = be.transform(X_test[cols])
    new_cols = list(X_tr.columns)
    print(f'do_cat_bin: Done. Added {len(new_cols)} new columns.')
    return X_tr, X_te, new_cols
示例#3
0
def fit_binary(input_df: pd.DataFrame, cols: List[str], na_value: Any = None):
    """
    Creates the binary encoder by fitting it through the given DataFrame.
    NaN values and Special value specified under `na_value` in the DataFrame will be encoded as unseen value.
    Args:
        input_df: DataFrame used to fit the encoder
        cols: List of categorical columns to be encoded
        na_value: Default null value for DataFrame

    Returns:
        result_df: encoded input_df DataFrame
        model : encoder model to be passed to `transform_binary` method
    """
    df = input_df.copy()

    if na_value is not None:
        for col in cols:
            df[col] = df[col].replace({na_value: np.nan})

    encoder = BinaryEncoder(cols=cols, drop_invariant=True)
    encoder = encoder.fit(df)
    for idx in range(len(encoder.base_n_encoder.ordinal_encoder.mapping)):
        encoder.base_n_encoder.ordinal_encoder.mapping[idx]["mapping"].loc[
            np.nan] = -2

    result_df = encoder.transform(df)

    model = {"encoder": encoder, "cols": cols, "na_value": na_value}
    return result_df, model
示例#4
0
def to_categorical(
    training_data: pd.DataFrame, test_data: pd.DataFrame
) -> (pd.DataFrame, pd.DataFrame):

    categorical_columns_list = list(training_data.columns[training_data.dtypes==object])
    ce_be = BinaryEncoder(cols=categorical_columns_list, handle_unknown="inpute")
    training_data_ce_binary = ce_be.fit_transform(training_data)
    test_data_ce_binary = ce_be.transform(test_data)

    return dict(train_data_categorical=training_data_ce_binary,
                test_data_categorical=test_data_ce_binary)
def categoryEncode(df, cols=None, mode="binary"):
    if(mode == "ordinal"):
        encoder = OrdinalEncoder(cols=cols, handle_missing="return_nan", handle_unknown="return_nan")
    elif(mode == "binary"):
        encoder = BinaryEncoder(cols=cols)
    df_new = encoder.fit_transform(df)
    return df_new
示例#6
0
class DFBinaryEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, **kwargs):
        self.columns = columns
        self.model = BinaryEncoder(**kwargs)
        self.transform_cols = None

    def fit(self, X, y=None):
        self.columns = X.columns if self.columns is None else self.columns
        self.transform_cols = [x for x in X.columns if x in self.columns]
        self.model.fit(X[self.transform_cols])

        return self

    def transform(self, X):
        if self.transform_cols is None:
            raise NotFittedError(
                f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator."
            )

        new_X = self.model.transform(X[self.transform_cols])
        new_X[new_X.columns] = new_X[new_X.columns].astype('int8')

        new_X = pd.concat([X, new_X], axis=1)
        new_X.drop(columns=self.transform_cols, inplace=True)

        return new_X

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)

    def inverse_transform(self, X):
        if self.transform_cols is None:
            raise NotFittedError(
                f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator."
            )

        columns = [
            x for x in X.columns
            if any([y for y in self.transform_cols if x.startswith(f'{y}_')])
        ]
        new_X = self.model.inverse_transform(X[columns])

        new_X = pd.concat([X, new_X], axis=1)
        new_X.drop(columns=columns, inplace=True)

        return new_X
class df_BinaryEncoder(TransformerMixin):
    """
    Use for encoding nominal features
    Parameters
    ----------
    handle_unknown: str, default='ignore'
    ----------
    """
    def __init__(self, handle_unknown='ignore'):
        self.handle_unknown = handle_unknown
        
    def fit(self, X, y=None):
        self.enc = BinaryEncoder(handle_unknown=self.handle_unknown)
        self.enc.fit(X)
        return self
    
    def transform(self, X):
        return self.enc.transform(X)
    def read_feature(self, one_hot=False, create_not_existing_features=True):
        """
        it reads a feature from disk and returns it.
        if one_hot = False, it returns it as was saved.
        if one_hot = True, returns the onehot of the categorical columns, by means of self.columns_to_onehot
        """
        path = 'dataset/preprocessed/{}/{}/feature/{}/features.csv'.format(
            self.cluster, self.mode, self.name)
        if not os.path.exists(path):

            if create_not_existing_features:
                choice = 'y'
                print('Missing feature: creating')
            else:
                choice = yesno_choice(
                    'feature \'{}\' does not exist. want to create?'.format(
                        self.name))
            if choice == 'y':
                self.save_feature()
            else:
                return

        index_col = 0 if self.save_index else None
        df = pd.read_csv(path, index_col=index_col)
        #df = df.drop('Unnamed: 0', axis=1)

        print('{} feature read'.format(self.name))

        # then proceed with one hot
        if one_hot:
            for t in self.columns_to_onehot:
                col = df[t[0]]
                one_hot_prefix = t[2] if len(t) == 3 else t[0]
                if t[1] == 'single':
                    oh = pd.get_dummies(col, prefix=one_hot_prefix)
                elif t[1] == 'binary':
                    ce = BinaryEncoder(cols=t[0])
                    oh = ce.fit_transform(col)
                else:
                    mid = col.apply(lambda x: x.split('|')
                                    if isinstance(x, str) else x)
                    mid.fillna(value='', inplace=True)
                    mlb = MultiLabelBinarizer()
                    oh = mlb.fit_transform(mid)
                    oh = pd.DataFrame(oh, columns=mlb.classes_)
                    oh = oh.astype(np.uint8)
                    oh = oh.add_prefix(one_hot_prefix)

                df = df.drop([t[0]], axis=1)
                df = pd.concat([df, oh], axis=1)

            print('{} onehot completed'.format(self.name))

        df = self.post_loading(df)
        return df
示例#9
0
def CatEncoder(X, cat_cols, tags, estimator_name, objective_type, trial, n_classes, random_state):
    if tags["handles categorical"] == False:
        large_threshold = 6
        #TODO: handle numpy arrays with categorical?
        #TODO: handle multiclass / Regression
        if isinstance(X, pd.DataFrame) and isinstance(cat_cols[0], str):
            large_cardinal_cats = [col for col in X[cat_cols].columns if X[col].nunique() > large_threshold]
            small_cardinal_cats = [col for col in X[cat_cols].columns if X[col].nunique() <= large_threshold]
        elif isinstance(X, pd.DataFrame):
            large_cardinal_cats = [col for col in cat_cols if len(np.unique(X.iloc[:,col])) > large_threshold]
            small_cardinal_cats = [col for col in cat_cols if len(np.unique(X.iloc[:,col])) <= large_threshold]
        else:
            large_cardinal_cats = [col for col in cat_cols if len(np.unique(X[:,col])) > large_threshold]
            small_cardinal_cats = [col for col in cat_cols if len(np.unique(X[:,col])) <= large_threshold]

        enc_pipe = None
        cat_enc_types = ["target", "binary", "catboost"]

        if len(small_cardinal_cats) > 0:
            enc_pipe = add_to_pipe(enc_pipe, "ohe", OneHotEncoder(cols=small_cardinal_cats, drop_invariant=True))

        if len(large_cardinal_cats) > 0:
            if (objective_type == "classification" and n_classes == 1):
                cat_enc_types.append("woe")

            cat_enc_type = trial.suggest_categorical(estimator_name + " cat_enc_type", cat_enc_types)

            if cat_enc_type == "binary":
                # mapping = get_mapping(X, large_cardinal_cats)
                enc = BinaryEncoder(cols=large_cardinal_cats,
                                    # mapping=mapping
                                    )

            elif cat_enc_type == "woe":
                enc = WOEEncoder(cols=large_cardinal_cats, drop_invariant=True)

            elif cat_enc_type == "target":
                min_samples_leaf = 6  # TODO: calculate percentage or something else
                enc = TargetEncoder(min_samples_leaf=min_samples_leaf,
                                    cols=large_cardinal_cats)

            else: # catboost
                enc = CatBoostEncoder(cols=large_cardinal_cats,
                                      random_state=random_state)  # TODO: replace SEED
                # TODO: permute to the dataset beforehand

            enc_pipe = add_to_pipe(enc_pipe, cat_enc_type + "_encoder", enc)
    return enc_pipe
示例#10
0
def CatEncoder(X, cat_cols, tags, objective_type, trial, n_classes, random_state):
    if tags["handles categorical"] == False:
        large_threshold = 6
        #TODO: handle numpy arrays with categorical?
        large_cardinal_cats = [col for col in X[cat_cols].columns if X[col].nunique() > large_threshold]
        small_cardinal_cats = [col for col in X[cat_cols].columns if X[col].nunique() <= large_threshold]

        enc_pipe = None
        cat_enc_types = ["binary", "catboost", "woe", "target"]

        if small_cardinal_cats is not None:
            enc_pipe = add_to_pipe(enc_pipe, "ohe", OneHotEncoder(cols=small_cardinal_cats, drop_invariant=True))

        if large_cardinal_cats is not None:
            if (objective_type == "classification" and n_classes > 2): #multiclass
                cat_enc_types = ["binary"]

            cat_enc_type = trial.suggest_categorical("cat_enc_type", cat_enc_types)

            if cat_enc_type == "binary":
                # mapping = get_mapping(X, large_cardinal_cats)
                enc = BinaryEncoder(cols=large_cardinal_cats,
                                    drop_invariant=True,
                                    # mapping=mapping
                                    )

            elif cat_enc_type == "woe":
                enc = WOEEncoder(cols=large_cardinal_cats, drop_invariant=True)

            elif cat_enc_type == "target":
                min_samples_leaf = 10  # TODO: calculate percentage or something else
                enc = TargetEncoder(min_samples_leaf=min_samples_leaf,
                                    cols=large_cardinal_cats,
                                    drop_invariant=True)

            else: # catboost
                enc = CatBoostEncoder(cols=large_cardinal_cats,
                                      drop_invariant=True,
                                      random_state=random_state)  # TODO: replace SEED
                # TODO: permute to the dataset beforehand

            enc_pipe = add_to_pipe(enc_pipe, cat_enc_type + "_encoder", enc)
    return enc_pipe
示例#11
0
#ordinal encoding
from sklearn.preprocessing import OrdinalEncoder
ord1 = OrdinalEncoder()
ord1.fit([df['ord_2']])
df["ord_2"] = ord1.fit_transform(df[["ord_2"]])
df.head(10)
dnew = df.copy()
#ordinal encoding through mapping
temp_dict = {'Cold': 1, 'Warm': 2, 'Hot': 3}
dnew['Ord_2_encod'] = dnew.ord_2.map(temp_dict)
dnew = dnew.drop(['ord_2'], axis=1)

#Binary encoding
from category_encoders import BinaryEncoder
encoder = BinaryEncoder(cols=['ord_2'])
newdata = encoder.fit_transform(df['ord_2'])
df = pd.concat([df, newdata], axis=1)
df = df.drop(['ord_2'], axis=1)
df.head(10)

#Hash encoding
from sklearn.feature_extraction import FeatureHasher
h = FeatureHasher(n_features=3, input_type='string')
hashed_Feature = h.fit_transform(df['nom_0'])
hashed_Feature = hashed_Feature.toarray()
df = pd.concat([df, pd.DataFrame(hashed_Feature)], axis=1)
df.head(10)

df.insert(6, "Target", [0, 1, 1, 0, 0, 1, 0, 0, 0, 1], True)
示例#12
0
def predict():
    '''
    For rendering results on HTML GUI
    '''
    features = [x for x in request.form.values()]
    #final_features = [np.array(int_features)]
    #prediction = model.predict(final_features)

    #output = round(prediction[0], 2)

    features = np.array(features)
    features = features.reshape(1, 6)
    features = pd.DataFrame(data=features,
                            columns=[
                                'Name', 'Genre', 'Comments', 'Likes',
                                'Popularity', 'Followers'
                            ])
    df = pd.read_csv('data.csv')
    cv = {'Comments': int, 'Likes': int, 'Popularity': int, 'Followers': int}
    df = df.astype(cv)
    features = features.astype(cv)
    #x=df[df['Views']==0].index

    df.drop(index=df[df['Views'] < df['Likes']].index, axis=1, inplace=True)
    df.drop(index=df[df['Views'] < df['Comments']].index, axis=1, inplace=True)
    df.drop(index=df[df['Views'] < df['Popularity']].index,
            axis=1,
            inplace=True)

    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    (df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))
    df = df[~((df < (Q1 - 3 * IQR)) | (df > (Q3 + 3 * IQR))).any(axis=1)]

    df = df.drop(
        columns=['Unique_ID', 'Country', 'Song_Name', 'Timestamp', 'index'])

    y = df['Views']
    df = df.drop(columns=['Views'])

    be = BinaryEncoder()
    df = be.fit_transform(df)
    f = be.transform(features)

    X = df.iloc[:, :]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=0)

    rg1 = AdaBoostRegressor()
    rg1.fit(X_train, y_train)
    #ypred=rg1.predict(X_test)
    #sqrt(mean_squared_error(y_test,ypred))

    rg2 = GradientBoostingRegressor(n_estimators=300, learning_rate=0.1)
    # para={'n_estimators':[250,300],'learning_rate':[1,0.1,0.01]}
    # grid=GridSearchCV(estimator=rg8,param_grid=para,verbose=1,cv=10,n_jobs=-1)
    rg2.fit(X_train, y_train)
    #ypred=rg2.predict(X_test)
    #sqrt(mean_squared_error(y_test,ypred))

    rg3 = RandomForestRegressor(random_state=0, n_estimators=20, max_depth=15)
    # para={'n_estimators':[5,10,30,20],'max_depth':[5,8,20,17]}
    # grid=GridSearchCV(estimator=rg9,param_grid=para,cv=10,verbose=1,n_jobs=-1)
    rg3.fit(X_train, y_train)
    #ypred=rg3.predict(X_test)
    #sqrt(mean_squared_error(y_test,ypred))

    rg6 = StackingRegressor([rg1, rg2], meta_regressor=rg3)
    rg6.fit(X_train, y_train)
    #ypred=rg6.predict(X_test)
    #sqrt(mean_squared_error(y_test,ypred))
    f = f.iloc[:, :]
    y_pred = rg6.predict(f)

    y_pred = y_pred.astype(int)

    return render_template(
        'index.html', prediction_text='Numberb of Views is {}'.format(y_pred))
def main():
    import psutil

    # import matplotlib.pyplot as plt
    from sklearn.pipeline import Pipeline
    from sklearn.model_selection import GridSearchCV
    from sklearn.ensemble import RandomForestClassifier
    # from sklearn.preprocessing import StandardScaler
    # from sklearn.naive_bayes import GaussianNB
    from sklearn.preprocessing import Imputer
    from sklearn.externals import joblib
    # from sklearn import metrics
    from category_encoders import BinaryEncoder
    from datetime import datetime

    from sklearn.model_selection import TimeSeriesSplit
    import os
    import numpy as np

    import sys
    sys.path.append("../")
    import serial_preprocess_data as preprocess
    import utils

    cpu_count = int(psutil.cpu_count() / 4) - 2
    print("Trying to use {} number of cpu".format(cpu_count))
    data_dir = "../../data/"
    hdf_files = sorted([data_dir + file for file in os.listdir(data_dir)
                        if '.h5' in file])

    columns = ['Year',
               'Cancelled',
               'Distance',
               'Diverted',
               'ArrTime',
               'Dest',
               'FlightNum',
               # 'DepDelay',  ## not using DepDelay
               'ActualElapsedTime',
               'ArrDelay',
               'DayofMonth',
               'UniqueCarrier',
               'Month',
               'DepTime',
               'Origin',
               'DayOfWeek'
               ]
    scoring = 'roc_auc'
    no_of_files = 12

    df = preprocess.readFilesToDf("h5", file_list=hdf_files[:no_of_files],
                                  cols=columns)

    print("Size of file read in is {0:.2f} GB".format(
          utils.getFileSizeInGB(hdf_files[:no_of_files])))
    print("Reading in {0} selected columns only".format(len(columns)))
    print("Columns are:", columns)
    print("Memory usage of the data frame is {0:.2f} GB".format(
          np.sum(df.memory_usage()) / 1e9))

    # preprocess data check the percentage of nans
    _ = preprocess.find_cardinality_of_categorical_variables(df)

    ix = preprocess.clean_data_minimally(df)
    # apply cleaning of the data
    df = df.iloc[ix].reindex()
    df = df.sort_values(by=['DayofMonth', 'Month', 'Year', 'DepTime'])

    feature_cols = list(df.columns)
    feature_cols.remove('ArrDelay')
    feature_cols.remove('Cancelled')

    df['delayCat'] = df.ArrDelay.apply(
        preprocess.convert_delay_into_multiple_categories)
    df['delayBinaryCat'] = df.ArrDelay.apply(
        preprocess.convert_delay_into_two_categories)
    X = df[feature_cols]
    y = df['delayBinaryCat']

    encoder = BinaryEncoder()
    encoder.fit(X)
    transformed_X = encoder.transform(X)

    print("Transformed columns are ", transformed_X.columns)

    df_gpby = df.groupby('delayCat')
    delay_percentage_breakdown = df_gpby.ArrDelay.count() / df.shape[0] * 100
    delay_percentage_breakdown.index = ['very early',
                                        'early',
                                        'on time',
                                        'late',
                                        'very late'
                                        ]
    print("Percentage breakdown of different categories " +
          "of the target variable is: \n",
          delay_percentage_breakdown)

    # the breakdown of delay is pretty balanced.
    # Although a careful study will also look at the correlation with other
    # other features

    tscv = TimeSeriesSplit()
    # cv_ixes = [(train_ix, test_ix)
    #            for train_ix, test_ix in tscv.split(transformed_X)]

    # only put grid search steps into pipeline
    rf_pipeline_steps = [
        # impute missing feature values with median values
        ("imputer", Imputer(strategy="median")),
        ('rf', RandomForestClassifier(n_jobs=cpu_count, oob_score=True)),
    ]

    gridsearch_parameters = dict([
        ("rf__n_estimators", [800]),
        ("rf__max_features", [None]),  # not many featuers to subset from
    ])

    rf_pipeline = Pipeline(rf_pipeline_steps)

    est = GridSearchCV(rf_pipeline,
                       param_grid=gridsearch_parameters,
                       n_jobs=1,
                       scoring=scoring,
                       cv=tscv.split(X),  # this does 3 fold cross-validation
                       )
    print("Fitting the values")
    print("Columns in the training data are ", X.columns)
    est.fit(transformed_X.values, y.values)
    print("Saving the model")
    print("Best score" + scoring + "is", est.best_score_)
    print("Best parameters are ", est.best_params_)

    datetime_stamp = datetime.now().strftime(
        "%D_%X").replace("/", "_").replace(":", "_")
    joblib.dump(est.best_estimator_,
                "./RF_CV_pipeline_" + datetime_stamp + ".pkl")
class BinaryEncoder():
    """Maps each categorical value to several columns using binary encoding.

    Parameters:
        cols: [str]
            list of column names to encode.
    """
    name = 'binary'

    def __init__(self, cols=None):
        self.encoder = Binary(cols=cols)

    def fit(self, X, features, y=None):
        """Fits encoder to data table.
        returns self.
        """
        self.encoder.fit(X, y)
        self.features = self.encode_features_list(X, features)
        return self

    def transform(self, X):
        """Encodes matrix and updates features accordingly.
        returns encoded matrix (dataframe).
        """
        X_new = self.encoder.transform(X)
        feature_names = []
        for feature in self.features:
            for fname in feature.get_feature_names():
                feature_names.append(fname)
        X_new.columns = feature_names

        return X_new

    def fit_transform(self, X, features, y=None):
        """First fits, then transforms matrix.
        returns encoded matrix (dataframe).
        """
        return self.fit(X, features, y).transform(X)

    def get_mapping(self, category):
        """Gets the mapping for the binary encoder and underlying ordinal encoder.
        returns tuple (binary_encoder_mapping, ordinal_encoder_mapping).
        """
        def mapping_helper(method, category):
            if isinstance(category, str):
                for map in method.mapping:
                    if map['col'] == category:
                        return map['mapping']
            return method.mapping[category]['mapping']

        return mapping_helper(self.encoder.base_n_encoder, category), \
            mapping_helper(self.encoder.base_n_encoder.ordinal_encoder, category)

    def encode_features_list(self, X, features):
        feature_list = []
        index = 0
        for f in features:
            if f.get_name() in self.encoder.base_n_encoder.cols:
                f = ft.Feature([f], primitive=BinaryEnc(self, index))
                index += 1
            feature_list.append(f)
        return feature_list

    def get_features(self):
        return self.features

    def get_name(self):
        return self.name
 def __init__(self, cols=None):
     self.encoder = Binary(cols=cols)
示例#16
0
def nominal(name: str):
    return (name,
            Pipeline([(name + '.select', DataFrameSelector([name])),
                      (name + '.scale', BinaryEncoder())]))
from tqdm import tqdm
#import dask.dataframe as dd
import pickle
from argparse import ArgumentParser

DataFrame = pd.core.frame.DataFrame
Series = pd.core.series.Series
Array = np.ndarray
Imputer = Callable[[DataFrame], DataFrame]
nan = np.nan

df = pd.read_csv('data.csv').drop(['name'], axis=1)
X_ = df.drop('status_group', axis=1)
y = df.status_group

be = BinaryEncoder()
FEATS = 70

pca = PCA(n_components=FEATS)

vals = pca.fit_transform(StandardScaler().fit_transform(be.fit_transform(X_)))

X = pd.DataFrame(vals,
                 columns=[f"pc{k+1}" for k in range(FEATS)],
                 index=y.index).assign(y=y)


def mcar_goblin(dat: DataFrame, ratio: float) -> DataFrame:
    ''' Simulate MCAR with bernoulli '''
    def ident_or_nan(x: float) -> float:
        ''' if heads, replace value with nan. if tails, identity '''
 def fit(self, X, y=None):
     self.enc = BinaryEncoder(handle_unknown=self.handle_unknown)
     self.enc.fit(X)
     return self
示例#19
0
 def __init__(self, columns=None, **kwargs):
     self.columns = columns
     self.model = BinaryEncoder(**kwargs)
     self.transform_cols = None
示例#20
0
from category_encoders import MEstimateEncoder

# Create the encoder instance. Choose m to control noise.
encoder = MEstimateEncoder(cols=["Zipcode"], m=5.0)

# Fit the encoder on the encoding split.
encoder.fit(X_encode, y_encode)

# Encode the Zipcode column to create the final training data
X_train = encoder.transform(X_pretrain)


## Preprocessing
mode_binary = Pipeline([
    ('encoder', SimpleImputer(strategy = 'most_frequent')),
    ('binary', BinaryEncoder())])

transformer = ColumnTransformer([
    ('one hot', OneHotEncoder(handle_unknown = 'ignore'), [ 'hotel', 'meal', 'market_segment', 'distribution_channel', 'reserved_room_type', 'deposit_type', 'customer_type']),
    ('mode binary', mode_binary, ['country']),
    ('impute mode', SimpleImputer(strategy = 'most_frequent'), ['children'])], remainder = 'passthrough')

#https://scikit-learn.org/stable/modules/compose.html#columntransformer-for-heterogeneous-data

X = df.drop('is_canceled', axis = 1)
y = df['is_canceled']

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = 0.2, random_state = 1515)

# Feature Scaling after split
示例#21
0
classifier = LogisticRegression(multi_class = "multinomial")

mapper = DataFrameMapper([
	(cat_cols, [OrdinalEncoder(), OneHotEncoder()]),
	(cont_cols, None)
])

build_audit(mapper, classifier, "OrdinalEncoderAudit")

mapper = DataFrameMapper([
	(cat_cols, BaseNEncoder(base = 2, drop_invariant = True)),
	(cont_cols, None)
])

build_audit(mapper, classifier, "Base2EncoderAudit")

mapper = DataFrameMapper([
	(cat_cols, BaseNEncoder(base = 3, drop_invariant = True)),
	(cont_cols, None)
])

build_audit(mapper, classifier, "Base3EncoderAudit")

classifier = RandomForestClassifier(n_estimators = 31, random_state = 13)

mapper = DataFrameMapper([
	(cat_cols, BinaryEncoder()),
	(cont_cols, None)
])

build_audit(mapper, classifier, "BinaryEncoderAudit", compact = False)
示例#22
0
build_audit(mapper, classifier, "OrdinalEncoderAudit")

mapper = DataFrameMapper([(cat_cols, BaseNEncoder(base=2,
                                                  drop_invariant=True)),
                          (cont_cols, None)])

build_audit(mapper, classifier, "Base2EncoderAudit")

mapper = DataFrameMapper([
    (cat_cols, [BaseNEncoder(base=3, drop_invariant=True),
                OneHotEncoder()]), (cont_cols, None)
])

build_audit(mapper, classifier, "Base3EncoderAudit")

classifier = XGBClassifier(objective="binary:logistic",
                           n_estimators=31,
                           max_depth=7,
                           random_state=13)

mapper = DataFrameMapper([(cat_cols, BaseNEncoder(base=4,
                                                  drop_invariant=True)),
                          (cont_cols, None)])

build_audit(mapper, classifier, "Base4EncoderAudit", compact=False)

classifier = RandomForestClassifier(n_estimators=31, random_state=13)

mapper = DataFrameMapper([(cat_cols, BinaryEncoder()), (cont_cols, None)])

build_audit(mapper, classifier, "BinaryEncoderAudit", compact=False)
示例#23
0
def binaryEncoding(df,column):
  from category_encoders import BinaryEncoder
  encoder=BinaryEncoder(cols=[column])
  df=encoder.fit_transform(df)
  return df
示例#24
0
文件: manip.py 项目: krashr-ds/DS
def doCleanupEncode(X,
                    y=None,
                    cat=None,
                    oh=None,
                    binary=None,
                    loo=None,
                    woe=None,
                    lp_cols=None,
                    NoData=True):
    from enrich import replaceCVs
    from enrich import one_hot_encode
    from category_encoders import BinaryEncoder
    from category_encoders import OneHotEncoder
    from category_encoders import WOEEncoder
    from category_encoders import LeaveOneOutEncoder

    if NoData is False:
        if cat is not None | oh is not None:
            # translate associated columns' null, NaN, blank and 9 values to zero
            X = replaceCVs(X, cat + oh, [np.nan, 9, "", " "], 0)

    if oh is not None:
        if NoData:
            ec = OneHotEncoder(cols=oh,
                               use_cat_names=True,
                               return_df=True,
                               handle_unknown='indicator',
                               handle_missing='indicator').fit(X)
            X = ec.fit_transform(X)
            # dropping these columns did not help performance
            # for o in oh:
            #    stem = o.split("_")[1]
            #    d1 = "L_" + stem + "_-1"
            #    d2 = "L_" + stem + "_nan"
            #    print("DROPPING ", d1, " ", d2, "\n")
            #    X.drop(d1, axis=1, errors='ignore', inplace=True)
            #    X.drop(d2, axis=1, errors='ignore', inplace=True)
        else:
            # one-hot encode, then drop 0 if created
            for oh_c in oh:
                X = one_hot_encode(X, oh_c, False)
                X.drop(0, axis=1, errors='ignore', inplace=True)

    if binary is not None:
        # binary encode binary columns
        if NoData:
            enc = BinaryEncoder(cols=binary,
                                drop_invariant=True,
                                return_df=True,
                                handle_unknown='indicator').fit(X)
            X = enc.transform(X)
        else:
            enc = BinaryEncoder(cols=binary,
                                drop_invariant=True,
                                return_df=True).fit(X)
            X = enc.transform(X)

    if woe is not None:
        # use weight of evidence on woe columns
        for w in woe:
            X[w] = X[w].fillna('NoData')

        wenc = WOEEncoder(cols=woe).fit(X, y)
        X = wenc.transform(X).round(2)

    if loo is not None:
        # use leave one out on loo columns
        for l in loo:
            X[l] = X[l].fillna('NoData')

        lenc = LeaveOneOutEncoder(cols=loo, return_df=True).fit(X, y)
        X = lenc.transform(X).round(2)

    # Cast all to int64
    # X = X.astype("int64")

    if lp_cols is not None:
        # drop least predictive
        X.drop(lp_cols, axis=1, errors="ignore", inplace=True)

    X.reset_index(drop=True, inplace=True)
    return X
示例#25
0
def train_pipeline(X, y):
    """
    Builds and trains a machine learning pipeline
    """

    numerical_col = [
        'Num nights', 'Adults', 'Children', 'Session duration', 'Sessions',
        'Avg. session length (sec)', 'Avg. pageviews per session', 'Pageviews',
        'Hits', 'Created to arrival'
    ]
    categorical_col = [
        'Language', 'Website', 'Enquiry type', 'Enquiry status',
        'Client budget', 'Country code', 'GA source', 'GA medium', 'Device',
        'Created month'
    ]

    binary_col = [
        'Flights booked', 'User agent', 'User repeat', 'User referral'
    ]
    text_col = ['Click path', 'GA keyword']
    target = ['is booking']

    # Numerical pipeline

    numerical_pipeline = make_pipeline(ColumnSelector(cols=numerical_col),
                                       SimpleImputer(strategy="median"),
                                       StandardScaler())

    # Categorical pipeline

    categorical_pipeline = make_pipeline(
        ColumnSelector(cols=categorical_col),
        SimpleImputer(strategy="constant", fill_value='None'), OneHotEncoder())

    # Binary pipeline

    binary_pipeline = make_pipeline(ColumnSelector(cols=binary_col),
                                    SimpleImputer(strategy="most_frequent"),
                                    BinaryEncoder())

    # Text pipelines

    text_pipeline_1 = make_pipeline(
        ColumnSelector(cols=['Click path']),
        SimpleImputer(strategy='constant', fill_value=''),
        ReshapeTransformer(), HashingVectorizer(n_features=2**11),
        DenseTransformer())

    text_pipeline_2 = make_pipeline(
        ColumnSelector(cols=['GA keyword']),
        SimpleImputer(strategy='constant', fill_value=''),
        ReshapeTransformer(), TfidfVectorizer(), DenseTransformer())

    # Pipeline union

    processing_pipeline = make_union(numerical_pipeline, categorical_pipeline,
                                     binary_pipeline, text_pipeline_1,
                                     text_pipeline_2)

    estimator = BalancedRandomForestClassifier(bootstrap=False,
                                               class_weight=None,
                                               criterion='gini',
                                               max_depth=60,
                                               max_features='sqrt',
                                               max_leaf_nodes=None,
                                               min_impurity_decrease=0.0,
                                               min_samples_leaf=1,
                                               min_samples_split=5,
                                               min_weight_fraction_leaf=0.0,
                                               n_estimators=472,
                                               n_jobs=1,
                                               oob_score=False,
                                               random_state=None,
                                               replacement=False,
                                               sampling_strategy='auto',
                                               verbose=0,
                                               warm_start=False)

    predictive_pipeline = make_pipeline(processing_pipeline, estimator)

    predictive_pipeline.fit(X, y)

    return predictive_pipeline
                        nrows=500)
    test = pd.read_csv(os.path.join(config["input_path"], "test.csv"),
                       na_values=-1,
                       nrows=500)

    train_feature, train_label = train.iloc[:,
                                            2:].copy(), train.iloc[:,
                                                                   1].copy()
    test_feature = test.iloc[:, 1:].copy()
    del train, test

    train_feature = train_feature[[
        col for col in train_feature.columns if not col.startswith("ps_calc_")
    ]]
    test_feature = test_feature[train_feature.columns]

    ncs = [
        col for col in train_feature.columns
        if not col.endswith(("_bin", "_cat"))
    ]
    ccs = [
        col for col in train_feature.columns if col.endswith(("_bin", "_cat"))
    ]

    eet = EntityEmbeddingTree(numeric_columns=ncs, categorical_columns=ccs)
    eet.fit(X=train_feature, y=train_label)

    encoder = BinaryEncoder()
    print(encoder.fit_transform(eet.transform(X=train_feature)).shape)
    print(encoder.transform(eet.transform(X=test_feature)).shape)