def test_hashing(self):
        """
        Creates a dataset and encodes with with the hashing trick

        :return:
        """

        cols = ['C1', 'D', 'E', 'F']
        X = self.create_dataset(n_rows=1000)
        X_t = self.create_dataset(n_rows=100)

        enc = encoders.HashingEncoder(verbose=1, n_components=128, cols=cols)
        enc.fit(X, None)
        self.verify_numeric(enc.transform(X_t))

        enc = encoders.HashingEncoder(verbose=1, n_components=32)
        enc.fit(X, None)
        self.verify_numeric(enc.transform(X_t))

        enc = encoders.HashingEncoder(verbose=1,
                                      n_components=32,
                                      drop_invariant=True)
        enc.fit(X, None)
        self.verify_numeric(enc.transform(X_t))

        enc = encoders.HashingEncoder(verbose=1,
                                      n_components=32,
                                      return_df=False)
        enc.fit(X, None)
        self.assertTrue(isinstance(enc.transform(X_t), np.ndarray))
Пример #2
0
def hashing_encoding(df, cols, handle_nan=True):
    if handle_nan:
        encoder = ce.HashingEncoder(cols=cols)
    else:
        encoder = ce.HashingEncoder(cols=cols)
    df_new = encoder.fit_transform(df)
    return df_new
Пример #3
0
    def set_pipeline(self):
        dist = self.kwargs.get("distance_type", "haversine")
        feateng_steps = self.kwargs.get("feateng",
                                        ["distance", "time_features"])

        # Define feature engineering pipeline blocks here
        pipe_time_features = make_pipeline(
            TimeFeaturesEncoder(time_column='pickup_datetime'),
            OneHotEncoder(handle_unknown='ignore'))
        pipe_distance = make_pipeline(DistanceTransformer(distance_type=dist),
                                      StandardScaler())
        pipe_geohash = make_pipeline(AddGeohash(), ce.HashingEncoder())
        # Add new feature engineer Above
        #pipe_direction =
        #pipe_distance_to_center =

        # Define default feature engineering blocs
        feateng_blocks = [
            ('distance', pipe_distance, list(DIST_ARGS.values())),
            ('time_features', pipe_time_features, ['pickup_datetime']),
            ('geohash', pipe_geohash, list(DIST_ARGS.values())),
            #('direction', pipe_direction, list(DIST_ARGS.values())),
            #('distance_to_center', pipe_distance_to_center, list(DIST_ARGS.values())),
        ]
        # Filter out some bocks according to input parameters
        for bloc in feateng_blocks:
            if bloc[0] not in feateng_steps:
                feateng_blocks.remove(bloc)

        features_encoder = ColumnTransformer(feateng_blocks,
                                             n_jobs=None,
                                             remainder="drop")

        self.pipeline = Pipeline(steps=[('features', features_encoder),
                                        ('rgs', self.get_estimator())], )
Пример #4
0
    def encode_variables(self, one_hot_list=[], boolean_list=[], many_list=[], standard_scaler=[],dim_reduc = False):
       
        ###Boolean
        if boolean_list:
            input_cols = list(self.X.columns)
            bool_enc = ce.OrdinalEncoder(cols=boolean_list,drop_invariant=True).fit(self.X)
            self.X = bool_enc.transform(self.X)
            self.models.append({'model_type':'boolean', 'model_file':bool_enc,'input_cols':input_cols, 'model_variables':boolean_list})

        ####  ONE HOT
        if one_hot_list:
            input_cols = list(self.X.columns)
            onehot_enc = ce.OneHotEncoder(cols=one_hot_list,drop_invariant=True,use_cat_names=True).fit(self.X)
            self.X = onehot_enc.transform(self.X)        
            self.models.append({'model_type':'one_hot', 'model_file':onehot_enc,'input_cols':input_cols, 'model_variables':one_hot_list})
        
        ##MANY VALUES
        if many_list:
            input_cols = list(self.X.columns)            
            hash_enc = ce.HashingEncoder(cols=many_list,n_components=15, drop_invariant=True).fit(self.X)
            self.X = hash_enc.transform(self.X)
            self.models.append({'model_type':'hash_trick', 'model_file':hash_enc, 'input_cols':input_cols,'model_variables':many_list})
        
        ##Scalling
        if standard_scaler:
            input_cols = list(self.X.columns)        
            df_standard = self.X[standard_scaler]
            scaler = StandardScaler()
            self.X[standard_scaler] = scaler.fit_transform(df_standard)
            self.models.append({'model_type':'standard_scaler', 'model_file':scaler, 'model_variables':standard_scaler})
        #SVD
        if dim_reduc == 'SVD':
            self.dimensional_reduction(method='SVD',n_components=10)
Пример #5
0
def hash_encoder(df, cols, no_new_cols_per):
    print("<hash> df rows: %ld" % df.shape[0])
    for col in cols:
        print("hashing col %s" % col)
        ce_hash = ce.HashingEncoder(cols=[col], n_components=no_new_cols_per)
        X = df[col]
        new_cols_df = ce_hash.fit_transform(X)
        print("new cols df rows: %ld" % new_cols_df.shape[0])
        df = df.drop(col, axis=1)
        for i in range(0, no_new_cols_per):
            placeholder_name = "col_%ld" % i
            new_col_name = "%s%s%ld" % (col, "_", i)
            #print("new_cols_df before rename:")
            #print(new_cols_df.head(n=1))
            new_cols_df = new_cols_df.rename(
                columns={placeholder_name: new_col_name})
            #print("new_cols_df after rename:")
            #print(new_cols_df.head(n=1))

        # append the new columns to the dataframe
        print("BEFORE concatting for col %s" % col)
        print("<hash> df rows: %ld" % df.shape[0])
        print("<hash> new cols rows: %ld" % new_cols_df.shape[0])
        df.reset_index(drop=True, inplace=True)
        new_cols_df.reset_index(drop=True, inplace=True)
        df = pd.concat([df, new_cols_df], axis=1)
        print("concatting for col %s" % col)
        print("<hash> df rows: %ld" % df.shape[0])

    return df
Пример #6
0
def prep_fight_card(df_fight_card, fighter_data_en):
    # Encode weight class
    enc = ce.HashingEncoder(cols=['weight_class'])
    fight_card_final = enc.fit_transform(df_fight_card)
    fight_card_final.rename(columns={'col_0':'weight_class_col_0', 'col_1':'weight_class_col_1', 'col_2':'weight_class_col_2',\
                                        'col_3':'weight_class_col_3', 'col_4':'weight_class_col_4', 'col_5':'weight_class_col_5',\
                                        'col_6':'weight_class_col_6', 'col_7':'weight_class_col_7'}, inplace=True)

    # Get data for fighter1
    fight_card_final = fight_card_final.merge(fighter_data_en, left_on=['fighter1'], right_on=['full_name'],\
                                                    how='left').drop(columns=['full_name', 'fighter1'])
    fight_card_final.rename(columns={'col_0':'stance_col_0_fighter1', 'col_1':'stance_col_1_fighter1', 'col_2':'stance_col_2_fighter1',\
                                        'col_3':'stance_col_3_fighter1', 'col_4':'stance_col_4_fighter1', 'col_5':'stance_col_5_fighter1',\
                                        'col_6':'stance_col_6_fighter1', 'col_7':'stance_col_7_fighter1',\
                                        'height':'height_fighter1', 'weight':'weight_fighter1', 'reach':'reach_fighter1',\
                                        'wins':'wins_fighter1', 'losses':'losses_fighter1', 'draws':'draws_fighter1',\
                                        'SLpM':'SLpM_fighter1','Str_Acc':'Str_Acc_fighter1', 'SApM':'SApM_fighter1',\
                                        'Str_Dep':'Str_Dep_fighter1', 'TD_Avg':'TD_Avg_fighter1', 'TD_Acc':'TD_Acc_fighter1',\
                                        'TD_Def':'TD_Def_fighter1', 'Sub_Avg':'Sub_Avg_fighter1'}, inplace=True)

    # Get data for fighter 2
    fight_card_final = fight_card_final.merge(fighter_data_en, left_on=['fighter2'], right_on=['full_name'],\
                                                    how='left').drop(columns=['full_name', 'fighter2'])
    fight_card_final.rename(columns={'col_0':'stance_col_0_fighter2', 'col_1':'stance_col_1_fighter2', 'col_2':'stance_col_2_fighter2',\
                                        'col_3':'stance_col_3_fighter2', 'col_4':'stance_col_4_fighter2', 'col_5':'stance_col_5_fighter2',\
                                        'col_6':'stance_col_6_fighter2', 'col_7':'stance_col_7_fighter2',\
                                        'height':'height_fighter2', 'weight':'weight_fighter2', 'reach':'reach_fighter2',\
                                        'wins':'wins_fighter2', 'losses':'losses_fighter2', 'draws':'draws_fighter2',\
                                        'SLpM':'SLpM_fighter2','Str_Acc':'Str_Acc_fighter2', 'SApM':'SApM_fighter2',\
                                        'Str_Dep':'Str_Dep_fighter2', 'TD_Avg':'TD_Avg_fighter2', 'TD_Acc':'TD_Acc_fighter2',\
                                        'TD_Def':'TD_Def_fighter2', 'Sub_Avg':'Sub_Avg_fighter2'}, inplace=True)

    return (fight_card_final)
Пример #7
0
    def set_pipeline(self):
        memory = self.kwargs.get("pipeline_memory", None)
        dist = self.kwargs.get("distance_type", "euclidian")
        feateng_steps = self.kwargs.get("feateng", ["distance", "time_features", 'direction', 'distance_to_center'])
        if memory:
            memory = mkdtemp()

        # Define feature engineering pipeline blocks here
        pipe_time_features = make_pipeline(TimeFeaturesEncoder(time_column='pickup_datetime'),
                                           OneHotEncoder(handle_unknown='ignore'))
        pipe_distance = make_pipeline(DistanceTransformer(distance_type=dist, **DIST_ARGS), RobustScaler())
        pipe_geohash = make_pipeline(AddGeohash(), ce.HashingEncoder())
        pipe_direction = make_pipeline(Direction(), RobustScaler())
        pipe_distance_to_center = make_pipeline(DistanceToCenter(), RobustScaler())

        # Define default feature engineering blocs
        feateng_blocks = [
            ('distance', pipe_distance, list(DIST_ARGS.values())),
            ('time_features', pipe_time_features, ['pickup_datetime']),
            ('geohash', pipe_geohash, list(DIST_ARGS.values())),
            ('direction', pipe_direction, list(DIST_ARGS.values())),
            ('distance_to_center', pipe_distance_to_center, list(DIST_ARGS.values())),
        ]
        # Filter out some bocks according to input parameters
        for bloc in feateng_blocks:
            if bloc[0] not in feateng_steps:
                feateng_blocks.remove(bloc)

        features_encoder = ColumnTransformer(feateng_blocks, n_jobs=None, remainder="drop")

        self.pipeline = Pipeline(steps=[
            ('features', features_encoder),
            ('rgs', self.get_estimator())], memory=memory)
Пример #8
0
 def _fit_hash(self, df, target):
     hash_encoder = ce.HashingEncoder()
     hash_encoder.fit(df[target].map(to_str))
     name = [
         'continuous_' + remove_continuous_discrete_prefix(x) + '_hash'
         for x in hash_encoder.get_feature_names()
     ]
     self.trans_ls.append(('hash', name, target, hash_encoder))
Пример #9
0
def hashing():
    X, _, _ = get_mushroom_data()
    print(X.info())
    enc = ce.HashingEncoder()
    enc.fit(X, None)
    out = enc.transform(X)
    print(out.info())
    del enc, _, X, out
Пример #10
0
def hash_encode1(df2):
    df = df2.copy()
    categorical_features = df.select_dtypes(
        include=['category']).columns.values
    hashing_encoder = ce.HashingEncoder(n_components=len(categorical_features),
                                        cols=categorical_features.tolist())
    df[categorical_features] = hashing_encoder.fit_transform(
        df[categorical_features])
    return df
Пример #11
0
 def create_features(self, df_train, df_test):
     encoder = ce.HashingEncoder(cols=self.columns)
     encoder.fit(df_train[self.columns],
                 df_train[self.target_column].values.tolist())
     encoded_train = encoder.transform(df_train[self.columns])
     encoded_test = encoder.transform(df_test[self.columns])
     for column in encoded_train.columns:
         self.train[column + '_HashingEncoder'] = encoded_train[column]
         self.test[column + '_HashingEncoder'] = encoded_test[column]
Пример #12
0
def pd_colcat_encoder_generic(df, col, pars):
    """
       https://pypi.org/project/category-encoders/
       encoder = ce.BackwardDifferenceEncoder(cols=[...])
encoder = ce.BaseNEncoder(cols=[...])
encoder = ce.BinaryEncoder(cols=[...])
encoder = ce.CatBoostEncoder(cols=[...])
encoder = ce.CountEncoder(cols=[...])
encoder = ce.GLMMEncoder(cols=[...])
encoder = ce.HashingEncoder(cols=[...])
encoder = ce.HelmertEncoder(cols=[...])
encoder = ce.JamesSteinEncoder(cols=[...])
encoder = ce.LeaveOneOutEncoder(cols=[...])
encoder = ce.MEstimateEncoder(cols=[...])
encoder = ce.OneHotEncoder(cols=[...])
encoder = ce.OrdinalEncoder(cols=[...])
encoder = ce.SumEncoder(cols=[...])
encoder = ce.PolynomialEncoder(cols=[...])
encoder = ce.TargetEncoder(cols=[...])
encoder = ce.WOEEncoder(cols=[...])


    """
    colcat = col
    import category_encoders as ce
    pars_encoder = pars
    pars_encoder['cols'] = col
    if 'path_pipeline_export' in pars:
        try:
            pars_encoder = load(pars['path_pipeline_export'] +
                                '/colcat_encoder_pars.pkl')
        except:
            pass

    encoder = ce.HashingEncoder(**pars_encoder)
    dfcat_bin = encoder.fit_transform(df[col])

    dfcat_bin.columns = [t for t in dfcat_bin.columns]
    colcat_encoder = list(dfcat_bin.columns)

    ###################################################################################
    if 'path_features_store' in pars and 'path_pipeline_export' in pars:
        save_features(dfcat_bin, 'dfcat_encoder', pars['path_features_store'])
        save(encoder,
             pars['path_pipeline_export'] + "/colcat_encoder_model.pkl")
        save(pars_encoder,
             pars['path_pipeline_export'] + "/colcat_encoder_pars.pkl")
        save(colcat_encoder,
             pars['path_pipeline_export'] + "/colcat_encoder.pkl")

    col_pars = {}
    col_pars['col_encode_model'] = encoder
    col_pars['cols_new'] = {
        'colcat_encoder': colcat_encoder  ### list
    }
    return dfcat_bin, col_pars
Пример #13
0
def encode_categorical(df,encoder_name='binary'):
    encoder_dic = {"one_hot":ce.OneHotEncoder(),
                   "feature_hashing":ce.HashingEncoder(n_components=32),
                   "binary":ce.BinaryEncoder(),
                   "ordinal":ce.OrdinalEncoder(),
                   "polynomial":ce.PolynomialEncoder()}
    encoder = encoder_dic.get(encoder_name)
    encoder.fit(df,verbose=1)
    df = encoder.transform(df)
    return df
Пример #14
0
def hash_encode(df,
                col_names=[],
                label_col='HasDetections',
                accumulate=True,
                return_df=True,
                dynamic_n_component=False,
                drop_invariant=True,
                debug=True,
                save_intermediate_results=False):
    col_names = list(col_names)
    if len(col_names) == 0:
        col_names = list(df.columns)

    if label_col in col_names:
        col_names.remove(label_col)

    print('hash encoding: {}'.format(col_names), flush=True)
    _frames = []
    _cols_arr = []
    for c in col_names:
        if debug:
            print('processing: {}'.format(c), flush=True)

        _model = df[[c]]

        _val_ct = df[c].value_counts().count()

        _n_comp = 8 * int(
            np.log(_val_ct)) if dynamic_n_component and _val_ct > 10 else 8

        hash_enc = ce.HashingEncoder(cols=[c],
                                     return_df=return_df,
                                     n_components=_n_comp,
                                     drop_invariant=drop_invariant)
        print('hash_enc: {}'.format(hash_enc), flush=True)
        hash_enc.fit(_model, df[label_col])

        _df = hash_enc.transform(_model)

        if return_df:
            _df.columns = '{}_'.format(c) + _df.columns
        else:
            _cols_arr.append(c)

        if accumulate:
            _frames.append(_df)

        if save_intermediate_results:
            _dict = {'col': c, 'hash_enc': hash_enc, 'data': _df}
            _fl_name = 'input/{}_df_train.pkl'.format(
                c) if return_df else 'input/{}_np_train.pkl'.format(c)
            joblib.dump(_dict, _fl_name)

    return pd.concat(_frames, axis=1) if return_df else (_frames, _cols_arr)
def prepare_data(data, path, name, max_sample):

    from sklearn import preprocessing
    import category_encoders as ce
    import pandas as pd
    print("start of prepare_data")

    cat_final = ['professionid', 'birthplace', 'residencezipcode', \
    'companyzipcode', 'legalzipcode', 'education', 'maritalstatus']
    quant = ['numofdependence', \
         'monthlyfixedincome', 'monthlyvariableincome', 'spouseincome', \
         'avg_income', 'std_income', 'avg_income_cnt', 'avg_income_nation', 'std_income_nation',
       'avg_income_nation_cnt', 'avg_income_area', 'std_icnome_area',
       'avg_income_area_cnt', 'avg_sale_house_price_5000',
       'std_sale_house_price_5000', 'sale_house_cnt_5000',
       'avg_sale_apartment_price_5000', 'std_sale_apartment_price_5000',
       'sale_apartment_cnt_5000', 'avg_rent_house_price_5000',
       'std_rent_house_price_5000', 'rent_house_cnt_5000',
       'avg_rent_apartment_price_5000', 'std_rent_apartment_price_5000',
       'rent_apartment_cnt_5000', 'avg_sale_house_price_10000',
       'std_sale_house_price_10000', 'sale_house_cnt_10000',
       'avg_sale_apartment_price_10000', 'std_sale_apartment_price_10000',
       'sale_apartment_cnt_10000', 'avg_rent_house_price_10000',
       'std_rent_house_price_10000', 'rent_house_cnt_10000',
       'avg_rent_apartment_price_10000', 'std_rent_apartment_price_10000',
       'rent_apartment_cnt_10000', 'previous_converted']
    # resample_df = pd.read_csv(path+name)
    resample_df = data
    final_df = resample_df[cat_final]
    final_df['index'] = resample_df.index
    # label encoder on ordinal features
    le = preprocessing.LabelEncoder()
    final_df['professionid'] = le.fit_transform(resample_df.professionid)
    final_df['education'] = le.fit_transform(resample_df.education)

    non_ordinal = set(cat_final) ^ set(['professionid', 'education'])
    # Hashing Encoding for large scale categorical data
    HE = ce.HashingEncoder(cols=non_ordinal,
                           return_df=True,
                           max_sample=max_sample)
    # encode the categorical variables
    data = HE.fit_transform(final_df)

    #categorical features fillna with mode
    data = data.apply(lambda x: x.fillna(x.mode()), axis=0)
    #quantitative features fillna with mean
    quant_data = resample_df[quant].apply(lambda x: x.fillna(x.mean()), axis=0)

    X = quant_data.reset_index().merge(data, on='index').drop('index', axis=1)
    y = resample_df['defaulted']

    print("Shape of X, y: ", X.shape, y.shape)
    return X, y
Пример #16
0
def hash_encoding(trainData, predictionData):

    oEncoder = ce.HashingEncoder(cols=[
        'housing_situation', 'satisfaction', 'gender', 'hair_color', 'country',
        'profession', 'degree'
    ])

    oEncoder.fit(trainData)
    trainDataFrame = oEncoder.transform(trainData)
    predictionDataFrame = oEncoder.transform(predictionData)

    return trainDataFrame, predictionDataFrame
Пример #17
0
    def __init__(self, encoder_type, columns_name=None):
        """
        :param encoder_type:
        :param columns_name: list, 特征名组成的列表名
        """
        if encoder_type == "BackwardDe":  # 反向差分编码
            self.encoder = ce.BackwardDifferenceEncoder(cols=columns_name)

        elif encoder_type == "BaseN":  # BaseN编码
            self.encoder = ce.BaseNEncoder(cols=columns_name)

        elif encoder_type == "Binary":  # 二值编码
            self.encoder = ce.BinaryEncoder(cols=columns_name)

        elif encoder_type == "Catboost":
            self.encoder = ce.CatBoostEncoder(cols=columns_name)

        elif encoder_type == "Hash":
            self.encoder = ce.HashingEncoder(cols=columns_name)

        elif encoder_type == "Helmert":
            self.encoder = ce.HelmertEncoder(cols=columns_name)

        elif encoder_type == "JamesStein":
            self.encoder = ce.JamesSteinEncoder(cols=columns_name)

        elif encoder_type == "LOO":  # LeaveOneOutEncoder  编码
            self.encoder = ce.LeaveOneOutEncoder(cols=columns_name)

        elif encoder_type == "ME":
            self.encoder = ce.MEstimateEncoder(cols=columns_name)  # M估计编码器

        elif encoder_type == "OneHot":
            self.encoder = ce.OneHotEncoder(cols=columns_name)

        elif encoder_type == "OridinalEncoder":  # 原始编码
            self.encoder = ce.OrdinalEncoder(cols=columns_name)

        elif encoder_type == "Sum":  # 求和编码
            self.encoder = ce.SumEncoder(cols=columns_name)

        elif encoder_type == "Polynomial":  # 多项式编码
            self.encoder = ce.PolynomialEncoder(cols=columns_name)

        elif encoder_type == "Target":  # 目标编码
            self.encoder = ce.TargetEncoder(cols=columns_name)

        elif encoder_type == "WOE":  # WOE 编码器
            self.encoder = ce.WOEEncoder(cols=columns_name)

        else:
            raise ValueError("请选择正确的编码方式")
Пример #18
0
    def test_hashing_np(self):
        """
        Creates a dataset and encodes with with the hashing trick

        :return:
        """

        X = self.create_array(n_rows=1000)
        X_t = self.create_array(n_rows=100)

        enc = encoders.HashingEncoder(verbose=1, n_components=128)
        enc.fit(X, None)
        self.verify_numeric(enc.transform(X_t))
Пример #19
0
    def test_must_not_reset_index(self):
        columns = ['column1', 'column2', 'column3', 'column4']
        df = pd.DataFrame([[i, i, i, i] for i in range(10)], columns=columns)
        df = df.iloc[2:8, :]
        target_columns = ['column1', 'column2', 'column3']

        single_process_encoder = encoders.HashingEncoder(max_process=1,
                                                         cols=target_columns)
        single_process_encoder.fit(df, None)
        df_encoded_single_process = single_process_encoder.transform(df)
        assert_index_equal(df.index, df_encoded_single_process.index)
        assert df.shape[0] == pd.concat([df, df_encoded_single_process],
                                        axis=1).shape[0]

        multi_process_encoder = encoders.HashingEncoder(cols=target_columns)
        multi_process_encoder.fit(df, None)
        df_encoded_multi_process = multi_process_encoder.transform(df)
        assert_index_equal(df.index, df_encoded_multi_process.index)
        assert df.shape[0] == pd.concat([df, df_encoded_multi_process],
                                        axis=1).shape[0]

        assert_frame_equal(df_encoded_single_process, df_encoded_multi_process)
Пример #20
0
def get_encoder_dict():
    encoder_dict = {
        'OneHotEncoder': ce.OneHotEncoder(),
        'BinaryEncoder': ce.BinaryEncoder(),
        'HashingEncoder': ce.HashingEncoder(),
        'LabelEncoder': le.MultiColumnLabelEncoder(),
        'FrequencyEncoder': fe.FrequencyEncoder(),
        'TargetEncoder': ce.TargetEncoder(),
        'HelmertEncoder': ce.HelmertEncoder(),
        'JamesSteinEncoder': ce.JamesSteinEncoder(),
        'BaseNEncoder': ce.BaseNEncoder(),
        'SumEncoder': ce.SumEncoder(),
    }
    return encoder_dict
Пример #21
0
def feature_encoding(df):
    # categorical data encoding
    df = df[df['workclass'] != '?']
    df = df[df['occupation'] != '?']
    df = df[df['native.country'] != '?']
    one_hot_list = ['sex', 'race']
    hash_list = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'native.country']
    ohe = ce.OneHotEncoder(cols=one_hot_list)
    le = ce.OrdinalEncoder()
    he = ce.HashingEncoder(cols=hash_list, drop_invariant=True, n_components=6)
    df['income'] = le.fit_transform(df['income'])
    df = ohe.fit_transform(df)
    df = he.fit_transform(df)
    return df
Пример #22
0
    def create_pipeline(self):

        # create pipeline
        distance_arguments = dict(start_lat="pickup_latitude",
                                  start_lon="pickup_longitude",
                                  end_lat="dropoff_latitude",
                                  end_lon="dropoff_longitude")

        distance_columns = list(distance_arguments.values())

        time_columns = ["pickup_datetime"]

        # getting params
        distance_params = self.pipeline.get('distance', dict())
        dt_params = {**distance_params, **distance_arguments}

        time_params = self.pipeline.get('time', dict())

        pipe_distance = make_pipeline(DistanceTransformer(**dt_params),
                                      RobustScaler())

        pipe_geohash = make_pipeline(GeohashTransformer(), ce.HashingEncoder())

        pipe_direction = make_pipeline(DirectionTransformer(), RobustScaler())

        pipe_distance_to_center = make_pipeline(DistanceToCenterTransformer(),
                                                RobustScaler())

        pipe_time = make_pipeline(
            TimeTransformer(time_column='pickup_datetime', **time_params),
            OneHotEncoder(handle_unknown='ignore'))

        transformers = [
            ('distance', pipe_distance, distance_columns),
            # ('geohash', pipe_geohash, distance_columns),  # bug
            ('direction', pipe_direction, distance_columns),
            ('distance_to_center', pipe_distance_to_center, distance_columns),
            ('time', pipe_time, time_columns),
        ]

        preprocessor = ColumnTransformer(transformers)

        estimator = self.create_estimator()

        steps = [('preprocessor', preprocessor), ('regressor', estimator)]

        pipeline = Pipeline(steps=steps)

        return pipeline
Пример #23
0
    def __init__(self, is_testing=False):

        self._num_list = ['Pclass', 'SibSp', 'Parch', 'Fare']

        if is_testing:
            self._cat_list = ['Ticket', 'Sex']
        else:
            self._cat_list = ['Ticket', 'Sex', 'Survived']

        self._embarked_pipeline = Pipeline([
            (
                'imputer',
                SimpleImputer(strategy="most_frequent", missing_values=np.nan)
            ),  ## gets most frequently used value and replaces nan's with that value
            ('one_hot', OneHotEncoder()),  ## one hot encodes this feature
        ])

        self._cat_pipeline = Pipeline([
            (
                'imputer',
                SimpleImputer(strategy="most_frequent", missing_values=np.nan)
            ),  ## gets most frequently used value and replaces nan's with that value
            ('ordinal_encoder', OrdinalEncoder()
             ),  ## Replaces each string with an integer [0,n_categories-1]
            ('feature_scaler', MinMaxScaler())
        ])

        self._num_pipeline = Pipeline([
            (
                'imputer', SimpleImputer(strategy="mean",
                                         missing_values=np.nan)
            ),  ## gets most frequently used value and replaces nan's with that value
            ('feature_scaler', MinMaxScaler()
             ),  ## Replaces each string with an integer [0,n_categories-1]
        ])

        self._cabin_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy="constant", fill_value='U')),
            ('hash', ce.HashingEncoder(n_components=8))
        ])

        self._preprocessor = ColumnTransformer([
            ("numerical", self._num_pipeline, self._num_list),
            ("embarked", self._embarked_pipeline, ['Embarked']),
            ("name", NameSplitter.NameSplitter(), ['Name']),
            ("age", AgeSplitter.AgeSplitter(), ['Age']),
            ("cabin", self._cabin_pipeline, ['Cabin']),
            ("cat", self._cat_pipeline, self._cat_list),
        ])
Пример #24
0
    def set_pipeline(self):

        time_features = make_pipeline(TimeFeaturesEncoder(time_column='pickup_datetime'),
                                      OneHotEncoder(handle_unknown='ignore'))

        pipe_geohash = make_pipeline(AddGeohash(), ce.HashingEncoder())

        features_encoder = ColumnTransformer([
            ('distance', DistanceTransformer(**DIST_ARGS), list(DIST_ARGS.values())),
            ('time_features', time_features, ['pickup_datetime']),
            ('geohash', pipe_geohash, list(DIST_ARGS.values()))
        ])

        self.pipeline = Pipeline(steps=[
            ('features', features_encoder),
            ('rgs', self.get_estimator())])
Пример #25
0
    def set_pipeline(self):
        memory = self.kwargs.get('pipeline_memory', None)
        dist = self.kwargs.get('distance_type', 'haversine')
        feateng_steps = self.kwargs.get('feateng',
                                        ['distance', 'time_features'])

        if memory:
            memory = mkdtemp()

        # Define feature engineering pipeline blocks here
        pipe_time_features = make_pipeline(
            TimeFeaturesEncoder(time_column='pickup_datetime'),
            OneHotEncoder(handle_unknown='ignore'))
        pipe_distance = make_pipeline(
            DistanceTransformer(distance_type=dist, **DIST_ARGS),
            StandardScaler())
        pipe_geohash = make_pipeline(AddGeohash(), ce.HashingEncoder())
        pipe_direction = make_pipeline(Direction(), StandardScaler())
        pipe_distance_to_center = make_pipeline(DistanceToCenter(),
                                                StandardScaler())

        # Combine pipes
        feateng_blocks = [
            ('distance', pipe_distance, list(DIST_ARGS.values())),
            ('time_features', pipe_time_features, ['pickup_datetime']),
            #('geohash', pipe_geohash, list(DIST_ARGS.values())),
            ('direction', pipe_direction, list(DIST_ARGS.values())),
            ('distance_to_center', pipe_distance_to_center,
             list(DIST_ARGS.values())),
        ]

        for bloc in feateng_blocks:
            if bloc[0] not in feateng_steps:
                feateng_blocks.remove(bloc)

        features_encoder = ColumnTransformer(feateng_blocks,
                                             n_jobs=None,
                                             remainder="drop")

        self.pipeline = Pipeline(steps=[('features', features_encoder),
                                        ('rgs', self.get_estimator())],
                                 memory=memory)

        if self.optimize:
            self.pipeline.steps.insert(
                -1,
                ['optimize_size', OptimizeSize(verbose=False)])
Пример #26
0
    def _build_features(self, df):
        """Build features."""

        with Timer("Building timestamp features"):
            df = self._build_features_datetime(df)

        with Timer("Building str features"):
            df = self._build_features_str(df)

        label_cols = list(set(df.columns).intersection(set(LABEL_COLS)))
        df.loc[:, label_cols] = df.loc[:, label_cols].astype(str)

        # # Gives memory error
        # with Timer("Encoding with BackwardDifferenceEncoder"):
        #     backward_diff_cols = list(
        #         set(label_cols).intersection(
        #             set(HIGH_NUM_CAT + MEDIUM_NUM_CAT)))
        #     bd_encoder = ce.backward_difference.BackwardDifferenceEncoder(
        #         cols=backward_diff_cols, verbose=1)
        #     dftmp = bd_encoder.fit_transform(df)

        if self.hash_encode:
            with Timer("Encoding with HashingEncoder"):
                for col in ['RESOURCE_ID', 'RUE', 'VILLE']:
                    hash_cols = list(set(label_cols).intersection(set([col])))
                    hash_encoder = ce.HashingEncoder(
                        cols=hash_cols, n_components=8, verbose=1)
                    dftmp = hash_encoder.fit_transform(df)
                    newcols = dftmp.columns.difference(df.columns)
                    dftmp = dftmp[newcols]
                    dftmp.columns = 'hash_{}_'.format(col) + dftmp.columns
                    df = pd.concat([df, dftmp], axis=1)

        if self.label_encode:
            # Forgotten columns at the end, simple Binary Encoding:
            with Timer("Encoding remaning ones in as LabelEncoder"):
                other_cols = df.columns.difference(
                    df._get_numeric_data().columns).tolist()
                le = preprocessing.LabelEncoder()
                for col in other_cols:
                    df.loc[:, col] = le.fit_transform(df[col])

        to_drop = list(set(df.columns).intersection(set(TIMESTAMP_COLS)))
        df = df.drop(columns=to_drop)

        return df
def feature_encoding(X_train, y_train, X_test, method='ordinal'):
    columns = list(X_train.select_dtypes(include=['object']).columns)
    if method == "ordinal":
        ce_binary = ce.OrdinalEncoder(cols=columns)
    elif method == "binary":
        ce_binary = ce.BinaryEncoder(cols=columns)
    elif method == "onehot":
        ce_binary = ce.OneHotEncoder(cols=columns)
    elif method == "basen":
        ce_binary = ce.BaseNEncoder(cols=columns)
    elif method == "hashing":
        ce_binary = ce.HashingEncoder(cols=columns)
    else:
        raise Exception("Wrong Method Choosen!")

    X_train = ce_binary.fit_transform(X_train, y_train)
    X_test = ce_binary.transform(X_test)
    return X_train.values, y_train, X_test.values
Пример #28
0
    def test_hashing(self):
        """
        Creates a dataset and encodes with with the hashing trick

        :return:
        """

        cols = ['C1', 'D', 'E', 'F']
        enc = encoders.HashingEncoder(verbose=1, n_components=128, cols=cols)
        X = self.create_dataset(n_rows=1000)

        X_test = enc.fit_transform(X, None)

        for dt in X_test.dtypes:
            numeric = False
            if dt == int or dt == float:
                numeric = True
            self.assertTrue(numeric)
Пример #29
0
def main():
    data_dir = '../data'
    # Load fight, fighter data
    with open(f'{data_dir}/fighter_data_en.pkl', 'rb') as pklfile:
        fighter_data_en = pickle.load(pklfile)

    # Load fight card for prediction
    df_fight_card = pd.read_excel('../data/fight_card.xlsx',
                                  sheet_name='Sheet1')

    # Encode weight class
    enc = ce.HashingEncoder(cols=['weight_class'])
    fight_card_final = enc.fit_transform(df_fight_card)
    fight_card_final.rename(columns={'col_0':'weight_class_col_0', 'col_1':'weight_class_col_1', 'col_2':'weight_class_col_2',\
                                        'col_3':'weight_class_col_3', 'col_4':'weight_class_col_4', 'col_5':'weight_class_col_5',\
                                        'col_6':'weight_class_col_6', 'col_7':'weight_class_col_7'}, inplace=True)

    # Get data for fighter1
    fight_card_final = fight_card_final.merge(fighter_data_en, left_on=['fighter1'], right_on=['full_name'],\
                                                    how='left').drop(columns=['full_name', 'fighter1'])
    fight_card_final.rename(columns={'col_0':'stance_col_0_fighter1', 'col_1':'stance_col_1_fighter1', 'col_2':'stance_col_2_fighter1',\
                                        'col_3':'stance_col_3_fighter1', 'col_4':'stance_col_4_fighter1', 'col_5':'stance_col_5_fighter1',\
                                        'col_6':'stance_col_6_fighter1', 'col_7':'stance_col_7_fighter1',\
                                        'height':'height_fighter1', 'weight':'weight_fighter1', 'reach':'reach_fighter1',\
                                        'wins':'wins_fighter1', 'losses':'losses_fighter1', 'draws':'draws_fighter1',\
                                        'SLpM':'SLpM_fighter1','Str_Acc':'Str_Acc_fighter1', 'SApM':'SApM_fighter1',\
                                        'Str_Dep':'Str_Dep_fighter1', 'TD_Avg':'TD_Avg_fighter1', 'TD_Acc':'TD_Acc_fighter1',\
                                        'TD_Def':'TD_Def_fighter1', 'Sub_Avg':'Sub_Avg_fighter1'}, inplace=True)

    # Get data for fighter 2
    fight_card_final = fight_card_final.merge(fighter_data_en, left_on=['fighter2'], right_on=['full_name'],\
                                                    how='left').drop(columns=['full_name', 'fighter2'])
    fight_card_final.rename(columns={'col_0':'stance_col_0_fighter2', 'col_1':'stance_col_1_fighter2', 'col_2':'stance_col_2_fighter2',\
                                        'col_3':'stance_col_3_fighter2', 'col_4':'stance_col_4_fighter2', 'col_5':'stance_col_5_fighter2',\
                                        'col_6':'stance_col_6_fighter2', 'col_7':'stance_col_7_fighter2',\
                                        'height':'height_fighter2', 'weight':'weight_fighter2', 'reach':'reach_fighter2',\
                                        'wins':'wins_fighter2', 'losses':'losses_fighter2', 'draws':'draws_fighter2',\
                                        'SLpM':'SLpM_fighter2','Str_Acc':'Str_Acc_fighter2', 'SApM':'SApM_fighter2',\
                                        'Str_Dep':'Str_Dep_fighter2', 'TD_Avg':'TD_Avg_fighter2', 'TD_Acc':'TD_Acc_fighter2',\
                                        'TD_Def':'TD_Def_fighter2', 'Sub_Avg':'Sub_Avg_fighter2'}, inplace=True)

    fight_card_final.to_csv(f'{data_dir}/fight_card_final.csv')
    with open(f'{data_dir}/fight_card_final.pkl', 'wb') as pklfile:
        pickle.dump(fight_card_final, pklfile)
Пример #30
0
    def _get_model(self, X, y=None):

        params = dict(
            cols=self.columns_to_encode,
            return_df=self.desired_output_type == DataTypes.DataFrame)

        if self.encoding_type == "dummy":
            return category_encoders.OneHotEncoder(use_cat_names=True,
                                                   **params)

        elif self.encoding_type == "binary":
            return category_encoders.BinaryEncoder(**params)

        elif self.encoding_type == "basen":
            return category_encoders.BaseNEncoder(base=self.basen_base,
                                                  **params)

        elif self.encoding_type == "hashing":
            return category_encoders.HashingEncoder(
                n_components=self.hashing_n_components, **params)

        #        elif self.encoding_type == "helmer":
        #            return category_encoders.HelmertEncoder(**params)
        #
        #        elif self.encoding_type == "polynomial":
        #            return category_encoders.PolynomialEncoder(**params)
        #
        #        elif self.encoding_type == "sum_coding":
        #            return category_encoders.SumEncoder(**params)
        #
        #        elif self.encoding_type == "backward_coding":
        #            return category_encoders.BackwardDifferenceEncoder(**params)
        # Ceux la aussi ne marche pas bien => change la taille parfois...

        # Rmk : Other categorical not included
        # * Target Encoder
        # * Leave One Out
        # MoreOver : Bug in those encoder : fit_transform does't work correctly
        # Those uses the target and I'd rather know exactly what I'm doing using tailor made classes

        else:
            raise ValueError("Unknown 'encoding_type' : %s" %
                             self.encoding_type)