def test_hashing(self): """ Creates a dataset and encodes with with the hashing trick :return: """ cols = ['C1', 'D', 'E', 'F'] X = self.create_dataset(n_rows=1000) X_t = self.create_dataset(n_rows=100) enc = encoders.HashingEncoder(verbose=1, n_components=128, cols=cols) enc.fit(X, None) self.verify_numeric(enc.transform(X_t)) enc = encoders.HashingEncoder(verbose=1, n_components=32) enc.fit(X, None) self.verify_numeric(enc.transform(X_t)) enc = encoders.HashingEncoder(verbose=1, n_components=32, drop_invariant=True) enc.fit(X, None) self.verify_numeric(enc.transform(X_t)) enc = encoders.HashingEncoder(verbose=1, n_components=32, return_df=False) enc.fit(X, None) self.assertTrue(isinstance(enc.transform(X_t), np.ndarray))
def hashing_encoding(df, cols, handle_nan=True): if handle_nan: encoder = ce.HashingEncoder(cols=cols) else: encoder = ce.HashingEncoder(cols=cols) df_new = encoder.fit_transform(df) return df_new
def set_pipeline(self): dist = self.kwargs.get("distance_type", "haversine") feateng_steps = self.kwargs.get("feateng", ["distance", "time_features"]) # Define feature engineering pipeline blocks here pipe_time_features = make_pipeline( TimeFeaturesEncoder(time_column='pickup_datetime'), OneHotEncoder(handle_unknown='ignore')) pipe_distance = make_pipeline(DistanceTransformer(distance_type=dist), StandardScaler()) pipe_geohash = make_pipeline(AddGeohash(), ce.HashingEncoder()) # Add new feature engineer Above #pipe_direction = #pipe_distance_to_center = # Define default feature engineering blocs feateng_blocks = [ ('distance', pipe_distance, list(DIST_ARGS.values())), ('time_features', pipe_time_features, ['pickup_datetime']), ('geohash', pipe_geohash, list(DIST_ARGS.values())), #('direction', pipe_direction, list(DIST_ARGS.values())), #('distance_to_center', pipe_distance_to_center, list(DIST_ARGS.values())), ] # Filter out some bocks according to input parameters for bloc in feateng_blocks: if bloc[0] not in feateng_steps: feateng_blocks.remove(bloc) features_encoder = ColumnTransformer(feateng_blocks, n_jobs=None, remainder="drop") self.pipeline = Pipeline(steps=[('features', features_encoder), ('rgs', self.get_estimator())], )
def encode_variables(self, one_hot_list=[], boolean_list=[], many_list=[], standard_scaler=[],dim_reduc = False): ###Boolean if boolean_list: input_cols = list(self.X.columns) bool_enc = ce.OrdinalEncoder(cols=boolean_list,drop_invariant=True).fit(self.X) self.X = bool_enc.transform(self.X) self.models.append({'model_type':'boolean', 'model_file':bool_enc,'input_cols':input_cols, 'model_variables':boolean_list}) #### ONE HOT if one_hot_list: input_cols = list(self.X.columns) onehot_enc = ce.OneHotEncoder(cols=one_hot_list,drop_invariant=True,use_cat_names=True).fit(self.X) self.X = onehot_enc.transform(self.X) self.models.append({'model_type':'one_hot', 'model_file':onehot_enc,'input_cols':input_cols, 'model_variables':one_hot_list}) ##MANY VALUES if many_list: input_cols = list(self.X.columns) hash_enc = ce.HashingEncoder(cols=many_list,n_components=15, drop_invariant=True).fit(self.X) self.X = hash_enc.transform(self.X) self.models.append({'model_type':'hash_trick', 'model_file':hash_enc, 'input_cols':input_cols,'model_variables':many_list}) ##Scalling if standard_scaler: input_cols = list(self.X.columns) df_standard = self.X[standard_scaler] scaler = StandardScaler() self.X[standard_scaler] = scaler.fit_transform(df_standard) self.models.append({'model_type':'standard_scaler', 'model_file':scaler, 'model_variables':standard_scaler}) #SVD if dim_reduc == 'SVD': self.dimensional_reduction(method='SVD',n_components=10)
def hash_encoder(df, cols, no_new_cols_per): print("<hash> df rows: %ld" % df.shape[0]) for col in cols: print("hashing col %s" % col) ce_hash = ce.HashingEncoder(cols=[col], n_components=no_new_cols_per) X = df[col] new_cols_df = ce_hash.fit_transform(X) print("new cols df rows: %ld" % new_cols_df.shape[0]) df = df.drop(col, axis=1) for i in range(0, no_new_cols_per): placeholder_name = "col_%ld" % i new_col_name = "%s%s%ld" % (col, "_", i) #print("new_cols_df before rename:") #print(new_cols_df.head(n=1)) new_cols_df = new_cols_df.rename( columns={placeholder_name: new_col_name}) #print("new_cols_df after rename:") #print(new_cols_df.head(n=1)) # append the new columns to the dataframe print("BEFORE concatting for col %s" % col) print("<hash> df rows: %ld" % df.shape[0]) print("<hash> new cols rows: %ld" % new_cols_df.shape[0]) df.reset_index(drop=True, inplace=True) new_cols_df.reset_index(drop=True, inplace=True) df = pd.concat([df, new_cols_df], axis=1) print("concatting for col %s" % col) print("<hash> df rows: %ld" % df.shape[0]) return df
def prep_fight_card(df_fight_card, fighter_data_en): # Encode weight class enc = ce.HashingEncoder(cols=['weight_class']) fight_card_final = enc.fit_transform(df_fight_card) fight_card_final.rename(columns={'col_0':'weight_class_col_0', 'col_1':'weight_class_col_1', 'col_2':'weight_class_col_2',\ 'col_3':'weight_class_col_3', 'col_4':'weight_class_col_4', 'col_5':'weight_class_col_5',\ 'col_6':'weight_class_col_6', 'col_7':'weight_class_col_7'}, inplace=True) # Get data for fighter1 fight_card_final = fight_card_final.merge(fighter_data_en, left_on=['fighter1'], right_on=['full_name'],\ how='left').drop(columns=['full_name', 'fighter1']) fight_card_final.rename(columns={'col_0':'stance_col_0_fighter1', 'col_1':'stance_col_1_fighter1', 'col_2':'stance_col_2_fighter1',\ 'col_3':'stance_col_3_fighter1', 'col_4':'stance_col_4_fighter1', 'col_5':'stance_col_5_fighter1',\ 'col_6':'stance_col_6_fighter1', 'col_7':'stance_col_7_fighter1',\ 'height':'height_fighter1', 'weight':'weight_fighter1', 'reach':'reach_fighter1',\ 'wins':'wins_fighter1', 'losses':'losses_fighter1', 'draws':'draws_fighter1',\ 'SLpM':'SLpM_fighter1','Str_Acc':'Str_Acc_fighter1', 'SApM':'SApM_fighter1',\ 'Str_Dep':'Str_Dep_fighter1', 'TD_Avg':'TD_Avg_fighter1', 'TD_Acc':'TD_Acc_fighter1',\ 'TD_Def':'TD_Def_fighter1', 'Sub_Avg':'Sub_Avg_fighter1'}, inplace=True) # Get data for fighter 2 fight_card_final = fight_card_final.merge(fighter_data_en, left_on=['fighter2'], right_on=['full_name'],\ how='left').drop(columns=['full_name', 'fighter2']) fight_card_final.rename(columns={'col_0':'stance_col_0_fighter2', 'col_1':'stance_col_1_fighter2', 'col_2':'stance_col_2_fighter2',\ 'col_3':'stance_col_3_fighter2', 'col_4':'stance_col_4_fighter2', 'col_5':'stance_col_5_fighter2',\ 'col_6':'stance_col_6_fighter2', 'col_7':'stance_col_7_fighter2',\ 'height':'height_fighter2', 'weight':'weight_fighter2', 'reach':'reach_fighter2',\ 'wins':'wins_fighter2', 'losses':'losses_fighter2', 'draws':'draws_fighter2',\ 'SLpM':'SLpM_fighter2','Str_Acc':'Str_Acc_fighter2', 'SApM':'SApM_fighter2',\ 'Str_Dep':'Str_Dep_fighter2', 'TD_Avg':'TD_Avg_fighter2', 'TD_Acc':'TD_Acc_fighter2',\ 'TD_Def':'TD_Def_fighter2', 'Sub_Avg':'Sub_Avg_fighter2'}, inplace=True) return (fight_card_final)
def set_pipeline(self): memory = self.kwargs.get("pipeline_memory", None) dist = self.kwargs.get("distance_type", "euclidian") feateng_steps = self.kwargs.get("feateng", ["distance", "time_features", 'direction', 'distance_to_center']) if memory: memory = mkdtemp() # Define feature engineering pipeline blocks here pipe_time_features = make_pipeline(TimeFeaturesEncoder(time_column='pickup_datetime'), OneHotEncoder(handle_unknown='ignore')) pipe_distance = make_pipeline(DistanceTransformer(distance_type=dist, **DIST_ARGS), RobustScaler()) pipe_geohash = make_pipeline(AddGeohash(), ce.HashingEncoder()) pipe_direction = make_pipeline(Direction(), RobustScaler()) pipe_distance_to_center = make_pipeline(DistanceToCenter(), RobustScaler()) # Define default feature engineering blocs feateng_blocks = [ ('distance', pipe_distance, list(DIST_ARGS.values())), ('time_features', pipe_time_features, ['pickup_datetime']), ('geohash', pipe_geohash, list(DIST_ARGS.values())), ('direction', pipe_direction, list(DIST_ARGS.values())), ('distance_to_center', pipe_distance_to_center, list(DIST_ARGS.values())), ] # Filter out some bocks according to input parameters for bloc in feateng_blocks: if bloc[0] not in feateng_steps: feateng_blocks.remove(bloc) features_encoder = ColumnTransformer(feateng_blocks, n_jobs=None, remainder="drop") self.pipeline = Pipeline(steps=[ ('features', features_encoder), ('rgs', self.get_estimator())], memory=memory)
def _fit_hash(self, df, target): hash_encoder = ce.HashingEncoder() hash_encoder.fit(df[target].map(to_str)) name = [ 'continuous_' + remove_continuous_discrete_prefix(x) + '_hash' for x in hash_encoder.get_feature_names() ] self.trans_ls.append(('hash', name, target, hash_encoder))
def hashing(): X, _, _ = get_mushroom_data() print(X.info()) enc = ce.HashingEncoder() enc.fit(X, None) out = enc.transform(X) print(out.info()) del enc, _, X, out
def hash_encode1(df2): df = df2.copy() categorical_features = df.select_dtypes( include=['category']).columns.values hashing_encoder = ce.HashingEncoder(n_components=len(categorical_features), cols=categorical_features.tolist()) df[categorical_features] = hashing_encoder.fit_transform( df[categorical_features]) return df
def create_features(self, df_train, df_test): encoder = ce.HashingEncoder(cols=self.columns) encoder.fit(df_train[self.columns], df_train[self.target_column].values.tolist()) encoded_train = encoder.transform(df_train[self.columns]) encoded_test = encoder.transform(df_test[self.columns]) for column in encoded_train.columns: self.train[column + '_HashingEncoder'] = encoded_train[column] self.test[column + '_HashingEncoder'] = encoded_test[column]
def pd_colcat_encoder_generic(df, col, pars): """ https://pypi.org/project/category-encoders/ encoder = ce.BackwardDifferenceEncoder(cols=[...]) encoder = ce.BaseNEncoder(cols=[...]) encoder = ce.BinaryEncoder(cols=[...]) encoder = ce.CatBoostEncoder(cols=[...]) encoder = ce.CountEncoder(cols=[...]) encoder = ce.GLMMEncoder(cols=[...]) encoder = ce.HashingEncoder(cols=[...]) encoder = ce.HelmertEncoder(cols=[...]) encoder = ce.JamesSteinEncoder(cols=[...]) encoder = ce.LeaveOneOutEncoder(cols=[...]) encoder = ce.MEstimateEncoder(cols=[...]) encoder = ce.OneHotEncoder(cols=[...]) encoder = ce.OrdinalEncoder(cols=[...]) encoder = ce.SumEncoder(cols=[...]) encoder = ce.PolynomialEncoder(cols=[...]) encoder = ce.TargetEncoder(cols=[...]) encoder = ce.WOEEncoder(cols=[...]) """ colcat = col import category_encoders as ce pars_encoder = pars pars_encoder['cols'] = col if 'path_pipeline_export' in pars: try: pars_encoder = load(pars['path_pipeline_export'] + '/colcat_encoder_pars.pkl') except: pass encoder = ce.HashingEncoder(**pars_encoder) dfcat_bin = encoder.fit_transform(df[col]) dfcat_bin.columns = [t for t in dfcat_bin.columns] colcat_encoder = list(dfcat_bin.columns) ################################################################################### if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(dfcat_bin, 'dfcat_encoder', pars['path_features_store']) save(encoder, pars['path_pipeline_export'] + "/colcat_encoder_model.pkl") save(pars_encoder, pars['path_pipeline_export'] + "/colcat_encoder_pars.pkl") save(colcat_encoder, pars['path_pipeline_export'] + "/colcat_encoder.pkl") col_pars = {} col_pars['col_encode_model'] = encoder col_pars['cols_new'] = { 'colcat_encoder': colcat_encoder ### list } return dfcat_bin, col_pars
def encode_categorical(df,encoder_name='binary'): encoder_dic = {"one_hot":ce.OneHotEncoder(), "feature_hashing":ce.HashingEncoder(n_components=32), "binary":ce.BinaryEncoder(), "ordinal":ce.OrdinalEncoder(), "polynomial":ce.PolynomialEncoder()} encoder = encoder_dic.get(encoder_name) encoder.fit(df,verbose=1) df = encoder.transform(df) return df
def hash_encode(df, col_names=[], label_col='HasDetections', accumulate=True, return_df=True, dynamic_n_component=False, drop_invariant=True, debug=True, save_intermediate_results=False): col_names = list(col_names) if len(col_names) == 0: col_names = list(df.columns) if label_col in col_names: col_names.remove(label_col) print('hash encoding: {}'.format(col_names), flush=True) _frames = [] _cols_arr = [] for c in col_names: if debug: print('processing: {}'.format(c), flush=True) _model = df[[c]] _val_ct = df[c].value_counts().count() _n_comp = 8 * int( np.log(_val_ct)) if dynamic_n_component and _val_ct > 10 else 8 hash_enc = ce.HashingEncoder(cols=[c], return_df=return_df, n_components=_n_comp, drop_invariant=drop_invariant) print('hash_enc: {}'.format(hash_enc), flush=True) hash_enc.fit(_model, df[label_col]) _df = hash_enc.transform(_model) if return_df: _df.columns = '{}_'.format(c) + _df.columns else: _cols_arr.append(c) if accumulate: _frames.append(_df) if save_intermediate_results: _dict = {'col': c, 'hash_enc': hash_enc, 'data': _df} _fl_name = 'input/{}_df_train.pkl'.format( c) if return_df else 'input/{}_np_train.pkl'.format(c) joblib.dump(_dict, _fl_name) return pd.concat(_frames, axis=1) if return_df else (_frames, _cols_arr)
def prepare_data(data, path, name, max_sample): from sklearn import preprocessing import category_encoders as ce import pandas as pd print("start of prepare_data") cat_final = ['professionid', 'birthplace', 'residencezipcode', \ 'companyzipcode', 'legalzipcode', 'education', 'maritalstatus'] quant = ['numofdependence', \ 'monthlyfixedincome', 'monthlyvariableincome', 'spouseincome', \ 'avg_income', 'std_income', 'avg_income_cnt', 'avg_income_nation', 'std_income_nation', 'avg_income_nation_cnt', 'avg_income_area', 'std_icnome_area', 'avg_income_area_cnt', 'avg_sale_house_price_5000', 'std_sale_house_price_5000', 'sale_house_cnt_5000', 'avg_sale_apartment_price_5000', 'std_sale_apartment_price_5000', 'sale_apartment_cnt_5000', 'avg_rent_house_price_5000', 'std_rent_house_price_5000', 'rent_house_cnt_5000', 'avg_rent_apartment_price_5000', 'std_rent_apartment_price_5000', 'rent_apartment_cnt_5000', 'avg_sale_house_price_10000', 'std_sale_house_price_10000', 'sale_house_cnt_10000', 'avg_sale_apartment_price_10000', 'std_sale_apartment_price_10000', 'sale_apartment_cnt_10000', 'avg_rent_house_price_10000', 'std_rent_house_price_10000', 'rent_house_cnt_10000', 'avg_rent_apartment_price_10000', 'std_rent_apartment_price_10000', 'rent_apartment_cnt_10000', 'previous_converted'] # resample_df = pd.read_csv(path+name) resample_df = data final_df = resample_df[cat_final] final_df['index'] = resample_df.index # label encoder on ordinal features le = preprocessing.LabelEncoder() final_df['professionid'] = le.fit_transform(resample_df.professionid) final_df['education'] = le.fit_transform(resample_df.education) non_ordinal = set(cat_final) ^ set(['professionid', 'education']) # Hashing Encoding for large scale categorical data HE = ce.HashingEncoder(cols=non_ordinal, return_df=True, max_sample=max_sample) # encode the categorical variables data = HE.fit_transform(final_df) #categorical features fillna with mode data = data.apply(lambda x: x.fillna(x.mode()), axis=0) #quantitative features fillna with mean quant_data = resample_df[quant].apply(lambda x: x.fillna(x.mean()), axis=0) X = quant_data.reset_index().merge(data, on='index').drop('index', axis=1) y = resample_df['defaulted'] print("Shape of X, y: ", X.shape, y.shape) return X, y
def hash_encoding(trainData, predictionData): oEncoder = ce.HashingEncoder(cols=[ 'housing_situation', 'satisfaction', 'gender', 'hair_color', 'country', 'profession', 'degree' ]) oEncoder.fit(trainData) trainDataFrame = oEncoder.transform(trainData) predictionDataFrame = oEncoder.transform(predictionData) return trainDataFrame, predictionDataFrame
def __init__(self, encoder_type, columns_name=None): """ :param encoder_type: :param columns_name: list, 特征名组成的列表名 """ if encoder_type == "BackwardDe": # 反向差分编码 self.encoder = ce.BackwardDifferenceEncoder(cols=columns_name) elif encoder_type == "BaseN": # BaseN编码 self.encoder = ce.BaseNEncoder(cols=columns_name) elif encoder_type == "Binary": # 二值编码 self.encoder = ce.BinaryEncoder(cols=columns_name) elif encoder_type == "Catboost": self.encoder = ce.CatBoostEncoder(cols=columns_name) elif encoder_type == "Hash": self.encoder = ce.HashingEncoder(cols=columns_name) elif encoder_type == "Helmert": self.encoder = ce.HelmertEncoder(cols=columns_name) elif encoder_type == "JamesStein": self.encoder = ce.JamesSteinEncoder(cols=columns_name) elif encoder_type == "LOO": # LeaveOneOutEncoder 编码 self.encoder = ce.LeaveOneOutEncoder(cols=columns_name) elif encoder_type == "ME": self.encoder = ce.MEstimateEncoder(cols=columns_name) # M估计编码器 elif encoder_type == "OneHot": self.encoder = ce.OneHotEncoder(cols=columns_name) elif encoder_type == "OridinalEncoder": # 原始编码 self.encoder = ce.OrdinalEncoder(cols=columns_name) elif encoder_type == "Sum": # 求和编码 self.encoder = ce.SumEncoder(cols=columns_name) elif encoder_type == "Polynomial": # 多项式编码 self.encoder = ce.PolynomialEncoder(cols=columns_name) elif encoder_type == "Target": # 目标编码 self.encoder = ce.TargetEncoder(cols=columns_name) elif encoder_type == "WOE": # WOE 编码器 self.encoder = ce.WOEEncoder(cols=columns_name) else: raise ValueError("请选择正确的编码方式")
def test_hashing_np(self): """ Creates a dataset and encodes with with the hashing trick :return: """ X = self.create_array(n_rows=1000) X_t = self.create_array(n_rows=100) enc = encoders.HashingEncoder(verbose=1, n_components=128) enc.fit(X, None) self.verify_numeric(enc.transform(X_t))
def test_must_not_reset_index(self): columns = ['column1', 'column2', 'column3', 'column4'] df = pd.DataFrame([[i, i, i, i] for i in range(10)], columns=columns) df = df.iloc[2:8, :] target_columns = ['column1', 'column2', 'column3'] single_process_encoder = encoders.HashingEncoder(max_process=1, cols=target_columns) single_process_encoder.fit(df, None) df_encoded_single_process = single_process_encoder.transform(df) assert_index_equal(df.index, df_encoded_single_process.index) assert df.shape[0] == pd.concat([df, df_encoded_single_process], axis=1).shape[0] multi_process_encoder = encoders.HashingEncoder(cols=target_columns) multi_process_encoder.fit(df, None) df_encoded_multi_process = multi_process_encoder.transform(df) assert_index_equal(df.index, df_encoded_multi_process.index) assert df.shape[0] == pd.concat([df, df_encoded_multi_process], axis=1).shape[0] assert_frame_equal(df_encoded_single_process, df_encoded_multi_process)
def get_encoder_dict(): encoder_dict = { 'OneHotEncoder': ce.OneHotEncoder(), 'BinaryEncoder': ce.BinaryEncoder(), 'HashingEncoder': ce.HashingEncoder(), 'LabelEncoder': le.MultiColumnLabelEncoder(), 'FrequencyEncoder': fe.FrequencyEncoder(), 'TargetEncoder': ce.TargetEncoder(), 'HelmertEncoder': ce.HelmertEncoder(), 'JamesSteinEncoder': ce.JamesSteinEncoder(), 'BaseNEncoder': ce.BaseNEncoder(), 'SumEncoder': ce.SumEncoder(), } return encoder_dict
def feature_encoding(df): # categorical data encoding df = df[df['workclass'] != '?'] df = df[df['occupation'] != '?'] df = df[df['native.country'] != '?'] one_hot_list = ['sex', 'race'] hash_list = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'native.country'] ohe = ce.OneHotEncoder(cols=one_hot_list) le = ce.OrdinalEncoder() he = ce.HashingEncoder(cols=hash_list, drop_invariant=True, n_components=6) df['income'] = le.fit_transform(df['income']) df = ohe.fit_transform(df) df = he.fit_transform(df) return df
def create_pipeline(self): # create pipeline distance_arguments = dict(start_lat="pickup_latitude", start_lon="pickup_longitude", end_lat="dropoff_latitude", end_lon="dropoff_longitude") distance_columns = list(distance_arguments.values()) time_columns = ["pickup_datetime"] # getting params distance_params = self.pipeline.get('distance', dict()) dt_params = {**distance_params, **distance_arguments} time_params = self.pipeline.get('time', dict()) pipe_distance = make_pipeline(DistanceTransformer(**dt_params), RobustScaler()) pipe_geohash = make_pipeline(GeohashTransformer(), ce.HashingEncoder()) pipe_direction = make_pipeline(DirectionTransformer(), RobustScaler()) pipe_distance_to_center = make_pipeline(DistanceToCenterTransformer(), RobustScaler()) pipe_time = make_pipeline( TimeTransformer(time_column='pickup_datetime', **time_params), OneHotEncoder(handle_unknown='ignore')) transformers = [ ('distance', pipe_distance, distance_columns), # ('geohash', pipe_geohash, distance_columns), # bug ('direction', pipe_direction, distance_columns), ('distance_to_center', pipe_distance_to_center, distance_columns), ('time', pipe_time, time_columns), ] preprocessor = ColumnTransformer(transformers) estimator = self.create_estimator() steps = [('preprocessor', preprocessor), ('regressor', estimator)] pipeline = Pipeline(steps=steps) return pipeline
def __init__(self, is_testing=False): self._num_list = ['Pclass', 'SibSp', 'Parch', 'Fare'] if is_testing: self._cat_list = ['Ticket', 'Sex'] else: self._cat_list = ['Ticket', 'Sex', 'Survived'] self._embarked_pipeline = Pipeline([ ( 'imputer', SimpleImputer(strategy="most_frequent", missing_values=np.nan) ), ## gets most frequently used value and replaces nan's with that value ('one_hot', OneHotEncoder()), ## one hot encodes this feature ]) self._cat_pipeline = Pipeline([ ( 'imputer', SimpleImputer(strategy="most_frequent", missing_values=np.nan) ), ## gets most frequently used value and replaces nan's with that value ('ordinal_encoder', OrdinalEncoder() ), ## Replaces each string with an integer [0,n_categories-1] ('feature_scaler', MinMaxScaler()) ]) self._num_pipeline = Pipeline([ ( 'imputer', SimpleImputer(strategy="mean", missing_values=np.nan) ), ## gets most frequently used value and replaces nan's with that value ('feature_scaler', MinMaxScaler() ), ## Replaces each string with an integer [0,n_categories-1] ]) self._cabin_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy="constant", fill_value='U')), ('hash', ce.HashingEncoder(n_components=8)) ]) self._preprocessor = ColumnTransformer([ ("numerical", self._num_pipeline, self._num_list), ("embarked", self._embarked_pipeline, ['Embarked']), ("name", NameSplitter.NameSplitter(), ['Name']), ("age", AgeSplitter.AgeSplitter(), ['Age']), ("cabin", self._cabin_pipeline, ['Cabin']), ("cat", self._cat_pipeline, self._cat_list), ])
def set_pipeline(self): time_features = make_pipeline(TimeFeaturesEncoder(time_column='pickup_datetime'), OneHotEncoder(handle_unknown='ignore')) pipe_geohash = make_pipeline(AddGeohash(), ce.HashingEncoder()) features_encoder = ColumnTransformer([ ('distance', DistanceTransformer(**DIST_ARGS), list(DIST_ARGS.values())), ('time_features', time_features, ['pickup_datetime']), ('geohash', pipe_geohash, list(DIST_ARGS.values())) ]) self.pipeline = Pipeline(steps=[ ('features', features_encoder), ('rgs', self.get_estimator())])
def set_pipeline(self): memory = self.kwargs.get('pipeline_memory', None) dist = self.kwargs.get('distance_type', 'haversine') feateng_steps = self.kwargs.get('feateng', ['distance', 'time_features']) if memory: memory = mkdtemp() # Define feature engineering pipeline blocks here pipe_time_features = make_pipeline( TimeFeaturesEncoder(time_column='pickup_datetime'), OneHotEncoder(handle_unknown='ignore')) pipe_distance = make_pipeline( DistanceTransformer(distance_type=dist, **DIST_ARGS), StandardScaler()) pipe_geohash = make_pipeline(AddGeohash(), ce.HashingEncoder()) pipe_direction = make_pipeline(Direction(), StandardScaler()) pipe_distance_to_center = make_pipeline(DistanceToCenter(), StandardScaler()) # Combine pipes feateng_blocks = [ ('distance', pipe_distance, list(DIST_ARGS.values())), ('time_features', pipe_time_features, ['pickup_datetime']), #('geohash', pipe_geohash, list(DIST_ARGS.values())), ('direction', pipe_direction, list(DIST_ARGS.values())), ('distance_to_center', pipe_distance_to_center, list(DIST_ARGS.values())), ] for bloc in feateng_blocks: if bloc[0] not in feateng_steps: feateng_blocks.remove(bloc) features_encoder = ColumnTransformer(feateng_blocks, n_jobs=None, remainder="drop") self.pipeline = Pipeline(steps=[('features', features_encoder), ('rgs', self.get_estimator())], memory=memory) if self.optimize: self.pipeline.steps.insert( -1, ['optimize_size', OptimizeSize(verbose=False)])
def _build_features(self, df): """Build features.""" with Timer("Building timestamp features"): df = self._build_features_datetime(df) with Timer("Building str features"): df = self._build_features_str(df) label_cols = list(set(df.columns).intersection(set(LABEL_COLS))) df.loc[:, label_cols] = df.loc[:, label_cols].astype(str) # # Gives memory error # with Timer("Encoding with BackwardDifferenceEncoder"): # backward_diff_cols = list( # set(label_cols).intersection( # set(HIGH_NUM_CAT + MEDIUM_NUM_CAT))) # bd_encoder = ce.backward_difference.BackwardDifferenceEncoder( # cols=backward_diff_cols, verbose=1) # dftmp = bd_encoder.fit_transform(df) if self.hash_encode: with Timer("Encoding with HashingEncoder"): for col in ['RESOURCE_ID', 'RUE', 'VILLE']: hash_cols = list(set(label_cols).intersection(set([col]))) hash_encoder = ce.HashingEncoder( cols=hash_cols, n_components=8, verbose=1) dftmp = hash_encoder.fit_transform(df) newcols = dftmp.columns.difference(df.columns) dftmp = dftmp[newcols] dftmp.columns = 'hash_{}_'.format(col) + dftmp.columns df = pd.concat([df, dftmp], axis=1) if self.label_encode: # Forgotten columns at the end, simple Binary Encoding: with Timer("Encoding remaning ones in as LabelEncoder"): other_cols = df.columns.difference( df._get_numeric_data().columns).tolist() le = preprocessing.LabelEncoder() for col in other_cols: df.loc[:, col] = le.fit_transform(df[col]) to_drop = list(set(df.columns).intersection(set(TIMESTAMP_COLS))) df = df.drop(columns=to_drop) return df
def feature_encoding(X_train, y_train, X_test, method='ordinal'): columns = list(X_train.select_dtypes(include=['object']).columns) if method == "ordinal": ce_binary = ce.OrdinalEncoder(cols=columns) elif method == "binary": ce_binary = ce.BinaryEncoder(cols=columns) elif method == "onehot": ce_binary = ce.OneHotEncoder(cols=columns) elif method == "basen": ce_binary = ce.BaseNEncoder(cols=columns) elif method == "hashing": ce_binary = ce.HashingEncoder(cols=columns) else: raise Exception("Wrong Method Choosen!") X_train = ce_binary.fit_transform(X_train, y_train) X_test = ce_binary.transform(X_test) return X_train.values, y_train, X_test.values
def test_hashing(self): """ Creates a dataset and encodes with with the hashing trick :return: """ cols = ['C1', 'D', 'E', 'F'] enc = encoders.HashingEncoder(verbose=1, n_components=128, cols=cols) X = self.create_dataset(n_rows=1000) X_test = enc.fit_transform(X, None) for dt in X_test.dtypes: numeric = False if dt == int or dt == float: numeric = True self.assertTrue(numeric)
def main(): data_dir = '../data' # Load fight, fighter data with open(f'{data_dir}/fighter_data_en.pkl', 'rb') as pklfile: fighter_data_en = pickle.load(pklfile) # Load fight card for prediction df_fight_card = pd.read_excel('../data/fight_card.xlsx', sheet_name='Sheet1') # Encode weight class enc = ce.HashingEncoder(cols=['weight_class']) fight_card_final = enc.fit_transform(df_fight_card) fight_card_final.rename(columns={'col_0':'weight_class_col_0', 'col_1':'weight_class_col_1', 'col_2':'weight_class_col_2',\ 'col_3':'weight_class_col_3', 'col_4':'weight_class_col_4', 'col_5':'weight_class_col_5',\ 'col_6':'weight_class_col_6', 'col_7':'weight_class_col_7'}, inplace=True) # Get data for fighter1 fight_card_final = fight_card_final.merge(fighter_data_en, left_on=['fighter1'], right_on=['full_name'],\ how='left').drop(columns=['full_name', 'fighter1']) fight_card_final.rename(columns={'col_0':'stance_col_0_fighter1', 'col_1':'stance_col_1_fighter1', 'col_2':'stance_col_2_fighter1',\ 'col_3':'stance_col_3_fighter1', 'col_4':'stance_col_4_fighter1', 'col_5':'stance_col_5_fighter1',\ 'col_6':'stance_col_6_fighter1', 'col_7':'stance_col_7_fighter1',\ 'height':'height_fighter1', 'weight':'weight_fighter1', 'reach':'reach_fighter1',\ 'wins':'wins_fighter1', 'losses':'losses_fighter1', 'draws':'draws_fighter1',\ 'SLpM':'SLpM_fighter1','Str_Acc':'Str_Acc_fighter1', 'SApM':'SApM_fighter1',\ 'Str_Dep':'Str_Dep_fighter1', 'TD_Avg':'TD_Avg_fighter1', 'TD_Acc':'TD_Acc_fighter1',\ 'TD_Def':'TD_Def_fighter1', 'Sub_Avg':'Sub_Avg_fighter1'}, inplace=True) # Get data for fighter 2 fight_card_final = fight_card_final.merge(fighter_data_en, left_on=['fighter2'], right_on=['full_name'],\ how='left').drop(columns=['full_name', 'fighter2']) fight_card_final.rename(columns={'col_0':'stance_col_0_fighter2', 'col_1':'stance_col_1_fighter2', 'col_2':'stance_col_2_fighter2',\ 'col_3':'stance_col_3_fighter2', 'col_4':'stance_col_4_fighter2', 'col_5':'stance_col_5_fighter2',\ 'col_6':'stance_col_6_fighter2', 'col_7':'stance_col_7_fighter2',\ 'height':'height_fighter2', 'weight':'weight_fighter2', 'reach':'reach_fighter2',\ 'wins':'wins_fighter2', 'losses':'losses_fighter2', 'draws':'draws_fighter2',\ 'SLpM':'SLpM_fighter2','Str_Acc':'Str_Acc_fighter2', 'SApM':'SApM_fighter2',\ 'Str_Dep':'Str_Dep_fighter2', 'TD_Avg':'TD_Avg_fighter2', 'TD_Acc':'TD_Acc_fighter2',\ 'TD_Def':'TD_Def_fighter2', 'Sub_Avg':'Sub_Avg_fighter2'}, inplace=True) fight_card_final.to_csv(f'{data_dir}/fight_card_final.csv') with open(f'{data_dir}/fight_card_final.pkl', 'wb') as pklfile: pickle.dump(fight_card_final, pklfile)
def _get_model(self, X, y=None): params = dict( cols=self.columns_to_encode, return_df=self.desired_output_type == DataTypes.DataFrame) if self.encoding_type == "dummy": return category_encoders.OneHotEncoder(use_cat_names=True, **params) elif self.encoding_type == "binary": return category_encoders.BinaryEncoder(**params) elif self.encoding_type == "basen": return category_encoders.BaseNEncoder(base=self.basen_base, **params) elif self.encoding_type == "hashing": return category_encoders.HashingEncoder( n_components=self.hashing_n_components, **params) # elif self.encoding_type == "helmer": # return category_encoders.HelmertEncoder(**params) # # elif self.encoding_type == "polynomial": # return category_encoders.PolynomialEncoder(**params) # # elif self.encoding_type == "sum_coding": # return category_encoders.SumEncoder(**params) # # elif self.encoding_type == "backward_coding": # return category_encoders.BackwardDifferenceEncoder(**params) # Ceux la aussi ne marche pas bien => change la taille parfois... # Rmk : Other categorical not included # * Target Encoder # * Leave One Out # MoreOver : Bug in those encoder : fit_transform does't work correctly # Those uses the target and I'd rather know exactly what I'm doing using tailor made classes else: raise ValueError("Unknown 'encoding_type' : %s" % self.encoding_type)