class df_OrdinalEncoder(TransformerMixin): def __init__(self, handle_unknown='ignore'): self.handle_unknown = handle_unknown def fit(self, X, y=None): self.enc = OrdinalEncoder(handle_unknown=self.handle_unknown) self.enc.fit(X) return self def transform(self, X): X_encoded = self.enc.transform(X) X_encoded_df = pd.DataFrame(data=X_encoded, index=X.index, columns=X.columns) return X_encoded_df
def xgb_class(X_train, y_train, X_test, y_test): """ Baseline XGB Classifier that prints out ROC score for Train and Test sets provided. """ class_index = 1 # processor = make_pipeline( # ce.ordinal.OrdinalEncoder(), # SimpleImputer(strategy='median') # ) # X_train_processed = processor.fit_transform(X_train) # X_test_processed = processor.transform(X_test) encoder = OrdinalEncoder() imputer = SimpleImputer() X_train_encoded = encoder.fit(X_train) X_train_encoded = encoder.transform(X_train) X_train_imputed = imputer.fit_transform(X_train_encoded) X_test_encoded = encoder.fit(X_test) X_test_encoded = encoder.transform(X_test) X_test_imputed = imputer.fit_transform(X_test_encoded) model = XGBClassifier(n_estimators=100, n_jobs=-1, max_depth=10) model.fit(X_train_imputed, y_train, eval_metric='auc') # Getting the predicted probabilities y_pred = model.predict(X_test_processed) y_pred_proba_train = model.predict_proba(X_train_imputed)[:, class_index] y_pred_proba_test = model.predict_proba(X_test_imputed)[:, class_index] train_roc = roc_auc_score(y_train, y_pred_proba_train) test_roc = roc_auc_score(y_test, y_pred_proba_test) # Making a new Series for mean baseline print s1 = pd.Series(y_train) s2 = pd.Series(y_test) s3 = s1.append(s2) print('Mean Baseline of Target') print(s3.value_counts(normalize=True)) print() print(f'Train ROC AUC for class: {train_roc} \n') print(f'Test ROC AUC for class: {test_roc}') return
def fit_label(input_df: pd.DataFrame, cols: List[str], na_value: Any = None): """ Creates the label encoder by fitting it through the given DataFrame NaN values and Special value specified under `na_value` in the DataFrame will be encoded as unseen value. Args: input_df: DataFrame used to fit the encoder cols: List of categorical columns to be encoded na_value: Default null value for DataFrame Returns: result_df: encoded input_df DataFrame model : encoder model to be passed to `transform_label` method """ df = input_df.copy() if na_value is not None: for col in cols: df[col] = df[col].replace({na_value: np.nan}) encoder = OrdinalEncoder(cols=cols) encoder = encoder.fit(df) for idx in range(len(encoder.mapping)): encoder.mapping[idx]["mapping"].loc[np.nan] = -2 result_df = encoder.transform(df) for col in cols: result_df[col] = result_df[col].replace({-1: 0, -2: 0}) result_df[col] = result_df[col].astype(int) model = {"encoder": encoder, "cols": cols, "na_value": na_value} return result_df, model
def create_label(df, test_df, topic): result_dict = {} feature_df = df[['title']].copy() label_df = df.drop(columns=['itemid', 'title', 'image_path']).copy() feature_df['title'] = feature_df['title'].apply(lambda x: text_process(x)) feature_array = feature_df['title'].values.tolist() feature_encoder = TfidfVectorizer() feature_encoder.fit(feature_array) feature_attr = feature_encoder.transform(feature_array) feature_decomposer = TruncatedSVD(500) feature_decomposer.fit(feature_attr) feature_attr = feature_decomposer.transform(feature_attr) test_df['title'] = test_df['title'].apply(lambda x: text_process(x)) test_array = test_df['title'].values.tolist() test_attr = feature_encoder.transform(test_array) test_attr = feature_decomposer.transform(test_attr) train_itemid = df['itemid'] test_itemid = test_df['itemid'] result_dict['itemid_train_{}'.format(topic)] = train_itemid result_dict['itemid_test_{}'.format(topic)] = test_itemid result_dict['X_train_{}'.format(topic)] = feature_attr result_dict['X_encoder_{}'.format(topic)] = feature_encoder result_dict['X_decomposer_{}'.format(topic)] = feature_decomposer result_dict['X_test_{}'.format(topic)] = test_attr for column in label_df.columns: label_encoder = OrdinalEncoder(cols=[column], handle_unknown='impute') label_encoder.fit(label_df[[column]]) label_attr = label_encoder.transform(label_df[[column]]) result_dict['Y_train_{}_{}'.format(topic, column)] = label_attr result_dict['Y_encoder_{}_{}'.format(topic, column)] = label_encoder result_dict['Y_colname_{}_{}'.format(topic, column)] = label_attr.columns return result_dict
def encode_ordinal_df(dataframe, fit=False): """ Encode ordinal features, preserving the notion of order and dropping invariant features --- Arguments dataframe: pd.DataFrame Dataframe with pre-processed data (i.e. renamed features), ordinal features only fit: boolean Indicates if we should train or load an encoder Returns dataframe: pd.DataFrame Dataframe with encoded data """ # Train or load an encoder if fit: encoder = OrdinalEncoder(cols=dataframe.columns.values, drop_invariant=True) encoder.fit(dataframe) pickle_obj(encoder, 'ordinal_encoder') else: encoder = unpickle_obj('ordinal_encoder') # transform data return encoder.transform(dataframe)
from skimpute import MissForest df = pd.read_csv("../exmaple/train_classification.csv") start = time() df.pop("Name") df.pop("Ticket") df.pop("PassengerId") y = df.pop("Survived").values cv = ShuffleSplit(n_splits=1, test_size=0.25, random_state=42) train_ix, test_ix = next(cv.split(df, y)) train_X = (df.iloc[train_ix, :]) train_y = y[train_ix] test_X = (df.iloc[test_ix, :]) test_y = y[test_ix] imputer = MissForest() imputer.fit(df) train_X = imputer.transform(train_X) test_X = imputer.transform(test_X) print(train_X) print(train_X.dtypes) print(time() - start) encoder = OrdinalEncoder() encoder.fit(df) train_X = encoder.transform(train_X) test_X = encoder.transform(test_X) rf = RandomForestClassifier(random_state=42) rf.fit(train_X, train_y) score = rf.score(test_X, test_y) print(score) # 0.8295964125560538
class OrdinalEncoder(): """Maps each categorical value to one column using ordinal encoding. Parameters: cols: [str] list of column names to encode. """ name = 'ordinal' def __init__(self, cols=None): self.encoder = Ordinal(cols=cols) def fit(self, X, features, y=None): """Fits encoder to data table. returns self """ self.encoder.fit(X, y=None) self.features = self.encode_features_list(X, features) return self def transform(self, X): """Encodes matrix and updates features accordingly. returns encoded matrix (dataframe) """ X_new = self.encoder.transform(X) feature_names = [] for feature in self.features: for fname in feature.get_feature_names(): feature_names.append(fname) X_new.columns = feature_names return X_new def fit_transform(self, X, features, y=None): """First fits, then transforms matrix. returns encoded matrix (dataframe) """ return self.fit(X, features, y).transform(X) def get_mapping(self, category): """Gets the mapping the ordinal encoder. returns mapping (dict) """ if isinstance(category, str): for map in self.encoder.mapping: if map['col'] == category: return map['mapping'] return self.encoder.mapping[category]['mapping'] def encode_features_list(self, X, features): feature_list = [] index = 0 for f in features: if f.get_name() in self.encoder.cols: f = ft.Feature([f], primitive=OrdinalEnc(self, index)) index += 1 feature_list.append(f) return feature_list def get_features(self): return self.features def get_name(self): return self.name
def fit(self, input_df): self.whole_df[self.cols].fillna('NAN', inplace=True) oe = OrdinalEncoder(cols=self.cols, handle_unknown='inpute') oe.fit(self.whole_df[self.cols]) self.oe = oe return self.transform(input_df)
cat = OnnxSubEstimator(skl_ord, X, op_version=opv, output_names=operator.outputs[:1]) cat.add_to(scope, container) update_registered_converter(OrdinalEncoder, "CategoricalEncoderOrdinalEncoder", ordinal_encoder_shape_calculator, ordinal_encoder_converter) ################################### # Let's compute the output one a short example. enc = OrdinalEncoder(cols=[0, 1]) enc.fit(X) print(enc.transform(X[:5])) ################################### # Let's check the ONNX conversion produces the same results. ord_onx = to_onnx(enc, X[:1], target_opset=14) sess = InferenceSession(ord_onx.SerializeToString()) print(sess.run(None, {'X': X[:5]})[0]) ###################################### # That works. # # Custom converter for WOEEncoder # +++++++++++++++++++++++++++++++ #