class df_OrdinalEncoder(TransformerMixin):
    def __init__(self, handle_unknown='ignore'):
        self.handle_unknown = handle_unknown
        
    def fit(self, X, y=None):
        self.enc = OrdinalEncoder(handle_unknown=self.handle_unknown)
        self.enc.fit(X)
        return self
    
    def transform(self, X):
        X_encoded = self.enc.transform(X)
        X_encoded_df = pd.DataFrame(data=X_encoded, index=X.index, columns=X.columns)
        return X_encoded_df
示例#2
0
    def xgb_class(X_train, y_train, X_test, y_test):
        """
        Baseline XGB Classifier that prints out ROC score for Train and Test 
        sets provided.
        """
        class_index = 1
        # processor = make_pipeline(
        #    ce.ordinal.OrdinalEncoder(),
        #    SimpleImputer(strategy='median')
        # )

        # X_train_processed = processor.fit_transform(X_train)
        # X_test_processed = processor.transform(X_test)

        encoder = OrdinalEncoder()
        imputer = SimpleImputer()

        X_train_encoded = encoder.fit(X_train)
        X_train_encoded = encoder.transform(X_train)
        X_train_imputed = imputer.fit_transform(X_train_encoded)

        X_test_encoded = encoder.fit(X_test)
        X_test_encoded = encoder.transform(X_test)
        X_test_imputed = imputer.fit_transform(X_test_encoded)

        model = XGBClassifier(n_estimators=100, n_jobs=-1, max_depth=10)

        model.fit(X_train_imputed, y_train, eval_metric='auc')

        # Getting the predicted probabilities
        y_pred = model.predict(X_test_processed)
        y_pred_proba_train = model.predict_proba(X_train_imputed)[:,
                                                                  class_index]
        y_pred_proba_test = model.predict_proba(X_test_imputed)[:, class_index]

        train_roc = roc_auc_score(y_train, y_pred_proba_train)
        test_roc = roc_auc_score(y_test, y_pred_proba_test)

        # Making a new Series for mean baseline print
        s1 = pd.Series(y_train)
        s2 = pd.Series(y_test)
        s3 = s1.append(s2)

        print('Mean Baseline of Target')
        print(s3.value_counts(normalize=True))
        print()
        print(f'Train ROC AUC for class: {train_roc} \n')
        print(f'Test ROC AUC for class: {test_roc}')

        return
示例#3
0
def fit_label(input_df: pd.DataFrame, cols: List[str], na_value: Any = None):
    """
    Creates the label encoder by fitting it through the given DataFrame
    NaN values and Special value specified under `na_value` in the DataFrame will be encoded as unseen value.
    Args:
        input_df: DataFrame used to fit the encoder
        cols: List of categorical columns to be encoded
        na_value: Default null value for DataFrame

    Returns:
        result_df: encoded input_df DataFrame
        model : encoder model to be passed to `transform_label` method
    """
    df = input_df.copy()

    if na_value is not None:
        for col in cols:
            df[col] = df[col].replace({na_value: np.nan})

    encoder = OrdinalEncoder(cols=cols)
    encoder = encoder.fit(df)
    for idx in range(len(encoder.mapping)):
        encoder.mapping[idx]["mapping"].loc[np.nan] = -2

    result_df = encoder.transform(df)

    for col in cols:
        result_df[col] = result_df[col].replace({-1: 0, -2: 0})
        result_df[col] = result_df[col].astype(int)

    model = {"encoder": encoder, "cols": cols, "na_value": na_value}
    return result_df, model
示例#4
0
def create_label(df, test_df, topic):
    result_dict = {}
    feature_df = df[['title']].copy()
    label_df = df.drop(columns=['itemid', 'title', 'image_path']).copy()

    feature_df['title'] = feature_df['title'].apply(lambda x: text_process(x))
    feature_array = feature_df['title'].values.tolist()
    feature_encoder = TfidfVectorizer()
    feature_encoder.fit(feature_array)
    feature_attr = feature_encoder.transform(feature_array)
    feature_decomposer = TruncatedSVD(500)
    feature_decomposer.fit(feature_attr)
    feature_attr = feature_decomposer.transform(feature_attr)

    test_df['title'] = test_df['title'].apply(lambda x: text_process(x))
    test_array = test_df['title'].values.tolist()
    test_attr = feature_encoder.transform(test_array)
    test_attr = feature_decomposer.transform(test_attr)

    train_itemid = df['itemid']
    test_itemid = test_df['itemid']

    result_dict['itemid_train_{}'.format(topic)] = train_itemid
    result_dict['itemid_test_{}'.format(topic)] = test_itemid
    result_dict['X_train_{}'.format(topic)] = feature_attr
    result_dict['X_encoder_{}'.format(topic)] = feature_encoder
    result_dict['X_decomposer_{}'.format(topic)] = feature_decomposer
    result_dict['X_test_{}'.format(topic)] = test_attr

    for column in label_df.columns:
        label_encoder = OrdinalEncoder(cols=[column], handle_unknown='impute')
        label_encoder.fit(label_df[[column]])
        label_attr = label_encoder.transform(label_df[[column]])

        result_dict['Y_train_{}_{}'.format(topic, column)] = label_attr
        result_dict['Y_encoder_{}_{}'.format(topic, column)] = label_encoder
        result_dict['Y_colname_{}_{}'.format(topic,
                                             column)] = label_attr.columns

    return result_dict
def encode_ordinal_df(dataframe, fit=False):
   """
    Encode ordinal features, preserving the notion of order and dropping invariant features
    ---
    Arguments
        dataframe: pd.DataFrame
            Dataframe with pre-processed data (i.e. renamed features), ordinal features only
        fit: boolean
            Indicates if we should train or load an encoder
    Returns
        dataframe: pd.DataFrame
            Dataframe with encoded data
    """
    # Train or load an encoder    
    if fit:
        encoder = OrdinalEncoder(cols=dataframe.columns.values, drop_invariant=True)
        encoder.fit(dataframe)
        
        pickle_obj(encoder, 'ordinal_encoder')
    else:
        encoder = unpickle_obj('ordinal_encoder')

    # transform data
    return encoder.transform(dataframe)
示例#6
0
from skimpute import MissForest

df = pd.read_csv("../exmaple/train_classification.csv")
start = time()
df.pop("Name")
df.pop("Ticket")
df.pop("PassengerId")
y = df.pop("Survived").values
cv = ShuffleSplit(n_splits=1, test_size=0.25, random_state=42)
train_ix, test_ix = next(cv.split(df, y))
train_X = (df.iloc[train_ix, :])
train_y = y[train_ix]
test_X = (df.iloc[test_ix, :])
test_y = y[test_ix]
imputer = MissForest()
imputer.fit(df)
train_X = imputer.transform(train_X)
test_X = imputer.transform(test_X)
print(train_X)
print(train_X.dtypes)
print(time() - start)
encoder = OrdinalEncoder()
encoder.fit(df)
train_X = encoder.transform(train_X)
test_X = encoder.transform(test_X)
rf = RandomForestClassifier(random_state=42)
rf.fit(train_X, train_y)
score = rf.score(test_X, test_y)
print(score)  # 0.8295964125560538
class OrdinalEncoder():
    """Maps each categorical value to one column using ordinal encoding.

    Parameters:
        cols: [str]
            list of column names to encode.
    """
    name = 'ordinal'

    def __init__(self, cols=None):
        self.encoder = Ordinal(cols=cols)

    def fit(self, X, features, y=None):
        """Fits encoder to data table.
        returns self
        """
        self.encoder.fit(X, y=None)
        self.features = self.encode_features_list(X, features)
        return self

    def transform(self, X):
        """Encodes matrix and updates features accordingly.
        returns encoded matrix (dataframe)
        """
        X_new = self.encoder.transform(X)
        feature_names = []
        for feature in self.features:
            for fname in feature.get_feature_names():
                feature_names.append(fname)
        X_new.columns = feature_names
        return X_new

    def fit_transform(self, X, features, y=None):
        """First fits, then transforms matrix.
        returns encoded matrix (dataframe)
        """
        return self.fit(X, features, y).transform(X)

    def get_mapping(self, category):
        """Gets the mapping the ordinal encoder.
        returns mapping (dict)
        """
        if isinstance(category, str):
            for map in self.encoder.mapping:
                if map['col'] == category:
                    return map['mapping']
        return self.encoder.mapping[category]['mapping']

    def encode_features_list(self, X, features):
        feature_list = []
        index = 0
        for f in features:
            if f.get_name() in self.encoder.cols:
                f = ft.Feature([f], primitive=OrdinalEnc(self, index))
                index += 1
            feature_list.append(f)
        return feature_list

    def get_features(self):
        return self.features

    def get_name(self):
        return self.name
 def fit(self, input_df):
     self.whole_df[self.cols].fillna('NAN', inplace=True)
     oe = OrdinalEncoder(cols=self.cols, handle_unknown='inpute')
     oe.fit(self.whole_df[self.cols])
     self.oe = oe
     return self.transform(input_df)
示例#9
0
    cat = OnnxSubEstimator(skl_ord,
                           X,
                           op_version=opv,
                           output_names=operator.outputs[:1])
    cat.add_to(scope, container)


update_registered_converter(OrdinalEncoder, "CategoricalEncoderOrdinalEncoder",
                            ordinal_encoder_shape_calculator,
                            ordinal_encoder_converter)

###################################
# Let's compute the output one a short example.

enc = OrdinalEncoder(cols=[0, 1])
enc.fit(X)
print(enc.transform(X[:5]))

###################################
# Let's check the ONNX conversion produces the same results.

ord_onx = to_onnx(enc, X[:1], target_opset=14)
sess = InferenceSession(ord_onx.SerializeToString())
print(sess.run(None, {'X': X[:5]})[0])

######################################
# That works.
#
# Custom converter for WOEEncoder
# +++++++++++++++++++++++++++++++
#