def test_custom_ordinal_time_comparison(X=None, iterations=10, verbose=1): if not X: X = np.array([ ["P", "+"], ["P2", "-"], ["P3", "-"], ]) custom_encoder = CustomOrdinalFeatureEncoder() ordinal_encoder = OrdinalEncoder() ordinal_encoder_time = [] custom_encoder_time = [] for i in range(iterations): ts = time() custom_encoder.fit(X) transformed = custom_encoder.transform(X) custom_encoder.inverse_transform(transformed) custom_encoder_time.append(time() - ts) ts = time() ordinal_encoder.fit(X) transformed = ordinal_encoder.transform(X) ordinal_encoder.inverse_transform(transformed) ordinal_encoder_time.append(time() - ts) custom_encoder_time = np.mean(custom_encoder_time) ordinal_encoder_time = np.mean(ordinal_encoder_time) if verbose: print(f"CustomEncoder -> Time: {custom_encoder_time}") print(f"OrdinalEncoder -> Time: {ordinal_encoder_time}") return custom_encoder_time, ordinal_encoder_time
def test_ordinal_encoder_inverse(): X = [['abc', 2, 55], ['def', 1, 55]] enc = OrdinalEncoder() X_tr = enc.fit_transform(X) exp = np.array(X, dtype=object) assert_array_equal(enc.inverse_transform(X_tr), exp) # incorrect shape raises X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]]) msg = re.escape('Shape of the passed X data is not correct') with pytest.raises(ValueError, match=msg): enc.inverse_transform(X_tr)
def make_prediction(): oe = OrdinalEncoder() oe2 = OrdinalEncoder() train_data = pd.read_csv("Train.csv") test_data = pd.read_csv("Test.csv") test_data = oe.fit_transform(test_data) model = LogisticRegression() train_cols = [col for col in train_data.columns if col != "netgain"] X = train_data[train_cols] X = oe.fit_transform(X).astype("int") Y = train_data["netgain"] Y = Y.values.reshape(-1, 1) Y = oe2.fit_transform(Y).astype("int") #fitting model with prediction data and telling it my target model.fit(X, Y) test_data2 = pd.read_csv("Test.csv") test_data2["netgain"] = oe2.inverse_transform( model.predict(test_data).astype("int").reshape(-1, 1)) test_data2[["id", "netgain"]].to_csv("Results.csv") # Now we want to delete the first column which is unnecessary since it is jsut for numbering the rows new_df = pd.read_csv('Results.csv') first_column = new_df.columns[0] new_df = new_df.drop([first_column], axis=1) new_df.to_csv('Results.csv', index=False)
def test_ordinal_encoder_missing_value_support_pandas_categorical(pd_nan_type): """Check ordinal encoder is compatible with pandas.""" # checks pandas dataframe with categorical features if pd_nan_type == 'pd.NA': # pd.NA is in pandas 1.0 pd = pytest.importorskip('pandas', minversion="1.0") pd_missing_value = pd.NA else: # np.nan pd = pytest.importorskip('pandas') pd_missing_value = np.nan df = pd.DataFrame({ 'col1': pd.Series(['c', 'a', pd_missing_value, 'b', 'a'], dtype='category'), }) oe = OrdinalEncoder().fit(df) assert len(oe.categories_) == 1 assert_array_equal(oe.categories_[0][:3], ['a', 'b', 'c']) assert np.isnan(oe.categories_[0][-1]) df_trans = oe.transform(df) assert_allclose(df_trans, [[2.0], [0.0], [np.nan], [1.0], [0.0]]) X_inverse = oe.inverse_transform(df_trans) assert X_inverse.shape == (5, 1) assert_array_equal(X_inverse[:2, 0], ['c', 'a']) assert_array_equal(X_inverse[3:, 0], ['b', 'a']) assert np.isnan(X_inverse[2, 0])
def set_miss_values(df, complete_index): enc_label = OrdinalEncoder() enc_fea = OrdinalEncoder() missing_index = complete_index[0] # Take out the existing numerical data (no NaN) and throw them in Random Forest Regressor train_df = df[complete_index] # known & unknow values known_values = np.array(train_df[train_df[missing_index].notnull()]) unknow_values = np.array(train_df[train_df[missing_index].isnull()]) # y is the know missing_index y = known_values[:, 0].reshape(-1, 1) enc_label.fit(y) y = enc_label.transform(y) # X are the features X = known_values[:, 1:] test_X = unknow_values[:, 1:] all_X = np.row_stack((X, test_X)) enc_fea.fit(all_X) X = enc_fea.transform(X) # fit rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1) rfr.fit(X, y.ravel()) # predict predicted_values = rfr.predict(enc_fea.transform(unknow_values[:, 1:])) predicted_values = enc_label.inverse_transform(predicted_values.reshape(-1, 1)) # fill in with predicted values df.loc[(df[missing_index].isnull()), missing_index] = predicted_values return df
def test_ordinal_encoder_missing_value_support_pandas_categorical(pd_nan_type): """Check ordinal encoder is compatible with pandas.""" # checks pandas dataframe with categorical features pd = pytest.importorskip("pandas") pd_missing_value = pd.NA if pd_nan_type == "pd.NA" else np.nan df = pd.DataFrame( { "col1": pd.Series(["c", "a", pd_missing_value, "b", "a"], dtype="category"), } ) oe = OrdinalEncoder().fit(df) assert len(oe.categories_) == 1 assert_array_equal(oe.categories_[0][:3], ["a", "b", "c"]) assert np.isnan(oe.categories_[0][-1]) df_trans = oe.transform(df) assert_allclose(df_trans, [[2.0], [0.0], [np.nan], [1.0], [0.0]]) X_inverse = oe.inverse_transform(df_trans) assert X_inverse.shape == (5, 1) assert_array_equal(X_inverse[:2, 0], ["c", "a"]) assert_array_equal(X_inverse[3:, 0], ["b", "a"]) assert np.isnan(X_inverse[2, 0])
class SklearnEncoder(object): def __init__(self, encoder_type): self.encoder_type = encoder_type if self.encoder_type == "Label": self.encoder_module = LabelEncoder() elif self.encoder_type == "OneHot": self.encoder_module = OneHotEncoder() elif self.encoder_type == "Ordinal": # 序数编码 self.encoder_module = OrdinalEncoder() def _fit(self, x, y=None): if self.encoder_type == "Label": self.encoder_module.fit(y=x) else: self.encoder_module.fit(X=x, y=y) def _transform(self, x): if self.encoder_type == "Label": return self.encoder_module.transform(y=x) else: return self.encoder_module.transform(X=x) def _fit_transform(self, x, y=None): if self.encoder_type == "Label": return self.encoder_module.fit_transform(y=x) else: return self.encoder_module.fit_transform(X=x, y=y) def _reversal(self, x): # 与transform的操作刚好相反 return self.encoder_module.inverse_transform(X=x)
class CategoricalEncoder(BaseEstimator, TransformerMixin): """ A Transformer to one hot-encode a given category of data """ def __init__(self): self.categories_ = None self._one_hot_encoder = OneHotEncoder(sparse=False, categories='auto') self._ordinal_encoder = OrdinalEncoder() def fit(self, X, y=None): # pylint: disable=invalid-name,unused-argument """ Satisfy sklearn, do nothing :param X: :param y: :return: """ return self def transform(self, X): # pylint: disable=invalid-name,no-self-use """ Encode each column of X as one hot lists :param X: :return: """ ordinal_data = self._ordinal_encoder.fit_transform(X) self.categories_ = self._ordinal_encoder.categories_ return self._one_hot_encoder.fit_transform(ordinal_data) def inverse_transform(self, X): # pylint: disable=invalid-name,no-self-use """ Return the original data, given transformed data """ ordinal_data = self._one_hot_encoder.inverse_transform(X) return self._ordinal_encoder.inverse_transform(ordinal_data)
def impute_cat_column(y: pd.Series, X: pd.DataFrame) -> pd.Series: """Impute missing values of a categorical pandas Series using a catboost classifier. Missing values of categorical features in X are imputed using their mode. Args: y (pd.Series): Series for which to impute missing values X (pd.DataFrame): Features to use for imputation Returns: pd.Series: y with missing values imputed """ cat_features = X.select_dtypes('object').columns X[cat_features] = X[cat_features].fillna(X[cat_features].mode().iloc[0]) idx_valid = y.notnull() y_valid = y[idx_valid] if y.dtype == 'O': enc = OrdinalEncoder() y_valid = enc.fit_transform(y_valid.values.reshape(-1,1)) model = cb.CatBoostClassifier() _ = model.fit(X[idx_valid], y_valid, cat_features=cat_features, verbose=0) y_pred = model.predict(X[~idx_valid]) if y.dtype == 'O': y_pred = enc.inverse_transform(y_pred) cur_opt = pd.get_option('mode.chained_assignment') pd.set_option('mode.chained_assignment',None) y.loc[~idx_valid] = y_pred.reshape(-1) pd.set_option('mode.chained_assignment',cur_opt) return y
def test_ordinal_encoder_sparse(): """Check that we raise proper error with sparse input in OrdinalEncoder. Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/19878 """ X = np.array([[3, 2, 1], [0, 1, 1]]) X_sparse = sparse.csr_matrix(X) encoder = OrdinalEncoder() err_msg = "A sparse matrix was passed, but dense data is required" with pytest.raises(TypeError, match=err_msg): encoder.fit(X_sparse) with pytest.raises(TypeError, match=err_msg): encoder.fit_transform(X_sparse) X_trans = encoder.fit_transform(X) X_trans_sparse = sparse.csr_matrix(X_trans) with pytest.raises(TypeError, match=err_msg): encoder.inverse_transform(X_trans_sparse)
def test_ordinal_encoder_inverse(): X = [['abc', 2, 55], ['def', 1, 55]] enc = OrdinalEncoder() X_tr = enc.fit_transform(X) exp = np.array(X, dtype=object) assert_array_equal(enc.inverse_transform(X_tr), exp) # incorrect shape raises X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]]) msg = re.escape('Shape of the passed X data is not correct') assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr)
def test_ordinal_encoder_handle_unknowns_string(): enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-2) X_fit = np.array([['a', 'x'], ['b', 'y'], ['c', 'z']], dtype=object) X_trans = np.array([['c', 'xy'], ['bla', 'y'], ['a', 'x']], dtype=object) enc.fit(X_fit) X_trans_enc = enc.transform(X_trans) exp = np.array([[2, -2], [-2, 1], [0, 0]], dtype='int64') assert_array_equal(X_trans_enc, exp) X_trans_inv = enc.inverse_transform(X_trans_enc) inv_exp = np.array([['c', None], [None, 'y'], ['a', 'x']], dtype=object) assert_array_equal(X_trans_inv, inv_exp)
def test_ordinal_encoder_handle_unknowns_numeric(dtype): enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-999) X_fit = np.array([[1, 7], [2, 8], [3, 9]], dtype=dtype) X_trans = np.array([[3, 12], [23, 8], [1, 7]], dtype=dtype) enc.fit(X_fit) X_trans_enc = enc.transform(X_trans) exp = np.array([[2, -999], [-999, 1], [0, 0]], dtype="int64") assert_array_equal(X_trans_enc, exp) X_trans_inv = enc.inverse_transform(X_trans_enc) inv_exp = np.array([[3, None], [None, 8], [1, 7]], dtype=object) assert_array_equal(X_trans_inv, inv_exp)
def test_ordinal_encoder_handle_unknowns_string(): enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-2) X_fit = np.array([["a", "x"], ["b", "y"], ["c", "z"]], dtype=object) X_trans = np.array([["c", "xy"], ["bla", "y"], ["a", "x"]], dtype=object) enc.fit(X_fit) X_trans_enc = enc.transform(X_trans) exp = np.array([[2, -2], [-2, 1], [0, 0]], dtype="int64") assert_array_equal(X_trans_enc, exp) X_trans_inv = enc.inverse_transform(X_trans_enc) inv_exp = np.array([["c", None], [None, "y"], ["a", "x"]], dtype=object) assert_array_equal(X_trans_inv, inv_exp)
def test_ordinal_encoder_passthrough_missing_values_float(): """Test ordinal encoder with nan on float dtypes.""" X = np.array([[np.nan, 3.0, 1.0, 3.0]], dtype=np.float64).T oe = OrdinalEncoder().fit(X) assert len(oe.categories_) == 1 assert_allclose(oe.categories_[0], [1.0, 3.0, np.nan]) X_trans = oe.transform(X) assert_allclose(X_trans, [[np.nan], [1.0], [0.0], [1.0]]) X_inverse = oe.inverse_transform(X_trans) assert_allclose(X_inverse, X)
class DataEncoder(object): # 支持三种编码方式 def __init__(self, encoder_type): assert encoder_type in {"one_hot", "label", "Ordinal"} self.encoder_type = encoder_type if self.encoder_type == "one_hot": # 种类的编码 self.encodermodule = OneHotEncoder(categories='auto', drop=None, sparse=True, dtype=np.float64, handle_unknown='error') # categories 可取 "auto" 或种类的列表 # drop 可取 {‘first’, ‘if_binary’} None 或 array [i] 表示丢弃第i个 # first 表示丢弃每个种类特征的第一个, 二进制 # sparse 返回一个稀疏矩阵,否则返回一个数组 # handle_unknown {‘error’, ‘ignore’}, default=’error’ elif self.encoder_type == "label": self.encodermodule = LabelEncoder() elif self.encoder_type == "Ordinal": # 序号编码 self.encodermodule = OrdinalEncoder(categories="auto", dtype=np.float64) # categories 用法与onehot 差不多 else: raise ValueError("please select a correct encoder_type") def fit_transform(self, data): return self.encodermodule.fit_transform(data) def fit(self, data): self.encodermodule.fit(data) def transform(self, data): self.encodermodule.transform(data) def set_params(self, params): self.encodermodule.set_params(**params) def get_params(self): return self.encodermodule.get_params(deep=True) def inverse_transform(self, data): return self.encodermodule.inverse_transform(data) def get_classes(self): assert self.encoder_type in {"label"} return self.encodermodule.classes_ def get_category(self): assert self.encoder_type in {"one_hot", "Ordinal"} return self.encodermodule.categories_ # 返回数组列表 def get_feature_names(self, output_feature): # 获取输出特征的特征名字 assert self.encoder_type in {"one_hot"} return self.encodermodule.get_feature_names(output_feature)
class DecoderXGBoost(BaseEstimator, TransformerMixin): def __init__(self): self.enc = OrdinalEncoder() # Encoder que transforma os nomes das classes em valores numéricos que serão melhores interpretados pelo classificador. self.enc.fit([['EXCELENTE'], ['MUITO_BOM'], ['HUMANAS'], ['EXATAS'], ['DIFICULDADE']]) def fit(self, X, y=None): return self def transform(self, X): # Primeiro extraímos a classe que teve o melhor fit para cada resultado. best_preds = np.asarray([[np.argmax(line) for line in X]]).transpose() # Decodificamos para os tipos de dados originais da coluna de PERFIL (object). best_preds = self.enc.inverse_transform(best_preds) # Retornamos um np.array com as novas predições. return best_preds
class CatSklearnAttacker(PrivacyAttackerModel): """Base class for categorical attacker based on sklearn models. Attributes: key_type (CategoricalType): Required key attribute type (class_num or one_hot) by the learner. sensitive_type (CategoricalType): Required sensitive attribute type (class_num or one_hot) by the learner. skl_learner (Class): A (wrapped) sklearn classifier class that can be called with no arguments. """ KEY_TYPE = None SENSITIVE_TYPE = None SKL_LEARNER = None def __init__(self): self.predictor = self.SKL_LEARNER() self.key_processor = OrdinalEncoder() if self.KEY_TYPE == CategoricalType.CLASS_NUM \ else OneHotEncoder() self.sensitive_processor = OrdinalEncoder() if \ self.SENSITIVE_TYPE == CategoricalType.CLASS_NUM else OneHotEncoder() def fit(self, synthetic_data, key, sensitive): key_table = allow_nan(synthetic_data[key]) sensitive_table = allow_nan(synthetic_data[sensitive]) self.key_processor.fit(key_table) self.sensitive_processor.fit(sensitive_table) key_train = self.key_processor.transform(key_table) sensitive_train = self.sensitive_processor.transform(sensitive_table) self.predictor.fit(key_train, sensitive_train) def predict(self, key_data): keys = allow_nan_array(key_data) # de-nan key attributes try: # key attributes in ML ready format keys_transform = self.key_processor.transform([keys]) except ValueError: # Some attributes of the input haven't appeared in synthetic tables return None sensitive_pred = self.predictor.predict(keys_transform) if len(np.array(sensitive_pred).shape) == 1: sensitive_pred = [sensitive_pred] # predicted sensitive attributes in original format sensitives = self.sensitive_processor.inverse_transform(sensitive_pred) return tuple(sensitives[0])
def _fit_resample(self, X, y): # FIXME: to be removed in 0.12 if self.n_jobs is not None: warnings.warn( "The parameter `n_jobs` has been deprecated in 0.10 and will be " "removed in 0.12. You can pass an nearest neighbors estimator where " "`n_jobs` is already set instead.", FutureWarning, ) self._validate_estimator() X_resampled = [X.copy()] y_resampled = [y.copy()] encoder = OrdinalEncoder(dtype=np.int32) X_encoded = encoder.fit_transform(X) vdm = ValueDifferenceMetric( n_categories=[len(cat) for cat in encoder.categories_]).fit(X_encoded, y) for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: continue target_class_indices = np.flatnonzero(y == class_sample) X_class = _safe_indexing(X_encoded, target_class_indices) X_class_dist = vdm.pairwise(X_class) self.nn_k_.fit(X_class_dist) # the kneigbors search will include the sample itself which is # expected from the original algorithm nn_indices = self.nn_k_.kneighbors(X_class_dist, return_distance=False) X_new, y_new = self._make_samples(X_class, class_sample, y.dtype, nn_indices, n_samples) X_new = encoder.inverse_transform(X_new) X_resampled.append(X_new) y_resampled.append(y_new) X_resampled = np.vstack(X_resampled) y_resampled = np.hstack(y_resampled) return X_resampled, y_resampled
def best_features(train, test, perc): temp_trans = OrdinalEncoder(dtype='int') train[['protocol_type', 'service', 'flag', 'target']] = temp_trans.fit_transform( train[['protocol_type', 'service', 'flag', 'target']]) trans = SelectPercentile(f_classif, percentile=perc) trans.fit(train.drop('target', axis='columns'), train['target']) train[['protocol_type', 'service', 'flag', 'target']] = temp_trans.inverse_transform( train[['protocol_type', 'service', 'flag', 'target']]) eliminated_columns = trans.get_support() bad_features = [] for i in range(len(eliminated_columns)): if not eliminated_columns[i]: bad_features.append(train.columns[i]) train.drop(bad_features, axis='columns', inplace=True) test.drop(bad_features, axis='columns', inplace=True) return train, test
def _fit_resample(self, X, y): self._validate_estimator() X_resampled = [X.copy()] y_resampled = [y.copy()] encoder = OrdinalEncoder(dtype=np.int32) X_encoded = encoder.fit_transform(X) vdm = ValueDifferenceMetric( n_categories=[len(cat) for cat in encoder.categories_] ).fit(X_encoded, y) for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: continue target_class_indices = np.flatnonzero(y == class_sample) X_class = _safe_indexing(X_encoded, target_class_indices) X_class_dist = vdm.pairwise(X_class) self.nn_k_.fit(X_class_dist) # the kneigbors search will include the sample itself which is # expected from the original algorithm nn_indices = self.nn_k_.kneighbors(X_class_dist, return_distance=False) X_new, y_new = self._make_samples( X_class, class_sample, y.dtype, nn_indices, n_samples ) X_new = encoder.inverse_transform(X_new) X_resampled.append(X_new) y_resampled.append(y_new) X_resampled = np.vstack(X_resampled) y_resampled = np.hstack(y_resampled) return X_resampled, y_resampled
def run_unsupervised_simulation(self, N=1000): self.N = N try: return self.S1T, self.S2 except NameError: # If calc_second_order is True, the resulting matrix has N * (2D + 2) rows. encoder = OrdinalEncoder().fit(self.df) sample = self.sampler.sample(problem=self.problem, N=N) sample = pd.DataFrame(encoder.inverse_transform(np.rint(sample)), columns=self.df.columns) # WARNING: hard coded!! predictions = self.bn.predict( sample, 'response').applymap(lambda x: 1 if x == 'pCR' else 0) si = self.analyzer.analyze(problem, sample) self.S1T = pd.DataFrame({ 'S1': si['S1'], 'ST': si['ST'] }, index=self.df.columns) self.S2 = pd.DataFrame(si['S2'], index=self.df.columns, columns=self.df.columns) return self.S1T, self.S2
class NumericTransformer(object): """General purpose numeric conversion for pandas dataframes. All categorical data and levels must be passed to .fit(). If new categorical series or levels are present in .transform() it won't work! Currently datetimes cannot be inverse_transformed back to datetime Args: na_strings (list): list of strings to replace as pd.NA categorical_fillna (str): how to fill NaN for categorical variables (numeric NaN are unaltered) "ffill" - uses forward and backward filling to supply na values "indicator" or anything else currently results in all missing replaced with str "missing_value" handle_unknown (str): passed through to scikit-learn OrdinalEncoder verbose (int): greater than 0 to print some messages """ def __init__( self, na_strings: list = ['', ' '], # 'NULL', 'NA', 'NaN', 'na', 'nan' categorical_fillna: str = "ffill", handle_unknown: str = 'use_encoded_value', verbose: int = 0, ): self.na_strings = na_strings self.verbose = verbose self.categorical_fillna = categorical_fillna self.handle_unknown = handle_unknown self.categorical_flag = False self.needs_transformation = True def _fit(self, df): """Fit categorical to numeric.""" # test if any columns aren't numeric if not isinstance(df, pd.DataFrame): # basically just Series inputs df = pd.DataFrame(df) if df.shape[1] == df.select_dtypes(include=np.number).shape[1]: self.needs_transformation = False if self.verbose > 2: print("All data is numeric, skipping NumericTransformer") if self.needs_transformation: # replace some common nan datatypes from strings to nan df.replace(self.na_strings, np.nan, inplace=True) # pd.NA in future # convert series to numeric which can be readily converted. df = df.apply(pd.to_numeric, errors='ignore') # record which columns are which dtypes self.column_order = df.columns self.numeric_features = df.select_dtypes( include=[np.number] ).columns.tolist() self.categorical_features = list( set(df.columns.tolist()) - set(self.numeric_features) ) if len(self.categorical_features) > 0: self.categorical_flag = True if self.categorical_flag: from sklearn.preprocessing import OrdinalEncoder df_enc = df[self.categorical_features] if self.categorical_fillna == "ffill": df_enc = df_enc.fillna(method='ffill').fillna(method='bfill') df_enc = df_enc.fillna('missing_value') self.cat_transformer = OrdinalEncoder( handle_unknown=self.handle_unknown, unknown_value=np.nan ) # the + 1 makes it compatible with remove_leading_zeroes df_enc = self.cat_transformer.fit_transform(df_enc) + 1 # df_enc = self.cat_transformer.transform(df_enc) + 1 self.cat_max = df_enc.max(axis=0) self.cat_min = df_enc.min(axis=0) if self.verbose > 0: print("Categorical features converted to numeric") df = pd.concat( [ pd.DataFrame( df[self.numeric_features], columns=self.numeric_features ), pd.DataFrame( df_enc, columns=self.categorical_features, index=df.index ), ], axis=1, )[self.column_order] return df.astype(float) def fit(self, df): """Learn behavior of data to change. Args: df (pandas.DataFrame): input dataframe """ self._fit(df) return self def fit_transform(self, df): """Fits and Returns *Magical* DataFrame. Args: df (pandas.DataFrame): input dataframe """ return self._fit(df) def transform(self, df): """Convert categorical dataset to numeric.""" if self.needs_transformation: if not isinstance(df, pd.DataFrame): df = pd.DataFrame(df) df.replace(self.na_strings, np.nan, inplace=True) df = df.apply(pd.to_numeric, errors='ignore') if self.categorical_flag: df_enc = (df[self.categorical_features]).fillna(method='ffill') df_enc = df_enc.fillna(method='bfill').fillna('missing_value') df_enc = self.cat_transformer.transform(df_enc) + 1 df = pd.concat( [ pd.DataFrame( df[self.numeric_features], columns=self.numeric_features ), pd.DataFrame( df_enc, columns=self.categorical_features, index=df.index ), ], axis=1, )[self.column_order] try: df = df.astype(float) except ValueError as e: raise ValueError( f"NumericTransformer.transform() could not convert data to float. {str(e)}." ) return df def inverse_transform(self, df, convert_dtypes: bool = False): """Convert numeric back to categorical. Args: df (pandas.DataFrame): df convert_dtypes (bool): whether to use pd.convert_dtypes after inverse """ if self.categorical_flag: if not isinstance(df, pd.DataFrame): # basically just Series inputs df = pd.DataFrame(df) df_enc = ( df[self.categorical_features].clip( upper=self.cat_max, lower=self.cat_min, axis=1 ) - 1 ) df_enc = self.cat_transformer.inverse_transform(df_enc) df = pd.concat( [ pd.DataFrame( df[self.numeric_features], columns=self.numeric_features ), pd.DataFrame( df_enc, columns=self.categorical_features, index=df.index ), ], axis=1, )[self.column_order] if convert_dtypes: df = df.convert_dtypes() return df
data_.iloc[:,1:-1] = enc.fit_transform(data_.iloc[:,1:-1]) # 一步到位 # In[]: from sklearn.preprocessing import OneHotEncoder X = data_.iloc[:,1:-1] enc = OneHotEncoder(categories='auto').fit(X) result = enc.transform(X).toarray() #依然可以直接一步到位,但为了给大家展示模型属性,所以还是写成了三步 OneHotEncoder(categories='auto').fit_transform(X).toarray() #依然可以还原 pd.DataFrame(enc.inverse_transform(result)) print(enc.get_feature_names()) # 返回每一个经过哑变量后生成稀疏矩阵列的名字 # axis=1,表将两表左右相连,如果是axis=0,就是将量表上下相连 newdata = pd.concat([data,pd.DataFrame(result)],axis=1) newdata.drop(["Sex","Embarked"],axis=1,inplace=True) newdata.columns = ["Age","Survived","Female","Male","Embarked_C","Embarked_Q","Embarked_S"] # In[]: # 5、连续变量转换: # 将年龄二值化 data_2 = data.copy()
class GeneralizeCategorical(GeneralizeContinuous): def __init__(self, epsilon=1.0, n_bins=5, strategy='uniform', max_cardinality=10): super().__init__(n_bins=n_bins, strategy=strategy) self.epsilon = epsilon self.max_cardinality = max_cardinality def fit(self, X, y=None): """ Steps: 1. Transform categorical to continuous 2. Store DP marginal counts for optional inverse transform 3. Run super().fit() to get groups """ self._ordinalencoder = OrdinalEncoder().fit(X) #todo: turn into numpy -> df needed for marginal distribution X_enc = self._ordinalencoder.transform(X) X_enc = pd.DataFrame(X_enc, columns=X.columns) # get dp marginal of encoded feature # todo turn into list of arrays local_epsilon = self.epsilon / X.shape[1] self.marginals_ = [] for jj, c in enumerate(X.columns): self.marginals_.append( dp_marginal_distribution(X_enc.loc[:, c], local_epsilon).values) return super().fit(X_enc, y) def transform(self, X): """Equivalent to continuous transform but we still need to encode the data beforehand""" X_enc = self._ordinalencoder.transform(X) return super().transform(X_enc) def inverse_transform(self, Xt): assert set(Xt.columns) == set( self._header), "input contains different columns than seen in fit" X_enc = check_array(Xt, copy=True, dtype=FLOAT_DTYPES, force_all_finite='allow-nan') # Xinv = Xt.copy() n_records, n_features = X_enc.shape if X_enc.shape[1] != n_features: raise ValueError("Incorrect number of features. Expecting {}, " "received {}.".format(n_features, X_enc.shape[1])) self._marginal_group_alloc = [] for jj, c in enumerate(Xt.columns): bin_edges = self.bin_edges_[jj] marginals = self.marginals_[jj] marginals_idx = np.arange(len(marginals)) rtol = 1.e-5 atol = 1.e-8 eps = atol + rtol * np.abs(marginals) marginal_group_alloc = np.digitize(marginals_idx + eps, bin_edges[1:]) np.clip(marginal_group_alloc, 0, self.n_bins_[jj] - 1, out=marginal_group_alloc) self._marginal_group_alloc.append(marginal_group_alloc) # lower_bounds = np.int_(bin_edges[np.int_(X_enc[:, jj])]) # upper_bounds = np.int_(bin_edges[np.int_(X_enc[:, jj]) + 1]) for i in range(n_records): # Values which are close to a bin edge are susceptible to numeric # instability. Add eps to X so these values are binned correctly # with respect to their decimal truncation. See documentation of # numpy.isclose for an explanation of ``rtol`` and ``atol``. # rtol = 1.e-5 # atol = 1.e-8 # eps = atol + rtol * np.abs(upper_bounds[i]) # marginal_candidates = marginals[ # (marginals.keys() >= lower_bounds[i]) & # (marginals.keys() < upper_bounds[i] + eps)] #np.where returns 1d tuple, thus index 0 marginal_candidate_idx = np.where( X_enc[i, jj] == marginal_group_alloc)[0] marginal_candidate_probs = marginals[marginal_candidate_idx] marginal_candidate_probs_normalized = dp_normalize( marginal_candidate_probs) # marginal_idx = np.arange(lower_bounds[i], upper_bounds[i]) # marginal_probs = marginals[marginal_idx] # marginal_probs_normalized = marginal_probs / marginal_probs.sum() # sample encoded (numerical) value based on marginal probabilities # print(jj) # print(X_enc.shape) # print(X_enc[i, jj]) # print(marginal_candidate_idx) X_enc[i, jj] = np.random.choice( marginal_candidate_idx, p=marginal_candidate_probs_normalized) # X_enc[i, jj] = np.random.choice(list(marginal_candidates.keys()), p=marginal_candidates_normalized.values) # inverse transform numerical value to original categorical X_inv = self._ordinalencoder.inverse_transform(X_enc) return pd.DataFrame(X_inv, columns=self._header)
class BinaryEncoder(BaseEstimator, TransformerMixin): """ First the categories are encoded as ordinal, then the resulting integers are converted into the binary code, then the digits from the binary string are split into separate columns. Decoding is done with O(n) complexity by selecting the value of parameter, binary representation of which is the closest to one of the existing categories in terms of Euclidean distance. Choices object should be hashable. """ def __init__(self, categories: Union[str, List[object]] = 'auto'): self.__categories = categories self.__transformer = OrdinalEncoder(categories=self.__categories, dtype=pd.np.int64) self.__encode_mapping = {} self.__decode_mapping = {} self.__n_bits = {} self._enc_suffix = f"_{self.__class__.__name__}" def fit(self, df: pd.DataFrame, y=None): self.__transformer.fit(X=df, y=y) self.__n_bits = { c_name: len(format(len(c_cats), 'b')) for c_name, c_cats in enumerate(self.__transformer.categories_) } # __n_bits reflects how many bits it is needed to encode categories of corresponding (by index) column # precompute binary encodings for idx, column_categories in enumerate( self.__transformer.categories_): self.__encode_mapping[idx] = dict() self.__decode_mapping[idx] = dict() for cat_idx, category in enumerate(column_categories): encoding = tuple( float(x) for x in format(cat_idx, f'0{self.__n_bits[idx]}b')) self.__encode_mapping[idx][cat_idx] = encoding self.__decode_mapping[idx][encoding] = cat_idx return self def transform(self, df: pd.DataFrame) -> pd.np.ndarray: if len(df.keys()) != len(self.__n_bits): raise TypeError( f"Transformer was fit to data with {self.__n_bits} columns, " f"but given data with {len(df.keys())} columns.") # Convert to OrdinalEncoding pre_transformed = self.__transformer.transform( X=df) # In OrdinalEncoding # Convert to BinaryEncoding n_out_columns = sum(self.__n_bits.values()) n_out_rows = len(pre_transformed) transformed = pd.np.empty(shape=(n_out_rows, n_out_columns), dtype=pd.np.int64) for row_idx, p_row in enumerate(pre_transformed): row = [] for idx, cat_idx in enumerate(p_row): row.extend(self.__encode_mapping[idx][cat_idx]) transformed[row_idx] = row return transformed def inverse_transform(self, df: pd.DataFrame) -> pd.np.ndarray: # convert back from Binary to OrdinalEncoding ordinal_encoded = pd.DataFrame(columns=self.__encode_mapping.keys()) # decode per original column left_pointer = 0 for column in self.__encode_mapping.keys(): columns_idxs = slice(left_pointer, left_pointer + self.__n_bits[column]) left_pointer += self.__n_bits[column] bin_columns_raw = df.iloc[:, columns_idxs].to_numpy() bin_columns_real = pd.np.apply_along_axis( self._closest_euclidean, axis=1, arr=bin_columns_raw, vectors=self.__decode_mapping[column].keys()) ord_column = pd.np.apply_along_axis( lambda enc: self.__decode_mapping[column][tuple(enc)], axis=1, arr=bin_columns_real) ordinal_encoded[column] = ord_column # convert back from OrdinalEncoding to original one decoded = self.__transformer.inverse_transform(ordinal_encoded) return decoded @staticmethod def _closest_euclidean(vector: pd.np.ndarray, vectors: List[pd.np.ndarray]) -> pd.np.ndarray: """ finds closest vector from a list of provided vectors by minimizing Euclidean distance :param vector: :param vectors: :return: """ min_found_distance = float('inf') closest_vector = None for existing_vector in vectors: dist = sum(((x - y)**2 for x, y in zip(vector, existing_vector))) if dist == 0: # Found exact match closest_vector = existing_vector break elif dist < min_found_distance: closest_vector = existing_vector min_found_distance = dist return closest_vector
class NameClassifier(object): ''' ML algorithm to classify names' nationality this class is NameClassifier model class Attributes: Vectorizer: to vectorize the data for prediction, CountVectorizer model: classifier for decision making, based on Naive Bayes Methods: load_data train evaluate predict get_word_dict get_label_str plot_confusion saveModel loadModel ''' def __init__(self): # declaire the model variables, classifier (clf) and vectorizer, if training new one self.model = MultinomialNB() ### Some utility functions for data preprocess etc # load data from csv on pandas, not tied to class def load_data(self, file_names, test_size=0.3): '''Load the data, encode the labels, and split into train and test set. Params: file_name(string): file path & name to the csv file test_size(float): ratio of testing set, between 0 & 1 Return: x_train, x_test(as pandas series of names), y_train, y_test(as numpy arr of labels) These elements will be returned on the order above. Pandas Series: name data, X_train and X_test ndarray: encoded labels, y_train and y_test ''' # if 2 filenames (japanese and foregin) is entered, convert labels into fr(non-japanese) for binary classification if type(file_names) == list: df = pd.concat([pd.read_csv(f) for f in file_names]) df.loc[df.code != 'jp_JP', 'code'] = 'fr' else: df = pd.read_csv(file_names) labels = df['code'].values.reshape(-1, 1) self.label_encoder = OrdinalEncoder().fit(labels) labels = self.label_encoder.transform(labels) return train_test_split(df['name'], labels.ravel(), test_size=test_size, shuffle=True) def train(self, X_train, y_train): '''given training data, this method will fit the vectorizer(bag of words) and train the naive bayes model. Param: X_train(Pandas Series): training name dataset y_train(ndarray): training labels dataset ''' # fit the vectorizer print('Fitting the vectorizer and training the model...') self.vec = CountVectorizer().fit(X_train) self.word_vec = self.vec.transform(X_train) # train the ML model self.model.fit(self.word_vec, y_train) print('training completed!') def predict(self, names, label_str=False): '''Predict name's origin based on the test data. Returns encoded label by default, but returns label strings when label_str=True Param: names(ndarray/Pandas Series/list): containing names label_str(bool): default False, to return label integers, set it to True to return label strings Return: array: containing label integers or strings. ''' name_vector = self.vec.transform(names) pred = self.model.predict(name_vector) if not label_str: return pred else: return self.label_encoder.inverse_transform(pred.reshape( -1, 1)).ravel() def evaluate(self, names, labels): '''make prediction, and evaluate the model's - accuracy - precision: each element in returned vector represents precision for each class. - recall: same as above, except recall for each class. for each class and overall. You can take average to get model wise precision and recall. Params: names(list/Pandas Series/ndarray): names data labels(ndarray): ground truth ''' prediction = self.predict(names) cm = confusion_matrix(labels, prediction) # recall recall = np.diag(cm) / np.sum(cm, axis=1) # precision precision = np.diag(cm) / np.sum(cm, axis=0) acc = (prediction == labels).mean() return {'accuracy': acc, 'precision': precision, 'recall': recall} def get_word_dict(self, corpus=None): '''This method returns word frequency dictionary, from the training data of the model or given corpus if any. Params: corpus(list/Series): python list or pandas series of names.This is default to None, in which case frequency dictionary is created on the data the model was trained on. Returns: dictionary: python dictionary with names as keys, and their frequencies as values. ''' freq_dic = {} if corpus is None: vector = self.vec bag_words = self.word_vec else: vector = CountVectorizer().fit(corpus) bag_words = vector.transform(corpus) feature = vector.get_feature_names() sum_words = bag_words.sum(axis=0).tolist()[0] # list within list for i, word in enumerate(feature): freq_dic[word] = sum_words[i] return freq_dic def get_label_str(self, labels): '''accepts numerically encoded labels and returns corresponding label strings param: labels(ndarray): ndarray containing numerical labels returns: ndarray: containing label strings ''' return self.label_encoder.inverse_transform(labels.reshape(-1, 1)).ravel() def plot_confusion(self, yt, prediction_test): '''Plot confusion matrix, based on given labels and prediction Param: yt(ndarray): array of gruond truth labels prediction_test(ndarray): predicted labels ''' self.cm = confusion_matrix(yt, prediction_test) fig = plt.figure(figsize=(10, 8)) plt.imshow(self.cm, interpolation='nearest') plt.colorbar() axis_font = {'size': 13, 'color': 'black'} self.cat = self.label_encoder.categories_[0] num_class = len(self.cat) classNames = [self.cat[i] for i in range(num_class)] plt.title("Confusion Matrix by class", fontdict=axis_font) plt.ylabel("True Label", fontdict=axis_font) plt.xlabel("Predicted Label", fontdict=axis_font) tick_marks = np.arange(len(classNames)) plt.xticks(tick_marks, classNames, rotation=45) plt.yticks(tick_marks, classNames) fdic = {'size': 10, 'color': 'white', 'weight': 'heavy'} for i in range(num_class): for j in range(num_class): plt.text(j, i, str(self.cm[i, j]), fontdict=fdic, horizontalalignment='center', verticalalignment='center') plt.show() @classmethod def load_model(cls, file_name): # instance / class method?? '''Load saved model obj for use. Param: file_name(string): path to the model file(pickle). Return: NameClassifier: the loaded class obj for use. ''' # https://stackoverflow.com/questions/2709800/how-to-pickle-yourself # loading pickled saved model # loading itself from the pickle?? lol print('loading the model') return pickle.load(open(file_name, 'rb')) def save_model(self, file_name): '''Save a trained model obj for future use. Param: file_name(string): path to the model file(pickle). ''' # save this class itself as pickle?? pickle.dump(self, open(file_name, 'wb'))
class NumericTransformer(object): """Test numeric conversion.""" def __init__( self, na_strings: list = ['', ' ', 'NULL', 'NA', 'NaN', 'na', 'nan'], categorical_impute_strategy: str = 'constant', verbose: int = 0): self.na_strings = na_strings self.categorical_impute_strategy = categorical_impute_strategy self.verbose = verbose self.categorical_flag = False def fit(self, df): """Fit categorical to numeric.""" # replace some common nan datatypes from strings to np.nan df.replace(self.na_strings, np.nan, inplace=True) # convert series to numeric which can be readily converted. df = df.apply(pd.to_numeric, errors='ignore') # record which columns are which dtypes self.column_order = df.columns # df_datatypes = df.dtypes self.numeric_features = (df.select_dtypes( include=[np.number]).columns.tolist()) self.categorical_features = list( set(df.columns.tolist()) - set(self.numeric_features)) if len(self.categorical_features) > 0: self.categorical_flag = True if self.categorical_flag: from sklearn.preprocessing import OrdinalEncoder df_enc = (df[self.categorical_features]).fillna(method='ffill') df_enc = df_enc.fillna(method='bfill').fillna('missing_value') self.cat_transformer = OrdinalEncoder() self.cat_transformer.fit(df_enc) # the + 1 makes it compatible with remove_leading_zeroes df_enc = self.cat_transformer.transform(df_enc) + 1 self.cat_max = df_enc.max(axis=0) self.cat_min = df_enc.min(axis=0) if self.verbose >= 0: print("Categorical features converted to numeric") return self def transform(self, df): """Convert categorical dataset to numeric.""" df.replace(self.na_strings, np.nan, inplace=True) df = df.apply(pd.to_numeric, errors='ignore') if self.categorical_flag: df_enc = (df[self.categorical_features]).fillna(method='ffill') df_enc = df_enc.fillna(method='bfill').fillna('missing_value') df_enc = self.cat_transformer.transform(df_enc) + 1 df = pd.concat([ pd.DataFrame(df[self.numeric_features], columns=self.numeric_features), pd.DataFrame( df_enc, columns=self.categorical_features, index=df.index) ], axis=1)[self.column_order] return df.astype(float) def inverse_transform(self, df): """Convert numeric back to categorical.""" if self.categorical_flag: df_enc = df[self.categorical_features].clip( upper=self.cat_max, lower=self.cat_min, axis=1) - 1 df_enc = self.cat_transformer.inverse_transform(df_enc) df = pd.concat([ pd.DataFrame(df[self.numeric_features], columns=self.numeric_features), pd.DataFrame( df_enc, columns=self.categorical_features, index=df.index) ], axis=1)[self.column_order] return df
self.assertListEqual(enc_df.categories_[1].tolist(), ["Unknown", "Big", "Small", "Other"]) self.assertTrue((X_df_tran == np.array([[2, 1], [1, 1], [1, 2], [0, 3]])).all()) self.assertTrue((X_df_invtran == X).all()) if __name__ == "__main__": unittest.main() labelenc = OrdinalEncoder() X = np.array([['Male', 1], ['Female', 1], ['Female', 2]]) labelenc.fit(X) X_tran = labelenc.transform(X) X_invtran = labelenc.inverse_transform(X_tran) print(labelenc.categories_) print(X_tran) print(X_invtran) print("========") enc = FrequencyEncoder() X = np.array([['Male', 1], ['Female', 2], ['Female', 2]]) enc.fit(X) X_tran = enc.transform(X) X_invtran = enc.inverse_transform(X_tran) print(enc.categories_) print(X_tran) print(X_invtran)
# > Una posible opcion para tratar con nans, es reconocerlo como tal y asignarle su propia categoria # Aca vemos como pasar el nan al texto "nan" y por lo tanto es un "nuevo" color df['eye_color'].astype(str).unique() # Convertimos nulos a string 'nan', es decir un valor posible mas para que no explote df[['eye_color_encoded', 'gender_encoded']] = oe.fit_transform(df[columns_to_encode].astype(str)) df[['eye_color', 'eye_color_encoded', 'gender', 'gender_encoded']] # > Una funcionalidad MUY interesante de muchas de las clases de sklearn que ayudan en la transformacion de # es que tienen la transformacion INVERSA! oe.inverse_transform(df[['eye_color_encoded', 'gender_encoded']]) # **Pregunta del millon**: # - Esta todo bien con esta trasnformacion?? # - Puedo usar las columnas 'eye_color_encoded' y 'gender_encoded' ?? # #### Label Encoder # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html # Es exactamente la misma idea pero esperando una sola variable ya que se usa para encodear la variable target de un modelo predictivo le = LabelEncoder() # Convertimos nulos a string 'nan', es decir un valor posible mas df['alignment_encoded'] = le.fit_transform(df['alignment'].astype(str)) df[['alignment', 'alignment_encoded']]
# # 方法二 使用sklearn中LabelEncoder()类进行映射 # class_le = LabelEncoder() # class_le.fit(df["classlabel"]) # print(class_le.classes_) # df["classlabel"]= class_le.fit_transform(df["classlabel"]) # print(df) # print(class_le.inverse_transform(df["classlabel"])) # 方法三 使用sklearn中的OrdinalEncoder来进行编码 size_oe = OrdinalEncoder() size_oe.fit(df['size'].values.reshape(-1,1)) size_new = size_oe.fit_transform(df['size'].values.reshape(-1,1)) print(size_oe.categories_) print(size_new) print(size_oe.inverse_transform(size_new)) # # 方法四 One-hot方法 # # 为什么要使用One-hot方法? # # 对于color特征项,如果使用上述两种方法转换为数值型,会引入由不同的数值大小造成的特征不平等问题 # # 1) 使用pandas中的get_dummies()方法(哑变量)来进行处理 # pf = pd.get_dummies(df["color"]) # df = pd.concat([df,pf], axis=1) # df.drop(["color"], axis = 1,inplace = True) # print(df) # 2) 使用sklearn中的OneHotEncoder来进行处理 color_ohe = OneHotEncoder(sparse=False) # sparse = False意味着输出的是numpy.ndarray, True输出的是scipy.sparse.csr.csr_matrix color_ohe.fit(df["color"].values.reshape(-1,1)) color_New = color_ohe.fit_transform(df["color"].values.reshape(-1,1))