class DataEncoder(object): # 支持三种编码方式 def __init__(self, encoder_type): assert encoder_type in {"one_hot", "label", "Ordinal"} self.encoder_type = encoder_type if self.encoder_type == "one_hot": # 种类的编码 self.encodermodule = OneHotEncoder(categories='auto', drop=None, sparse=True, dtype=np.float64, handle_unknown='error') # categories 可取 "auto" 或种类的列表 # drop 可取 {‘first’, ‘if_binary’} None 或 array [i] 表示丢弃第i个 # first 表示丢弃每个种类特征的第一个, 二进制 # sparse 返回一个稀疏矩阵,否则返回一个数组 # handle_unknown {‘error’, ‘ignore’}, default=’error’ elif self.encoder_type == "label": self.encodermodule = LabelEncoder() elif self.encoder_type == "Ordinal": # 序号编码 self.encodermodule = OrdinalEncoder(categories="auto", dtype=np.float64) # categories 用法与onehot 差不多 else: raise ValueError("please select a correct encoder_type") def fit_transform(self, data): return self.encodermodule.fit_transform(data) def fit(self, data): self.encodermodule.fit(data) def transform(self, data): self.encodermodule.transform(data) def set_params(self, params): self.encodermodule.set_params(**params) def get_params(self): return self.encodermodule.get_params(deep=True) def inverse_transform(self, data): return self.encodermodule.inverse_transform(data) def get_classes(self): assert self.encoder_type in {"label"} return self.encodermodule.classes_ def get_category(self): assert self.encoder_type in {"one_hot", "Ordinal"} return self.encodermodule.categories_ # 返回数组列表 def get_feature_names(self, output_feature): # 获取输出特征的特征名字 assert self.encoder_type in {"one_hot"} return self.encodermodule.get_feature_names(output_feature)
class SibSpBinner(BaseEstimator, TransformerMixin): def __init__(self, thresh=2, encode='ord', sparse=False): if thresh > 7: raise ValueError('Specify a value less than 7') self.thresh = thresh self.cat = self.get_cat(thresh) self.encode = encode self.sparse = sparse if encode == 'ohe': self.enc = OneHotEncoder([self.cat], drop=[self.cat[-1]], sparse=sparse) elif encode == 'ord': self.enc = OrdinalEncoder([self.cat]) def fit(self, X, y=None): if self.encode in ['ohe', 'ord']: try: self.name = X.name except AttributeError: self.name = 'SibSp' dummy_df = pd.DataFrame({self.name: ['0']}) self.enc.fit(dummy_df) return self @staticmethod def get_cat(thresh): return [str(x) for x in range(thresh + 1)] + [f'>{thresh}'] def transform(self, X): X = X.copy() X[X > self.thresh] = f'>{self.thresh}' X = pd.DataFrame(X).astype(str) if self.encode in ['ohe', 'ord']: X = self.enc.transform(X) return X def get_feature_names(self, input_features=None): if self.encode in ['ohe', 'ord']: return self.enc.get_feature_names(input_features)
# ``` # %% from sklearn.preprocessing import OneHotEncoder encoder = OneHotEncoder(sparse=False) education_encoded = encoder.fit_transform(education_column) education_encoded # %% [markdown] # We see that encoding a single feature will give a NumPy array full of zeros # and ones. We can get a better understanding using the associated feature # names resulting from the transformation. # %% feature_names = encoder.get_feature_names(input_features=["education"]) education_encoded = pd.DataFrame(education_encoded, columns=feature_names) education_encoded # %% [markdown] # As we can see, each category (unique value) became a column; the encoding # returned, for each sample, a 1 to specify which category it belongs to. # # Let's apply this encoding on the full dataset. # %% print(f"The dataset is composed of {data_categorical.shape[1]} features") data_categorical.head() # %% data_encoded = encoder.fit_transform(data_categorical)
# In[]: from sklearn.preprocessing import OneHotEncoder X = data_.iloc[:,1:-1] enc = OneHotEncoder(categories='auto').fit(X) result = enc.transform(X).toarray() #依然可以直接一步到位,但为了给大家展示模型属性,所以还是写成了三步 OneHotEncoder(categories='auto').fit_transform(X).toarray() #依然可以还原 pd.DataFrame(enc.inverse_transform(result)) print(enc.get_feature_names()) # 返回每一个经过哑变量后生成稀疏矩阵列的名字 # axis=1,表将两表左右相连,如果是axis=0,就是将量表上下相连 newdata = pd.concat([data,pd.DataFrame(result)],axis=1) newdata.drop(["Sex","Embarked"],axis=1,inplace=True) newdata.columns = ["Age","Survived","Female","Male","Embarked_C","Embarked_Q","Embarked_S"] # In[]: # 5、连续变量转换: # 将年龄二值化 data_2 = data.copy() from sklearn.preprocessing import Binarizer X = data_2.iloc[:,0].values.reshape(-1,1) #类为特征专用,所以不能使用一维数组
print(f"The dataset is composed of {data_categorical.shape[1]} features") data_categorical.head() # %% from sklearn.preprocessing import OneHotEncoder encoder = OneHotEncoder(sparse=False) data_encoded = encoder.fit_transform(data_categorical) print(f"The dataset encoded contains {data_encoded.shape[1]} features") data_encoded # %% [markdown] # Let's wrap this numpy array in a dataframe with informative column names as provided by the encoder object: # %% columns_encoded = encoder.get_feature_names(data_categorical.columns) pd.DataFrame(data_encoded, columns=columns_encoded).head() # %% [markdown] # Look at how the workclass variable of the first 3 records has been encoded and compare this to the original string representation. # # The number of features after the encoding is than 10 times larger than in the # original data because some variables such as `occupation` and `native-country` # have many possible categories. # # We can now integrate this encoder inside a machine learning pipeline as in the # case with numerical data: let's train a linear classifier on # the encoded data and check the performance of this machine learning pipeline # using cross-validation. # %%
# avoid obtaining a sparse matrix, which is less efficient but easier to # inspect results for didactic purposes. # ``` # %% from sklearn.preprocessing import OneHotEncoder encoder = OneHotEncoder(sparse=False) education_encoded = encoder.fit_transform(education_column) # %% [markdown] # As in the previous section, we will visually check the encoding. # %% df = pd.DataFrame(education_encoded[:10], columns=encoder.get_feature_names(education_column.columns)) ax = sns.heatmap(df, annot=True, cmap="RdBu", cbar=False) ax.set_ylabel("Sample index") _ = ax.set_title("Ordinal encoding of 'education' column") # %% [markdown] # So we observed that each category in education becomes a column and the # data resulting from the encoding is indicating whether or not the sample # belong to this category. # # Let's apply this encoding on the full dataset. # %% print(f"The dataset is composed of {data_categorical.shape[1]} features") data_categorical.head()