예제 #1
0
class DataEncoder(object):  # 支持三种编码方式
    def __init__(self, encoder_type):
        assert encoder_type in {"one_hot", "label", "Ordinal"}
        self.encoder_type = encoder_type
        if self.encoder_type == "one_hot":  # 种类的编码
            self.encodermodule = OneHotEncoder(categories='auto', drop=None, sparse=True,
                                               dtype=np.float64, handle_unknown='error')
            # categories 可取 "auto" 或种类的列表
            # drop  可取 {‘first’, ‘if_binary’} None  或 array [i] 表示丢弃第i个
            # first 表示丢弃每个种类特征的第一个, 二进制
            # sparse  返回一个稀疏矩阵,否则返回一个数组
            # handle_unknown  {‘error’, ‘ignore’}, default=’error’
        elif self.encoder_type == "label":
            self.encodermodule = LabelEncoder()

        elif self.encoder_type == "Ordinal":  # 序号编码
            self.encodermodule = OrdinalEncoder(categories="auto", dtype=np.float64)
            # categories 用法与onehot 差不多
        else:
            raise ValueError("please select a correct encoder_type")

    def fit_transform(self, data):
        return self.encodermodule.fit_transform(data)

    def fit(self, data):
        self.encodermodule.fit(data)

    def transform(self, data):
        self.encodermodule.transform(data)

    def set_params(self, params):
        self.encodermodule.set_params(**params)

    def get_params(self):
        return self.encodermodule.get_params(deep=True)

    def inverse_transform(self, data):
        return self.encodermodule.inverse_transform(data)

    def get_classes(self):
        assert self.encoder_type in {"label"}
        return self.encodermodule.classes_

    def get_category(self):
        assert self.encoder_type in {"one_hot", "Ordinal"}
        return self.encodermodule.categories_  # 返回数组列表

    def get_feature_names(self, output_feature):  # 获取输出特征的特征名字
        assert self.encoder_type in {"one_hot"}
        return self.encodermodule.get_feature_names(output_feature)
예제 #2
0
class SibSpBinner(BaseEstimator, TransformerMixin):
    def __init__(self, thresh=2, encode='ord', sparse=False):

        if thresh > 7:
            raise ValueError('Specify a value less than 7')

        self.thresh = thresh
        self.cat = self.get_cat(thresh)
        self.encode = encode
        self.sparse = sparse

        if encode == 'ohe':
            self.enc = OneHotEncoder([self.cat],
                                     drop=[self.cat[-1]],
                                     sparse=sparse)
        elif encode == 'ord':
            self.enc = OrdinalEncoder([self.cat])

    def fit(self, X, y=None):

        if self.encode in ['ohe', 'ord']:
            try:
                self.name = X.name
            except AttributeError:
                self.name = 'SibSp'

            dummy_df = pd.DataFrame({self.name: ['0']})
            self.enc.fit(dummy_df)

        return self

    @staticmethod
    def get_cat(thresh):
        return [str(x) for x in range(thresh + 1)] + [f'>{thresh}']

    def transform(self, X):
        X = X.copy()
        X[X > self.thresh] = f'>{self.thresh}'

        X = pd.DataFrame(X).astype(str)

        if self.encode in ['ohe', 'ord']:
            X = self.enc.transform(X)

        return X

    def get_feature_names(self, input_features=None):
        if self.encode in ['ohe', 'ord']:
            return self.enc.get_feature_names(input_features)
예제 #3
0
# ```

# %%
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False)
education_encoded = encoder.fit_transform(education_column)
education_encoded

# %% [markdown]
# We see that encoding a single feature will give a NumPy array full of zeros
# and ones. We can get a better understanding using the associated feature
# names resulting from the transformation.

# %%
feature_names = encoder.get_feature_names(input_features=["education"])
education_encoded = pd.DataFrame(education_encoded, columns=feature_names)
education_encoded

# %% [markdown]
# As we can see, each category (unique value) became a column; the encoding
# returned, for each sample, a 1 to specify which category it belongs to.
#
# Let's apply this encoding on the full dataset.

# %%
print(f"The dataset is composed of {data_categorical.shape[1]} features")
data_categorical.head()

# %%
data_encoded = encoder.fit_transform(data_categorical)
예제 #4
0
# In[]:
from sklearn.preprocessing import OneHotEncoder

X = data_.iloc[:,1:-1]
 
enc = OneHotEncoder(categories='auto').fit(X)
result = enc.transform(X).toarray()
 
#依然可以直接一步到位,但为了给大家展示模型属性,所以还是写成了三步
OneHotEncoder(categories='auto').fit_transform(X).toarray()
 
#依然可以还原
pd.DataFrame(enc.inverse_transform(result))
 
print(enc.get_feature_names()) # 返回每一个经过哑变量后生成稀疏矩阵列的名字
 
# axis=1,表将两表左右相连,如果是axis=0,就是将量表上下相连
newdata = pd.concat([data,pd.DataFrame(result)],axis=1)
newdata.drop(["Sex","Embarked"],axis=1,inplace=True)
newdata.columns = ["Age","Survived","Female","Male","Embarked_C","Embarked_Q","Embarked_S"]



# In[]:
# 5、连续变量转换:
# 将年龄二值化
data_2 = data.copy()
 
from sklearn.preprocessing import Binarizer
X = data_2.iloc[:,0].values.reshape(-1,1)               #类为特征专用,所以不能使用一维数组
print(f"The dataset is composed of {data_categorical.shape[1]} features")
data_categorical.head()

# %%
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False)
data_encoded = encoder.fit_transform(data_categorical)
print(f"The dataset encoded contains {data_encoded.shape[1]} features")
data_encoded

# %% [markdown]
# Let's wrap this numpy array in a dataframe with informative column names as provided by the encoder object:

# %%
columns_encoded = encoder.get_feature_names(data_categorical.columns)
pd.DataFrame(data_encoded, columns=columns_encoded).head()

# %% [markdown]
# Look at how the workclass variable of the first 3 records has been encoded and compare this to the original string representation.
#
# The number of features after the encoding is than 10 times larger than in the
# original data because some variables such as `occupation` and `native-country`
# have many possible categories.
#
# We can now integrate this encoder inside a machine learning pipeline as in the
# case with numerical data: let's train a linear classifier on
# the encoded data and check the performance of this machine learning pipeline
# using cross-validation.

# %%
# avoid obtaining a sparse matrix, which is less efficient but easier to
# inspect results for didactic purposes.
# ```

# %%
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False)
education_encoded = encoder.fit_transform(education_column)

# %% [markdown]
# As in the previous section, we will visually check the encoding.

# %%
df = pd.DataFrame(education_encoded[:10],
                  columns=encoder.get_feature_names(education_column.columns))
ax = sns.heatmap(df, annot=True, cmap="RdBu", cbar=False)
ax.set_ylabel("Sample index")
_ = ax.set_title("Ordinal encoding of 'education' column")

# %% [markdown]
# So we observed that each category in education becomes a column and the
# data resulting from the encoding is indicating whether or not the sample
# belong to this category.
#
# Let's apply this encoding on the full dataset.

# %%
print(f"The dataset is composed of {data_categorical.shape[1]} features")
data_categorical.head()