class ImputerWrapper: """ A simple wrapper around Imputer and supports using zero to fill in missing values. If entire column is nan it gets filled with 0 to avoid Imputer removing the column. """ def __init__(self, missing_values='NaN', strategy='zero', axis=0, verbose=0, copy=False): self.strategy = strategy self.imputer = None if strategy != 'zero': self.imputer = Imputer(missing_values, strategy, axis, verbose, copy) def prepare(self, X): for j in range(X.shape[1]): all_nan = True for i in range(X.shape[0]): if not numpy.isnan(X[i][j]): all_nan = False break if all_nan: logging.info('column %d all nan, filling with 0' % j) for i in range(X.shape[0]): X[i][j] = 0.0 def fit(self, X, y=None): if self.strategy == 'zero': return self self.prepare(X) self.imputer.fit(X, y) return self def fit_transform(self, X, y=None, **fit_params): if self.strategy == 'zero': for i in range(X.shape[0]): for j in range(X.shape[1]): if numpy.isnan(X[i][j]): X[i][j] = 0.0 return X self.prepare(X) return self.imputer.fit_transform(X, y, **fit_params) def get_params(self, deep=True): if self.strategy == 'zero': return None return self.imputer.get_params(deep) def set_params(self, **params): if self.strategy == 'zero': return self self.imputer.set_params(**params) return self def transform(self, X): if self.strategy == 'zero': for i in range(X.shape[0]): for j in range(X.shape[1]): if numpy.isnan(X[i][j]): X[i][j] = 0.0 return X return self.imputer.transform(X)
def treat_nulls(data_train, data_test, strategy={}): for feature in strategy.keys(): imp = Imputer(missing_values='NaN', axis=0) if strategy[feature] == 'average': imp.set_params(strategy='strategy__mean') if strategy[feature] == 'median': imp.set_params(strategy='strategy__median') if strategy[feature] == 'mode': imp.set_params(strategy='strategy__most_frequent') if strategy[feature] == 'remove': data_train.features.dropna(subset=[feature], inplace=True) data_test.features.dropna(subset=[feature], inplace=True) else: # if not remove imp.fit(data_train.features[feature]) data_train.features[feature] = imp.transform( data_train.features[feature]) data_test.features[feature] = imp.transform( data_test.features[feature]) return imp.transform(data_train), imp.transform(data_test)
# In[ ]: print(data1.isnull().sum(), data2.isnull().sum()) # We can now use Imputer for Imputing Missing Data # In[ ]: from sklearn.preprocessing import Imputer, LabelEncoder, OneHotEncoder le = LabelEncoder() x_train['Embarked'] = x_train['Embarked'].fillna('$') x_train['Embarked'] = le.fit_transform(x_train['Embarked']) x_train['Cabin'] = le.fit_transform(x_train['Cabin']) imr = Imputer(missing_values=8, strategy='median', axis=0, copy=False) x_train[['Cabin']] = imr.fit_transform(x_train[['Cabin']]) imr.set_params(missing_values=np.nan, strategy='mean') x_train[['Age']] = imr.fit_transform(x_train[['Age']]) imr.set_params(missing_values=3, strategy='most_frequent') x_train[['Embarked']] = imr.fit_transform(x_train[['Embarked']]) ohe = OneHotEncoder(categorical_features=[1]) x_train['Sex'] = le.fit_transform(x_train['Sex']) print(x_train.head()) # In[ ]: fig, ax1 = plt.subplots(figsize=(10, 10)) sns.heatmap(data=x_train.corr(), annot=True, fmt='.1f', linewidths=.1) # In[ ]:
# 补齐缺失的参数 from sklearn.preprocessing import Imputer a = np.array([[0, 2, 4], [3, 2, 5], [10, 2, 4]]) np.vstack((a, [None] * 3)) im = Imputer() im.fit_transform(a) # 参数 class sklearn.preprocessing.Imputer(missing_values='NaN', # strategy='mean', axis=0, verbose=0, copy=True) im.set_params(strategy="mean") # 均值 im.set_params(strategy="median") # 中位数 im.set_params(strategy="most_frequent") # 最频繁 im.fit_transform(a) # missing_values : integer or “NaN”, optional (default=”NaN”) # The placeholder for the missing values. All occurrences of # missing_values will be imputed. For missing values encoded as np.nan, # use the string value “NaN”. # 行还是列推断 im.set_params(axis=1) a = a.T im.fit_transform(a) # verbose,控制详细程度