def indicate_missing(train_df, test_df): for missing_feature in cont_missing_features + cat_missing_features: imp = MissingIndicator(missing_values=np.nan) imp.fit(pd.concat([train_df, test_df])[[missing_feature]]) train_df["is_missing_" + missing_feature] = imp.transform( train_df[[missing_feature]]) test_df["is_missing_" + missing_feature] = imp.transform( test_df[[missing_feature]]) return train_df, test_df
def data_missing_indicator(data_train,var_type_dict,data_test=None): ''' 进行特缺失值标记变量衍生 data_train: 需要进行转换的训练集 var_type_dict: 变量信息记录dict data_test: 需要进行转换的测试集 可以不给 不给就不会进行相应的转换 return: data_train_completed 衍生完成的训练集 var_type_dict 更新完的变量信息记录dict data_test_completed 衍生完成的测试集 ''' numeric_feature = var_type_dict.get('numeric_var',[]) category_feature = var_type_dict.get('category_var',[]) print('开始进行特缺失值标记变量衍生'.center(50, '=')) ##从dict里面把特征list拿出来 is_miss_feature = ['is_'+i+'_missing' for i in numeric_feature+category_feature] print('原始数据维度:',data_train.shape) print('新增数据维度:',len(is_miss_feature)) check_unique(numeric_feature+is_miss_feature) ##数值列和类别列用指定的方法填充 miss_indicator = MissingIndicator(features='all') data_train_completed = miss_indicator.fit_transform(data_train[numeric_feature+category_feature]) data_train_completed = pd.concat([data_train,pd.DataFrame(data_train_completed,columns=is_miss_feature)],axis=1) print('变量衍生完成:',data_train_completed.shape) ##更新var_type_dict文件 全部加入到numeric_var当中 var_type_dict['numeric_var'] = numeric_feature+is_miss_feature ##如果测试数据不为空 那么对测试数据进行transform 并返回 if data_test is not None: data_test_completed = miss_indicator.transform(data_test[numeric_feature+category_feature]) data_test_completed = pd.concat([data_test,pd.DataFrame(data_test_completed,columns=is_miss_feature)],axis=1) return data_train_completed,var_type_dict,data_test_completed return data_train_completed,var_type_dict
def test_missing_indicator_sparse_param(arr_type, missing_values, param_sparse): # check the format of the output with different sparse parameter X_fit = np.array([[missing_values, missing_values, 1], [4, missing_values, 2]]) X_trans = np.array([[missing_values, missing_values, 1], [4, 12, 10]]) X_fit = arr_type(X_fit).astype(np.float64) X_trans = arr_type(X_trans).astype(np.float64) indicator = MissingIndicator(missing_values=missing_values, sparse=param_sparse) X_fit_mask = indicator.fit_transform(X_fit) X_trans_mask = indicator.transform(X_trans) if param_sparse is True: assert X_fit_mask.format == 'csc' assert X_trans_mask.format == 'csc' elif param_sparse == 'auto' and missing_values == 0: assert isinstance(X_fit_mask, np.ndarray) assert isinstance(X_trans_mask, np.ndarray) elif param_sparse is False: assert isinstance(X_fit_mask, np.ndarray) assert isinstance(X_trans_mask, np.ndarray) else: if sparse.issparse(X_fit): assert X_fit_mask.format == 'csc' assert X_trans_mask.format == 'csc' else: assert isinstance(X_fit_mask, np.ndarray) assert isinstance(X_trans_mask, np.ndarray)
class MissingIndicatorImpl: def __init__( self, missing_values="nan", features="missing-only", sparse="auto", error_on_new=True, ): self._hyperparams = { "missing_values": missing_values, "features": features, "sparse": sparse, "error_on_new": error_on_new, } def fit(self, X, y=None): self._wrapped_model = SKLModel(**self._hyperparams) if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
class MIAImputer(BaseEstimator, TransformerMixin): """ MIA imputation strategy duplicate each columns by remplacing each np.nan by once +inf and once -inf """ def __init__(self, add_indicator=False, fill_value=10**5): self.add_indicator = add_indicator self.simple_imputer_max = SimpleImputer(strategy='constant', fill_value=10**5) self.simple_imputer_min = SimpleImputer(strategy='constant', fill_value=-10**5) def fit(self, X, y=None): self.simple_imputer_max.fit(X, y) self.simple_imputer_min.fit(X, y) if self.add_indicator: self.indicator_ = MissingIndicator(missing_values=np.nan, error_on_new=False) self.indicator_.fit(X) return self def transform(self, X): if self.add_indicator: X_trans_indicator = self.indicator_.transform(X) X_max = self.simple_imputer_max.transform(X) X_min = self.simple_imputer_min.transform(X) X = np.hstack((X_max, X_min)) if self.add_indicator: X = np.hstack((X, X_trans_indicator)) return X
def test_missing_indicator_raise_on_sparse_with_missing_0(arr_type): # test for sparse input and missing_value == 0 missing_values = 0 X_fit = np.array([[missing_values, missing_values, 1], [4, missing_values, 2]]) X_trans = np.array([[missing_values, missing_values, 1], [4, 12, 10]]) # convert the input to the right array format X_fit_sparse = arr_type(X_fit) X_trans_sparse = arr_type(X_trans) indicator = MissingIndicator(missing_values=missing_values) with pytest.raises(ValueError, match="Sparse input with missing_values=0"): indicator.fit_transform(X_fit_sparse) indicator.fit_transform(X_fit) with pytest.raises(ValueError, match="Sparse input with missing_values=0"): indicator.transform(X_trans_sparse)
def test_missing_indicator_new(missing_values, arr_type, dtype, param_features, n_features, features_indices): X_fit = np.array([[missing_values, missing_values, 1], [4, missing_values, 2]]) X_trans = np.array([[missing_values, missing_values, 1], [4, 12, 10]]) X_fit_expected = np.array([[1, 1, 0], [0, 1, 0]]) X_trans_expected = np.array([[1, 1, 0], [0, 0, 0]]) # convert the input to the right array format and right dtype X_fit = arr_type(X_fit).astype(dtype) X_trans = arr_type(X_trans).astype(dtype) X_fit_expected = X_fit_expected.astype(dtype) X_trans_expected = X_trans_expected.astype(dtype) indicator = MissingIndicator(missing_values=missing_values, features=param_features, sparse=False) X_fit_mask = indicator.fit_transform(X_fit) X_trans_mask = indicator.transform(X_trans) assert X_fit_mask.shape[1] == n_features assert X_trans_mask.shape[1] == n_features assert_array_equal(indicator.features_, features_indices) assert_allclose(X_fit_mask, X_fit_expected[:, features_indices]) assert_allclose(X_trans_mask, X_trans_expected[:, features_indices]) assert X_fit_mask.dtype == bool assert X_trans_mask.dtype == bool assert isinstance(X_fit_mask, np.ndarray) assert isinstance(X_trans_mask, np.ndarray) indicator.set_params(sparse=True) X_fit_mask_sparse = indicator.fit_transform(X_fit) X_trans_mask_sparse = indicator.transform(X_trans) assert X_fit_mask_sparse.dtype == bool assert X_trans_mask_sparse.dtype == bool assert X_fit_mask_sparse.format == 'csc' assert X_trans_mask_sparse.format == 'csc' assert_allclose(X_fit_mask_sparse.toarray(), X_fit_mask) assert_allclose(X_trans_mask_sparse.toarray(), X_trans_mask)
def test_missing_indicator_new(missing_values, arr_type, dtype, param_features, n_features, features_indices): X_fit = np.array([[missing_values, missing_values, 1], [4, 2, missing_values]]) X_trans = np.array([[missing_values, missing_values, 1], [4, 12, 10]]) X_fit_expected = np.array([[1, 1, 0], [0, 0, 1]]) X_trans_expected = np.array([[1, 1, 0], [0, 0, 0]]) # convert the input to the right array format and right dtype X_fit = arr_type(X_fit).astype(dtype) X_trans = arr_type(X_trans).astype(dtype) X_fit_expected = X_fit_expected.astype(dtype) X_trans_expected = X_trans_expected.astype(dtype) indicator = MissingIndicator(missing_values=missing_values, features=param_features, sparse=False) X_fit_mask = indicator.fit_transform(X_fit) X_trans_mask = indicator.transform(X_trans) assert X_fit_mask.shape[1] == n_features assert X_trans_mask.shape[1] == n_features assert_array_equal(indicator.features_, features_indices) assert_allclose(X_fit_mask, X_fit_expected[:, features_indices]) assert_allclose(X_trans_mask, X_trans_expected[:, features_indices]) assert X_fit_mask.dtype == bool assert X_trans_mask.dtype == bool assert isinstance(X_fit_mask, np.ndarray) assert isinstance(X_trans_mask, np.ndarray) indicator.set_params(sparse=True) X_fit_mask_sparse = indicator.fit_transform(X_fit) X_trans_mask_sparse = indicator.transform(X_trans) assert X_fit_mask_sparse.dtype == bool assert X_trans_mask_sparse.dtype == bool assert X_fit_mask_sparse.format == 'csc' assert X_trans_mask_sparse.format == 'csc' assert_allclose(X_fit_mask_sparse.toarray(), X_fit_mask) assert_allclose(X_trans_mask_sparse.toarray(), X_trans_mask)
class _MissingIndicatorImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
class MissingIndicatorComponent(AutoSklearnPreprocessingAlgorithm): def __init__(self, missing_values=np.nan, features: str = "missing-only", random_state=None): super().__init__() self.features = features self.missing_values = missing_values self.random_state = random_state def fit(self, X, Y=None): from sklearn.impute import MissingIndicator self.preprocessor = MissingIndicator( missing_values=self.missing_values, features=self.features) self.preprocessor.fit(X, Y) return self def transform(self, X): if self.preprocessor is None: raise NotImplementedError() return self.preprocessor.transform(X) @staticmethod def get_properties(dataset_properties=None): return { 'shortname': 'MissingIndicator', 'name': 'Missing Indicator', 'handles_regression': True, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, 'handles_multioutput': True, 'is_deterministic': True, 'input': (DENSE, UNSIGNED_DATA), 'output': (INPUT, ) } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): features = CategoricalHyperparameter("features", ["missing-only", "all"], default_value="missing-only") cs = ConfigurationSpace() cs.add_hyperparameters([features]) return cs
class MissingIndicatorImpl(): def __init__(self, missing_values='nan', features='missing-only', sparse='auto', error_on_new=True): self._hyperparams = { 'missing_values': missing_values, 'features': features, 'sparse': sparse, 'error_on_new': error_on_new} def fit(self, X, y=None): self._wrapped_model = SKLModel(**self._hyperparams) if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
class SimulationDataScaler(object): def __init__(self, l=0.0, u=1.0, fill_value=0.0, winsorize=False, return_df=True): self.l = l self.u = u self.columns = [] self.is_fit = False self.return_df = return_df if not winsorize: self.l = 0.0 self.u = 1.0 self.winsorize = winsorize self.winsorizor = Winsorizer(l=l, u=u) self.scaler = StandardScaler() self.imputer = SimpleImputer(strategy='constant', fill_value=fill_value, add_indicator=False) self.indicator_imputer = MissingIndicator(features="all") def fit(self, X): #X = X.copy() X[~np.isfinite(X)] = np.nan self.is_fit = True if type(X) is pd.DataFrame: self.columns = list(X.columns) X_w = self.winsorizor.fit_transform(X) #print( (X[~np.isfinite(X)]).sum()) self.indicator_imputer.fit(X) self.scaler.fit(X_w) self.imputer.fit(X_w) return self def transform(self, X): if not self.is_fit: raise "Please fit the before running" X[~np.isfinite(X)] = np.nan X_w = self.winsorizor.transform(X) X_imp_ind = self.indicator_imputer.transform(X) X_w_s = self.scaler.transform(X_w) X_w_s_i = self.imputer.transform(X_w_s) if self.return_df: return pd.DataFrame(X_w_s_i, columns=self.columns), pd.DataFrame( X_imp_ind, columns=self.columns) else: return X_w_s_i, X_imp_ind def fit_transform(self, X): self.fit(X) return self.transform(X) def inverse_transform(self, X_w_s_i): if not self.is_fit: raise "Please fit the before running" X_w_i = self.scaler.inverse_transform(X_w_s_i) if self.return_df: return pd.DataFrame(X_w_i, columns=self.columns) else: return X_w_i
data_info(train) data_info(test) ###checking event rate from collections import Counter Counter(train.target) train.target.value_counts(normalize=True) ##creating NA indicator for all the columns containing NAs mindicator = MissingIndicator(missing_values=np.nan,error_on_new=False) z = mindicator.fit_transform(train.drop('target',axis = 1)) cols_na_ind = [x+'_na_ind' for x in train.columns[mindicator.features_]] train = pd.concat([train,pd.DataFrame(1*z,columns = cols_na_ind)],axis = 1) train.head(1) z = mindicator.transform(test) cols_na_ind = [x+'_na_ind' for x in test.columns[mindicator.features_]] test = pd.concat([test,pd.DataFrame(1*z,columns = cols_na_ind)],axis = 1) test.head(1) ## Treating Null values var = 'gender' #f'count of NULLs in {var} : {train[[var]].isna().sum()[0]}' train[var].value_counts(dropna = False,normalize = True) pd.crosstab(index = train[var].fillna('Nan'), columns = train.target,margins = True,normalize='index',) var = 'enrolled_university' train[var].value_counts(dropna = False,normalize=True) pd.crosstab(index = train[var].fillna('Nan'), columns = train.target,margins = True,normalize='index',) var = 'education_level'
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean') imp_data = [[7, 2, 3], [4, np.nan, 6], [10, 5, 9]] imp_mean.fit(imp_data) imp_mean.statistics_ X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]] imp_mean.get_params() imp_mean.transform(X) from sklearn.impute import MissingIndicator X1 = np.array([[np.nan, 1, 3], [4, 0, np.nan], [8, 1, 0]]) X2 = np.array([[5, 1, np.nan], [np.nan, 2, 3], [2, 4, 0]]) indicator = MissingIndicator() indicator.fit(X1) indicator.features_ X1 indicator.transform(X1) X2 indicator.transform(X2) indicator_all = MissingIndicator(features='all') indicator_all.fit_transform(X1) indicator_all.fit_transform(X2) indicator_all.features_ from sklearn.preprocessing import Binarizer X = [[1., -1., 2.], [2., 0., 0.], [0., 1., -1.]] transformer = Binarizer() type(transformer) transformer.fit(X) transformer.transform(X)
class RobustMissingIndicator(BaseEstimator, TransformerMixin): """Binary indicators for missing values. Note that this component typically should not be used in a vanilla :class:`sklearn.pipeline.Pipeline` consisting of transformers and a classifier, but rather could be added using a :class:`sklearn.pipeline.FeatureUnion` or :class:`sklearn.compose.ColumnTransformer`. Similar to sklearn.impute.MissingIndicator with added functionality - RobustMissingIndicator uses a custom mask_function to determine the boolean mask. The default mask_function is sagemaker_sklearn_extension.impute.is_finite_numeric which checks whether or not a value can be converted into a float. Parameters ---------- features : str, optional (default="all") Whether the imputer mask should represent all or a subset of features. - If "missing-only", the imputer mask will only represent features containing missing values during fit time. - If "all" (default), the imputer mask will represent all features. error_on_new : boolean, optional (default=True) If True (default), transform will raise an error when there are features with missing values in transform that have no missing values in fit. This is applicable only when ``features="missing-only"``. mask_function : callable -> np.array, dtype('bool') (default=None) A vectorized python function, accepts np.array, returns np.array with dtype('bool') For each value, if mask_function(val) == False, that value will be imputed. mask_function is used to create a boolean mask that determines which values in the input to impute. Use np.vectorize to vectorize singular python functions. By default, mask_function will be sagemaker_sklearn_extension.impute.is_finite_numeric Notes ----- only accepts 2D, non-sparse inputs """ def __init__(self, features="all", error_on_new=True, mask_function=None): self.features = features self.error_on_new = error_on_new self.mask_function = mask_function def _validate_input(self, X): if hasattr(X, "dtype") and X.dtype is not None and hasattr( X.dtype, "kind") and X.dtype.kind == "c": raise ValueError("Complex data not supported\n{}\n".format(X)) return check_array(X, dtype=np.dtype("O"), copy=True, force_all_finite=False, ensure_2d=True) def fit(self, X, y=None): """Fit the transformer on X. Parameters ---------- X : {array-like}, shape (n_samples, n_features) Input data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features. Returns ------- self : RobustMissingIndicator """ X = self._validate_input(X) self.vectorized_mask_function_ = self.mask_function or is_finite_numeric X = _apply_mask(X, _get_mask(X, self.vectorized_mask_function_)) self.missing_indicator_ = MissingIndicator( features=self.features, error_on_new=self.error_on_new) self.missing_indicator_.fit(X) return self def transform(self, X): """Generate missing values indicator for X. Parameters ---------- X : {array-like}, shape (n_samples, n_features) The input data to complete. Returns ------- Xt : {ndarray}, shape (n_samples, n_features) The missing indicator for input data. The data type of ``Xt`` will be boolean. """ check_is_fitted(self, ["missing_indicator_", "vectorized_mask_function_"]) X = self._validate_input(X) X = _apply_mask(X, _get_mask(X, self.vectorized_mask_function_)) return self.missing_indicator_.transform(X) def _more_tags(self): return {"allow_nan": True}
#print(indicator) indicator = pd.DataFrame( indicator, columns=['m1', 'm3']) # The only two columns in which missing values are print(indicator) # MissingIndicator - more in depth import numpy as np from sklearn.impute import MissingIndicator X1 = np.array([[np.nan, 1, 3], [4, 0, np.nan], [8, 1, 0]]) X2 = np.array([[5, 1, np.nan], [np.nan, 2, 3], [2, 4, 0]]) indicator = MissingIndicator() indicator.fit(X1) # Creates the possible indicator columns (i.e., not all) X2_tr = indicator.transform(X2) X1_tr = indicator.transform(X1) print('X2_tr') print(X2_tr) print('X1_tr') print(X1_tr) ##### # Inputation ##### from sklearn.impute import SimpleImputer imp = SimpleImputer(missing_values=np.nan, strategy='mean') Y = imp.fit_transform(X) print(Y)