def transform(self, X): """Perform imputation using interpolation. Parameters ---------- X : array-like, shape = (n_samples, n_timestamps) Data with missing values. Returns ------- X_new : array-like, shape = (n_samples, n_timestamps) Data without missing values. """ missing_values, force_all_finite = self._check_params() X = check_array(X, dtype='float64', force_all_finite=force_all_finite) n_samples, n_timestamps = X.shape indicator = MissingIndicator( missing_values=missing_values, features='all', sparse=False, ) non_missing_idx = ~(indicator.fit_transform(X)) x_new = np.arange(n_timestamps) X_imputed = np.asarray([ self._impute_one_sample(X[i], non_missing_idx[i], x_new) for i in range(n_samples) ]) return X_imputed
def test_missing_indicator_float_inputs_isnan_false_tvm(self): for features in ["all", "missing-only"]: model = MissingIndicator(features=features, missing_values=0) data = np.array([[1, 2], [0, 3], [7, 6]], dtype=np.float32) model.fit(data) self._test_sklearn_missing_indic(model, data, "tvm")
def impute_dynamic_dataframe(self, df_dynamic): # interpolate if gap is less than 10 timestep mask = df_dynamic[self.feat_name].copy() df_dynamic_imp = df_dynamic.copy() for column in self.feat_name: df = pd.DataFrame(df_dynamic_imp[column]) df['new'] = ((df.notnull() != df.shift().notnull()).cumsum()) df['ones'] = 1 mask[column] = ( df.groupby('new')['ones'].transform('count') < self.missing_gap_thresh) | df_dynamic_imp[column].notnull() df_dynamic_imp[self.feat_name] = df_dynamic_imp[ self.feat_name].interpolate().bfill()[mask] # add dummy variables indicator = MissingIndicator(missing_values=np.nan, features='all') X = df_dynamic_imp[self.feat_name].values if_missing = indicator.fit_transform(X) if_measured = 1 - if_missing.astype(int) dummy_names = [] for ind, feat in enumerate(self.feat_name): dummy_name = 'if_' + feat df_dynamic_imp[dummy_name] = if_measured[:, ind] dummy_names.append(dummy_name) # impute missing invasive variables with 0 and add column "index" df_dynamic_imp = df_dynamic_imp.fillna(value=0) df_dynamic_imp = df_dynamic_imp.reindex(['index', 'pid', 'ts'] + self.feat_name + dummy_names, axis=1) return df_dynamic_imp
class MIAImputer(BaseEstimator, TransformerMixin): """ MIA imputation strategy duplicate each columns by remplacing each np.nan by once +inf and once -inf """ def __init__(self, add_indicator=False, fill_value=10**5): self.add_indicator = add_indicator self.simple_imputer_max = SimpleImputer(strategy='constant', fill_value=10**5) self.simple_imputer_min = SimpleImputer(strategy='constant', fill_value=-10**5) def fit(self, X, y=None): self.simple_imputer_max.fit(X, y) self.simple_imputer_min.fit(X, y) if self.add_indicator: self.indicator_ = MissingIndicator(missing_values=np.nan, error_on_new=False) self.indicator_.fit(X) return self def transform(self, X): if self.add_indicator: X_trans_indicator = self.indicator_.transform(X) X_max = self.simple_imputer_max.transform(X) X_min = self.simple_imputer_min.transform(X) X = np.hstack((X_max, X_min)) if self.add_indicator: X = np.hstack((X, X_trans_indicator)) return X
def data_missing_indicator(data_train,var_type_dict,data_test=None): ''' 进行特缺失值标记变量衍生 data_train: 需要进行转换的训练集 var_type_dict: 变量信息记录dict data_test: 需要进行转换的测试集 可以不给 不给就不会进行相应的转换 return: data_train_completed 衍生完成的训练集 var_type_dict 更新完的变量信息记录dict data_test_completed 衍生完成的测试集 ''' numeric_feature = var_type_dict.get('numeric_var',[]) category_feature = var_type_dict.get('category_var',[]) print('开始进行特缺失值标记变量衍生'.center(50, '=')) ##从dict里面把特征list拿出来 is_miss_feature = ['is_'+i+'_missing' for i in numeric_feature+category_feature] print('原始数据维度:',data_train.shape) print('新增数据维度:',len(is_miss_feature)) check_unique(numeric_feature+is_miss_feature) ##数值列和类别列用指定的方法填充 miss_indicator = MissingIndicator(features='all') data_train_completed = miss_indicator.fit_transform(data_train[numeric_feature+category_feature]) data_train_completed = pd.concat([data_train,pd.DataFrame(data_train_completed,columns=is_miss_feature)],axis=1) print('变量衍生完成:',data_train_completed.shape) ##更新var_type_dict文件 全部加入到numeric_var当中 var_type_dict['numeric_var'] = numeric_feature+is_miss_feature ##如果测试数据不为空 那么对测试数据进行transform 并返回 if data_test is not None: data_test_completed = miss_indicator.transform(data_test[numeric_feature+category_feature]) data_test_completed = pd.concat([data_test,pd.DataFrame(data_test_completed,columns=is_miss_feature)],axis=1) return data_train_completed,var_type_dict,data_test_completed return data_train_completed,var_type_dict
def test_missing_indicator_sparse_param(arr_type, missing_values, param_sparse): # check the format of the output with different sparse parameter X_fit = np.array([[missing_values, missing_values, 1], [4, missing_values, 2]]) X_trans = np.array([[missing_values, missing_values, 1], [4, 12, 10]]) X_fit = arr_type(X_fit).astype(np.float64) X_trans = arr_type(X_trans).astype(np.float64) indicator = MissingIndicator(missing_values=missing_values, sparse=param_sparse) X_fit_mask = indicator.fit_transform(X_fit) X_trans_mask = indicator.transform(X_trans) if param_sparse is True: assert X_fit_mask.format == 'csc' assert X_trans_mask.format == 'csc' elif param_sparse == 'auto' and missing_values == 0: assert isinstance(X_fit_mask, np.ndarray) assert isinstance(X_trans_mask, np.ndarray) elif param_sparse is False: assert isinstance(X_fit_mask, np.ndarray) assert isinstance(X_trans_mask, np.ndarray) else: if sparse.issparse(X_fit): assert X_fit_mask.format == 'csc' assert X_trans_mask.format == 'csc' else: assert isinstance(X_fit_mask, np.ndarray) assert isinstance(X_trans_mask, np.ndarray)
class MissingIndicatorImpl: def __init__( self, missing_values="nan", features="missing-only", sparse="auto", error_on_new=True, ): self._hyperparams = { "missing_values": missing_values, "features": features, "sparse": sparse, "error_on_new": error_on_new, } def fit(self, X, y=None): self._wrapped_model = SKLModel(**self._hyperparams) if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
def impute_data(X, feature_name_in): """Impute numeric data""" to_replace_dict = {'na': np.nan} for i in feature_name_in: na_cnt = 0 if pd.api.types.is_string_dtype(X[i]): na_cnt = X[i].str.contains('na').sum() if na_cnt > 0: X[i] = X.replace(to_replace=to_replace_dict, value=None) indicator = MissingIndicator(error_on_new=True, features='all', missing_values=np.nan, sparse=False) X_binary_miss = indicator.fit_transform(X).astype(int) X_binary_miss_sum = np.sum(X_binary_miss, axis=0) feature_name_out = feature_name_in.copy() to_del = [] for i in range(0, len(X_binary_miss_sum)): if X_binary_miss_sum[i] > 0: feature_name_out.append(feature_name_in[i] + "_miss") else: to_del.append(i) X_binary_miss = np.delete(X_binary_miss, to_del, axis=1) imp = SimpleImputer(missing_values=np.nan, strategy='mean') imp.fit(X) X_tr = imp.transform(X) X_out = np.concatenate((X_tr, X_binary_miss), axis=1) #print(feature_name_out) #print(X_out) return X_out, feature_name_out
def fit(self, X, y=None): self.simple_imputer_max.fit(X, y) self.simple_imputer_min.fit(X, y) if self.add_indicator: self.indicator_ = MissingIndicator(missing_values=np.nan, error_on_new=False) self.indicator_.fit(X) return self
def test_missing_indicator_float_inputs(self): for features in ["all", "missing-only"]: model = MissingIndicator(features=features) data = np.array([[1, 2], [np.nan, 3], [7, 6]], dtype=np.float32) model.fit(data) for backend in ["torch", "torch.jit"]: self._test_sklearn_missing_indic(model, data, backend)
def test_missing_indicator_no_missing(): # check that all features are dropped if there are no missing values when # features='missing-only' (#13491) X = np.array([[1, 1], [1, 1]]) mi = MissingIndicator(features='missing-only', missing_values=-1) Xt = mi.fit_transform(X) assert Xt.shape[1] == 0
def test_missing_indicator_sparse_no_explicit_zeros(): # Check that non missing values don't become explicit zeros in the mask # generated by missing indicator when X is sparse. (#13491) X = sparse.csr_matrix([[0, 1, 2], [1, 2, 0], [2, 0, 1]]) mi = MissingIndicator(features='all', missing_values=1) Xt = mi.fit_transform(X) assert Xt.getnnz() == Xt.sum()
def get_indicators(data): indicator = MissingIndicator(missing_values=np.nan, features='all') mask_data = pd.DataFrame(indicator.fit_transform(data.iloc[:, :-1])) # Rename some columns: mask_data.columns = mask_data.columns + 1 mask_data = mask_data.add_prefix('ind_') return (mask_data)
def get_results(dataset): X_full, y_full = dataset.data, dataset.target n_samples = X_full.shape[0] n_features = X_full.shape[1] # Estimate the score on the entire dataset, with no missing values estimator = RandomForestRegressor(random_state=0, n_estimators=100) full_scores = cross_val_score(estimator, X_full, y_full, scoring='neg_mean_squared_error') # Add missing values in 75% of the lines missing_rate = 0.75 n_missing_samples = int(np.floor(n_samples * missing_rate)) missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples, dtype=np.bool), np.ones(n_missing_samples, dtype=np.bool))) rng.shuffle(missing_samples) missing_features = rng.randint(0, n_features, n_missing_samples) # Estimate the score after replacing missing values by 0 X_missing = X_full.copy() X_missing[np.where(missing_samples)[0], missing_features] = 0 y_missing = y_full.copy() estimator = RandomForestRegressor(random_state=0, n_estimators=100) zero_impute_scores = cross_val_score(estimator, X_missing, y_missing, scoring='neg_mean_squared_error') # Estimate the score after imputation (mean strategy) of the missing values X_missing = X_full.copy() X_missing[np.where(missing_samples)[0], missing_features] = 0 y_missing = y_full.copy() estimator = make_pipeline( make_union(SimpleImputer(missing_values=0, strategy="mean"), MissingIndicator(missing_values=0)), RandomForestRegressor(random_state=0, n_estimators=100)) mean_impute_scores = cross_val_score(estimator, X_missing, y_missing, scoring='neg_mean_squared_error') # Estimate the score after chained imputation of the missing values estimator = make_pipeline( make_union(ChainedImputer(missing_values=0, random_state=0), MissingIndicator(missing_values=0)), RandomForestRegressor(random_state=0, n_estimators=100)) chained_impute_scores = cross_val_score(estimator, X_missing, y_missing, scoring='neg_mean_squared_error') return ((full_scores.mean(), full_scores.std()), (zero_impute_scores.mean(), zero_impute_scores.std()), (mean_impute_scores.mean(), mean_impute_scores.std()), (chained_impute_scores.mean(), chained_impute_scores.std()))
class _MissingIndicatorImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
def encode_with_labels_and_impute(df, strategy='mean') -> pd.DataFrame: """ Encode with simple labels and impute mean (of labels) in NaNs""" indicator = MissingIndicator(features='all') missing_indicator = indicator.fit_transform(df) df = impute(df, strategy='constant') # impute dummy str NaNs df = encode_with_labels(df) # impute real np.nan back for i in range(0, df.shape[1]): missing_indicator_col = missing_indicator[:, i] df.iloc[missing_indicator_col, i] = np.nan return impute(df, strategy=strategy)
def numeric_feature_pipeline(X , numeric_features , binarize=True , binarize_cutoff=0.5): """ Define a numeric feature processing pipeline. Parameters ------------ X: {array-like} pd.DataFrame or numpy.ndarray. Shape {observations} x {features} numeric_features: {list of str} column names in X representing continuously-valued features binarize: {bool} should high-missingness features be binarized with MissingIndicator? binarize_cutoff: {float in [0, 1]} threshold above which we create a binary variable for each feature with a missingness rate > binarize_cutoff Returns ------------ sklearn.pipeline object """ # -- base of numeric feature processor: imputer + mean, standard deviation scaler num_pipeline = make_pipeline(ColumnTransformer([('impute_num', SimpleImputer(strategy='median'), numeric_features)]) , StandardScaler()) # -- append missingness binarizer if specified if binarize: num_missing_dat = rank_missingness(X[numeric_features]) num_binarize_features = num_missing_dat[num_missing_dat > binarize_cutoff].index.tolist() num_binarizer = ColumnTransformer([('missing_num_binarizer', MissingIndicator(), num_binarize_features)]) # -- column-concatenate separate missingness-binarized variables. num_pipeline = FeatureUnion([('num_pipeline', num_pipeline) , ('missing_num_binarizer', num_binarizer)]) return num_pipeline
def test_missing_indicator_with_imputer(X, missing_values, X_trans_exp): trans = make_union( SimpleImputer(missing_values=missing_values, strategy='most_frequent'), MissingIndicator(missing_values=missing_values) ) X_trans = trans.fit_transform(X) assert_array_equal(X_trans, X_trans_exp)
def example(): import numpy as np from sklearn.impute import SimpleImputer imp = SimpleImputer(missing_values=np.nan, strategy='mean') imp.fit([[1, 2], [np.nan, 3], [7, 6]]) X = [[np.nan, 2], [6, np.nan], [7, 6]] print(imp.transform(X)) ###################################### from sklearn.datasets import load_iris from sklearn.impute import SimpleImputer, MissingIndicator from sklearn.model_selection import train_test_split from sklearn.pipeline import FeatureUnion, make_pipeline from sklearn.tree import DecisionTreeClassifier X, y = load_iris(return_X_y=True) mask = np.random.randint(0, 2, size=X.shape).astype(np.bool) X[mask] = np.nan X_train, X_test, y_train, _ = train_test_split(X, y, test_size=100, random_state=0) transformer = FeatureUnion( transformer_list=[('features', SimpleImputer( strategy='mean')), ('indicators', MissingIndicator())]) transformer = transformer.fit(X_train, y_train) results = transformer.transform(X_test) print(results.shape) clf = make_pipeline(transformer, DecisionTreeClassifier()) clf = clf.fit(X_train, y_train) results = clf.predict(X_test) print(results.shape)
def fit_transform_missing_indicator(input_data: pd.DataFrame, db_name: str, sql: None) -> pd.DataFrame: indicator = MissingIndicator() x = indicator.fit_transform(input_data) missing_features = [ f"missing_{input_data.columns[ii]}" for ii in list(indicator.features_) ] missing_indicator_df = pd.DataFrame(x, columns=missing_features) missing_indicator_df[missing_features].replace({True: 1, False: 0}) with sqlite3.connect(db_name) as conn: query = "INSERT INTO features VALUES (?,?)" conn.execute(query, ("missing", cloudpickle.dumps(missing_features))) return input_data.merge(missing_indicator_df, left_index=True, right_index=True)
def prep_dat(df,ylabels=None): df = df.drop(columns = ['UNIQUE_ID']) i_one = df.loc[pd.isnull(df['GENDER'])].index i_two = df.loc[pd.isnull(df['ETHNICITY'])].index labels_update = list(np.append(i_one,i_two)) print(labels_update) df = df.drop(labels=labels_update) df.reset_index(inplace=True,drop=True) #including only the values without missing gender and ethnicity df = df[pd.notnull(df['GENDER'])] df = df[pd.notnull(df['ETHNICITY'])] col_names = df.columns df['HS_GPA'] = df['HS_GPA'].replace(0,np.nan) imputer_transformer = FeatureUnion( transformer_list = [ ('features',SimpleImputer(strategy = 'constant',fill_value = 0)), ('indicators',MissingIndicator())]) out_df = imputer_transformer.fit_transform(df) #imputed dfcreation col_names = np.append(col_names,['Ind_Mothers','Ind_Fathers','Ind_HS_GPA','Ind_SATRead','Ind_SATMath']) imputed_df = pd.DataFrame(out_df,columns = col_names) imputed_df[['Ind_Mothers','Ind_Fathers','Ind_HS_GPA','Ind_SATRead','Ind_SATMath']] = imputed_df[['Ind_Mothers','Ind_Fathers','Ind_HS_GPA','Ind_SATRead','Ind_SATMath']].astype(int) if np.any(ylabels): ylabels = ylabels.drop(labels=labels_update) ylabels.reset_index(inplace=True,drop=True) return imputed_df,ylabels else: return imputed_df
def build_null_mapper(df, cols_with_na): imputer_steps = [([col], SimpleImputer()) for col in cols_with_na] is_missing_steps = [([col], MissingIndicator(), { 'alias': f'{col}_is_na' }) for col in cols_with_na] combined_steps = imputer_steps + is_missing_steps return DataFrameMapper(combined_steps, df_out=True)
def transform(self, X, y=None): null_rating_mask = X['h_user_rating'] == 0 null_indices = X[null_rating_mask].index X.loc[null_indices, 'h_user_rating'] = np.nan # Ensure number of MissingIndicator features stays constant by specifying # features='all'. Variability in the number of columns causes a mismatch in # feature names and actual features. imputer_mask = MissingIndicator(features='all', sparse=False).fit_transform(X) # Impute the values manually because each value is handled differently mean_horse_weight = X['c_horse_weight'].mean() X['c_horse_weight'] = X['c_horse_weight'].fillna(mean_horse_weight) mean_horse_weight_diff = X['c_horse_weight_diff'].mean() X['c_horse_weight_diff'] = X['c_horse_weight_diff'].fillna( mean_horse_weight_diff) X['c_previous_order_of_finish'] = X[ 'c_previous_order_of_finish'].fillna(0) X['h_user_rating'] = X['h_user_rating'].fillna(0) scaler = StandardScaler() X_scaled = scaler.fit_transform(X) return np.c_[X_scaled, imputer_mask]
def fit(self, X, y=None): self._wrapped_model = SKLModel(**self._hyperparams) if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self
def missing_indicator(self): """ Returns the output of sklearn.impute.MissingIndicator as a pandas DataFrame """ return pd.DataFrame(MissingIndicator(features='all').fit_transform(self.input_data), index=self.input_data.index, columns=[x + '_flag' for x in self.input_data.columns])
class MissingIndicatorComponent(AutoSklearnPreprocessingAlgorithm): def __init__(self, missing_values=np.nan, features: str = "missing-only", random_state=None): super().__init__() self.features = features self.missing_values = missing_values self.random_state = random_state def fit(self, X, Y=None): from sklearn.impute import MissingIndicator self.preprocessor = MissingIndicator( missing_values=self.missing_values, features=self.features) self.preprocessor.fit(X, Y) return self def transform(self, X): if self.preprocessor is None: raise NotImplementedError() return self.preprocessor.transform(X) @staticmethod def get_properties(dataset_properties=None): return { 'shortname': 'MissingIndicator', 'name': 'Missing Indicator', 'handles_regression': True, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, 'handles_multioutput': True, 'is_deterministic': True, 'input': (DENSE, UNSIGNED_DATA), 'output': (INPUT, ) } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): features = CategoricalHyperparameter("features", ["missing-only", "all"], default_value="missing-only") cs = ConfigurationSpace() cs.add_hyperparameters([features]) return cs
def test_missing_indicator_feature_names_out(): """Check that missing indicator return the feature names with a prefix.""" pd = pytest.importorskip("pandas") missing_values = np.nan X = pd.DataFrame( [ [missing_values, missing_values, 1, missing_values], [4, missing_values, 2, 10], ], columns=["a", "b", "c", "d"], ) indicator = MissingIndicator(missing_values=missing_values).fit(X) feature_names = indicator.get_feature_names_out() expected_names = ["missingindicator_a", "missingindicator_b", "missingindicator_d"] assert_array_equal(expected_names, feature_names)
def get_scores_for_imputer(imputer, X_missing, y_missing): estimator = make_pipeline( make_union(imputer, MissingIndicator(missing_values=0)), REGRESSOR) impute_scores = cross_val_score(estimator, X_missing, y_missing, scoring='neg_mean_squared_error', cv=N_SPLITS) return impute_scores
def test_missing_indicator(): X, y = load_iris(return_X_y=True) for missing_values in [np.nan, X[0][0], X[-1][1]]: X, y = load_iris(return_X_y=True) if np.isnan(missing_values): X.ravel()[np.random.choice(X.size, 20, replace=False)] = np.nan X_ = X.tolist() for features in ["missing-only", "all"]: imp = MissingIndicator( features=features, missing_values=missing_values, error_on_new=False ) imp.fit(X) imp_ = convert_estimator(imp) X_t = getattr(imp, "transform")(X) X_t_ = getattr(imp_, "transform")(X_) assert np.allclose(X_t.shape, shape(X_t_)) assert np.allclose(X_t, X_t_)
def run(datai, missing_rate): train_data, train_target, test_data, test_target = datai transformer = FeatureUnion([('1', IterativeImputer(ExtraTreesRegressor())), ('2', MissingIndicator(features='all'))]) model = make_pipeline(StandardScaler(), transformer, SVC()) model.fit(random_replace_with_nan(train_data, missing_rate), train_target) return accuracy_score( test_target, model.predict(random_replace_with_nan(test_data, missing_rate)))
def test_missing_indicator_new(missing_values, arr_type, dtype, param_features, n_features, features_indices): X_fit = np.array([[missing_values, missing_values, 1], [4, missing_values, 2]]) X_trans = np.array([[missing_values, missing_values, 1], [4, 12, 10]]) X_fit_expected = np.array([[1, 1, 0], [0, 1, 0]]) X_trans_expected = np.array([[1, 1, 0], [0, 0, 0]]) # convert the input to the right array format and right dtype X_fit = arr_type(X_fit).astype(dtype) X_trans = arr_type(X_trans).astype(dtype) X_fit_expected = X_fit_expected.astype(dtype) X_trans_expected = X_trans_expected.astype(dtype) indicator = MissingIndicator(missing_values=missing_values, features=param_features, sparse=False) X_fit_mask = indicator.fit_transform(X_fit) X_trans_mask = indicator.transform(X_trans) assert X_fit_mask.shape[1] == n_features assert X_trans_mask.shape[1] == n_features assert_array_equal(indicator.features_, features_indices) assert_allclose(X_fit_mask, X_fit_expected[:, features_indices]) assert_allclose(X_trans_mask, X_trans_expected[:, features_indices]) assert X_fit_mask.dtype == bool assert X_trans_mask.dtype == bool assert isinstance(X_fit_mask, np.ndarray) assert isinstance(X_trans_mask, np.ndarray) indicator.set_params(sparse=True) X_fit_mask_sparse = indicator.fit_transform(X_fit) X_trans_mask_sparse = indicator.transform(X_trans) assert X_fit_mask_sparse.dtype == bool assert X_trans_mask_sparse.dtype == bool assert X_fit_mask_sparse.format == 'csc' assert X_trans_mask_sparse.format == 'csc' assert_allclose(X_fit_mask_sparse.toarray(), X_fit_mask) assert_allclose(X_trans_mask_sparse.toarray(), X_trans_mask)
def test_missing_indicator_raise_on_sparse_with_missing_0(arr_type): # test for sparse input and missing_value == 0 missing_values = 0 X_fit = np.array([[missing_values, missing_values, 1], [4, missing_values, 2]]) X_trans = np.array([[missing_values, missing_values, 1], [4, 12, 10]]) # convert the input to the right array format X_fit_sparse = arr_type(X_fit) X_trans_sparse = arr_type(X_trans) indicator = MissingIndicator(missing_values=missing_values) with pytest.raises(ValueError, match="Sparse input with missing_values=0"): indicator.fit_transform(X_fit_sparse) indicator.fit_transform(X_fit) with pytest.raises(ValueError, match="Sparse input with missing_values=0"): indicator.transform(X_trans_sparse)
def test_missing_indicator_error(X_fit, X_trans, params, msg_err): indicator = MissingIndicator(missing_values=-1) indicator.set_params(**params) with pytest.raises(ValueError, match=msg_err): indicator.fit(X_fit).transform(X_trans)
def test_missing_indicator_string(): X = np.array([['a', 'b', 'c'], ['b', 'c', 'a']], dtype=object) indicator = MissingIndicator(missing_values='a', features='all') X_trans = indicator.fit_transform(X) assert_array_equal(X_trans, np.array([[True, False, False], [False, False, True]]))