def transform(self, X): X_ = process_dataframe(X) assert X_.shape[1] == self.cols if self.cat_imputer is not None: X_.iloc[:, self.cat_idx] = self.cat_imputer.transform( X_.iloc[:, self.cat_idx]) if self.num_imputer is not None: X_.iloc[:, self.num_idx] = self.num_imputer.transform( X_.iloc[:, self.num_idx]) return X_
def transform(self, X): """Impute all missing values in X. Parameters ---------- X : {array-like}, shape = [n_samples, n_features] The input data to complete. Returns ------- X : {array-like}, shape = [n_samples, n_features] The imputed dataset. """ # Confirm whether fit() has been called check_is_fitted(self, ["num_idx", "cat_idx", "statistics_"]) X_ = process_dataframe(X) dtypes = X_.dtypes # Check if any column has all missing mask = _get_mask(X_, self.missing_values) if np.any(mask.sum(axis=0) >= (X_.shape[0])): logger.warning( "One or more columns have all rows missing. Using AdaptiveSimpleImputer to do imputing." ) from skimpute.adaptive import AdaptiveSimpleImputer return AdaptiveSimpleImputer(consider_ordinal_as_cat=self.consider_ordinal_as_cat). \ fit_transform(X) # Get fitted X_ col count and ensure correct dimension n_cols_fit_X_ = len(self.num_idx) + len(self.cat_idx) _, n_cols_X_ = X_.shape if n_cols_X_ != n_cols_fit_X_: raise ValueError("Incompatible dimension between the fitted " "dataset and the one to be transformed.") if not mask.sum() > 0: logger.warning("No missing value located; returning original " "dataset.") return X # convert string column to index col_modes = SimpleImputer(strategy="most_frequent").fit(X_).statistics_ idx2encoder, X_, additional_data_list = build_encoder( X_, None, self.cat_idx, OrdinalEncoder(), [col_modes], "float32") self.encoded_col_modes = additional_data_list[0].astype('float32') # Call missForest function to impute missing columns = X_.columns index = X_.index Ximp = self._miss_forest(X_, mask) X = pd.DataFrame(Ximp, columns=columns, index=index) X = decode_data(X, idx2encoder) X = X.astype(dtypes) # Return imputed dataset return X
def fit(self, X, y=None): """Fit the imputer on X. Parameters ---------- X : {array-like}, shape (n_samples, n_features) Input data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features. Returns ------- self : object Returns self. """ self.dtypes = X.dtypes X_ = process_dataframe(X) self.num_idx, self.cat_idx = parse_cat_col( X_, self.consider_ordinal_as_cat) encoder = TargetEncoder() self.idx2encoder, X_, _ = build_encoder(X_, y, self.cat_idx, encoder, [], "float32") # Check if % missing in any column > col_max_missing mask = _get_mask(X_, self.missing_values) self.do_simple_imputing = False if np.any(mask.sum(axis=0) > (X_.shape[0] * self.col_max_missing)): self.do_simple_imputing = True logger.warning( "Some column(s) have more than {}% missing values".format( self.col_max_missing * 100)) X_col_means = np.ma.array(X_, mask=mask).mean(axis=0).data # Check if % missing in any row > row_max_missing bad_rows = mask.sum(axis=1) > (mask.shape[1] * self.row_max_missing) if np.any(bad_rows): logger.warning( "There are rows with more than {0}% missing values. These " "rows are not included as donor neighbors.".format( self.row_max_missing * 100)) # Remove rows that have more than row_max_missing % missing X_ = X_.iloc[~bad_rows, :] # Check if sufficient neighboring samples available if X_.shape[0] < self.n_neighbors: self.do_simple_imputing = True logger.warning("There are only %d samples, but n_neighbors=%d." % (X_.shape[0], self.n_neighbors)) self.fitted_X_ = X_.values self.statistics_ = X_col_means return self
def fit(self, X, y=None, **kwargs): X_ = process_dataframe(X) self.cols = X_.shape[1] self.num_idx, self.cat_idx = parse_cat_col( X_, self.consider_ordinal_as_cat) if self.num_idx.size: self.num_imputer = SimpleImputer(strategy=self.num_strategy).fit( X_.iloc[:, self.num_idx]) else: self.num_imputer = None if self.cat_idx.size: self.cat_imputer = SimpleImputer(strategy=self.cat_strategy, fill_value=self.fill_value).fit( X_.iloc[:, self.cat_idx]) else: self.cat_imputer = None return self
def inverse_transform(self, X): X_ = process_dataframe(X) result_list = [] for i, column in enumerate(X_.columns): if column in self.mapping: map_val = deepcopy(self.mapping[column].values[:-2]) map_val = map_val.reshape(1, -1) sub = np.abs(X_.values[:, i].reshape(-1, 1) - map_val) indexes = np.argmin(sub, axis=1) enc_indexes = self.mapping[column].index[indexes] ordinal_map_list = self.ordinal_encoder.mapping ordinal_map = self.get_ordinal_map(ordinal_map_list, column) dict_ = ordinal_map.to_dict() inv_dict = {v: k for k, v in dict_.items()} mapped = enc_indexes.map(inv_dict) result = mapped.values.reshape(-1, 1) else: result = X[:, i].reshape(-1, 1) result_list.append(result) # todo : 重构为DataFrame的形式? return np.concatenate(result_list, axis=1)
def fit(self, X, y=None): """Fit the imputer on X. Parameters ---------- X : {array-like}, shape (n_samples, n_features) Input data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features. Returns ------- self : object Returns self. """ X_ = process_dataframe(X) self.cols = X_.shape[1] self.num_idx, self.cat_idx = parse_cat_col( X_, self.consider_ordinal_as_cat) # Check if any column has all missing mask = _get_mask(X_, self.missing_values) if np.any(mask.sum(axis=0) >= (X_.shape[0])): raise ValueError("One or more columns have all rows missing.") # First replace missing values with NaN if it is something else if self.missing_values not in ['NaN', np.nan]: X_.iloc[np.where(X_ == self.missing_values)] = np.nan # Now, make initial guess for missing values col_means = np.nanmean(X_.iloc[:, self.num_idx], axis=0) if self.num_idx.size else None col_modes = SimpleImputer(strategy="most_frequent").fit(X_).statistics_ self.statistics_ = {"col_means": col_means, "col_modes": col_modes} return self
def transform(self, X): """Impute all missing values in X. Parameters ---------- X : {array-like}, shape = [n_samples, n_features] The input data to complete. Returns ------- X : {array-like}, shape = [n_samples, n_features] The imputed dataset. """ check_is_fitted(self, [ "fitted_X_", "statistics_", "do_simple_imputing", "idx2encoder", "dtypes" ]) if self.do_simple_imputing: logger.debug("KNNImputer is doing adaptive simple imputation.") from skimpute import AdaptiveSimpleImputer return AdaptiveSimpleImputer( consider_ordinal_as_cat=self.consider_ordinal_as_cat ).fit_transform(X) X_ = process_dataframe(X) columns = X.columns index = X.index X_ = encode_data(X_, self.idx2encoder, "float32") # Get fitted data and ensure correct dimension n_rows_fit_X, n_cols_fit_X = self.fitted_X_.shape n_rows_X, n_cols_X = X_.shape if n_cols_X != n_cols_fit_X: raise ValueError("Incompatible dimension between the fitted " "dataset and the one to be transformed.") mask = _get_mask(X_, self.missing_values) row_total_missing = mask.sum(axis=1) if not np.any(row_total_missing): return X_ # Check for excessive missingness in rows bad_rows = row_total_missing > (mask.shape[1] * self.row_max_missing) if np.any(bad_rows): logger.warning( "There are rows with more than {0}% missing values. The " "missing features in these rows are imputed with column means." .format(self.row_max_missing * 100)) X_bad = X_.iloc[bad_rows, :] X_ = X_.iloc[~bad_rows, :] mask = mask[~bad_rows] row_total_missing = mask.sum(axis=1) row_has_missing = row_total_missing.astype(np.bool) if np.any(row_has_missing): # Mask for fitted_X mask_fx = _get_mask(self.fitted_X_, self.missing_values) # Pairwise distances between receivers and fitted samples dist = np.empty((len(X_), len(self.fitted_X_))) X_miss = X_.iloc[row_has_missing].values fitted_X = self.fitted_X_ # X_miss, self.scaler = self.min_max_scale(X_miss) # fitted_X, _ = self.min_max_scale(fitted_X) dist[row_has_missing] = pairwise_distances( X_miss, fitted_X, metric=self.metric, squared=False, missing_values=self.missing_values) # Find and impute missing X_ = self._impute(dist, X_.values, self.fitted_X_, mask, mask_fx) # Merge bad rows to X and mean impute their missing values if np.any(bad_rows): bad_missing_index = np.where(_get_mask(X_bad, self.missing_values)) X_bad[bad_missing_index] = np.take(self.statistics_, bad_missing_index[1]) X_merged = np.empty((n_rows_X, n_cols_X)) X_merged[bad_rows, :] = X_bad X_merged[~bad_rows, :] = X_ X_ = X_merged X = pd.DataFrame(X_, columns=columns, index=index) X = decode_data(X, self.idx2encoder) X = X.astype(self.dtypes) return X