示例#1
0
 def transform(self, X):
     X_ = process_dataframe(X)
     assert X_.shape[1] == self.cols
     if self.cat_imputer is not None:
         X_.iloc[:, self.cat_idx] = self.cat_imputer.transform(
             X_.iloc[:, self.cat_idx])
     if self.num_imputer is not None:
         X_.iloc[:, self.num_idx] = self.num_imputer.transform(
             X_.iloc[:, self.num_idx])
     return X_
示例#2
0
    def transform(self, X):
        """Impute all missing values in X.

        Parameters
        ----------
        X : {array-like}, shape = [n_samples, n_features]
            The input data to complete.

        Returns
        -------
        X : {array-like}, shape = [n_samples, n_features]
            The imputed dataset.
        """
        # Confirm whether fit() has been called
        check_is_fitted(self, ["num_idx", "cat_idx", "statistics_"])
        X_ = process_dataframe(X)
        dtypes = X_.dtypes
        # Check if any column has all missing
        mask = _get_mask(X_, self.missing_values)
        if np.any(mask.sum(axis=0) >= (X_.shape[0])):
            logger.warning(
                "One or more columns have all rows missing. Using AdaptiveSimpleImputer to do imputing."
            )
            from skimpute.adaptive import AdaptiveSimpleImputer
            return AdaptiveSimpleImputer(consider_ordinal_as_cat=self.consider_ordinal_as_cat). \
                fit_transform(X)

        # Get fitted X_ col count and ensure correct dimension
        n_cols_fit_X_ = len(self.num_idx) + len(self.cat_idx)
        _, n_cols_X_ = X_.shape

        if n_cols_X_ != n_cols_fit_X_:
            raise ValueError("Incompatible dimension between the fitted "
                             "dataset and the one to be transformed.")

        if not mask.sum() > 0:
            logger.warning("No missing value located; returning original "
                           "dataset.")
            return X

        # convert string column to index
        col_modes = SimpleImputer(strategy="most_frequent").fit(X_).statistics_
        idx2encoder, X_, additional_data_list = build_encoder(
            X_, None, self.cat_idx, OrdinalEncoder(), [col_modes], "float32")
        self.encoded_col_modes = additional_data_list[0].astype('float32')
        # Call missForest function to impute missing
        columns = X_.columns
        index = X_.index
        Ximp = self._miss_forest(X_, mask)
        X = pd.DataFrame(Ximp, columns=columns, index=index)
        X = decode_data(X, idx2encoder)
        X = X.astype(dtypes)
        # Return imputed dataset
        return X
示例#3
0
    def fit(self, X, y=None):
        """Fit the imputer on X.

        Parameters
        ----------
        X : {array-like}, shape (n_samples, n_features)
            Input data, where ``n_samples`` is the number of samples and
            ``n_features`` is the number of features.

        Returns
        -------
        self : object
            Returns self.
        """
        self.dtypes = X.dtypes

        X_ = process_dataframe(X)
        self.num_idx, self.cat_idx = parse_cat_col(
            X_, self.consider_ordinal_as_cat)
        encoder = TargetEncoder()
        self.idx2encoder, X_, _ = build_encoder(X_, y, self.cat_idx, encoder,
                                                [], "float32")
        # Check if % missing in any column > col_max_missing
        mask = _get_mask(X_, self.missing_values)
        self.do_simple_imputing = False
        if np.any(mask.sum(axis=0) > (X_.shape[0] * self.col_max_missing)):
            self.do_simple_imputing = True
            logger.warning(
                "Some column(s) have more than {}% missing values".format(
                    self.col_max_missing * 100))
        X_col_means = np.ma.array(X_, mask=mask).mean(axis=0).data

        # Check if % missing in any row > row_max_missing
        bad_rows = mask.sum(axis=1) > (mask.shape[1] * self.row_max_missing)
        if np.any(bad_rows):
            logger.warning(
                "There are rows with more than {0}% missing values. These "
                "rows are not included as donor neighbors.".format(
                    self.row_max_missing * 100))

            # Remove rows that have more than row_max_missing % missing
            X_ = X_.iloc[~bad_rows, :]

        # Check if sufficient neighboring samples available
        if X_.shape[0] < self.n_neighbors:
            self.do_simple_imputing = True
            logger.warning("There are only %d samples, but n_neighbors=%d." %
                           (X_.shape[0], self.n_neighbors))
        self.fitted_X_ = X_.values
        self.statistics_ = X_col_means
        return self
示例#4
0
 def fit(self, X, y=None, **kwargs):
     X_ = process_dataframe(X)
     self.cols = X_.shape[1]
     self.num_idx, self.cat_idx = parse_cat_col(
         X_, self.consider_ordinal_as_cat)
     if self.num_idx.size:
         self.num_imputer = SimpleImputer(strategy=self.num_strategy).fit(
             X_.iloc[:, self.num_idx])
     else:
         self.num_imputer = None
     if self.cat_idx.size:
         self.cat_imputer = SimpleImputer(strategy=self.cat_strategy,
                                          fill_value=self.fill_value).fit(
                                              X_.iloc[:, self.cat_idx])
     else:
         self.cat_imputer = None
     return self
示例#5
0
 def inverse_transform(self, X):
     X_ = process_dataframe(X)
     result_list = []
     for i, column in enumerate(X_.columns):
         if column in self.mapping:
             map_val = deepcopy(self.mapping[column].values[:-2])
             map_val = map_val.reshape(1, -1)
             sub = np.abs(X_.values[:, i].reshape(-1, 1) - map_val)
             indexes = np.argmin(sub, axis=1)
             enc_indexes = self.mapping[column].index[indexes]
             ordinal_map_list = self.ordinal_encoder.mapping
             ordinal_map = self.get_ordinal_map(ordinal_map_list, column)
             dict_ = ordinal_map.to_dict()
             inv_dict = {v: k for k, v in dict_.items()}
             mapped = enc_indexes.map(inv_dict)
             result = mapped.values.reshape(-1, 1)
         else:
             result = X[:, i].reshape(-1, 1)
         result_list.append(result)
     # todo : 重构为DataFrame的形式?
     return np.concatenate(result_list, axis=1)
示例#6
0
    def fit(self, X, y=None):
        """Fit the imputer on X.

        Parameters
        ----------
        X : {array-like}, shape (n_samples, n_features)
            Input data, where ``n_samples`` is the number of samples and
            ``n_features`` is the number of features.

        Returns
        -------
        self : object
            Returns self.
        """

        X_ = process_dataframe(X)
        self.cols = X_.shape[1]
        self.num_idx, self.cat_idx = parse_cat_col(
            X_, self.consider_ordinal_as_cat)

        # Check if any column has all missing
        mask = _get_mask(X_, self.missing_values)
        if np.any(mask.sum(axis=0) >= (X_.shape[0])):
            raise ValueError("One or more columns have all rows missing.")

        # First replace missing values with NaN if it is something else
        if self.missing_values not in ['NaN', np.nan]:
            X_.iloc[np.where(X_ == self.missing_values)] = np.nan

        # Now, make initial guess for missing values
        col_means = np.nanmean(X_.iloc[:, self.num_idx],
                               axis=0) if self.num_idx.size else None
        col_modes = SimpleImputer(strategy="most_frequent").fit(X_).statistics_

        self.statistics_ = {"col_means": col_means, "col_modes": col_modes}

        return self
示例#7
0
    def transform(self, X):
        """Impute all missing values in X.

        Parameters
        ----------
        X : {array-like}, shape = [n_samples, n_features]
            The input data to complete.

        Returns
        -------
        X : {array-like}, shape = [n_samples, n_features]
            The imputed dataset.
        """

        check_is_fitted(self, [
            "fitted_X_", "statistics_", "do_simple_imputing", "idx2encoder",
            "dtypes"
        ])
        if self.do_simple_imputing:
            logger.debug("KNNImputer is doing adaptive simple imputation.")
            from skimpute import AdaptiveSimpleImputer
            return AdaptiveSimpleImputer(
                consider_ordinal_as_cat=self.consider_ordinal_as_cat
            ).fit_transform(X)
        X_ = process_dataframe(X)
        columns = X.columns
        index = X.index
        X_ = encode_data(X_, self.idx2encoder, "float32")
        # Get fitted data and ensure correct dimension
        n_rows_fit_X, n_cols_fit_X = self.fitted_X_.shape
        n_rows_X, n_cols_X = X_.shape

        if n_cols_X != n_cols_fit_X:
            raise ValueError("Incompatible dimension between the fitted "
                             "dataset and the one to be transformed.")
        mask = _get_mask(X_, self.missing_values)

        row_total_missing = mask.sum(axis=1)
        if not np.any(row_total_missing):
            return X_

        # Check for excessive missingness in rows
        bad_rows = row_total_missing > (mask.shape[1] * self.row_max_missing)
        if np.any(bad_rows):
            logger.warning(
                "There are rows with more than {0}% missing values. The "
                "missing features in these rows are imputed with column means."
                .format(self.row_max_missing * 100))
            X_bad = X_.iloc[bad_rows, :]
            X_ = X_.iloc[~bad_rows, :]
            mask = mask[~bad_rows]
            row_total_missing = mask.sum(axis=1)
        row_has_missing = row_total_missing.astype(np.bool)

        if np.any(row_has_missing):
            # Mask for fitted_X
            mask_fx = _get_mask(self.fitted_X_, self.missing_values)

            # Pairwise distances between receivers and fitted samples
            dist = np.empty((len(X_), len(self.fitted_X_)))
            X_miss = X_.iloc[row_has_missing].values
            fitted_X = self.fitted_X_

            # X_miss, self.scaler = self.min_max_scale(X_miss)
            # fitted_X, _ = self.min_max_scale(fitted_X)

            dist[row_has_missing] = pairwise_distances(
                X_miss,
                fitted_X,
                metric=self.metric,
                squared=False,
                missing_values=self.missing_values)

            # Find and impute missing
            X_ = self._impute(dist, X_.values, self.fitted_X_, mask, mask_fx)

        # Merge bad rows to X and mean impute their missing values
        if np.any(bad_rows):
            bad_missing_index = np.where(_get_mask(X_bad, self.missing_values))
            X_bad[bad_missing_index] = np.take(self.statistics_,
                                               bad_missing_index[1])
            X_merged = np.empty((n_rows_X, n_cols_X))
            X_merged[bad_rows, :] = X_bad
            X_merged[~bad_rows, :] = X_
            X_ = X_merged
        X = pd.DataFrame(X_, columns=columns, index=index)
        X = decode_data(X, self.idx2encoder)
        X = X.astype(self.dtypes)
        return X