예제 #1
def test_knn_n_neighbors():

    X = np.array([
        [0,       0],
        [np.nan,  2],
        [4,       3],
        [5,       np.nan],
        [7,       7],
        [np.nan,  8],
        [14,      13]
    statistics_mean = np.nanmean(X, axis=0)

    # Test with 1 neighbor
    X_imputed_1NN = np.array([
        [0,      0],
        [4,      2],
        [4,      3],
        [5,      3],
        [7,      7],
        [7,      8],
        [14,     13]

    n_neighbors = 1
    imputer = KNNImputer(n_neighbors=n_neighbors)

    assert_array_equal(imputer.fit_transform(X), X_imputed_1NN)
    assert_array_equal(imputer.statistics_, statistics_mean)

    # Test with 6 neighbors
    X = np.array([
        [0,      0],
        [np.nan, 2],
        [4,      3],
        [5,      np.nan],
        [7,      7],
        [np.nan, 8],
        [14,      13]

    X_imputed_6NN = np.array([
        [0,      0],
        [6,      2],
        [4,      3],
        [5,      5.5],
        [7,      7],
        [6,      8],
        [14,     13]

    n_neighbors = 6
    imputer = KNNImputer(n_neighbors=6)
    imputer_plus1 = KNNImputer(n_neighbors=n_neighbors + 1)

    assert_array_equal(imputer.fit_transform(X), X_imputed_6NN)
    assert_array_equal(imputer.statistics_, statistics_mean)
    assert_array_equal(imputer.fit_transform(X), imputer_plus1.fit(
class KNN_impute:
    def __init__(self, missing_val_rep=0.0, k=10, copy=False):

        self.missing_val_rep = missing_val_rep
        self.imputer = KNNImputer(missing_val_rep,

    def add_medians(self, X, y):

        X['labels'] = y
        label_meds = remove_rows(X).groupby(by='labels').median()
        for l in tqdm(label_meds.index):
            X[X['labels'] == l] = X[X['labels'] == l].replace(
                self.missing_val_rep, label_meds.loc[l, :].to_dict())

        X.drop(columns=['labels'], inplace=True)

    def fit(self, X, y):

        self.add_medians(X, y)
        print('INSIDE IMPUTER: Beginning the fit')
        print('INSIDE IMPUTER: Completed the fit')
        return None
        def add_median(df):
            medians = df.median(axis=0)
            return df.replace(self.missing_val_rep, medians.to_dict())
        X['labels'] = y
        X_median = X.groupby(by='labels').apply(add_median)


    def transform(self, X):
        return self.imputer.transform(X)
예제 #3
class StandardImputation(BaseEstimator, DataPreprocessorMixin):
    """Standard imputation method for static data.
  Reference 1: https://pypi.org/project/missingpy/
  Reference 2: https://s3.amazonaws.com/assets.datacamp.com/production/course_17404/slides/chapter4.pdf
    - imputation_model_name: 'mice', 'missforest', 'knn'
    - data_type: 'static'
    def __init__(self, imputation_model_name, data_type):
        # Only allow for certain options
        assert data_type == 'static'
        assert imputation_model_name in ['mice', 'missforest', 'knn']

        self.imputation_model_name = imputation_model_name
        self.data_type = data_type
        # Initialize the imputation model
        self.imputation_model = None

    def fit(self, dataset):
        """Train standard imputation model.
      - dataset: incomplete dataset
        if dataset.static_feature is not None:
            # MICE
            if self.imputation_model_name == 'mice':
                self.imputation_model = IterativeImputer()
            # MissForest
            elif self.imputation_model_name == 'missforest':
                self.imputation_model = MissForest()
            # KNN
            elif self.imputation_model_name == 'knn':
                self.imputation_model = KNNImputer()



    def transform(self, dataset):
        """Return imputed dataset by standard imputation.
      - dataset: incomplete dataset
      - dataset: imputed dataset by standard imputation.
        assert self.imputation_model is not None

        if dataset.static_feature is not None:
            # Standard imputation
            data_imputed = self.imputation_model.transform(
            # Rounding
            dataset.static_feature = rounding(dataset.static_feature,

        return dataset

    def fit_transform(self, dataset):
        """Fit and transform. Return imputed data
      - dataset: incomplete dataset
        return self.transform(dataset)
예제 #4
def test_knn_imputation_default():
    # Test imputation with default parameter values

    # Test with an imputable matrix
    X = np.array([
        [1,      0,      0,      1],
        [2,      1,      2,      np.nan],
        [3,      2,      3,      np.nan],
        [np.nan, 4,      5,      5],
        [6,      np.nan, 6,      7],
        [8,      8,      8,      8],
        [16,     15,     18,    19],
    statistics_mean = np.nanmean(X, axis=0)

    X_imputed = np.array([
        [1,      0,      0,      1],
        [2,      1,      2,      8],
        [3,      2,      3,      8],
        [4,      4,      5,      5],
        [6,      3,      6,      7],
        [8,      8,      8,      8],
        [16,     15,     18,    19],

    imputer = KNNImputer()
    assert_array_equal(imputer.fit_transform(X), X_imputed)
    assert_array_equal(imputer.statistics_, statistics_mean)

    # Test with % missing in row > row_max_missing
    X = np.array([
        [1,      0,      0,      1],
        [2,      1,      2,      np.nan],
        [3,      2,      3,      np.nan],
        [np.nan, 4,      5,      5],
        [6,      np.nan, 6,      7],
        [8,      8,      8,      8],
        [19,     19,     19,     19],
        [np.nan, np.nan, np.nan, 19],
    statistics_mean = np.nanmean(X, axis=0)
    r7c0, r7c1, r7c2, _ = statistics_mean

    X_imputed = np.array([
        [1,      0,      0,      1],
        [2,      1,      2,      8],
        [3,      2,      3,      8],
        [4,      4,      5,      5],
        [6,      3,      6,      7],
        [8,      8,      8,      8],
        [19,     19,     19,     19],
        [r7c0,   r7c1,   r7c2,   19],

    imputer = KNNImputer()
    assert_array_almost_equal(imputer.fit_transform(X), X_imputed, decimal=6)
    assert_array_almost_equal(imputer.statistics_, statistics_mean, decimal=6)

    # Test with all neighboring donors also having missing feature values
    X = np.array([
        [1, 0, 0, np.nan],
        [2, 1, 2, np.nan],
        [3, 2, 3, np.nan],
        [4, 4, 5, np.nan],
        [6, 7, 6, np.nan],
        [8, 8, 8, np.nan],
        [20, 20, 20, 20],
        [22, 22, 22, 22]
    statistics_mean = np.nanmean(X, axis=0)

    X_imputed = np.array([
        [1, 0, 0, 21],
        [2, 1, 2, 21],
        [3, 2, 3, 21],
        [4, 4, 5, 21],
        [6, 7, 6, 21],
        [8, 8, 8, 21],
        [20, 20, 20, 20],
        [22, 22, 22, 22]

    imputer = KNNImputer()
    assert_array_equal(imputer.fit_transform(X), X_imputed)
    assert_array_equal(imputer.statistics_, statistics_mean)

    # Test when data in fit() and transform() are different
    X = np.array([
        [0,      0],
        [np.nan, 2],
        [4,      3],
        [5,      6],
        [7,      7],
        [9,      8],
        [11,     16]
    statistics_mean = np.nanmean(X, axis=0)

    Y = np.array([
        [1,      0],
        [3,      2],
        [4,      np.nan]

    Y_imputed = np.array([
        [1,      0],
        [3,      2],
        [4,      4.8]

    imputer = KNNImputer()
    assert_array_equal(imputer.fit(X).transform(Y), Y_imputed)
    assert_array_equal(imputer.statistics_, statistics_mean)
예제 #5
class Imputer(object):
    """Module for feature imputation."""
    def __init__(self, missing_values='nan', strategy='mean', n_neighbors=5):
            Imputation of feature values using either sklearn, missingpy or
            (WIP) fancyimpute approaches.

            missing_values : number, string, np.nan (default) or None
                The placeholder for the missing values. All occurrences of
                `missing_values` will be imputed.

            strategy : string, optional (default="mean")
                The imputation strategy.

                Supported using sklearn:
                - If "mean", then replace missing values using the mean along
                  each column. Can only be used with numeric data.
                - If "median", then replace missing values using the median along
                  each column. Can only be used with numeric data.
                - If "most_frequent", then replace missing using the most frequent
                  value along each column. Can be used with strings or numeric data.
                - If "constant", then replace missing values with fill_value. Can be
                  used with strings or numeric data.

                Supported using missingpy:
                - If 'knn', then use a nearest neighbor search. Can be
                  used with strings or numeric data.

                WIP: More strategies using fancyimpute

            n_neighbors : int, optional (default = 5)
                Number of neighboring samples to use for imputation if method
                is knn.


        # Set parameters to objects
        self.missing_values = missing_values
        self.strategy = strategy
        self.n_neighbors = n_neighbors

        # Depending on the imputations strategy, use a specific toolbox
        if strategy in ['mean', 'median', 'most_frequent', 'constant']:
            self.Imputer =\
        elif strategy == 'knn':
            if missing_values == 'nan':
                # Slightly different API for missingpy
                self.missing_values = 'NaN'
            self.Imputer = KNNImputer(missing_values=self.missing_values,

    def fit(self, X, y=None):
        self.Imputer.fit(X, y)

    def transform(self, X):
        return self.Imputer.transform(X)