class KNN_impute:
    def __init__(self, missing_val_rep=0.0, k=10, copy=False):

        self.missing_val_rep = missing_val_rep
        self.imputer = KNNImputer(missing_val_rep,
                                  k,
                                  copy=copy,
                                  col_max_missing=1.0,
                                  row_max_missing=01.0)

    def add_medians(self, X, y):

        X['labels'] = y
        label_meds = remove_rows(X).groupby(by='labels').median()
        #print(label_meds)
        for l in tqdm(label_meds.index):
            X[X['labels'] == l] = X[X['labels'] == l].replace(
                self.missing_val_rep, label_meds.loc[l, :].to_dict())

        X.drop(columns=['labels'], inplace=True)

    def fit(self, X, y):

        self.add_medians(X, y)
        print('INSIDE IMPUTER: Beginning the fit')
        self.imputer.fit(X)
        print('INSIDE IMPUTER: Completed the fit')
        return None
        '''
        def add_median(df):
            medians = df.median(axis=0)
            return df.replace(self.missing_val_rep, medians.to_dict())
        
        X['labels'] = y
        X_median = X.groupby(by='labels').apply(add_median)
        #print(X_median)
        X.drop(columns=['labels'],inplace=True)
        X_median.drop(columns=['labels'],inplace=True)

        self.imputer.fit(X_median)
        '''

    def transform(self, X):
        return self.imputer.transform(X)
示例#2
0
def Impute_Data_KNN(X_train, y_train, X_test, y_test, vals_mask, cols, data,
                    var, min_vals, max_vals):

    XY_incomplete_train = np.concatenate((X_train, y_train.reshape(-1, 1)),
                                         axis=1)
    XY_incomplete_test = np.concatenate((X_test, y_test.reshape(-1, 1)),
                                        axis=1)

    imputer = KNNImputer(n_neighbors=5)
    XY_completed_train = imputer.fit_transform(XY_incomplete_train)
    XY_completed_test = imputer.transform(XY_incomplete_test)

    X_train_imp = (XY_completed_train[:, 0:data.shape[1]])
    y_train_imp_orig = np.array(XY_completed_train[:, data.shape[1]],
                                dtype="int16")
    y_train_imp = np.array(XY_completed_train[:, data.shape[1]] >= 5,
                           dtype="int16")
    X_test_imp = (XY_completed_test[:, 0:data.shape[1]])
    y_test_imp = np.array(XY_completed_test[:, data.shape[1]] >= 5,
                          dtype="int16")
    y_test_imp_orig = np.array(XY_completed_test[:, data.shape[1]],
                               dtype="int16")

    for j in range(0, X_train_imp.shape[1]):
        if var.iloc[j]['type'] == 'cat':
            X_train_imp[:, j] = np.clip(np.round(X_train_imp[:, j]),
                                        min_vals[j], max_vals[j])
            X_test_imp[:, j] = np.clip(np.round(X_test_imp[:, j]), min_vals[j],
                                       max_vals[j])
        else:
            X_train_imp[:, j] = np.round(X_train_imp[:, j], decimals=1)
            X_test_imp[:, j] = np.round(X_test_imp[:, j], decimals=1)

    #min_vals_imp=np.nanmin(np.concatenate((X_train_imp,X_test_imp),axis=0),axis=0)
    #max_vals_imp=np.nanmax(np.concatenate((X_train_imp,X_test_imp),axis=0),axis=0)

    return (X_train_imp, y_train_imp, X_test_imp, y_test_imp, y_train_imp_orig,
            y_test_imp_orig)
示例#3
0
class StandardImputation(BaseEstimator, DataPreprocessorMixin):
    """Standard imputation method for static data.
    
  Reference 1: https://pypi.org/project/missingpy/
  Reference 2: https://s3.amazonaws.com/assets.datacamp.com/production/course_17404/slides/chapter4.pdf
  
  Attributes:
    - imputation_model_name: 'mice', 'missforest', 'knn'
    - data_type: 'static'
  """
    def __init__(self, imputation_model_name, data_type):
        # Only allow for certain options
        assert data_type == 'static'
        assert imputation_model_name in ['mice', 'missforest', 'knn']

        self.imputation_model_name = imputation_model_name
        self.data_type = data_type
        # Initialize the imputation model
        self.imputation_model = None

    def fit(self, dataset):
        """Train standard imputation model.
    
    Args:
      - dataset: incomplete dataset
    """
        if dataset.static_feature is not None:
            # MICE
            if self.imputation_model_name == 'mice':
                self.imputation_model = IterativeImputer()
            # MissForest
            elif self.imputation_model_name == 'missforest':
                self.imputation_model = MissForest()
            # KNN
            elif self.imputation_model_name == 'knn':
                self.imputation_model = KNNImputer()

            self.imputation_model.fit(dataset.static_feature)

        return

    def transform(self, dataset):
        """Return imputed dataset by standard imputation.
    
    Args:
      - dataset: incomplete dataset
    
    Returns:
      - dataset: imputed dataset by standard imputation.
    """
        assert self.imputation_model is not None

        if dataset.static_feature is not None:
            # Standard imputation
            data_imputed = self.imputation_model.transform(
                dataset.static_feature)
            # Rounding
            dataset.static_feature = rounding(dataset.static_feature,
                                              data_imputed)

        return dataset

    def fit_transform(self, dataset):
        """Fit and transform. Return imputed data
    
    Args:
      - dataset: incomplete dataset
    """
        self.fit(dataset)
        return self.transform(dataset)
示例#4
0
class Imputer(object):
    """Module for feature imputation."""
    def __init__(self, missing_values='nan', strategy='mean', n_neighbors=5):
        '''
            Imputation of feature values using either sklearn, missingpy or
            (WIP) fancyimpute approaches.

            Parameters
            ----------
            missing_values : number, string, np.nan (default) or None
                The placeholder for the missing values. All occurrences of
                `missing_values` will be imputed.


            strategy : string, optional (default="mean")
                The imputation strategy.

                Supported using sklearn:
                - If "mean", then replace missing values using the mean along
                  each column. Can only be used with numeric data.
                - If "median", then replace missing values using the median along
                  each column. Can only be used with numeric data.
                - If "most_frequent", then replace missing using the most frequent
                  value along each column. Can be used with strings or numeric data.
                - If "constant", then replace missing values with fill_value. Can be
                  used with strings or numeric data.

                Supported using missingpy:
                - If 'knn', then use a nearest neighbor search. Can be
                  used with strings or numeric data.

                WIP: More strategies using fancyimpute

            n_neighbors : int, optional (default = 5)
                Number of neighboring samples to use for imputation if method
                is knn.

            '''

        # Set parameters to objects
        self.missing_values = missing_values
        self.strategy = strategy
        self.n_neighbors = n_neighbors

        # Depending on the imputations strategy, use a specific toolbox
        if strategy in ['mean', 'median', 'most_frequent', 'constant']:
            self.Imputer =\
             SimpleImputer(missing_values=self.missing_values,
                           strategy=self.strategy)
        elif strategy == 'knn':
            if missing_values == 'nan':
                # Slightly different API for missingpy
                self.missing_values = 'NaN'
            self.Imputer = KNNImputer(missing_values=self.missing_values,
                                      n_neighbors=self.n_neighbors)

    def fit(self, X, y=None):
        self.Imputer.fit(X, y)

    def transform(self, X):
        return self.Imputer.transform(X)