class KNN_impute: def __init__(self, missing_val_rep=0.0, k=10, copy=False): self.missing_val_rep = missing_val_rep self.imputer = KNNImputer(missing_val_rep, k, copy=copy, col_max_missing=1.0, row_max_missing=01.0) def add_medians(self, X, y): X['labels'] = y label_meds = remove_rows(X).groupby(by='labels').median() #print(label_meds) for l in tqdm(label_meds.index): X[X['labels'] == l] = X[X['labels'] == l].replace( self.missing_val_rep, label_meds.loc[l, :].to_dict()) X.drop(columns=['labels'], inplace=True) def fit(self, X, y): self.add_medians(X, y) print('INSIDE IMPUTER: Beginning the fit') self.imputer.fit(X) print('INSIDE IMPUTER: Completed the fit') return None ''' def add_median(df): medians = df.median(axis=0) return df.replace(self.missing_val_rep, medians.to_dict()) X['labels'] = y X_median = X.groupby(by='labels').apply(add_median) #print(X_median) X.drop(columns=['labels'],inplace=True) X_median.drop(columns=['labels'],inplace=True) self.imputer.fit(X_median) ''' def transform(self, X): return self.imputer.transform(X)
def Impute_Data_KNN(X_train, y_train, X_test, y_test, vals_mask, cols, data, var, min_vals, max_vals): XY_incomplete_train = np.concatenate((X_train, y_train.reshape(-1, 1)), axis=1) XY_incomplete_test = np.concatenate((X_test, y_test.reshape(-1, 1)), axis=1) imputer = KNNImputer(n_neighbors=5) XY_completed_train = imputer.fit_transform(XY_incomplete_train) XY_completed_test = imputer.transform(XY_incomplete_test) X_train_imp = (XY_completed_train[:, 0:data.shape[1]]) y_train_imp_orig = np.array(XY_completed_train[:, data.shape[1]], dtype="int16") y_train_imp = np.array(XY_completed_train[:, data.shape[1]] >= 5, dtype="int16") X_test_imp = (XY_completed_test[:, 0:data.shape[1]]) y_test_imp = np.array(XY_completed_test[:, data.shape[1]] >= 5, dtype="int16") y_test_imp_orig = np.array(XY_completed_test[:, data.shape[1]], dtype="int16") for j in range(0, X_train_imp.shape[1]): if var.iloc[j]['type'] == 'cat': X_train_imp[:, j] = np.clip(np.round(X_train_imp[:, j]), min_vals[j], max_vals[j]) X_test_imp[:, j] = np.clip(np.round(X_test_imp[:, j]), min_vals[j], max_vals[j]) else: X_train_imp[:, j] = np.round(X_train_imp[:, j], decimals=1) X_test_imp[:, j] = np.round(X_test_imp[:, j], decimals=1) #min_vals_imp=np.nanmin(np.concatenate((X_train_imp,X_test_imp),axis=0),axis=0) #max_vals_imp=np.nanmax(np.concatenate((X_train_imp,X_test_imp),axis=0),axis=0) return (X_train_imp, y_train_imp, X_test_imp, y_test_imp, y_train_imp_orig, y_test_imp_orig)
class StandardImputation(BaseEstimator, DataPreprocessorMixin): """Standard imputation method for static data. Reference 1: https://pypi.org/project/missingpy/ Reference 2: https://s3.amazonaws.com/assets.datacamp.com/production/course_17404/slides/chapter4.pdf Attributes: - imputation_model_name: 'mice', 'missforest', 'knn' - data_type: 'static' """ def __init__(self, imputation_model_name, data_type): # Only allow for certain options assert data_type == 'static' assert imputation_model_name in ['mice', 'missforest', 'knn'] self.imputation_model_name = imputation_model_name self.data_type = data_type # Initialize the imputation model self.imputation_model = None def fit(self, dataset): """Train standard imputation model. Args: - dataset: incomplete dataset """ if dataset.static_feature is not None: # MICE if self.imputation_model_name == 'mice': self.imputation_model = IterativeImputer() # MissForest elif self.imputation_model_name == 'missforest': self.imputation_model = MissForest() # KNN elif self.imputation_model_name == 'knn': self.imputation_model = KNNImputer() self.imputation_model.fit(dataset.static_feature) return def transform(self, dataset): """Return imputed dataset by standard imputation. Args: - dataset: incomplete dataset Returns: - dataset: imputed dataset by standard imputation. """ assert self.imputation_model is not None if dataset.static_feature is not None: # Standard imputation data_imputed = self.imputation_model.transform( dataset.static_feature) # Rounding dataset.static_feature = rounding(dataset.static_feature, data_imputed) return dataset def fit_transform(self, dataset): """Fit and transform. Return imputed data Args: - dataset: incomplete dataset """ self.fit(dataset) return self.transform(dataset)
class Imputer(object): """Module for feature imputation.""" def __init__(self, missing_values='nan', strategy='mean', n_neighbors=5): ''' Imputation of feature values using either sklearn, missingpy or (WIP) fancyimpute approaches. Parameters ---------- missing_values : number, string, np.nan (default) or None The placeholder for the missing values. All occurrences of `missing_values` will be imputed. strategy : string, optional (default="mean") The imputation strategy. Supported using sklearn: - If "mean", then replace missing values using the mean along each column. Can only be used with numeric data. - If "median", then replace missing values using the median along each column. Can only be used with numeric data. - If "most_frequent", then replace missing using the most frequent value along each column. Can be used with strings or numeric data. - If "constant", then replace missing values with fill_value. Can be used with strings or numeric data. Supported using missingpy: - If 'knn', then use a nearest neighbor search. Can be used with strings or numeric data. WIP: More strategies using fancyimpute n_neighbors : int, optional (default = 5) Number of neighboring samples to use for imputation if method is knn. ''' # Set parameters to objects self.missing_values = missing_values self.strategy = strategy self.n_neighbors = n_neighbors # Depending on the imputations strategy, use a specific toolbox if strategy in ['mean', 'median', 'most_frequent', 'constant']: self.Imputer =\ SimpleImputer(missing_values=self.missing_values, strategy=self.strategy) elif strategy == 'knn': if missing_values == 'nan': # Slightly different API for missingpy self.missing_values = 'NaN' self.Imputer = KNNImputer(missing_values=self.missing_values, n_neighbors=self.n_neighbors) def fit(self, X, y=None): self.Imputer.fit(X, y) def transform(self, X): return self.Imputer.transform(X)