def test_callable_metric(): # Define callable metric that returns the l1 norm: def custom_callable(x, y, missing_values="NaN", squared=False): x = np.ma.array(x, mask=np.isnan(x)) y = np.ma.array(y, mask=np.isnan(y)) dist = np.nansum(np.abs(x-y)) return dist X = np.array([ [4, 3, 3, np.nan], [6, 9, 6, 9], [4, 8, 6, 9], [np.nan, 9, 11, 10.] ]) X_imputed = np.array([ [4, 3, 3, 9], [6, 9, 6, 9], [4, 8, 6, 9], [5, 9, 11, 10.] ]) imputer = KNNImputer(n_neighbors=2, metric=custom_callable) assert_array_equal(imputer.fit_transform(X), X_imputed)
def test_complete_features(): # Test with use_complete=True X = np.array([ [0, np.nan, 0, np.nan], [1, 1, 1, np.nan], [2, 2, np.nan, 2], [3, 3, 3, 3], [4, 4, 4, 4], [5, 5, 5, 5], [6, 6, 6, 6], [np.nan, 7, 7, 7] ]) r0c1 = np.mean(X[1:6, 1]) r0c3 = np.mean(X[2:-1, -1]) r1c3 = np.mean(X[2:-1, -1]) r2c2 = np.nanmean(X[:6, 2]) r7c0 = np.mean(X[2:-1, 0]) X_imputed = np.array([ [0, r0c1, 0, r0c3], [1, 1, 1, r1c3], [2, 2, r2c2, 2], [3, 3, 3, 3], [4, 4, 4, 4], [5, 5, 5, 5], [6, 6, 6, 6], [r7c0, 7, 7, 7] ]) imputer_comp = KNNImputer() assert_array_almost_equal(imputer_comp.fit_transform(X), X_imputed)
def impute_values(df: pd.DataFrame, method: str = 'mean', **kwargs): """ Impute missing values in DataFrame (np.nan or None). ------------------------ Args: * df: pd.DataFrame of (samples x features) * method: string for what method of imputation to use ** 'mean': mean imputation ** 'knn': K-NN imputation (see missingpy.KNNImputer) ** 'rf': random forest imputation (see missingpy.MissForest) Returns: * pd.DataFrame: imputed values (samples x features) """ assert method in ('mean','knn','rf'), '{} not yet implemented.'.format(method) if method=='mean': return df.fillna(df.mean(0)) elif method=='knn': X = df.values imputer = KNNImputer(**kwargs) X_impute = imputer.fit_transform(X) return pd.DataFrame(X_impute, index=df.index, columns=df.columns) elif method=='rf': X = df.values imputer = MissForest(**kwargs) X_impute = imputer.fit_transform(X) return pd.DataFrame(X_impute, index=df.index, columns=df.columns)
def pre_processing(data_route): data_frame = pd.read_csv(data_route) #Missing Value Imputation by Random Forest real_colums = data_frame.columns def handle_column_negative(x): return x.map(lambda x: x * (-1) if x < 0 else x) numericData = data_frame.copy() # Preparing data to Random Forest numericData = numericData.drop(["cluster", "date", "country"], axis=1) numericData = numericData.apply(lambda x: handle_column_negative(x), axis=1) numericData = numericData.replace([np.inf, -np.inf], np.nan) # applying random forest random_forest_imputer = KNNImputer() random_forest_result = random_forest_imputer.fit_transform(numericData) data_frame_processed = pd.DataFrame(random_forest_result) # adding removed fields data_frame_processed.insert(0, column='date', value=data_frame['date']) data_frame_processed.insert(0, column='cluster', value=data_frame['cluster']) data_frame_processed.insert(0, column='country', value=data_frame['country']) data_frame_processed.columns = real_colums return data_frame_processed
def test_default_with_invalid_input(): # Test imputation with default values and invalid input # Test with % missing in a column > col_max_missing X = np.array([ [np.nan, 0, 0, 0, 5], [np.nan, 1, 0, np.nan, 3], [np.nan, 2, 0, 0, 0], [np.nan, 6, 0, 5, 13], [np.nan, 7, 0, 7, 8], [np.nan, 8, 0, 8, 9], ]) imputer = KNNImputer() msg = "Some column(s) have more than {}% missing values".format( imputer.col_max_missing * 100) assert_raise_message(ValueError, msg, imputer.fit, X) # Test with insufficient number of neighbors X = np.array([ [1, 1, 1, 2, np.nan], [2, 1, 2, 2, 3], [3, 2, 3, 3, 8], [6, 6, 2, 5, 13], ]) msg = "There are only %d samples, but n_neighbors=%d." % \ (X.shape[0], imputer.n_neighbors) assert_raise_message(ValueError, msg, imputer.fit, X) # Test with inf present X = np.array([ [np.inf, 1, 1, 2, np.nan], [2, 1, 2, 2, 3], [3, 2, 3, 3, 8], [np.nan, 6, 0, 5, 13], [np.nan, 7, 0, 7, 8], [6, 6, 2, 5, 7], ]) msg = "+/- inf values are not allowed." assert_raise_message(ValueError, msg, KNNImputer().fit, X) # Test with inf present in matrix passed in transform() X = np.array([ [np.inf, 1, 1, 2, np.nan], [2, 1, 2, 2, 3], [3, 2, 3, 3, 8], [np.nan, 6, 0, 5, 13], [np.nan, 7, 0, 7, 8], [6, 6, 2, 5, 7], ]) X_fit = np.array([ [0, 1, 1, 2, np.nan], [2, 1, 2, 2, 3], [3, 2, 3, 3, 8], [np.nan, 6, 0, 5, 13], [np.nan, 7, 0, 7, 8], [6, 6, 2, 5, 7], ]) msg = "+/- inf values are not allowed in data to be transformed." assert_raise_message(ValueError, msg, KNNImputer().fit(X_fit).transform, X)
def test_knn_n_neighbors(): X = np.array([ [0, 0], [np.nan, 2], [4, 3], [5, np.nan], [7, 7], [np.nan, 8], [14, 13] ]) statistics_mean = np.nanmean(X, axis=0) # Test with 1 neighbor X_imputed_1NN = np.array([ [0, 0], [4, 2], [4, 3], [5, 3], [7, 7], [7, 8], [14, 13] ]) n_neighbors = 1 imputer = KNNImputer(n_neighbors=n_neighbors) assert_array_equal(imputer.fit_transform(X), X_imputed_1NN) assert_array_equal(imputer.statistics_, statistics_mean) # Test with 6 neighbors X = np.array([ [0, 0], [np.nan, 2], [4, 3], [5, np.nan], [7, 7], [np.nan, 8], [14, 13] ]) X_imputed_6NN = np.array([ [0, 0], [6, 2], [4, 3], [5, 5.5], [7, 7], [6, 8], [14, 13] ]) n_neighbors = 6 imputer = KNNImputer(n_neighbors=6) imputer_plus1 = KNNImputer(n_neighbors=n_neighbors + 1) assert_array_equal(imputer.fit_transform(X), X_imputed_6NN) assert_array_equal(imputer.statistics_, statistics_mean) assert_array_equal(imputer.fit_transform(X), imputer_plus1.fit( X).transform(X))
def __init__(self, missing_val_rep=0.0, k=10, copy=False): self.missing_val_rep = missing_val_rep self.imputer = KNNImputer(missing_val_rep, k, copy=copy, col_max_missing=1.0, row_max_missing=01.0)
def imputate_using_knn(dataset, k): cols = dataset.columns knn_impu = KNNImputer(n_neighbors=k, weights="uniform") result = knn_impu.fit_transform(dataset) result = pd.DataFrame(result) result.columns = cols return result
def knn_impute(data, n_neighbors=3): imputer = KNNImputer(n_neighbors=n_neighbors, missing_values=np.nan, weights='distance') imputed_df = pd.DataFrame(imputer.fit_transform(data)) imputed_df.columns = data.columns return (imputed_df)
def __init__(self, missing_values='nan', strategy='mean', n_neighbors=5): ''' Imputation of feature values using either sklearn, missingpy or (WIP) fancyimpute approaches. Parameters ---------- missing_values : number, string, np.nan (default) or None The placeholder for the missing values. All occurrences of `missing_values` will be imputed. strategy : string, optional (default="mean") The imputation strategy. Supported using sklearn: - If "mean", then replace missing values using the mean along each column. Can only be used with numeric data. - If "median", then replace missing values using the median along each column. Can only be used with numeric data. - If "most_frequent", then replace missing using the most frequent value along each column. Can be used with strings or numeric data. - If "constant", then replace missing values with fill_value. Can be used with strings or numeric data. Supported using missingpy: - If 'knn', then use a nearest neighbor search. Can be used with strings or numeric data. WIP: More strategies using fancyimpute n_neighbors : int, optional (default = 5) Number of neighboring samples to use for imputation if method is knn. ''' # Set parameters to objects self.missing_values = missing_values self.strategy = strategy self.n_neighbors = n_neighbors # Depending on the imputations strategy, use a specific toolbox if strategy in ['mean', 'median', 'most_frequent', 'constant']: self.Imputer =\ SimpleImputer(missing_values=self.missing_values, strategy=self.strategy) elif strategy == 'knn': if missing_values == 'nan': # Slightly different API for missingpy self.missing_values = 'NaN' self.Imputer = KNNImputer(missing_values=self.missing_values, n_neighbors=self.n_neighbors)
def outlier_treatment(train_data_frame): numericData = train_data_frame.loc[:, "expenses":"volume"] cleaned_data = numericData.copy() cleaned_data[~(np.abs(stats.zscore(cleaned_data)) < 3).all( axis=1)] = np.nan imputer = KNNImputer() result = imputer.fit_transform(cleaned_data) cdp = pd.DataFrame(result) cdp.insert(0, column='date', value=train_data_frame['date']) cdp.insert(0, column='cluster', value=train_data_frame['cluster']) cdp.insert(0, column='country', value=train_data_frame['country']) cdp.columns = train_data_frame.columns.copy() return cdp
def test_knn_imputation_shape(): # Verify the shapes of the imputed matrix for different weights and # number of neighbors. n_rows = 10 n_cols = 2 X = np.random.rand(n_rows, n_cols) X[0, 0] = np.nan for weights in ['uniform', 'distance']: for n_neighbors in range(1, 6): imputer = KNNImputer(n_neighbors=n_neighbors, weights=weights) X_imputed = imputer.fit_transform(X) assert_equal(X_imputed.shape, (n_rows, n_cols))
def do_impute(self, matrix_to_impute): parameter_set = self.get_parameter_set() np.savetxt('test_cur_matrix_missing.csv', matrix_to_impute) if self.parameters.impute_mode == parameter_set.constants.v_unsupervised_parameters_impute_mode_randomforest: imputed_cur_matrix = np.transpose( self.rfimpute.miss_forest_imputation( np.transpose(matrix_to_impute))) elif self.parameters.impute_mode == parameter_set.constants.v_unsupervised_parameters_impute_mode_knn: imputer = KNNImputer(n_neighbors=2, row_max_missing=1, col_max_missing=1) imputed_cur_matrix = np.transpose( imputer.fit_transform(np.transpose(matrix_to_impute))) return imputed_cur_matrix
def test_knn_imputation_zero(): # Test imputation when missing_values == 0 missing_values = 0 n_neighbors = 2 imputer = KNNImputer(missing_values=missing_values, n_neighbors=n_neighbors, weights="uniform") # Test with missing_values=0 when NaN present X = np.array([ [np.nan, 0, 0, 0, 5], [np.nan, 1, 0, np.nan, 3], [np.nan, 2, 0, 0, 0], [np.nan, 6, 0, 5, 13], ]) msg = "Input contains NaN, infinity or a value too large for %r." % X.dtype assert_raise_message(ValueError, msg, imputer.fit, X) # Test with % zeros in column > col_max_missing X = np.array([ [1, 0, 0, 0, 5], [2, 1, 0, 2, 3], [3, 2, 0, 0, 0], [4, 6, 0, 5, 13], ]) msg = "Some column(s) have more than {}% missing values".format( imputer.col_max_missing * 100) assert_raise_message(ValueError, msg, imputer.fit, X)
def test_knn_imputation_zero_p2(): # Test with an imputable matrix and also compare with missing_values="NaN" X_zero = np.array([ [1, 0, 1, 1, 1.], [2, 2, 2, 2, 2], [3, 3, 3, 3, 0], [6, 6, 0, 6, 6], ]) X_nan = np.array([ [1, np.nan, 1, 1, 1.], [2, 2, 2, 2, 2], [3, 3, 3, 3, np.nan], [6, 6, np.nan, 6, 6], ]) statistics_mean = np.nanmean(X_nan, axis=0) X_imputed = np.array([ [1, 2.5, 1, 1, 1.], [2, 2, 2, 2, 2], [3, 3, 3, 3, 1.5], [6, 6, 2.5, 6, 6], ]) imputer_zero = KNNImputer(missing_values=0, n_neighbors=2, weights="uniform") imputer_nan = KNNImputer(missing_values="NaN", n_neighbors=2, weights="uniform") assert_array_equal(imputer_zero.fit_transform(X_zero), X_imputed) assert_array_equal(imputer_zero.statistics_, statistics_mean) assert_array_equal(imputer_zero.fit_transform(X_zero), imputer_nan.fit_transform(X_nan))
class KNN_impute: def __init__(self, missing_val_rep=0.0, k=10, copy=False): self.missing_val_rep = missing_val_rep self.imputer = KNNImputer(missing_val_rep, k, copy=copy, col_max_missing=1.0, row_max_missing=01.0) def add_medians(self, X, y): X['labels'] = y label_meds = remove_rows(X).groupby(by='labels').median() #print(label_meds) for l in tqdm(label_meds.index): X[X['labels'] == l] = X[X['labels'] == l].replace( self.missing_val_rep, label_meds.loc[l, :].to_dict()) X.drop(columns=['labels'], inplace=True) def fit(self, X, y): self.add_medians(X, y) print('INSIDE IMPUTER: Beginning the fit') self.imputer.fit(X) print('INSIDE IMPUTER: Completed the fit') return None ''' def add_median(df): medians = df.median(axis=0) return df.replace(self.missing_val_rep, medians.to_dict()) X['labels'] = y X_median = X.groupby(by='labels').apply(add_median) #print(X_median) X.drop(columns=['labels'],inplace=True) X_median.drop(columns=['labels'],inplace=True) self.imputer.fit(X_median) ''' def transform(self, X): return self.imputer.transform(X)
def test_complete_features_weighted(): # Test with use_complete=True X = np.array([ [0, 0, 0, np.nan], [1, 1, 1, np.nan], [2, 2, np.nan, 2], [3, 3, 3, 3], [4, 4, 4, 4], [5, 5, 5, 5], [6, 6, 6, 6], [np.nan, 7, 7, 7] ]) dist = pairwise_distances(X, metric="masked_euclidean", squared=False) # Calculate weights r0c3_w = 1.0 / dist[0, 2:-1] r1c3_w = 1.0 / dist[1, 2:-1] r2c2_w = 1.0 / dist[2, (0, 1, 3, 4, 5)] r7c0_w = 1.0 / dist[7, 2:7] # Calculate weighted averages r0c3 = np.average(X[2:-1, -1], weights=r0c3_w) r1c3 = np.average(X[2:-1, -1], weights=r1c3_w) r2c2 = np.average(X[(0, 1, 3, 4, 5), 2], weights=r2c2_w) r7c0 = np.average(X[2:7, 0], weights=r7c0_w) X_imputed = np.array([ [0, 0, 0, r0c3], [1, 1, 1, r1c3], [2, 2, r2c2, 2], [3, 3, 3, 3], [4, 4, 4, 4], [5, 5, 5, 5], [6, 6, 6, 6], [r7c0, 7, 7, 7] ]) imputer_comp_wt = KNNImputer(weights="distance") assert_array_almost_equal(imputer_comp_wt.fit_transform(X), X_imputed)
def impute_missing_for_dataframe(dataframe, target='job_performance'): """ The imputer function should be used on a dataframe that has already been numerically encoded """ from missingpy import KNNImputer #, MissForest X = dataframe.loc[:, dataframe.columns != target].values y = dataframe[target].values # imputer object knn = KNNImputer(n_neighbors=5, weights="uniform", metric="masked_euclidean", row_max_missing=0.8, col_max_missing=0.8, copy=True) knn_missing_imputation = knn.fit_transform(X) imputed_dataframe = pd.DataFrame(knn_missing_imputation, columns = dataframe.columns[dataframe.columns != target]) imputed_dataframe[target] = pd.Series(y) return imputed_dataframe
def fit(self, dataset): """Train standard imputation model. Args: - dataset: incomplete dataset """ if dataset.static_feature is not None: # MICE if self.imputation_model_name == 'mice': self.imputation_model = IterativeImputer() # MissForest elif self.imputation_model_name == 'missforest': self.imputation_model = MissForest() # KNN elif self.imputation_model_name == 'knn': self.imputation_model = KNNImputer() self.imputation_model.fit(dataset.static_feature) return
def impute_times(final, times_open, times_closed, columns, imputation_method="mean"): """ Impute open work items times with different methods :param final: Complete preprocessed dataframe :param times_open: Dataframe of work items that are not closed :param times_closed: Dataframe of work items that are closed :param columns: Columns to impute :param imputation_method: Choose between 'mean', 'KNN', 'forest' :return: Dataframe of open work items with imputed values """ if imputation_method == "mean": for col in columns: mean = times_closed[col].mean() mask = (times_open[col] == 0) times_open[col].mask(mask, mean, inplace=True) if imputation_method in ["KNN", "forest"]: if imputation_method == "KNN": imputer = KNNImputer(missing_values=0, col_max_missing=0.9) if imputation_method == "forest": imputer = MissForest(missing_values=0) for col in columns: try: val = imputer.fit_transform(pd.DataFrame(final[col]))[:, 0] other = pd.DataFrame(index=final.index, data=val, columns=[col]) mask = (times_open[col] == 0) times_open.loc[mask, col] = other except ValueError: imputer = KNNImputer(missing_values=0, col_max_missing=0.99) return times_open
def test_weight_uniform(): X = np.array([ [0, 0], [np.nan, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10] ]) # Test with "uniform" weight (or unweighted) X_imputed_uniform = np.array([ [0, 0], [5, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10] ]) imputer = KNNImputer(weights="uniform") assert_array_equal(imputer.fit_transform(X), X_imputed_uniform) # Test with "callable" weight def no_weight(dist=None): return None imputer = KNNImputer(weights=no_weight) assert_array_equal(imputer.fit_transform(X), X_imputed_uniform)
def create_2d_velocity_field(self, radii, v_rot, n_interp_r=150, n_interp_theta=150): ''' uses tilted ring model parameters to calculate velocity field using eqn 1-3 of 1709.02049 and v_rot from mass model it is easier to loop through polar coordinates and then map the v_los to the nearest x,y point returns 2d velocity field array ''' v_field = np.empty(shape=(self.image_ydim, self.image_xdim)) v_field[:] = np.nan v_rot_interp = interp1d(radii, v_rot) radii_interp = np.linspace(np.min(radii), np.max(radii), n_interp_r) for r in radii_interp: v = v_rot_interp(r) for theta in np.linspace(0, 2. * np.pi, n_interp_theta): x, y, v_los = self._calc_v_los_at_r_theta(v, r, theta) if (self.image_xdim - 1 > x > 0 and y < self.image_ydim - 1 and y > 0): arr_x, arr_y = int(np.round(x, 0)), int(np.round(y, 0)) try: v_field[arr_y][arr_x] = v_los except: print(arr_x, arr_y, v_los) near_neighbors_mask = create_blurred_mask(v_field) imputer = KNNImputer(n_neighbors=3, weights="distance") v_field = imputer.fit_transform( np.where(near_neighbors_mask == 1, v_field, 0.)) v_field[v_field == 0] = np.nan # rotate to match the fits data field v_field = np.rot90(v_field, 3) return v_field
def Impute_Data_KNN(X_train, y_train, X_test, y_test, vals_mask, cols, data, var, min_vals, max_vals): XY_incomplete_train = np.concatenate((X_train, y_train.reshape(-1, 1)), axis=1) XY_incomplete_test = np.concatenate((X_test, y_test.reshape(-1, 1)), axis=1) imputer = KNNImputer(n_neighbors=5) XY_completed_train = imputer.fit_transform(XY_incomplete_train) XY_completed_test = imputer.transform(XY_incomplete_test) X_train_imp = (XY_completed_train[:, 0:data.shape[1]]) y_train_imp_orig = np.array(XY_completed_train[:, data.shape[1]], dtype="int16") y_train_imp = np.array(XY_completed_train[:, data.shape[1]] >= 5, dtype="int16") X_test_imp = (XY_completed_test[:, 0:data.shape[1]]) y_test_imp = np.array(XY_completed_test[:, data.shape[1]] >= 5, dtype="int16") y_test_imp_orig = np.array(XY_completed_test[:, data.shape[1]], dtype="int16") for j in range(0, X_train_imp.shape[1]): if var.iloc[j]['type'] == 'cat': X_train_imp[:, j] = np.clip(np.round(X_train_imp[:, j]), min_vals[j], max_vals[j]) X_test_imp[:, j] = np.clip(np.round(X_test_imp[:, j]), min_vals[j], max_vals[j]) else: X_train_imp[:, j] = np.round(X_train_imp[:, j], decimals=1) X_test_imp[:, j] = np.round(X_test_imp[:, j], decimals=1) #min_vals_imp=np.nanmin(np.concatenate((X_train_imp,X_test_imp),axis=0),axis=0) #max_vals_imp=np.nanmax(np.concatenate((X_train_imp,X_test_imp),axis=0),axis=0) return (X_train_imp, y_train_imp, X_test_imp, y_test_imp, y_train_imp_orig, y_test_imp_orig)
def clean_dragon(save=False): source = os.path.join(DATA_DIR, "cids-smiles-dragon.txt") df = pd.read_csv(source).set_index("CID") df = df.iloc[:, 1:] # Drop SMILES column # Scale to mean 0, variance 1 ss = StandardScaler() good = df.columns[df.isnull().sum() < 500] df = df[good] scaled = ss.fit_transform(df.astype("float")) df = pd.DataFrame(scaled, index=df.index, columns=df.columns) # Impute missing values knn = KNNImputer(k=5) imputed = knn.fit_transform(df.values) df = pd.DataFrame(imputed, index=df.index, columns=df.columns) # Optionally save to disk if save: dest = os.path.join(DATA_DIR, "cids-smiles-dragon-scaled-imputed.txt") df.to_csv(dest) return df
def test_metric_type(): X = np.array([ [0, 0], [np.nan, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10] ]) # Test with a metric type without NaN support imputer = KNNImputer(metric="euclidean") bad_metric_msg = "The selected metric does not support NaN values." assert_raise_message(ValueError, bad_metric_msg, imputer.fit, X)
def fit(self, dataset): """Train standard imputation model. Args: - dataset: incomplete dataset """ if dataset.static_feature is not None: # MICE if self.imputation_model_name == "mice": # TODO: Resolve the below: raise NotImplementedError( "IterativeImputer not implemented due to versioning issues with fancyimpute" ) # self.imputation_model = IterativeImputer() # MissForest elif self.imputation_model_name == "missforest": self.imputation_model = MissForest() # KNN elif self.imputation_model_name == "knn": self.imputation_model = KNNImputer() self.imputation_model.fit(dataset.static_feature) return
def imputeMatrix(dataM): imputer = KNNImputer(n_neighbors=10) dataT = imputer.fit_transform(dataM) return dataT
out[index1, index2] = np.nan Mask[index1, index2] = 0 Missing = Image.fromarray(rgbArray) plt.imshow(Missing) plt.show() return out SelectedImage = showImagesRandomImages( 3) #select and image randomly from MNSIT dataset missingPercentage = 0.2 # missing rate percentage missingImage = generateMissingFig( SelectedImage, missingPercentage) #inserting missing values to the original image imputer = KNNImputer(n_neighbors=2, weights="uniform") imputed_by_KNN = imputer.fit_transform(missingImage) KNNImputed_RMSE = mean_squared_error(SelectedImage, imputed_by_KNN) #plt.imshow(imputed_by_KNN, cmap='gray', vmin=0, vmax=1) #plt.show() imputer = MissForest() MissForest_imputed = imputer.fit_transform(missingImage) MissForest_RMSE = mean_squared_error(SelectedImage, MissForest_imputed) #plt.imshow(MissForest_imputed, cmap='gray', vmin=0, vmax=1) #plt.show() imputer = IterativeImputer() MICE_imputed = imputer.fit_transform(missingImage) MICE_RMSE = mean_squared_error(SelectedImage, MICE_imputed) #plt.imshow(MICE_imputed, cmap='gray', vmin=0, vmax=1)
def imputeKNN(data, **kwargs): imputer = KNNImputer(**kwargs) imputedData = imputer.fit_transform(data) imputedData = pd.DataFrame(imputedData, index=data.index, columns=data.columns) return imputedData
def perform_knn_imputation(dfs): knn_imputed_datasets = [KNNImputer(n_neighbors=100, copy = True).fit_transform(dfs[i]) for i in range(len(dfs))] return [pd.DataFrame(data=knn_imputed_datasets[i]) for i in range(len(dfs))]