def impute(self, df): if self.knn: knn = KNN() return pd.DataFrame(knn.fit_transform(df), columns=df.columns) else: mice = IterativeImputer() return pd.DataFrame(mice.fit_transform(df), columns=df.columns)
def impute(self, df): if self.knn: knn = KNN() return pd.DataFrame(knn.fit_transform(df), columns=df.columns) else: mice = IterativeImputer() return pd.DataFrame(mice.fit_transform(df), columns=df.columns)
def test_iterative_imputer_with_low_rank_random_matrix(): imputer = IterativeImputer(n_iter=50, random_state=0) XY_completed = imputer.fit_transform(XY_incomplete) _, missing_mae = reconstruction_error(XY, XY_completed, missing_mask, name="IterativeImputer") assert missing_mae < 0.1, "Error too high with IterativeImputer method!"
def multi_imp(data,m): XY=data n_imputations = m XY_completed = [] for i in range(n_imputations): imputer = IterativeImputer(n_iter=5, sample_posterior=True, random_state=i) XY_completed.extend(imputer.fit_transform(XY)) return np.array(XY_completed)
def get_predict(self, flag, in_data): output = in_data.copy() output.shape = (utils.M_NUM, 1) output[~flag] = np.nan solver = MICE() tmp = self.t_measure.copy() tmp = np.column_stack((tmp, output)).transpose() tmp = solver.fit_transform(tmp) output = np.array(tmp[-1, :]).reshape(utils.M_NUM, 1) return output
def test_iterative_imputer_with_low_rank_random_matrix_approximate(): imputer = IterativeImputer(n_iter=50, n_nearest_features=5, random_state=0) XY_completed = imputer.fit_transform(XY_incomplete) _, missing_mae = reconstruction_error( XY, XY_completed, missing_mask, name="IterativeImputer with n_nearest_features=5") assert missing_mae < 0.1, "Error too high with IterativeImputer " \ "method using n_nearest_features=5!"
def mice_imputer_wo_target(df): mice = IterativeImputer() return pd.DataFrame(mice.fit_transform(df), columns=[ 'city', 'city_development_index', 'gender', 'relevent_experience', 'enrolled_university', 'education_level', 'major_discipline', 'experience', 'company_size', 'company_type', 'last_new_job', 'training_hours' ])
def test_iterative_imputer_train_test_with_low_rank_random_matrix(): XY_incomplete_train = XY_incomplete[:250] XY_incomplete_test = XY_incomplete[250:] XY_test = XY[250:] imputer = IterativeImputer(n_iter=50, random_state=0) imputer.fit(XY_incomplete_train) XY_completed_test = imputer.transform(XY_incomplete_test) _, missing_mae = reconstruction_error(XY_test, XY_completed_test, missing_mask, name="IterativeImputer Train/Test") assert missing_mae < 0.1, "Error too high with IterativeImputer train/test method!"
def test_iterative_imputer_as_mice_with_low_rank_random_matrix_approximate(): n_imputations = 5 XY_completed = [] for i in range(n_imputations): imputer = IterativeImputer(n_iter=5, sample_posterior=True, random_state=i) XY_completed.append(imputer.fit_transform(XY_incomplete)) _, missing_mae = reconstruction_error(XY, np.mean(XY_completed, axis=0), missing_mask, name="IterativeImputer as MICE") assert missing_mae < 0.1, "Error too high with IterativeImputer as MICE!"
def fit(self, X, y=None): assert isinstance(X, pd.DataFrame) start = X y_present = y is not None groupby_present = self.groupby is not None self.imputers = [] if y_present or groupby_present: assert not (groupby_present and y_present) if y_present: classes = np.unique(y) gen_mask = lambda c: y == c if groupby_present: classes = X[self.groupby].unique() gen_mask = lambda c: X[self.groupby] == c self.imputers = { c: { "impute": [ IterativeImputer(n_iter=self.n_iter, sample_posterior=True, random_state=ix, **self.kwargs) for ix in range(self.multiple) ], "mask": gen_mask(c), } for c in classes } msg = """Imputation transformer: {} imputers x {} classes""".format( self.multiple, len(classes)) logger.info(msg) for c, d in self.imputers.items(): for imp in d["impute"]: imp.fit(X[d["mask"], :]) else: for ix in range(self.multiple): self.imputers.append( IterativeImputer(n_iter=self.n_iter, sample_posterior=True, random_state=ix, **self.kwargs)) msg = """Imputation transformer: {} imputers""".format( self.multiple) logger.info(msg) for ix in range(self.multiple): self.imputers[ix].fit(X) return self
def __init__(self, method, **kwargs): self.clf = None self.method = method if method == "SoftImpute": self.clf = SoftImpute(**kwargs) elif method == "KNN": self.clf = KNN(**kwargs) elif method == "Naive": self.clf = SimpleFill() elif method == 'II': raise ('NOT TESTED') self.clf = IterativeImputer(min_value=0) else: raise ("Not Implemented method")
class DFIterativeImputer(BaseEstimator, TransformerMixin): def __init__(self, max_iter=10): self.imputer = None self.max_iter = max_iter def fit(self, X, y=None): self.imputer = IterativeImputer(max_iter=self.max_iter) self.imputer.fit(X) return self def transform(self, X): X_filled = self.imputer.transform(X) X_filled = pd.DataFrame(X_filled, index=X.index, columns=X.columns) return X_filled
def deal_mar(df): """Deal with missing data with missing at random pattern.""" Xy_incomplete = df.values # knn with NoStdStreams(): Xy_filled_knn = KNN().fit_transform(Xy_incomplete); score_knn = compute_imputation_score(Xy_filled_knn) print("Imputation score of knn is {}".format(score_knn)) # matrix factorization with NoStdStreams(): Xy_filled_mf = MatrixFactorization().fit_transform(Xy_incomplete); score_mf = compute_imputation_score(Xy_filled_mf) print("Imputation score of matrix factorization is {}".format(score_knn)) # multiple imputation with NoStdStreams(): Xy_filled_ii = IterativeImputer().fit_transform(Xy_incomplete) score_ii = compute_imputation_score(Xy_filled_ii) print("Imputation score of multiple imputation is {}".format(score_ii)) score_dict = {'knn': score_knn, 'matrix factorization': score_mf, 'multiple imputation': score_ii} print("Imputation method with the highest socre is {}".format(max(score_dict, key=score_dict.get))) recommend = max(score_dict, key=score_dict.get) return recommend
def baseline_inpute(X_incomplete, method='mean', level=0): if method == 'mean': X_filled_mean = SimpleFill().fit_transform(X_incomplete) return X_filled_mean elif method == 'knn': k = [3, 10, 50][level] X_filled_knn = KNN(k=k, verbose=False).fit_transform(X_incomplete) return X_filled_knn elif method == 'svd': rank = [ np.ceil((X_incomplete.shape[1] - 1) / 10), np.ceil((X_incomplete.shape[1] - 1) / 5), X_incomplete.shape[1] - 1 ][level] X_filled_svd = IterativeSVD(rank=int(rank), verbose=False).fit_transform(X_incomplete) return X_filled_svd elif method == 'mice': max_iter = [3, 10, 50][level] X_filled_mice = IterativeImputer( max_iter=max_iter).fit_transform(X_incomplete) return X_filled_mice elif method == 'spectral': # default value for the sparsity level is with respect to the maximum singular value, # this is now done in a heuristic way sparsity = [0.5, None, 3][level] X_filled_spectral = SoftImpute( shrinkage_value=sparsity).fit_transform(X_incomplete) return X_filled_spectral else: raise NotImplementedError
def clean_missing(df, features, setting): """Clean missing values in the dataset. Parameters ---------- df : DataFrame features : List List of feature names. Returns ------- features_new : List List of feature names after cleaning. Xy_filled : array-like Numpy array where missing values have been cleaned. """ df_preprocessed, features_new = missing_preprocess(df, features) if setting == 'mcar': recommend = deal_mcar(df_preprocessed) elif setting == 'mar': recommend = deal_mar(df_preprocessed) elif setting == 'mnar': recommend = deal_mnar(df_preprocessed) else: print("Default MAR") recommend = deal_mar(df_preprocessed) if recommend == 'mean': print("Applying mean imputation ...") Xy_filled = Imputer(missing_values=np.nan, strategy='mean').fit_transform( df_preprocessed.values) print("Missing values cleaned!") elif recommend == 'mode': print("Applying mode imputation ...") Xy_filled = Imputer(missing_values=np.nan, strategy='most_frequent').fit_transform( df_preprocessed.values) print("Missing values cleaned!") elif recommend == 'knn': print("Applying knn imputation ...") with NoStdStreams(): Xy_filled = KNN().fit_transform(df_preprocessed.values) print("Missing values cleaned!") elif recommend == 'matrix factorization': print("Applying matrix factorization ...") with NoStdStreams(): Xy_filled = MatrixFactorization().fit_transform( df_preprocessed.values) print("Missing values cleaned!") elif recommend == 'multiple imputation': print("Applying multiple imputation ...") with NoStdStreams(): Xy_filled = IterativeImputer().fit_transform( df_preprocessed.values) print("Missing values cleaned!") else: print("Error: Approach not available!") return features_new, Xy_filled
class vk_sensing(): def __init__(self, method, **kwargs): self.clf = None self.method = method if method == "SoftImpute": self.clf = SoftImpute(**kwargs) elif method == "KNN": self.clf = KNN(**kwargs) elif method == "Naive": self.clf = SimpleFill() elif method == 'II': raise ('NOT TESTED') self.clf = IterativeImputer(min_value=0) else: raise ("Not Implemented method") def fit_transform(self, X_train): # print (X_train, np.isnan(X_train).all()) assert (self.clf is not None) X_est = None if np.isnan(X_train).any(): if np.isnan(X_train).all(): X_est = np.zeros_like(X_train) else: # print (np.isnan(self.clf.fit_transform(X_train)).any()) X_est = massage_imputed_matrix(self.clf.fit_transform(X_train)) else: X_est = X_train assert (not np.isnan(X_est).any()) return X_est def CVfit(self, X, val_ratio=0.2): mask = np.invert(np.isnan(X)) sample_mask = np.random.rand(*X.shape) < val_ratio X_train = X.copy() X_train[mask & (~sample_mask)] = np.nan X_val = X.copy() X_val[mask & (sample_mask)] = np.nan cur_best_err = np.inf cur_best_k = None for k in GLOB_IMPUTE_K_SWEEP: clf = construct_low_rank_imputer(self.method, k) if np.isnan(X_train).any(): if np.isnan(X_train).all(): X_est = np.zeros_like(X_train) else: X_est = massage_imputed_matrix(clf.fit_transform(X_train)) else: X_est = X_train err = MAE(X_est, X_val) # print (k, err, RMSN(X_est, X_val)) if err < cur_best_err: cur_best_err = err cur_best_k = k if cur_best_k is None: cur_best_k = 1 # print (cur_best_k) self.clf = construct_low_rank_imputer(self.method, cur_best_k)
def deal_mcar(df): """Deal with missing data with missing completely at random pattern.""" # number of instances num_instances = df.shape[0] # number of rows containing missing num_missing_instances = df.isnull().sum(axis=1).astype(bool).sum() # missing percentage missing_percentage = num_missing_instances / num_instances print("Missing percentage is {}".format(missing_percentage)) if missing_percentage < 0.05: recommend = 'list deletion' else: Xy_incomplete = df.values # mean Xy_filled_mean = Imputer(missing_values=np.nan, strategy='mean').fit_transform(Xy_incomplete) score_mean = compute_imputation_score(Xy_filled_mean) print("Imputation score of mean is {}".format(score_mean)) # mode Xy_filled_mode = Imputer( missing_values=np.nan, strategy='most_frequent').fit_transform(Xy_incomplete) score_mode = compute_imputation_score(Xy_filled_mode) print("Imputation score of mode is {}".format(score_mode)) # knn with NoStdStreams(): Xy_filled_knn = KNN().fit_transform(Xy_incomplete) score_knn = compute_imputation_score(Xy_filled_knn) print("Imputation score of knn is {}".format(score_knn)) # matrix factorization with NoStdStreams(): Xy_filled_mf = MatrixFactorization().fit_transform(Xy_incomplete) score_mf = compute_imputation_score(Xy_filled_mf) print( "Imputation score of matrix factorization is {}".format(score_knn)) # multiple imputation with NoStdStreams(): Xy_filled_ii = IterativeImputer().fit_transform(Xy_incomplete) score_ii = compute_imputation_score(Xy_filled_ii) print("Imputation score of multiple imputation is {}".format(score_ii)) score_dict = { 'mean': score_mean, 'mode': score_mode, 'knn': score_knn, 'matrix factorization': score_mf, 'multiple imputation': score_ii } print("Imputation method with the highest socre is {}".format( max(score_dict, key=score_dict.get))) recommend = max(score_dict, key=score_dict.get) return recommend
def __init__(self): path = "C:\PycharmProjects\PTSD\Data\PTSD.xlsx" df = pd.read_excel(path) df = df[~df['PCL_Strict3'].isna()] df = df[df["military_exp18_t3"] > 0] df = df[self.features + self.ID + self.target_features] df_pcl3 = pd.read_excel("C:\PycharmProjects\PTSD\Data\questionnaire6PCL3.xlsx") df_pcl3 = PCL_calculator(df_pcl3) df_pcl2 = pd.read_excel("C:\PycharmProjects\PTSD\Data\questionnaire6PCL2.xlsx") df_pcl2 = PCL_calculator(df_pcl2) df_pcl1 = pd.read_excel("C:\PycharmProjects\PTSD\Data\questionnaire6PCL1.xlsx") df_pcl1 = PCL_calculator(df_pcl1) df = df.merge(df_pcl1, on="ID", how='outer') df = df.merge(df_pcl2, suffixes=('_pcl1', '_pcl2'), on="ID", how='outer') df = df.merge(df_pcl3.drop(['PCL3_Strict', 'pcl3', 'PCL3_Broad'], axis=1), on="ID", how='outer') df = df[~df['PCL_Strict3'].isna()] #df = df[~df['tred_cutoff'].isna()] df.drop(self.ID, inplace=True, axis=1) if mew: mice = IterativeImputer() df = pd.DataFrame(mice.fit_transform(df), columns=df.columns) all_x_col = self.features + self.features_2 + self.target_features_2 #all_x_col = self.features + self.features_2 #y_col = ["tred_cutoff"] y_col = ["PCL_Strict3"] X = df[all_x_col] Y = df[y_col] X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(X, Y, test_size=0.25, random_state=271828, stratify=Y) X_train, X_test, y_train, y_test = train_test_split(X_train_0, y_train_0, test_size=0.25, random_state=271828, stratify=y_train_0) df = pd.concat([X_train, y_train], axis=1) self.X_test = X_test self.y_test =y_test self.X_train_0 = X_train_0 self.X_test_0 = X_test_0 self.y_train_0 = y_train_0 self.y_test_0 = y_test_0 self.df = df
def mice_impute(data): print("imputing data using mice") data_matrix = data.values filled_data = pd.DataFrame( IterativeImputer(imputation_order='random', n_iter=5, sample_posterior=True).fit_transform(data_matrix)) filled_data.columns = data.columns filled_data.index = data.index filled_data.to_csv('mice_imputed_data.csv') print("data imputed using mice")
def obj_sim(path): wine_data = pd.read_csv(path, header=0, index_col=0, engine='python', encoding='utf-8') wine_data = wine_data.values # t = BiScaler().fit_transform(wine_data[:, 4].reshape(-1, 1)) # obj_data = SoftImpute().fit_transform(t) obj_data = IterativeImputer().fit_transform(wine_data[:, 4].reshape(-1, 1)) draw(pd.DataFrame(obj_data), 0)
def fill_ii(df): ''' Use IterativeImputer to fill null number ---------------------------------- df: the pandas.dataframe going to fill missing value ''' df_filled_ii = pd.DataFrame(IterativeImputer().fit_transform(df.as_matrix())) df_filled_ii.columns = df.columns df_filled_ii.index = df.index return df_filled_ii
def construct_low_rank_imputer(method, k): clf = None if method == "SoftImpute": clf = SoftImpute(max_rank=k, verbose=False) elif method == "KNN": clf = KNN(k=k, verbose=False) elif method == 'II': clf = IterativeImputer(min_value=0) else: raise ("Not implemented") return clf
def main(p_miss=0.5, dataset="drive", mode="mcar", para=0.5, train=None, rand_seed=42): np.random.seed(rand_seed) n, p, xmiss, xhat_0, mask, data_x, data_y = load_data(p_miss, dataset=dataset, mode=mode, para=para, train=train, rand_seed=rand_seed) x_filled = IterativeImputer().fit_transform(xmiss) mse = mse_own(x_filled, data_x, mask) print("MSE for MICE: " + str(mse)) return x_filled, mse
def iterative_imputer(self, estimator, max_iter, tol, n_nearest_feature, initial_strategy, imputation_order, skip_complete, min_value, max_value, verbose, random_state): print("Interative Imputer") print(n_nearest_feature) my_estimator = None if estimator == 'BayesianRidge': my_estimator = BayesianRidge() if estimator == 'DecisionTreeRegressor': my_estimator = DecisionTreeRegressor() if estimator == 'ExtraTreesRegressor': my_estimator = ExtraTreesRegressor() if estimator == 'KNeighborsRegressor': my_estimator = KNeighborsRegressor() if estimator == 'DecisionTreeClassifier': my_estimator = DecisionTreeClassifier imp = IterativeImputer( estimator=my_estimator, missing_values=np.NAN, # sample_posterior=sample_posterior, max_iter=max_iter, tol=tol, n_nearest_features=n_nearest_feature, initial_strategy=initial_strategy, imputation_order=imputation_order, skip_complete=skip_complete, min_value=min_value, max_value=max_value, verbose=verbose, random_state=random_state, # add_indicator=add_indicator ) print("Iterative Imputer is created") self.data = imp.fit_transform(self.data) self.data = pd.DataFrame(self.data) self.data.columns = self.featuresName self.data = self.data.infer_objects()
def Data_prep(df, flag): # Removing Loan Id and Loan Status for One-Hot encoding and Imputation # ######################################################## predictors = df.columns predictors[1] predictors = np.delete(predictors, 0) if flag != 1: predictors = np.delete(predictors, -1) print("train module flag 0") flag = 1 df[predictors] # One-Hot Encoding # ######################################################## df_dummy = pd.get_dummies(df[predictors], dummy_na=True) df_dummy df_dummy.count() df_dummy.head() newcols = df_dummy.columns newcols # Data Imputations # ######################################################## from fancyimpute import IterativeImputer df_imputed = IterativeImputer().fit_transform(df_dummy) df_imputed = pd.DataFrame(df_imputed, columns=newcols) df_imputed.head() df_imputed.count() return (df_imputed)
def mice_imputation(train, test): data_mice_train = np.copy(train) data_mice_test = np.copy(test) for ind in range(data_mice_train[:, 0, :].shape[0]): data_mice_train[ind, 0, :][np.argwhere( data_mice_train[ind, 1, :] == 1.0)] = np.nan for ind in range(data_mice_test[:, 0, :].shape[0]): data_mice_test[ind, 0, :][np.argwhere(data_mice_test[ind, 1, :] == 1.0)] = np.nan mice_impute = IterativeImputer() #check if all columns have values if not impute 0 for col in range(data_mice_train[:, 0, :].shape[1]): if (np.all(np.isnan(data_mice_train[:, 0, :][:, col]))): data_mice_train[:, 0, :][:, col] = 0.0 mice_impute.fit(data_mice_train[:, 0, :]) return mice_impute.transform(data_mice_test[:, 0, :])
def replace_mice(method): train_df=pd.read_csv(path, parse_dates=True,encoding='utf-8') del_col=train_df.select_dtypes(include=['object']).columns for i in del_col: train_df=train_df.drop([i],axis=1) countcolumns=0 for i in train_df.columns: if(i==var): inx=countcolumns countcolumns=countcolumns+1 n_imputations = 10 XY_completed = [] for i in range(n_imputations): imputer = IterativeImputer(n_iter=n_imputations, sample_posterior=True, random_state=i) XY_completed.append(imputer.fit_transform(train_df.as_matrix())) XY_completed = np.mean(XY_completed, 0) XY_completed = np.round(XY_completed) new_df = pd.read_csv(path,parse_dates=True,encoding='utf-8') data_null_len=len(new_df[new_df[var].isnull()]) for i in range(data_null_len): xx=train_df[train_df[var].isnull()].index[i] new_df[var].loc[xx]=abs(XY_completed[xx][inx]) return new_df
def datainput(self): full_data = pd.read_csv(self.file, header=0) print('\nMissing values for each columns') print(full_data.isnull().sum()) # print # of mssing values numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] df_n = full_data.select_dtypes(include=numerics) col_names = list(df_n.columns) df_c = full_data.select_dtypes(exclude=numerics) ipt = input('\nIs there any missing values? (y/n) : ') if ipt == 'y': ct = input('Is there any missing values which is not digit? (y/n) : ') if ct == 'y': full_data.dropna() else: impute = IterativeImputer() df_n = impute.fit_transform(df_n) #process mssing values using imputer df_n = pd.DataFrame(df_n) df_n.columns = col_names full_data = pd.concat([df_n,df_c], axis=1) print('\nMissing values after processing') print(full_data.isnull().sum()) # print # of missing values train, test = train_test_split(full_data, test_size=0.3, shuffle = False) # train,test set split , default = shuffle return train, test
def get_imputer(imputer_name, **add_params): imputer_name = imputer_name.lower() if imputer_name == 'knn': return KNN(**add_params) elif imputer_name.lower() == 'nnm': return NuclearNormMinimization(**add_params) elif imputer_name == 'soft': return SoftImpute(**add_params) elif imputer_name == 'iterative': return IterativeImputer(**add_params) elif imputer_name == 'biscaler': return BiScaler(**add_params) else: print('Choose one of predefined imputers')
def nan_imputing(df): """ There is only one feature with nans. Donor age at diagnosis. We impute it using the KNN strategy :param df: :return: """ # Imput missing data with mice fancy_imputed = df dummies = pd.get_dummies(df) imputed = pd.DataFrame(data=IterativeImputer().fit_transform(dummies), columns=dummies.columns, index=dummies.index) fancy_imputed.donor_age_at_diagnosis = imputed.donor_age_at_diagnosis fancy_imputed['donor_age_at_diagnosis'] = fancy_imputed[ 'donor_age_at_diagnosis'].astype(np.int) return fancy_imputed
def _handle_na(self, columns, fillna_strategy): """ Handle the missing values for Numerical Features :param columns: columns/features name in the dataframe :param fillna_strategy: NA handling strategy """ if fillna_strategy in ['mean', 'median', 'most_frequent', 'mode']: # Change mode to most_frequent fillna_strategy = 'most_frequent' if fillna_strategy == 'mode' else fillna_strategy imp = SimpleImputer(missing_values=np.nan, strategy=fillna_strategy) self.output_df[columns] = imp.fit_transform(self.df[columns]) # return self.imputers[column] = imp elif fillna_strategy == 'new': for column in columns: new_col_name = column + '_new' if self.output_df[column].isnull().count() > 0: self.output_df[new_col_name] = np.where( self.output_df[column].isnull(), 1, 0) elif fillna_strategy == 'end_distribution': for column in columns: if self.output_df[column].isnull().count() > 0: new_col_name = column + '_new' extreme = self.df[column].mean( ) + 3 * self.df[column].std() self.output_df[column] = self.output_df[column].fillna( extreme) elif fillna_strategy == 'mice': from fancyimpute import IterativeImputer imp = IterativeImputer() self.output_df[columns] = imp.fit_transform( self.output_df[columns]) # self.imputers[columns] = imp elif fillna_strategy == 'knn': from fancyimpute import KNN imp = KNN() self.output_df[columns] = imp.fit_transform( self.output_df[columns]) # self.imputers[column] = imp elif fillna_strategy == 'softimpute': from fancyimpute import SoftImpute imp = SoftImpute() self.output_df[columns] = imp.fit_transform( self.output_df[columns])
def fit(self, dataset): """Train standard imputation model. Args: - dataset: incomplete dataset """ if dataset.static_feature is not None: # MICE if self.imputation_model_name == 'mice': self.imputation_model = IterativeImputer() # MissForest elif self.imputation_model_name == 'missforest': self.imputation_model = MissForest() # KNN elif self.imputation_model_name == 'knn': self.imputation_model = KNNImputer() self.imputation_model.fit(dataset.static_feature) return