def impute(self, df): if self.knn: knn = KNN() return pd.DataFrame(knn.fit_transform(df), columns=df.columns) else: mice = IterativeImputer() return pd.DataFrame(mice.fit_transform(df), columns=df.columns)
def test_iterative_imputer_with_low_rank_random_matrix(): imputer = IterativeImputer(n_iter=50, random_state=0) XY_completed = imputer.fit_transform(XY_incomplete) _, missing_mae = reconstruction_error(XY, XY_completed, missing_mask, name="IterativeImputer") assert missing_mae < 0.1, "Error too high with IterativeImputer method!"
def multi_imp(data,m): XY=data n_imputations = m XY_completed = [] for i in range(n_imputations): imputer = IterativeImputer(n_iter=5, sample_posterior=True, random_state=i) XY_completed.extend(imputer.fit_transform(XY)) return np.array(XY_completed)
class vk_sensing(): def __init__(self, method, **kwargs): self.clf = None self.method = method if method == "SoftImpute": self.clf = SoftImpute(**kwargs) elif method == "KNN": self.clf = KNN(**kwargs) elif method == "Naive": self.clf = SimpleFill() elif method == 'II': raise ('NOT TESTED') self.clf = IterativeImputer(min_value=0) else: raise ("Not Implemented method") def fit_transform(self, X_train): # print (X_train, np.isnan(X_train).all()) assert (self.clf is not None) X_est = None if np.isnan(X_train).any(): if np.isnan(X_train).all(): X_est = np.zeros_like(X_train) else: # print (np.isnan(self.clf.fit_transform(X_train)).any()) X_est = massage_imputed_matrix(self.clf.fit_transform(X_train)) else: X_est = X_train assert (not np.isnan(X_est).any()) return X_est def CVfit(self, X, val_ratio=0.2): mask = np.invert(np.isnan(X)) sample_mask = np.random.rand(*X.shape) < val_ratio X_train = X.copy() X_train[mask & (~sample_mask)] = np.nan X_val = X.copy() X_val[mask & (sample_mask)] = np.nan cur_best_err = np.inf cur_best_k = None for k in GLOB_IMPUTE_K_SWEEP: clf = construct_low_rank_imputer(self.method, k) if np.isnan(X_train).any(): if np.isnan(X_train).all(): X_est = np.zeros_like(X_train) else: X_est = massage_imputed_matrix(clf.fit_transform(X_train)) else: X_est = X_train err = MAE(X_est, X_val) # print (k, err, RMSN(X_est, X_val)) if err < cur_best_err: cur_best_err = err cur_best_k = k if cur_best_k is None: cur_best_k = 1 # print (cur_best_k) self.clf = construct_low_rank_imputer(self.method, cur_best_k)
def test_iterative_imputer_with_low_rank_random_matrix_approximate(): imputer = IterativeImputer(n_iter=50, n_nearest_features=5, random_state=0) XY_completed = imputer.fit_transform(XY_incomplete) _, missing_mae = reconstruction_error( XY, XY_completed, missing_mask, name="IterativeImputer with n_nearest_features=5") assert missing_mae < 0.1, "Error too high with IterativeImputer " \ "method using n_nearest_features=5!"
def get_predict(self, flag, in_data): output = in_data.copy() output.shape = (utils.M_NUM, 1) output[~flag] = np.nan solver = MICE() tmp = self.t_measure.copy() tmp = np.column_stack((tmp, output)).transpose() tmp = solver.fit_transform(tmp) output = np.array(tmp[-1, :]).reshape(utils.M_NUM, 1) return output
def mice_imputer_wo_target(df): mice = IterativeImputer() return pd.DataFrame(mice.fit_transform(df), columns=[ 'city', 'city_development_index', 'gender', 'relevent_experience', 'enrolled_university', 'education_level', 'major_discipline', 'experience', 'company_size', 'company_type', 'last_new_job', 'training_hours' ])
def test_iterative_imputer_as_mice_with_low_rank_random_matrix_approximate(): n_imputations = 5 XY_completed = [] for i in range(n_imputations): imputer = IterativeImputer(n_iter=5, sample_posterior=True, random_state=i) XY_completed.append(imputer.fit_transform(XY_incomplete)) _, missing_mae = reconstruction_error(XY, np.mean(XY_completed, axis=0), missing_mask, name="IterativeImputer as MICE") assert missing_mae < 0.1, "Error too high with IterativeImputer as MICE!"
def __init__(self): path = "C:\PycharmProjects\PTSD\Data\PTSD.xlsx" df = pd.read_excel(path) df = df[~df['PCL_Strict3'].isna()] df = df[df["military_exp18_t3"] > 0] df = df[self.features + self.ID + self.target_features] df_pcl3 = pd.read_excel("C:\PycharmProjects\PTSD\Data\questionnaire6PCL3.xlsx") df_pcl3 = PCL_calculator(df_pcl3) df_pcl2 = pd.read_excel("C:\PycharmProjects\PTSD\Data\questionnaire6PCL2.xlsx") df_pcl2 = PCL_calculator(df_pcl2) df_pcl1 = pd.read_excel("C:\PycharmProjects\PTSD\Data\questionnaire6PCL1.xlsx") df_pcl1 = PCL_calculator(df_pcl1) df = df.merge(df_pcl1, on="ID", how='outer') df = df.merge(df_pcl2, suffixes=('_pcl1', '_pcl2'), on="ID", how='outer') df = df.merge(df_pcl3.drop(['PCL3_Strict', 'pcl3', 'PCL3_Broad'], axis=1), on="ID", how='outer') df = df[~df['PCL_Strict3'].isna()] #df = df[~df['tred_cutoff'].isna()] df.drop(self.ID, inplace=True, axis=1) if mew: mice = IterativeImputer() df = pd.DataFrame(mice.fit_transform(df), columns=df.columns) all_x_col = self.features + self.features_2 + self.target_features_2 #all_x_col = self.features + self.features_2 #y_col = ["tred_cutoff"] y_col = ["PCL_Strict3"] X = df[all_x_col] Y = df[y_col] X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(X, Y, test_size=0.25, random_state=271828, stratify=Y) X_train, X_test, y_train, y_test = train_test_split(X_train_0, y_train_0, test_size=0.25, random_state=271828, stratify=y_train_0) df = pd.concat([X_train, y_train], axis=1) self.X_test = X_test self.y_test =y_test self.X_train_0 = X_train_0 self.X_test_0 = X_test_0 self.y_train_0 = y_train_0 self.y_test_0 = y_test_0 self.df = df
def iterative_imputer(self, estimator, max_iter, tol, n_nearest_feature, initial_strategy, imputation_order, skip_complete, min_value, max_value, verbose, random_state): print("Interative Imputer") print(n_nearest_feature) my_estimator = None if estimator == 'BayesianRidge': my_estimator = BayesianRidge() if estimator == 'DecisionTreeRegressor': my_estimator = DecisionTreeRegressor() if estimator == 'ExtraTreesRegressor': my_estimator = ExtraTreesRegressor() if estimator == 'KNeighborsRegressor': my_estimator = KNeighborsRegressor() if estimator == 'DecisionTreeClassifier': my_estimator = DecisionTreeClassifier imp = IterativeImputer( estimator=my_estimator, missing_values=np.NAN, # sample_posterior=sample_posterior, max_iter=max_iter, tol=tol, n_nearest_features=n_nearest_feature, initial_strategy=initial_strategy, imputation_order=imputation_order, skip_complete=skip_complete, min_value=min_value, max_value=max_value, verbose=verbose, random_state=random_state, # add_indicator=add_indicator ) print("Iterative Imputer is created") self.data = imp.fit_transform(self.data) self.data = pd.DataFrame(self.data) self.data.columns = self.featuresName self.data = self.data.infer_objects()
def replace_mice(method): train_df=pd.read_csv(path, parse_dates=True,encoding='utf-8') del_col=train_df.select_dtypes(include=['object']).columns for i in del_col: train_df=train_df.drop([i],axis=1) countcolumns=0 for i in train_df.columns: if(i==var): inx=countcolumns countcolumns=countcolumns+1 n_imputations = 10 XY_completed = [] for i in range(n_imputations): imputer = IterativeImputer(n_iter=n_imputations, sample_posterior=True, random_state=i) XY_completed.append(imputer.fit_transform(train_df.as_matrix())) XY_completed = np.mean(XY_completed, 0) XY_completed = np.round(XY_completed) new_df = pd.read_csv(path,parse_dates=True,encoding='utf-8') data_null_len=len(new_df[new_df[var].isnull()]) for i in range(data_null_len): xx=train_df[train_df[var].isnull()].index[i] new_df[var].loc[xx]=abs(XY_completed[xx][inx]) return new_df
def datainput(self): full_data = pd.read_csv(self.file, header=0) print('\nMissing values for each columns') print(full_data.isnull().sum()) # print # of mssing values numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] df_n = full_data.select_dtypes(include=numerics) col_names = list(df_n.columns) df_c = full_data.select_dtypes(exclude=numerics) ipt = input('\nIs there any missing values? (y/n) : ') if ipt == 'y': ct = input('Is there any missing values which is not digit? (y/n) : ') if ct == 'y': full_data.dropna() else: impute = IterativeImputer() df_n = impute.fit_transform(df_n) #process mssing values using imputer df_n = pd.DataFrame(df_n) df_n.columns = col_names full_data = pd.concat([df_n,df_c], axis=1) print('\nMissing values after processing') print(full_data.isnull().sum()) # print # of missing values train, test = train_test_split(full_data, test_size=0.3, shuffle = False) # train,test set split , default = shuffle return train, test
all_data[col_name] = most_common_imputed[col_name] nom_df = pd.get_dummies(all_data[nominal_columns], prefix=nominal_columns) for col_name in nom_df.columns: all_data[col_name] = nom_df[col_name] all_data = all_data.drop(columns= nominal_columns) print(all_data) from fancyimpute import IterativeImputer MICE_imputer = IterativeImputer() ordinal_mice = all_data.copy(deep = True) ordinal_mice.iloc[:,:] = np.round(MICE_imputer.fit_transform(ordinal_mice)) for col_name in ordinal_columns: all_data[col_name] = ordinal_mice[col_name] for col_name in numeric_columns: all_data[col_name] = ordinal_mice[col_name] if all_data.isnull().values.any(): print("Yuh artık!") print("GOSHHH!!!!!") print("Breakdown loading...") from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler
def __init__(self): super().__init__(chapter_id="PROJ01_dual_single", to_data_path="dual_single", target_field="EPSI") file_from_ = "dual_single.csv" file_to_ = "dual_single.csv.gz" # get data --- csv_path = os.path.join(self.TO_DATA_PATH, file_from_) if not os.path.isfile(csv_path): self.DUAL_SINGLE_URL = "https://github.com/amosbaranes/ml_data/raw/master/dual_single.csv.gz" self.fetch_tgz_data(self.DUAL_SINGLE_URL, file_from_, "gz") self.load_csv_data("dual_single") self.DATA_SOURCE = self.DATA.iloc[:, 5:19] self.DATA_IMPUTED = self.DATA.iloc[:, 19:] self.SINGLE_DATA = self.DATA[self.DATA["Type"] == "Single"] self.DUAL_DATA = self.DATA[self.DATA["Type"] == "Dual"] self.SINGLE_DATA_IMPUTED = self.SINGLE_DATA.iloc[:, 19:] self.DUAL_DATA_IMPUTED = self.DUAL_DATA.iloc[:, 19:] self.SINGLE_DATA_SOURCE = self.SINGLE_DATA.iloc[:, 5:19] self.DUAL_DATA_SOURCE = self.DUAL_DATA.iloc[:, 5:19] self.SINGLE_DATA_SOURCE_ZZ = zig_zag_(self.SINGLE_DATA_SOURCE, a_rows=0.25, a_col=0.85) self.DUAL_DATA_SOURCE_ZZ = zig_zag_(self.DUAL_DATA_SOURCE, a_rows=0.25, a_col=0.85) mice_impute_s = IterativeImputer() self.SINGLE_DATA_SOURCE_ZZI = pd.DataFrame( mice_impute_s.fit_transform(self.SINGLE_DATA_SOURCE_ZZ)) # print(self.SINGLE_DATA_SOURCE_ZZ.columns) # print(self.SINGLE_DATA_SOURCE_ZZI.columns) try: self.SINGLE_DATA_SOURCE_ZZI.columns = self.SINGLE_DATA_SOURCE_ZZ.columns file = "SINGLE_DATA_IMPUTED" ssr = os.path.join(self.TO_DATA_PATH, file + ".xlsx") # "housing" print(ssr) with pd.ExcelWriter(ssr, engine='xlsxwriter') as writer: self.SINGLE_DATA_SOURCE_ZZI.to_excel(writer, sheet_name="imputed") writer.save() except Exception as e: print(e) # print(self.SINGLE_DATA_SOURCE_ZZI) mice_impute_d = IterativeImputer() self.DUAL_DATA_SOURCE_ZZI = pd.DataFrame( mice_impute_d.fit_transform(self.DUAL_DATA_SOURCE_ZZ)) try: self.DUAL_DATA_SOURCE_ZZI.columns = self.DUAL_DATA_SOURCE_ZZ.columns file = "DUAL_DATA_IMPUTED" ssr = os.path.join(self.TO_DATA_PATH, file + ".xlsx") # "housing" print(ssr) with pd.ExcelWriter(ssr, engine='xlsxwriter') as writer: self.DUAL_DATA_SOURCE_ZZI.to_excel(writer, sheet_name="imputed") writer.save() except Exception as e: print(e)
from fancyimpute import IterativeImputer df = pd.read_csv('pima.csv') msno.bar(df) msno.heatmap(df) df_cleaned = df.dropna(subset=['Diastolic_BP', 'BMI', 'Glucose']) df_noNa = df.dropna() y_noNa = df_noNa['Class'] X_noNa = df_noNa.iloc[:, :-1] lm_noNa = sm.OLS(y_noNa, X_noNa).fit() R2 = pd.Series([lm_noNa.rsquared_adj, 0, 0], index=['noNa', 'MICE', 'medians']) df_MICE = df_cleaned.copy(deep=True) MICE_imputer = IterativeImputer() df_MICE.iloc[:, :] = MICE_imputer.fit_transform(df_MICE) y_MICE = df_MICE['Class'] X_MICE = df_MICE.iloc[:, :-1] lm_MICE = sm.OLS(y_MICE, X_MICE).fit() lm.summary() R2['MICE'] = lm_MICE.rsquared_adj df_medians = df_cleaned.copy(deep=True) df_medians.loc[df_medians['Serum_Insulin'].isna(), 'Serum_Insulin'] = df_medians['Serum_Insulin'].median() df_medians.loc[df_medians['Skin_Fold'].isna(), 'Skin_Fold'] = df_medians['Skin_Fold'].median() y_medians = df_medians['Class'] X_medians = df_medians.iloc[:, :-1] lm_medians = sm.OLS(y_medians, X_medians).fit()
'q6.14_ANGER_pcl2', 'q6.15_CONC_pcl2', 'q6.16_HYPER_pcl2', 'q6.17_STRTL_pcl2', 'intrusion_pcl2', 'avoidance_pcl2', 'hypertention_pcl2', 'depression_pcl2', 'tred_pcl2' ] target_features = ["PCL_Strict3", "PCL3"] ID = ["ID"] path = "C:\PycharmProjects\PTSD\Data\PTSD.xlsx" df = pd.read_excel(path) df = df[~df['PCL_Strict3'].isna()] df = df[features + ID + target_features] mice = IterativeImputer() df = pd.DataFrame(mice.fit_transform(df), columns=df.columns) extra_features = 0 if extra_features: df_pcl3 = pd.read_excel( "C:\PycharmProjects\PTSD\Data\questionnaire6PCL3.xlsx") df_pcl3 = PCL_calculator(df_pcl3) df_pcl2 = pd.read_excel( "C:\PycharmProjects\PTSD\Data\questionnaire6PCL2.xlsx") df_pcl2 = PCL_calculator(df_pcl2) df_pcl1 = pd.read_excel( "C:\PycharmProjects\PTSD\Data\questionnaire6PCL1.xlsx") df_pcl1 = PCL_calculator(df_pcl1) df = df.merge(df_pcl1, on="ID", how='outer') df = df.merge(df_pcl2, suffixes=('_pcl1', '_pcl2'), on="ID", how='outer')
def __init__(self): path = "C:\PycharmProjects\PTSD\Data\PTSD.xlsx" df = pd.read_excel(path) df = df[~df['PCL_Strict3'].isna()] df = df[~((df["military_exp18_t3"] == 0) & (df["military_exp18_t2"] == 0))] df = df[self.features + self.ID + self.target_features] df_pcl3 = pd.read_excel( "C:\PycharmProjects\PTSD\Data\questionnaire6PCL3.xlsx") df_pcl3 = PCL_calculator(df_pcl3) df_pcl2 = pd.read_excel( "C:\PycharmProjects\PTSD\Data\questionnaire6PCL2.xlsx") df_pcl2 = PCL_calculator(df_pcl2) df_pcl1 = pd.read_excel( "C:\PycharmProjects\PTSD\Data\questionnaire6PCL1.xlsx") df_pcl1 = PCL_calculator(df_pcl1) df = df.merge(df_pcl1, on="ID", how='outer') df = df.merge(df_pcl2, suffixes=('_pcl1', '_pcl2'), on="ID", how='outer') df = df.merge(df_pcl3.drop(['PCL3_Strict', 'pcl3', 'PCL3_Broad'], axis=1), on="ID", how='outer') df = df[~df['PCL_Strict3'].isna()] #df = df[~df['tred_cutoff'].isna()] df.drop(self.ID, inplace=True, axis=1) if mew: mice = IterativeImputer() df = pd.DataFrame(mice.fit_transform(df), columns=df.columns) all_x_col = self.features + self.features_2 + self.target_features_2 #all_x_col = self.features + self.features_2 #y_col = ["tred_cutoff"] y_col = ["PCL_Strict3"] X = df[all_x_col] Y = df[y_col] if mew: X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split( X, Y, test_size=0.25, random_state=271828, stratify=Y) X_train, X_test, y_train, y_test = train_test_split( X_train_0, y_train_0, test_size=0.25, random_state=271828, stratify=y_train_0) df = pd.concat([X_train, y_train], axis=1) self.X_test = X_test self.y_test = y_test self.X_train_0 = X_train_0 self.X_test_0 = X_test_0 self.y_train_0 = y_train_0 self.y_test_0 = y_test_0 else: X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.25, random_state=271828, stratify=Y) df = pd.concat([X_train, y_train], axis=1) self.X_test = X_test self.y_test = y_test self.df = df
# dialysis_mod1: 0.53 # insurance_esrd: 7.80 # Mortality_Rate_Facility: 1.30 # Hospitalization_Rate_facility: 1.09 # NEAR_DIST: 21.58 # nephcare_cat2: 36.99 # rucc_metro: 0.90 # Random forest imputation for categorical for var in ['dialysis_mod1', 'insurance_esrd', 'rucc_rural', 'nephcare_cat2']: pred = ['sex_new', 'age_cat', 'race_new', var] imputer = IterativeImputer( n_iter=1, random_state=7, predictor=RandomForestClassifier(n_estimators=10)) imputed = pd.DataFrame(imputer.fit_transform(d[pred]), columns=pred) d = d.drop(var, axis=1).join(imputed[var]) # Bayesian Ridge linear imputation for continuous for var in [ 'Hospitalization_Rate_facility', 'Mortality_Rate_Facility', 'NEAR_DIST' ]: completed = [] for i in range(5): pred = ['sex_new', 'age_cat', 'race_new', var] imputer = IterativeImputer(n_iter=5, sample_posterior=True, random_state=i) completed.append(imputer.fit_transform(d[pred])) completed_mean = np.mean(completed, axis=0) imputed = pd.DataFrame(completed_mean, columns=pred)
review.head(3) # In[35]: # Import IterativeImputer from fancyimpute from fancyimpute import IterativeImputer # Copy diabetes to diabetes_mice_imputed review_mice_imputed = review.copy(deep=True) # Initialize IterativeImputer mice_imputer = IterativeImputer() # Impute using fit_tranform on diabetes review_mice_imputed.iloc[:, :] = mice_imputer.fit_transform(review) #rounding off the imputed data review_mice_imputed.review_scores_location = round( review_mice_imputed.review_scores_location, 0) #view the data review_mice_imputed.head(3) # In[36]: #replacing the null values with the imputed value in the dataset df3_airbnb.review_scores_location = review_mice_imputed.review_scores_location.copy( ) # In[37]:
# meta_train_loss.backward() # opt.step() learner = maml.clone() x_support, x_query, y_support, y_query = train_test_split(x_train, y_train, test_size=0.25, stratify=y_train) ss = StandardScaler() x_support = ss.fit_transform(x_support) x_query = ss.transform(x_query) x_test = ss.transform(x_test) mice = IterativeImputer(max_iter=1000) x_support = mice.fit_transform(x_support) x_query = mice.fit_transform(x_query) x_test = mice.fit_transform(x_test) for _ in range(adapt_steps): # adaptation_steps support_preds = learner(torch.from_numpy(x_support).float()) support_loss = lossfn( support_preds.float(), torch.from_numpy(y_support.values.reshape(-1, 1))).float() learner.adapt(support_loss) query_preds = learner(x_query) query_loss = lossfn(query_preds, y_query) meta_train_loss += query_loss opt.zero_grad() meta_train_loss.backward()
def reconstruct(dataset, mask): print('Reconstructing using MICE...') # train_data = dataset.orig_ds['train_X'] # mask = dataset.miss_masks[config_idx]['train_X'] (datasetLen, dim) = np.shape(dataset) train_data = dataset.copy() incomplete_dataset = np.zeros((datasetLen, dim)) # IterativeImputer requires corrupted entries to be identified as NaN # Using the mask to replace in the input dataset all zero entries for NaN for i in range(datasetLen): frame = train_data.loc[i, :] ms = mask.loc[i, :] ms.values[ms.values == 0] = np.nan incomplete_dataset[i] = frame.values * ms.values incomplete_dataset = pd.DataFrame(incomplete_dataset) n_imputations = 5 reconstructed_dataset = [] # IterativeImputer replicates MICE algorithm when used for multiple imputations # by applying it repeatedly to the same dataset for i in tqdm(range(n_imputations)): imputer = IterativeImputer(n_iter=10, sample_posterior=True, random_state=i) reconstructed_dataset.append(imputer.fit_transform(incomplete_dataset)) reconstructed_dataset_mean = np.mean(reconstructed_dataset, axis=0) reconstructed_dataset_std = np.std(reconstructed_dataset, axis=0) return pd.DataFrame(reconstructed_dataset_mean) ## DEBUG TOOLS ## # import reconstruct as rc # import matplotlib.pyplot as plt # if __name__ == "__main__": # original_dataset, incomplete_dataset, mask = rc.get_dataset(mode='MCAR', n_samples=100) # # original_dataset = pd.DataFrame(original_dataset) # incomplete_dataset = pd.DataFrame(incomplete_dataset) # mask = pd.DataFrame(mask) # # reconstructed_dataset = reconstruct(incomplete_dataset, mask) # # inc = incomplete_dataset.loc[0,:] # rec = reconstructed_dataset.loc[0,:] # orig = original_dataset.loc[0,:] # # print(np.shape(inc)) # print(np.shape(rec)) # print(np.shape(orig)) # # samples = np.vstack([inc, rec, orig]) # fig = rc.plot(samples) # plt.savefig('Multiple_Impute_out1/{}.png'.format(str(0).zfill(3)), bbox_inches='tight') # plt.close(fig) # from sklearn.linear_model import LinearRegression # import os # import sys # projectdir = os.path.dirname(__file__) # app_path = os.path.join(projectdir, 'scikit-mice') # sys.path.insert(0, app_path) # import skmice # # from statsmodels.imputation import mice # import statsmodels.api as sm # np.set_printoptions(linewidth=115, suppress=False, precision=1, floatmode='fixed') # # def gendat(): # """ # Create a data set with missing values. # """ # # np.random.seed(34243) # # n = 20 # p = 5 # # exog = np.random.normal(size=(n, p)) # exog[:, 0] = exog[:, 1] - exog[:, 2] + 2*exog[:, 4] # exog[:, 0] += np.random.normal(size=n) # exog[:, 2] = 1*(exog[:, 2] > 0) # # endog = exog.sum(1) + np.random.normal(size=n) # # df = pd.DataFrame(exog) # df.columns = ["x%d" % k for k in range(1, p+1)] # # df["y"] = endog # # # df.x1[0:60] = np.nan # # df.x2[0:40] = np.nan # df.x1[0:5] = np.nan # df.x2[15:19] = np.nan # df.x3[10:30:2] = np.nan # df.x4[20:50:3] = np.nan # df.x5[40:45] = np.nan # df.y[30:100:2] = np.nan # # return df # # def reconstruct2(dataset, mask): # incomplete_dataset = np.zeros(np.shape(dataset)) # # # IterativeImputer requires corrupted entries to be identified as NaN # # Using the mask to replace in the input dataset all zero entries for NaN # for i in range(len(dataset)): # frame = dataset.loc[i, :] # ms = mask.loc[i, :] # ms.values[ms.values == 0] = np.nan # # incomplete_dataset[i] = frame.values*ms.values # # incomplete_dataset = pd.DataFrame(incomplete_dataset) # incomplete_dataset.columns = map(str, incomplete_dataset.columns.values) # # incomplete_dataset.columns = [item + ':' for item in incomplete_dataset.columns] # # print(incomplete_dataset.columns) # # # sys.exit(0) # # # print(incomplete_dataset) # # reconstructed_dataset = mice.MICEData(incomplete_dataset) # # print(np.shape(imp_data)) # print(np.shape(reconstructed_dataset.data)) # print(reconstructed_dataset.data) # # # mi = mice.MICE("y ~ x1 + x2 + x1:x2", sm.OLS, reconstructed_dataset) # mi = mice.MICE("0", sm.OLS, reconstructed_dataset) # results = mi.fit(n_burnin=10, n_imputations=10) # # print(np.shape(reconstructed_dataset.data)) # # sys.exit(0) # # return pd.DataFrame(reconstructed_dataset) # # if __name__ == "__main__": # original_dataset, dataset, mask = rc.get_dataset(mode='MCAR', n_samples=100) # # original_dataset = pd.DataFrame(original_dataset) # dataset = pd.DataFrame(dataset) # mask = pd.DataFrame(mask) # # incomplete_dataset = np.zeros(np.shape(dataset)) # # for i in range(len(dataset)): # frame = dataset.loc[i, :] # ms = mask.loc[i, :] # ms.values[ms.values == 0] = np.nan # # incomplete_dataset[i] = frame.values*ms.values # # print(np.shape(incomplete_dataset)) # # print(incomplete_dataset[0:1,:].reshape((2, -1))) # # imputer = IterativeImputer(missing_values=np.nan, n_iter=2, sample_posterior=True, random_state=1) # # reconstructed_dataset = imputer.fit_transform(incomplete_dataset[0, :].reshape((2, -1))) # reconstructed_dataset = imputer.fit_transform(incomplete_dataset) # # reconstructed_dataset = imputer.complete(incomplete_dataset) # # # # print(reconstructed_dataset_mean.shape) # print(np.shape(reconstructed_dataset)) # # print(reconstructed_dataset) # # # fig = rc.plot([reconstructed_dataset]) # # plt.savefig('Multiple_Impute_out1/{}.png'.format(str(0).zfill(3)), bbox_inches='tight') # # plt.close(fig) # # # sys.exit(0)
# Import IterativeImputer from fancyimpute from fancyimpute import IterativeImputer # Copy diabetes to diabetes_mice_imputed diabetes_mice_imputed = diabetes.copy(deep=True) # Initialize IterativeImputer mice_imputer = IterativeImputer() # Impute using fit_tranform on diabetes diabetes_mice_imputed.iloc[:, :] = mice_imputer.fit_transform(diabetes)
# SoftImpute AND IterativeSVD, SimpleFill, MatrixFactorization, and import pandas as pd import numpy as np import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') import sklearn.datasets as SKD data = pd.read_csv('ai_mavan_adhd7.csv', sep=',', index_col=None) # In[ ]: # MICE IMPUTATION mice_impute = IterativeImputer() traindatafill = mice_impute.fit_transform(adhd) # In[ ]: # KNN way to impute adhd_filled_knn = KNN(k=3).fit_transform( adhd ) #use 3 nearest rows which have a feature to fill in each row’s missing features # In[ ]: # NUCLEARNOMMINIMIZATION adhd_filled_nnm = NuclearNormMinimization().fit_transform(adhd) # In[69]:
colorbar = False, title = df_key) # %% # As we can see from the charts above, any of the imputation methods we tried will work well. # The read line depicts the imputed values, and they don't follow the pattern of the data. # We'll need to try some more advanced techniques such as: # KNN imputation # However, since the dataset is very large, KNN approach is not possible. # Mice imputation # %% markdown # ### MICE Imputation # %% df_2_mice = df_2.copy(deep=True) mice_imputer = IterativeImputer() df_2_mice.iloc[:, :] = mice_imputer.fit_transform(df_2_mice) # %% df_2_mice.head() # %% # Now we need to focus on the latitude and longitude imputations. # %% # Even by splitting the data in 3 distinct datasets, it's not possible to use KNN imputation for latitude and longitude # duo to the dataset size and hardware limitations. # Furthermore, the columns have over 50% of missing data, so the best approach in this case is to drop them. # %% # We will evaluate which model performed best on df_2 imputations, select it, and concatenate the datasets df_1 and df_2. # %% # Plot graphs of imputed DataFrames and the complete case df_2['price'].plot(kind='kde', c='red', linewidth=3) df_2_mean['price'].plot(kind='kde') df_2_median['price'].plot(kind='kde')
for col in label_list: # encode data leaving nan as they are label_column = pd.DataFrame(features[col].values) temp_labels = pd.Series( [i for i in label_column.iloc[:, 0].unique() if type(i) == str]) labelencoder_X.fit(temp_labels) features[col] = features[col].map( lambda x: labelencoder_X.transform([x])[0] if type(x) == str else x) # Multiple Imputation to fill nan values from fancyimpute import IterativeImputer import missingno as msno import matplotlib.pyplot as plt msno.bar(features, figsize=(12, 6), fontsize=12, color='steelblue') mice = IterativeImputer() data = pd.DataFrame(data=mice.fit_transform(features), columns=features.columns, index=features.index) # checking if there is no null value anymore data.isnull().values.any() # Drop rows that Year.Built>2019 & <0 data = data[~(data['Year.Built'] < 0) & ~(data['Year.Built'] > 2019)] # Calculate age of house data['Age_House'] = 2019 - data['Year.Built'] # Move Target variable at the end data = data[[c for c in data if c not in ['Sale.Price']] + ['Sale.Price']]