def missing_inputation(): # Load dataset data = pd.read_csv("creditApprovalUCI.csv") # Separate into train and test sets X_train, X_test, y_train, y_test = train_test_split( data.drop("A16", axis=1), data["A16"], test_size=0.3, random_state=0 ) # Set up the imputer median_imputer = MeanMedianImputer( imputation_method="median", variables=["A2", "A3", "A8", "A11", "A15"] ) # fit the imputer median_imputer.fit(X_train) # transform the data X_train = median_imputer.transform(X_train) X_test = median_imputer.transform(X_test)
def test_non_fitted_error(df_na): with pytest.raises(NotFittedError): imputer = MeanMedianImputer() imputer.transform(df_na)
missing_ind = AddMissingIndicator(variables=vars_with_na) missing_ind.fit(X_train) X_train = missing_ind.transform(X_train) X_test = missing_ind.transform(X_test) # check the binary missing indicator variables X_train[['LotFrontage_na', 'MasVnrArea_na', 'GarageYrBlt_na']].head() # %% # %% Missing values -- Numerical -- add missing indicator. mean_imputer = MeanMedianImputer( imputer_method='mean', variables=vars_with_na ) mean_imputer.fit(X_train) print(mean_imputer.imputer_dict_) X_train = mean_imputer.transform(X_train) X_test = mean_imputer.transform(X_test) # %% Varief whether there are missing value. X_train[cat_vars_with_na].isnull().sum() [var for var in cat_vars_with_na if X_test[var].isnull().sum() > 0] #%% Temporal variables. def elapsed_years(df, var): df[var] = df['YrSold'] - df[var] return df for var in ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']: X_train = elapsed_years(X_train, var) X_test = elapsed_years(X_test, var) # now we drop YrSold.