def missing_inputation():
    # Load dataset
    data = pd.read_csv("creditApprovalUCI.csv")

    # Separate into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        data.drop("A16", axis=1), data["A16"], test_size=0.3, random_state=0
    )

    # Set up the imputer
    median_imputer = MeanMedianImputer(
        imputation_method="median", variables=["A2", "A3", "A8", "A11", "A15"]
    )
    # fit the imputer
    median_imputer.fit(X_train)

    # transform the data
    X_train = median_imputer.transform(X_train)
    X_test = median_imputer.transform(X_test)
Пример #2
0
X_train[vars_with_na].isnull().mean()

# %% Missing values -- Numerical -- add missing indicator.
missing_ind = AddMissingIndicator(variables=vars_with_na)
missing_ind.fit(X_train)
X_train = missing_ind.transform(X_train)
X_test = missing_ind.transform(X_test)

# check the binary missing indicator variables
X_train[['LotFrontage_na', 'MasVnrArea_na', 'GarageYrBlt_na']].head()
# %% # %% Missing values -- Numerical -- add missing indicator.
mean_imputer = MeanMedianImputer(
    imputer_method='mean',
    variables=vars_with_na
)
mean_imputer.fit(X_train)
print(mean_imputer.imputer_dict_)

X_train = mean_imputer.transform(X_train)
X_test = mean_imputer.transform(X_test)

# %% Varief whether there are missing value.
X_train[cat_vars_with_na].isnull().sum()
[var for var in cat_vars_with_na if X_test[var].isnull().sum() > 0]

#%% Temporal variables.
def elapsed_years(df, var):
    df[var] = df['YrSold'] - df[var]
    return df
for var in ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']:
    X_train = elapsed_years(X_train, var)