def create_features(self): train_oht, test_oht = data.category_columns_to_one_hot_train_test( X_train_origin, test_origin) data.check_columns_size(train_oht, test_oht) self.train = train_oht self.test = test_oht
def create_features(self): df_all_zero_per = data.get_zero_percent(df_all_origin) train_zero_per = df_all_zero_per[:train_test_split_index] test_zero_per = df_all_zero_per[train_test_split_index:] test_zero_per = test_zero_per.reset_index(drop=True) data.check_columns_size(train_zero_per, test_zero_per) self.train = train_zero_per self.test = test_zero_per
def create_features(self): columns_list = ["KitchenQual", "GarageQual", "ExterQual", "BsmtQual"] dict_list = [{ "Ex": 3, "Gd": 2, "TA": 1, "Fa": -1, "Po": -2, np.nan: -3 }, { "Ex": 3, "Gd": 2, "TA": 1, "Fa": -1, "Po": -2, np.nan: -3 }, { "Ex": 3, "Gd": 2, "TA": 1, "Fa": -1, "Po": -2, np.nan: -3 }, { "Ex": 3, "Gd": 2, "TA": 1, "Fa": -1, "Po": -2, np.nan: -3 }] df_new_train = pd.DataFrame(index=X_train_origin.index) df_new_test = pd.DataFrame(index=test_origin.index) for i in range(len(columns_list)): df_new_train = pd.concat([ df_new_train, data.ordinal_ordered_encoding(X_train_origin, columns_list[i], dict_list[i]) ], axis=1) for i in range(len(columns_list)): df_new_test = pd.concat([ df_new_test, data.ordinal_ordered_encoding(test_origin, columns_list[i], dict_list[i]) ], axis=1) data.check_columns_size(df_new_train, df_new_test) self.train = df_new_train self.test = df_new_test
def create_features(self): column = "OverallQual" df_target_train = pd.DataFrame( index=train_origin.index, columns=["{}".format(column) + "_target"]) for train_index, valid_index in kf.split(train_origin): X_tr = train_origin.iloc[train_index, :] X_va = train_origin.iloc[valid_index, :] y_tr = Y_train_origin[train_index] y_va = Y_train_origin[valid_index] df_te = data.target_encoding(X_tr, X_va, column, y_va) df_target_train.iloc[train_index, :] = df_te.values df_target_test = data.target_encoding(test_origin, train_origin, column, Y_train_origin) data.check_columns_size(df_target_train, df_target_test) self.train = df_target_train self.test = df_target_test
def create_features(self): all_bin = data.binning(df_all_origin, "YearBuilt", 10) train_bin = all_bin[:train_test_split_index] test_bin = all_bin[train_test_split_index:].reset_index(drop=True) column = train_bin.columns[0] df_target_train = pd.DataFrame( index=train_bin.index, columns=["{}".format(column) + "_target"]) for train_index, valid_index in kf.split(train_bin): X_tr = train_bin.iloc[train_index, :] X_va = train_bin.iloc[valid_index, :] y_tr = Y_train_origin[train_index] y_va = Y_train_origin[valid_index] df_te = data.target_encoding(X_tr, X_va, column, y_va) df_target_train.iloc[train_index, :] = df_te.values df_target_test = data.target_encoding(test_bin, train_bin, column, Y_train_origin) data.check_columns_size(df_target_train, df_target_test) self.train = df_target_train self.test = df_target_test
def create_features(self): train_nan, test_nan = data.get_nan_flag(X_train_origin, test_origin) data.check_columns_size(train_nan, test_nan) self.train = train_nan self.test = test_nan