예제 #1
0
    def create_features(self):
        train_oht, test_oht = data.category_columns_to_one_hot_train_test(
            X_train_origin, test_origin)

        data.check_columns_size(train_oht, test_oht)
        self.train = train_oht
        self.test = test_oht
예제 #2
0
    def create_features(self):
        df_all_zero_per = data.get_zero_percent(df_all_origin)
        train_zero_per = df_all_zero_per[:train_test_split_index]
        test_zero_per = df_all_zero_per[train_test_split_index:]
        test_zero_per = test_zero_per.reset_index(drop=True)

        data.check_columns_size(train_zero_per, test_zero_per)
        self.train = train_zero_per
        self.test = test_zero_per
예제 #3
0
    def create_features(self):
        columns_list = ["KitchenQual", "GarageQual", "ExterQual", "BsmtQual"]
        dict_list = [{
            "Ex": 3,
            "Gd": 2,
            "TA": 1,
            "Fa": -1,
            "Po": -2,
            np.nan: -3
        }, {
            "Ex": 3,
            "Gd": 2,
            "TA": 1,
            "Fa": -1,
            "Po": -2,
            np.nan: -3
        }, {
            "Ex": 3,
            "Gd": 2,
            "TA": 1,
            "Fa": -1,
            "Po": -2,
            np.nan: -3
        }, {
            "Ex": 3,
            "Gd": 2,
            "TA": 1,
            "Fa": -1,
            "Po": -2,
            np.nan: -3
        }]
        df_new_train = pd.DataFrame(index=X_train_origin.index)
        df_new_test = pd.DataFrame(index=test_origin.index)

        for i in range(len(columns_list)):
            df_new_train = pd.concat([
                df_new_train,
                data.ordinal_ordered_encoding(X_train_origin, columns_list[i],
                                              dict_list[i])
            ],
                                     axis=1)

        for i in range(len(columns_list)):
            df_new_test = pd.concat([
                df_new_test,
                data.ordinal_ordered_encoding(test_origin, columns_list[i],
                                              dict_list[i])
            ],
                                    axis=1)

        data.check_columns_size(df_new_train, df_new_test)
        self.train = df_new_train
        self.test = df_new_test
예제 #4
0
    def create_features(self):
        column = "OverallQual"

        df_target_train = pd.DataFrame(
            index=train_origin.index,
            columns=["{}".format(column) + "_target"])
        for train_index, valid_index in kf.split(train_origin):
            X_tr = train_origin.iloc[train_index, :]
            X_va = train_origin.iloc[valid_index, :]
            y_tr = Y_train_origin[train_index]
            y_va = Y_train_origin[valid_index]

            df_te = data.target_encoding(X_tr, X_va, column, y_va)
            df_target_train.iloc[train_index, :] = df_te.values
        df_target_test = data.target_encoding(test_origin, train_origin,
                                              column, Y_train_origin)

        data.check_columns_size(df_target_train, df_target_test)
        self.train = df_target_train
        self.test = df_target_test
예제 #5
0
    def create_features(self):
        all_bin = data.binning(df_all_origin, "YearBuilt", 10)
        train_bin = all_bin[:train_test_split_index]
        test_bin = all_bin[train_test_split_index:].reset_index(drop=True)
        column = train_bin.columns[0]

        df_target_train = pd.DataFrame(
            index=train_bin.index, columns=["{}".format(column) + "_target"])
        for train_index, valid_index in kf.split(train_bin):
            X_tr = train_bin.iloc[train_index, :]
            X_va = train_bin.iloc[valid_index, :]
            y_tr = Y_train_origin[train_index]
            y_va = Y_train_origin[valid_index]
            df_te = data.target_encoding(X_tr, X_va, column, y_va)
            df_target_train.iloc[train_index, :] = df_te.values
        df_target_test = data.target_encoding(test_bin, train_bin, column,
                                              Y_train_origin)

        data.check_columns_size(df_target_train, df_target_test)
        self.train = df_target_train
        self.test = df_target_test
예제 #6
0
    def create_features(self):
        train_nan, test_nan = data.get_nan_flag(X_train_origin, test_origin)

        data.check_columns_size(train_nan, test_nan)
        self.train = train_nan
        self.test = test_nan