def data_transform(self):
        self.__installment_payment = cu.replace_day_outliers(
            self.__installment_payment)
        #self.__installment_payment = self.__installment_payment.replace(['XNA', 'XAP'], np.nan)
        self.__installment_payment["TIME_DAYS_INSTALMENT"] = pd.to_timedelta(
            self.__installment_payment["DAYS_INSTALMENT"], "D")
        self.__installment_payment[
            "TIME_DAYS_ENTRY_PAYMENT"] = pd.to_timedelta(
                self.__installment_payment["DAYS_ENTRY_PAYMENT"], "D")

        self.__installment_payment["TIME_DAYS_INSTALMENT"] += self.__start_time
        self.__installment_payment[
            "TIME_DAYS_ENTRY_PAYMENT"] += self.__start_time

        # 方便后续 featuretools 制定 variable types
        for col in self.__installment_payment.columns.tolist():
            if col in self.__installment_payment.select_dtypes(
                    include="object").columns.tolist():
                self.__installment_payment.rename(
                    columns={col: "FLAG_INSTALLMENT_PAYMENT_" + col},
                    inplace=True)

        self.__installment_payment = pd.get_dummies(
            data=self.__installment_payment,
            prefix="FLAG_INSTALLMENT_PAYMENT",
            dummy_na=True,
            columns=self.__installment_payment.select_dtypes(
                include="object").columns.tolist())
Пример #2
0
    def data_transform(self):
        #print("data transform for previous application start:")
        #self.__previous_application = self.__previous_application.replace(365243.0, np.nan)
        self.__previous_application = cu.replace_day_outliers(
            self.__previous_application)
        self.__previous_application = self.__previous_application.replace(
            ['XNA', 'XAP'], np.nan)
        self.__previous_application["TIME_DAYS_DECISION"] = pd.to_timedelta(
            self.__previous_application["DAYS_DECISION"], "D")
        self.__previous_application[
            "TIME_DAYS_FIRST_DRAWING"] = pd.to_timedelta(
                self.__previous_application["DAYS_FIRST_DRAWING"], "D")
        self.__previous_application["TIME_DAYS_FIRST_DUE"] = pd.to_timedelta(
            self.__previous_application["DAYS_FIRST_DUE"], "D")
        self.__previous_application[
            "TIME_DAYS_LAST_DUE_1ST_VERSION"] = pd.to_timedelta(
                self.__previous_application["DAYS_LAST_DUE_1ST_VERSION"], "D")
        self.__previous_application["TIME_DAYS_LAST_DUE"] = pd.to_timedelta(
            self.__previous_application["DAYS_LAST_DUE"], "D")
        self.__previous_application["TIME_DAYS_TERMINATION"] = pd.to_timedelta(
            self.__previous_application["DAYS_TERMINATION"], "D")

        self.__previous_application["TIME_DAYS_DECISION"] += self.__start_time
        self.__previous_application[
            "TIME_DAYS_FIRST_DRAWING"] += self.__start_time
        self.__previous_application["TIME_DAYS_FIRST_DUE"] += self.__start_time
        self.__previous_application[
            "TIME_DAYS_LAST_DUE_1ST_VERSION"] += self.__start_time
        self.__previous_application["TIME_DAYS_LAST_DUE"] += self.__start_time
        self.__previous_application[
            "TIME_DAYS_TERMINATION"] += self.__start_time

        # 方便后续 featuretools 制定 variable types
        for col in self.__previous_application.columns.tolist():
            if col in self.__previous_application.select_dtypes(
                    include="object").columns.tolist():
                self.__previous_application.rename(
                    columns={col: "FLAG_PREVIOUS_APPLICATION_" + col},
                    inplace=True)

        self.__previous_application = pd.get_dummies(
            data=self.__previous_application,
            dummy_na=True,
            columns=self.__previous_application.select_dtypes(
                include="object").columns.tolist())
    def data_transform(self):
        self.__bureau = cu.replace_day_outliers(self.__bureau)

        #self.__bureau, bureau_cat = one_hot_encoder(self.__bureau, True)
        #self.__bureau = self.__bureau.replace(['XNA', 'XAP'], np.nan)
        #self.__bureau = self.__bureau.replace(365243.0, np.nan)
        self.__bureau['DAYS_CREDIT_ENDDATE'][
            self.__bureau['DAYS_CREDIT_ENDDATE'] < -40000] = np.nan
        self.__bureau['DAYS_CREDIT_UPDATE'][
            self.__bureau['DAYS_CREDIT_UPDATE'] < -40000] = np.nan
        self.__bureau['DAYS_ENDDATE_FACT'][
            self.__bureau['DAYS_ENDDATE_FACT'] < -40000] = np.nan

        self.__bureau["TIME_DAYS_CREDIT"] = pd.to_timedelta(
            self.__bureau["DAYS_CREDIT"], "D")
        self.__bureau["TIME_DAYS_CREDIT_ENDDATE"] = pd.to_timedelta(
            self.__bureau["DAYS_CREDIT_ENDDATE"], "D")
        self.__bureau["TIME_DAYS_ENDDATE_FACT"] = pd.to_timedelta(
            self.__bureau["DAYS_ENDDATE_FACT"], "D")
        self.__bureau["TIME_DAYS_CREDIT_UPDATE"] = pd.to_timedelta(
            self.__bureau["DAYS_CREDIT_UPDATE"], "D")

        self.__bureau["TIME_DAYS_CREDIT"] += self.__start_time
        self.__bureau["TIME_DAYS_CREDIT_ENDDATE"] += self.__start_time
        self.__bureau["TIME_DAYS_ENDDATE_FACT"] += self.__start_time
        self.__bureau["TIME_DAYS_CREDIT_UPDATE"] += self.__start_time

        self.__bureau['AMT_CREDIT_SUM'].fillna(0, inplace=True)
        self.__bureau['AMT_CREDIT_SUM_DEBT'].fillna(0, inplace=True)
        self.__bureau['AMT_CREDIT_SUM_OVERDUE'].fillna(0, inplace=True)
        self.__bureau['CNT_CREDIT_PROLONG'].fillna(0, inplace=True)

        # 方便后续 featuretools 制定 variable types
        for col in self.__bureau.columns.tolist():
            if col in self.__bureau.select_dtypes(
                    include="object").columns.tolist():
                self.__bureau.rename(columns={col: "FLAG_BUREAU_" + col},
                                     inplace=True)

        self.__bureau = pd.get_dummies(data=self.__bureau,
                                       dummy_na=True,
                                       columns=self.__bureau.select_dtypes(
                                           include="object").columns.tolist())
    def data_transform(self):
        self.__credit_card = cu.replace_day_outliers(self.__credit_card)
        #self.__credit_card = self.__credit_card.replace(['XNA', 'XAP'], np.nan)
        self.__credit_card["TIME_MONTHS_BALANCE"] = pd.to_timedelta(
            self.__credit_card["MONTHS_BALANCE"], "M")
        self.__credit_card["TIME_MONTHS_BALANCE"] += self.__start_time

        # 方便后续 featuretools 制定 variable types
        for col in self.__credit_card.columns.tolist():
            if col in self.__credit_card.select_dtypes(
                    include="object").columns.tolist():
                self.__credit_card.rename(
                    columns={col: "FLAG_CREDIT_CARD_" + col}, inplace=True)

        self.__credit_card = pd.get_dummies(
            data=self.__credit_card,
            dummy_na=True,
            columns=self.__credit_card.select_dtypes(
                include="object").columns.tolist())
    def data_transform(self):
        self.__bureau_balance = cu.replace_day_outliers(self.__bureau_balance)
        #self.__bureau_balance = self.__bureau_balance.replace(['XNA', 'XAP'], np.nan)

        self.__bureau_balance["TIME_MONTHS_BALANCE"] = pd.to_timedelta(
            self.__bureau_balance["MONTHS_BALANCE"], "M")

        self.__bureau_balance["TIME_MONTHS_BALANCE"] += self.__start_time

        # 方便后续 featuretools 制定 variable types
        for col in self.__bureau_balance.columns.tolist():
            if col in self.__bureau_balance.select_dtypes(
                    include="object").columns.tolist():
                self.__bureau_balance.rename(
                    columns={col: "FLAG_BUREAU_BALANCE_" + col}, inplace=True)

        self.__bureau_balance = pd.get_dummies(
            data=self.__bureau_balance,
            prefix="FLAG_BUREAU_BALANCE",
            dummy_na=True,
            columns=self.__bureau_balance.select_dtypes(
                include="object").columns.tolist())
Пример #6
0
    def data_transform(self):
        self.__application_test = cu.replace_day_outliers(
            self.__application_test)
        dropcolum = [
            'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5',
            'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8',
            'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11',
            'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14',
            'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17',
            'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20',
            'FLAG_DOCUMENT_21'
        ]

        self.__application_test = self.__application_test[
            self.__application_test['CODE_GENDER'] != 'XNA']
        self.__application_test = self.__application_test.drop(dropcolum,
                                                               axis=1)

        # Categorical features with Binary encode (0 or 1; two categories)
        for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
            self.__application_test[bin_feature], uniques = pd.factorize(
                self.__application_test[bin_feature])