def data_transform(self): self.__installment_payment = cu.replace_day_outliers( self.__installment_payment) #self.__installment_payment = self.__installment_payment.replace(['XNA', 'XAP'], np.nan) self.__installment_payment["TIME_DAYS_INSTALMENT"] = pd.to_timedelta( self.__installment_payment["DAYS_INSTALMENT"], "D") self.__installment_payment[ "TIME_DAYS_ENTRY_PAYMENT"] = pd.to_timedelta( self.__installment_payment["DAYS_ENTRY_PAYMENT"], "D") self.__installment_payment["TIME_DAYS_INSTALMENT"] += self.__start_time self.__installment_payment[ "TIME_DAYS_ENTRY_PAYMENT"] += self.__start_time # 方便后续 featuretools 制定 variable types for col in self.__installment_payment.columns.tolist(): if col in self.__installment_payment.select_dtypes( include="object").columns.tolist(): self.__installment_payment.rename( columns={col: "FLAG_INSTALLMENT_PAYMENT_" + col}, inplace=True) self.__installment_payment = pd.get_dummies( data=self.__installment_payment, prefix="FLAG_INSTALLMENT_PAYMENT", dummy_na=True, columns=self.__installment_payment.select_dtypes( include="object").columns.tolist())
def data_transform(self): #print("data transform for previous application start:") #self.__previous_application = self.__previous_application.replace(365243.0, np.nan) self.__previous_application = cu.replace_day_outliers( self.__previous_application) self.__previous_application = self.__previous_application.replace( ['XNA', 'XAP'], np.nan) self.__previous_application["TIME_DAYS_DECISION"] = pd.to_timedelta( self.__previous_application["DAYS_DECISION"], "D") self.__previous_application[ "TIME_DAYS_FIRST_DRAWING"] = pd.to_timedelta( self.__previous_application["DAYS_FIRST_DRAWING"], "D") self.__previous_application["TIME_DAYS_FIRST_DUE"] = pd.to_timedelta( self.__previous_application["DAYS_FIRST_DUE"], "D") self.__previous_application[ "TIME_DAYS_LAST_DUE_1ST_VERSION"] = pd.to_timedelta( self.__previous_application["DAYS_LAST_DUE_1ST_VERSION"], "D") self.__previous_application["TIME_DAYS_LAST_DUE"] = pd.to_timedelta( self.__previous_application["DAYS_LAST_DUE"], "D") self.__previous_application["TIME_DAYS_TERMINATION"] = pd.to_timedelta( self.__previous_application["DAYS_TERMINATION"], "D") self.__previous_application["TIME_DAYS_DECISION"] += self.__start_time self.__previous_application[ "TIME_DAYS_FIRST_DRAWING"] += self.__start_time self.__previous_application["TIME_DAYS_FIRST_DUE"] += self.__start_time self.__previous_application[ "TIME_DAYS_LAST_DUE_1ST_VERSION"] += self.__start_time self.__previous_application["TIME_DAYS_LAST_DUE"] += self.__start_time self.__previous_application[ "TIME_DAYS_TERMINATION"] += self.__start_time # 方便后续 featuretools 制定 variable types for col in self.__previous_application.columns.tolist(): if col in self.__previous_application.select_dtypes( include="object").columns.tolist(): self.__previous_application.rename( columns={col: "FLAG_PREVIOUS_APPLICATION_" + col}, inplace=True) self.__previous_application = pd.get_dummies( data=self.__previous_application, dummy_na=True, columns=self.__previous_application.select_dtypes( include="object").columns.tolist())
def data_transform(self): self.__bureau = cu.replace_day_outliers(self.__bureau) #self.__bureau, bureau_cat = one_hot_encoder(self.__bureau, True) #self.__bureau = self.__bureau.replace(['XNA', 'XAP'], np.nan) #self.__bureau = self.__bureau.replace(365243.0, np.nan) self.__bureau['DAYS_CREDIT_ENDDATE'][ self.__bureau['DAYS_CREDIT_ENDDATE'] < -40000] = np.nan self.__bureau['DAYS_CREDIT_UPDATE'][ self.__bureau['DAYS_CREDIT_UPDATE'] < -40000] = np.nan self.__bureau['DAYS_ENDDATE_FACT'][ self.__bureau['DAYS_ENDDATE_FACT'] < -40000] = np.nan self.__bureau["TIME_DAYS_CREDIT"] = pd.to_timedelta( self.__bureau["DAYS_CREDIT"], "D") self.__bureau["TIME_DAYS_CREDIT_ENDDATE"] = pd.to_timedelta( self.__bureau["DAYS_CREDIT_ENDDATE"], "D") self.__bureau["TIME_DAYS_ENDDATE_FACT"] = pd.to_timedelta( self.__bureau["DAYS_ENDDATE_FACT"], "D") self.__bureau["TIME_DAYS_CREDIT_UPDATE"] = pd.to_timedelta( self.__bureau["DAYS_CREDIT_UPDATE"], "D") self.__bureau["TIME_DAYS_CREDIT"] += self.__start_time self.__bureau["TIME_DAYS_CREDIT_ENDDATE"] += self.__start_time self.__bureau["TIME_DAYS_ENDDATE_FACT"] += self.__start_time self.__bureau["TIME_DAYS_CREDIT_UPDATE"] += self.__start_time self.__bureau['AMT_CREDIT_SUM'].fillna(0, inplace=True) self.__bureau['AMT_CREDIT_SUM_DEBT'].fillna(0, inplace=True) self.__bureau['AMT_CREDIT_SUM_OVERDUE'].fillna(0, inplace=True) self.__bureau['CNT_CREDIT_PROLONG'].fillna(0, inplace=True) # 方便后续 featuretools 制定 variable types for col in self.__bureau.columns.tolist(): if col in self.__bureau.select_dtypes( include="object").columns.tolist(): self.__bureau.rename(columns={col: "FLAG_BUREAU_" + col}, inplace=True) self.__bureau = pd.get_dummies(data=self.__bureau, dummy_na=True, columns=self.__bureau.select_dtypes( include="object").columns.tolist())
def data_transform(self): self.__credit_card = cu.replace_day_outliers(self.__credit_card) #self.__credit_card = self.__credit_card.replace(['XNA', 'XAP'], np.nan) self.__credit_card["TIME_MONTHS_BALANCE"] = pd.to_timedelta( self.__credit_card["MONTHS_BALANCE"], "M") self.__credit_card["TIME_MONTHS_BALANCE"] += self.__start_time # 方便后续 featuretools 制定 variable types for col in self.__credit_card.columns.tolist(): if col in self.__credit_card.select_dtypes( include="object").columns.tolist(): self.__credit_card.rename( columns={col: "FLAG_CREDIT_CARD_" + col}, inplace=True) self.__credit_card = pd.get_dummies( data=self.__credit_card, dummy_na=True, columns=self.__credit_card.select_dtypes( include="object").columns.tolist())
def data_transform(self): self.__bureau_balance = cu.replace_day_outliers(self.__bureau_balance) #self.__bureau_balance = self.__bureau_balance.replace(['XNA', 'XAP'], np.nan) self.__bureau_balance["TIME_MONTHS_BALANCE"] = pd.to_timedelta( self.__bureau_balance["MONTHS_BALANCE"], "M") self.__bureau_balance["TIME_MONTHS_BALANCE"] += self.__start_time # 方便后续 featuretools 制定 variable types for col in self.__bureau_balance.columns.tolist(): if col in self.__bureau_balance.select_dtypes( include="object").columns.tolist(): self.__bureau_balance.rename( columns={col: "FLAG_BUREAU_BALANCE_" + col}, inplace=True) self.__bureau_balance = pd.get_dummies( data=self.__bureau_balance, prefix="FLAG_BUREAU_BALANCE", dummy_na=True, columns=self.__bureau_balance.select_dtypes( include="object").columns.tolist())
def data_transform(self): self.__application_test = cu.replace_day_outliers( self.__application_test) dropcolum = [ 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21' ] self.__application_test = self.__application_test[ self.__application_test['CODE_GENDER'] != 'XNA'] self.__application_test = self.__application_test.drop(dropcolum, axis=1) # Categorical features with Binary encode (0 or 1; two categories) for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']: self.__application_test[bin_feature], uniques = pd.factorize( self.__application_test[bin_feature])