def clean_pos(pos): logger.info(f''' #============================================================================== # PREV CLEANSING #==============================================================================''' ) pos = pos.query( "NAME_CONTRACT_STATUS!='Signed' and NAME_CONTRACT_STATUS!='Approved' and NAME_CONTRACT_STATUS!='XNA'" ) pos.loc[(pos.NAME_CONTRACT_STATUS == 'Completed') & (pos.CNT_INSTALMENT_FUTURE != 0), 'NAME_CONTRACT_STATUS'] = 'Active' pos_0 = pos.query('CNT_INSTALMENT_FUTURE==0') pos_1 = pos.query('CNT_INSTALMENT_FUTURE>0') pos_0['NAME_CONTRACT_STATUS'] = 'Completed' pos_0.sort_values(by=['SK_ID_PREV', 'MONTHS_BALANCE'], ascending=[True, False], inplace=True) pos_0.drop_duplicates('SK_ID_PREV', keep='last', inplace=True) pos = pd.concat([pos_0, pos_1], ignore_index=True) del pos_0, pos_1 gc.collect() utils.to_df_pkl(df=pos, path='../input', fname='clean_pos')
def clean_ccb(ccb): amt_cols = [col for col in ccb.columns if col.count('AMT')] cnt_cols = [col for col in ccb.columns if col.count('CNT')] amt_cnt_cols = list(set(amt_cols + cnt_cols)) for col in amt_cnt_cols: ccb[col].fillna(0, inplace=True) utils.to_df_pkl(df=ccb, path='../input', fname='clean_ccb')
def clean_prev(pre): logger.info(f''' #============================================================================== # PREV CLEANSING #==============================================================================''' ) cash = 'Cash loans' revo = 'Revolving loans' pre = utils.read_df_pkl(path='../input/previous*.p') pre['AMT_CREDIT'] = pre['AMT_CREDIT'].where(pre['AMT_CREDIT'] > 0, np.nan) pre['AMT_ANNUITY'] = pre['AMT_ANNUITY'].where(pre['AMT_ANNUITY'] > 0, np.nan) pre['AMT_APPLICATION'] = pre['AMT_APPLICATION'].where( pre['AMT_APPLICATION'] > 0, np.nan) pre['CNT_PAYMENT'] = pre['CNT_PAYMENT'].where(pre['CNT_PAYMENT'] > 0, np.nan) pre['AMT_DOWN_PAYMENT'] = pre['AMT_DOWN_PAYMENT'].where( pre['AMT_DOWN_PAYMENT'] > 0, np.nan) pre['RATE_DOWN_PAYMENT'] = pre['RATE_DOWN_PAYMENT'].where( pre['RATE_DOWN_PAYMENT'] > 0, np.nan) pre['DAYS_FIRST_DRAWING'] = pre['DAYS_FIRST_DRAWING'].where( pre['DAYS_FIRST_DRAWING'] < 100000, np.nan) pre['DAYS_FIRST_DUE'] = pre['DAYS_FIRST_DUE'].where( pre['DAYS_FIRST_DUE'] < 100000, np.nan) pre['DAYS_LAST_DUE_1ST_VERSION'] = pre['DAYS_LAST_DUE_1ST_VERSION'].where( pre['DAYS_LAST_DUE_1ST_VERSION'] < 100000, np.nan) pre['DAYS_LAST_DUE'] = pre['DAYS_LAST_DUE'].where( pre['DAYS_LAST_DUE'] < 100000, np.nan) pre['DAYS_TERMINATION'] = pre['DAYS_TERMINATION'].where( pre['DAYS_TERMINATION'] < 100000, np.nan) # pre['SELLERPLACE_AREA'] = pre['SELLERPLACE_AREA'].where(pre['SELLERPLACE_AREA'] <200, 200) ignore_list = [ 'SK_ID_CURR', 'SK_ID_PREV', 'NAME_CONTRACT_TYPE', 'NAME_CONTRACT_STATUS' ] ' revo ' ' RevolvingではCNT_PAYMENT, AMT系をNULLにする ' # for col in pre.columns: # if col in ignore_list: # logger.info(f'CONTINUE: {col}') # continue # pre[f'revo_{col}'] = pre[col].where(pre[f'NAME_CONTRACT_TYPE']==revo, np.nan) # pre[col] = pre[col].where(pre[f'NAME_CONTRACT_TYPE']!=revo, np.nan) pre['NAME_TYPE_SUITE'].fillna('XNA', inplace=True) pre['PRODUCT_COMBINATION'].fillna('XNA', inplace=True) pre = utils.to_df_pkl(df=pre, path='../input', fname='clean_prev')
def clean_app(app): logger.info(f''' #============================================================================== # APPLICATION #==============================================================================''' ) app['CODE_GENDER'].replace('XNA', 'F', inplace=True) cat_cols = get_categorical_features(df=app, ignore_list=[]) for col in cat_cols: app[col].fillna('XNA', inplace=True) ' revo ' # revo = 'Revolving loans' # amt_list = ['AMT_ANNUITY', 'AMT_CREDIT', 'AMT_GOODS_PRICE'] # for col in amt_list: # app[f'revo_{col}'] = app[col].where(app[f'NAME_CONTRACT_TYPE']==revo, np.nan) # app[col] = app[col].where(app[f'NAME_CONTRACT_TYPE']!=revo, np.nan) utils.to_df_pkl(df=app, path='../input', fname='clean_application_train_test')
def clean_bureau(bur): logger.info(f''' #============================================================================== # BUREAU CLEANSING #==============================================================================''' ) bur = utils.read_df_pkl(path='../input/bureau*.p') bur = bur[bur['CREDIT_CURRENCY'] == 'currency 1'] bur['DAYS_CREDIT_ENDDATE'] = bur['DAYS_CREDIT_ENDDATE'].where( bur['DAYS_CREDIT_ENDDATE'] > -36000, np.nan) bur['DAYS_ENDDATE_FACT'] = bur['DAYS_ENDDATE_FACT'].where( bur['DAYS_ENDDATE_FACT'] > -36000, np.nan) bur['DAYS_CREDIT_UPDATE'] = bur['DAYS_CREDIT_UPDATE'].where( bur['DAYS_CREDIT_UPDATE'] > -36000, np.nan) bur = utils.to_df_pkl(df=bur, path='../input', fname='clean_bureau')
def to_pkl(): app_train = pd.read_csv('../input/application_train.csv') app_test = pd.read_csv('../input/application_test.csv') app = pd.concat([app_train, app_test], axis=0) utils.to_df_pkl(df=app, path='../input', fname='application_train_test') app_eda = eda.df_info(app) app_eda.to_csv('../eda/application_eda.csv') bur = pd.read_csv('../input/bureau.csv') utils.to_df_pkl(df=bur, path='../input', fname='bureau') bur_eda = eda.df_info(bur) bur_eda.to_csv('../eda/bureau_eda.csv') pre = pd.read_csv('../input/previous_application.csv') utils.to_df_pkl(df=pre, path='../input', fname='previous_application') pre_eda = eda.df_info(pre) pre_eda.to_csv('../eda/prev_eda.csv') ins = pd.read_csv('../input/installments_payments.csv') utils.to_df_pkl(df=ins, path='../input', fname='installments_payments') ins_eda = eda.df_info(ins) ins_eda.to_csv('../eda/install_eda.csv') ccb = pd.read_csv('../input/credit_card_balance.csv') utils.to_df_pkl(df=ccb, path='../input', fname='credit_card_balance') ccb_eda = eda.df_info(ccb) ccb_eda.to_csv('../eda/credit_eda.csv') pos = pd.read_csv('../input/POS_CASH_balance.csv') utils.to_df_pkl(df=pos, path='../input', fname='POS_CASH_balance') pos_eda = eda.df_info(pos) pos_eda.to_csv('../eda/pos_eda.csv')
def clean_ins(ins): # なぜ0なのかよくわからないし290行しかないので抜いてしまう ins = ins.query("AMT_INSTALMENT>0") utils.to_df_pkl(df=ins, path='../input', fname='clean_install')
print("Pad Sequences Start!!") train_word_sequences = pad_sequences(x_train, maxlen=max_length, padding='post') test_word_sequences = pad_sequences(x_test, maxlen=max_length, padding='post') # train_word_sequences = pad_sequences(train_word_sequences, maxlen=max_length, padding='post') # test_word_sequences = pad_sequences(test_word_sequences, maxlen=max_length, padding='post') pred_prob = np.zeros((len(test_word_sequences),), dtype=np.float32) del x_train, x_test gc.collect() #======================================================================== # Numericの結合 is_num = 1 if is_num: train_word_sequences = np.hstack((train_word_sequences, num_train.values)) test_word_sequences = np.hstack((test_word_sequences, num_test.values)) print(f"Train: {train_word_sequences.shape} | Test: {test_word_sequences.shape}") print(train_word_sequences[:1]) print(test_word_sequences[:1]) # Train Test Set tx_train = train_word_sequences.copy() x_test = test_word_sequences.copy() if is_save: utils.to_df_pkl(df=pd.DataFrame(tx_train), path='../input', fname=f'0306_MS_NLP_train_only_feat{tx_train.shape[1]}') utils.to_df_pkl(df=pd.DataFrame(x_test), path='../input', fname=f'0306_MS_NLP_test_only_feat{x_test.shape[1]}') del train_word_sequences, test_word_sequences gc.collect() #========================================================================
print(f"{cnt} :", len(df[f'ir_{cnt}@'].dropna())) if len(df[f'ir_{cnt}@'].dropna())<len(df)*0.001: df.drop(f'ir_{cnt}@', axis=1, inplace=True) continue else: ir = ( (df[aan].values * df[cpy].values) / df[acr].values ) - 1.0 df[f'ir_pred@'] = ir df[f'ir_pred@'] = df[f'ir_pred@'].map(lambda x: x if (0.08<=x) and (x<=0.5) else np.nan) cnt = 'pred' ir_cols = [col for col in df.columns if col.count('ir_')] df['ir_mean'] = df[ir_cols].mean(axis=1) df['ir_max'] = df[ir_cols].max(axis=1) df['ir_min'] = df[ir_cols].min(axis=1) df['ir_std'] = df[ir_cols].std(axis=1) utils.to_df_pkl(df=df, path='../eda/', fname='1024_prev_ir') # Curren Applicationに対するCNT_PAYMENTの予測値 df = utils.read_df_pkl('../input/clean_cpy*') df['Pred_CPY_diff_Cal_CPY@'] = df['CNT_PAYMENT'].values - (df['AMT_CREDIT'].values / df['AMT_ANNUITY'].values) if bureau: kb = 'SK_ID_BUREAU' bur = utils.read_df_pkl('../input/clean_bur*')[[key, kb]].groupby(key)[kb].max().reset_index() df = df.reset_index().merge(bur, on=key, how='left') df['bur_bin'] = 'train_no_bureau' df['bur_bin'] = df.where(df[kb].isnull(), np.nan)['bur_bin'] if standard:
import re import pandas as pd from wordcloud import STOPWORDS def quara_load_data(): # read pickle train = utils.read_df_pkl(path='../input/train*.p') test = utils.read_df_pkl(path='../input/test*.p') return train, test if is_pickle: train = pd.read_csv('../input/train.csv') test = pd.read_csv('../input/test.csv') utils.to_df_pkl(path='../input/', fname='train', df=train) utils.to_df_pkl(path='../input/', fname='test', df=test) if is_base: train, test = quara_load_data() df = pd.concat([train, test], axis=0) utils.to_pkl_gzip(obj=df[['qid', 'target']], path='../input/base') sys.exit() def cleansing_text(text, remove_stopwords=True, stem_words=False): # Convert words to lower case and split them text = re.sub("_", " ", text, flags=re.IGNORECASE) text = text.lower().split() regex_num = re.compile(u"[0-90-9]")
train.drop(drop_list, axis=1, inplace=True) test.drop(drop_list, axis=1, inplace=True) #======================================================================== # Train & Prediction Start #======================================================================== LGBM = LGBM.cross_prediction(train=train, test=test, key=key, target=target, fold_type=fold_type, fold=fold, group_col_name=group_col_name, params=params, num_boost_round=num_boost_round, early_stopping_rounds=early_stopping_rounds, oof_flg=oof_flg) cv_score = LGBM.cv_score result = LGBM.prediction cv_feim = LGBM.cv_feim feature_num = len(LGBM.use_cols) cv_feim.to_csv( f'../valid/{start_time[4:12]}_feim_feat{feature_num}_CV{cv_score}.csv', index=False) test[target] = result.astype(np.int64) utils.to_df_pkl(df=test, path='../input/', fname='clean_cpy_application')
#======================================================================== #======================================================================== # Tokenize start_time = time.time() print("Transforming...") if is_make: ## Tokenize the sentences use_cols = [col for col in nlp_train.columns if col not in ignore_list] tx_col = "text" # nlp_train[tx_col] = nlp_train[use_cols].apply(lambda x: ' '.join([ str(tx) for tx in x]), axis=1) nlp_train[tx_col] = nlp_train[use_cols].apply( lambda x: ' '.join(x.values.tolist()), axis=1) utils.to_df_pkl(df=nlp_train[[key, tx_col, target]], path='../input/', fname=f'0305_MS_NLP_feat{len(use_cols)}') else: base = utils.read_df_pkl(path='../input/base_Av*') # len_train = base[~base[target].isnull()] nlp_train = utils.read_df_pkl(path='../input/0305_MS_NLP_feat*') nlp_train[target] = base[target].values text_list = nlp_train[tx_col].values.tolist() max_features = 10000 nb_words = max_features max_length = 100 tokenizer = Tokenizer(num_words=max_features, split=" ") tokenizer.fit_on_texts(text_list) del text_list
train.loc[train[col].isin(no_train_list), col] = major_val test.loc[test[col].isin(no_train_list), col] = major_val # マイナーなカテゴリはマイナーとわかるようにしておく train.loc[train[col].isin(less_than), col] = -1 test.loc[test[col].isin(less_than), col] = -1 print(f"{col} Complete!") if is_save: no_test_idx_list = list(set(no_test_idx_list)) print(f"All Train shape: {train.shape}") train = train.loc[~train[key].isin(no_test_idx_list), :] print(f"Exclude No Test Category Train shape: {train.shape}") df_feat = pd.concat([train, test], axis=0, ignore_index=True) if is_base: base = df_feat[[key, target, 'country_group']] utils.to_df_pkl(df=base, path='../input', fname='base_exclude_no_test') feat_cols = [ col for col in train.columns if col not in ignore_list and not (col.count('country_')) ] df_feat = df_feat[feat_cols] feat_cols = [col.replace('f000_', '') for col in df_feat.columns] df_feat.columns = feat_cols MS_utils.save_feature(df_feat=df_feat, prefix=prefix) print(f"Feature Save Complete!!")