Exemplo n.º 1
0
def clean_pos(pos):
    logger.info(f'''
    #==============================================================================
    # PREV CLEANSING
    #=============================================================================='''
                )

    pos = pos.query(
        "NAME_CONTRACT_STATUS!='Signed' and NAME_CONTRACT_STATUS!='Approved' and NAME_CONTRACT_STATUS!='XNA'"
    )
    pos.loc[(pos.NAME_CONTRACT_STATUS == 'Completed') &
            (pos.CNT_INSTALMENT_FUTURE != 0),
            'NAME_CONTRACT_STATUS'] = 'Active'

    pos_0 = pos.query('CNT_INSTALMENT_FUTURE==0')
    pos_1 = pos.query('CNT_INSTALMENT_FUTURE>0')
    pos_0['NAME_CONTRACT_STATUS'] = 'Completed'
    pos_0.sort_values(by=['SK_ID_PREV', 'MONTHS_BALANCE'],
                      ascending=[True, False],
                      inplace=True)
    pos_0.drop_duplicates('SK_ID_PREV', keep='last', inplace=True)
    pos = pd.concat([pos_0, pos_1], ignore_index=True)
    del pos_0, pos_1
    gc.collect()

    utils.to_df_pkl(df=pos, path='../input', fname='clean_pos')
Exemplo n.º 2
0
def clean_ccb(ccb):

    amt_cols = [col for col in ccb.columns if col.count('AMT')]
    cnt_cols = [col for col in ccb.columns if col.count('CNT')]
    amt_cnt_cols = list(set(amt_cols + cnt_cols))
    for col in amt_cnt_cols:
        ccb[col].fillna(0, inplace=True)

    utils.to_df_pkl(df=ccb, path='../input', fname='clean_ccb')
Exemplo n.º 3
0
def clean_prev(pre):
    logger.info(f'''
    #==============================================================================
    # PREV CLEANSING
    #=============================================================================='''
                )

    cash = 'Cash loans'
    revo = 'Revolving loans'
    pre = utils.read_df_pkl(path='../input/previous*.p')
    pre['AMT_CREDIT'] = pre['AMT_CREDIT'].where(pre['AMT_CREDIT'] > 0, np.nan)
    pre['AMT_ANNUITY'] = pre['AMT_ANNUITY'].where(pre['AMT_ANNUITY'] > 0,
                                                  np.nan)
    pre['AMT_APPLICATION'] = pre['AMT_APPLICATION'].where(
        pre['AMT_APPLICATION'] > 0, np.nan)
    pre['CNT_PAYMENT'] = pre['CNT_PAYMENT'].where(pre['CNT_PAYMENT'] > 0,
                                                  np.nan)
    pre['AMT_DOWN_PAYMENT'] = pre['AMT_DOWN_PAYMENT'].where(
        pre['AMT_DOWN_PAYMENT'] > 0, np.nan)
    pre['RATE_DOWN_PAYMENT'] = pre['RATE_DOWN_PAYMENT'].where(
        pre['RATE_DOWN_PAYMENT'] > 0, np.nan)

    pre['DAYS_FIRST_DRAWING'] = pre['DAYS_FIRST_DRAWING'].where(
        pre['DAYS_FIRST_DRAWING'] < 100000, np.nan)
    pre['DAYS_FIRST_DUE'] = pre['DAYS_FIRST_DUE'].where(
        pre['DAYS_FIRST_DUE'] < 100000, np.nan)
    pre['DAYS_LAST_DUE_1ST_VERSION'] = pre['DAYS_LAST_DUE_1ST_VERSION'].where(
        pre['DAYS_LAST_DUE_1ST_VERSION'] < 100000, np.nan)
    pre['DAYS_LAST_DUE'] = pre['DAYS_LAST_DUE'].where(
        pre['DAYS_LAST_DUE'] < 100000, np.nan)
    pre['DAYS_TERMINATION'] = pre['DAYS_TERMINATION'].where(
        pre['DAYS_TERMINATION'] < 100000, np.nan)
    #  pre['SELLERPLACE_AREA']          = pre['SELLERPLACE_AREA'].where(pre['SELLERPLACE_AREA']     <200, 200)

    ignore_list = [
        'SK_ID_CURR', 'SK_ID_PREV', 'NAME_CONTRACT_TYPE',
        'NAME_CONTRACT_STATUS'
    ]
    ' revo '
    ' RevolvingではCNT_PAYMENT, AMT系をNULLにする '
    #  for col in pre.columns:
    #      if col in ignore_list:
    #          logger.info(f'CONTINUE: {col}')
    #          continue
    #      pre[f'revo_{col}'] = pre[col].where(pre[f'NAME_CONTRACT_TYPE']==revo, np.nan)
    #      pre[col] = pre[col].where(pre[f'NAME_CONTRACT_TYPE']!=revo, np.nan)

    pre['NAME_TYPE_SUITE'].fillna('XNA', inplace=True)
    pre['PRODUCT_COMBINATION'].fillna('XNA', inplace=True)

    pre = utils.to_df_pkl(df=pre, path='../input', fname='clean_prev')
Exemplo n.º 4
0
def clean_app(app):
    logger.info(f'''
    #==============================================================================
    # APPLICATION
    #=============================================================================='''
                )

    app['CODE_GENDER'].replace('XNA', 'F', inplace=True)

    cat_cols = get_categorical_features(df=app, ignore_list=[])
    for col in cat_cols:
        app[col].fillna('XNA', inplace=True)

    ' revo '
    #  revo = 'Revolving loans'
    #  amt_list = ['AMT_ANNUITY', 'AMT_CREDIT', 'AMT_GOODS_PRICE']
    #  for col in amt_list:
    #      app[f'revo_{col}'] = app[col].where(app[f'NAME_CONTRACT_TYPE']==revo, np.nan)
    #      app[col] = app[col].where(app[f'NAME_CONTRACT_TYPE']!=revo, np.nan)

    utils.to_df_pkl(df=app,
                    path='../input',
                    fname='clean_application_train_test')
Exemplo n.º 5
0
def clean_bureau(bur):
    logger.info(f'''
    #==============================================================================
    # BUREAU CLEANSING
    #=============================================================================='''
                )

    bur = utils.read_df_pkl(path='../input/bureau*.p')
    bur = bur[bur['CREDIT_CURRENCY'] == 'currency 1']
    bur['DAYS_CREDIT_ENDDATE'] = bur['DAYS_CREDIT_ENDDATE'].where(
        bur['DAYS_CREDIT_ENDDATE'] > -36000, np.nan)
    bur['DAYS_ENDDATE_FACT'] = bur['DAYS_ENDDATE_FACT'].where(
        bur['DAYS_ENDDATE_FACT'] > -36000, np.nan)
    bur['DAYS_CREDIT_UPDATE'] = bur['DAYS_CREDIT_UPDATE'].where(
        bur['DAYS_CREDIT_UPDATE'] > -36000, np.nan)
    bur = utils.to_df_pkl(df=bur, path='../input', fname='clean_bureau')
Exemplo n.º 6
0
def to_pkl():
    app_train = pd.read_csv('../input/application_train.csv')
    app_test = pd.read_csv('../input/application_test.csv')
    app = pd.concat([app_train, app_test], axis=0)
    utils.to_df_pkl(df=app, path='../input', fname='application_train_test')
    app_eda = eda.df_info(app)
    app_eda.to_csv('../eda/application_eda.csv')

    bur = pd.read_csv('../input/bureau.csv')
    utils.to_df_pkl(df=bur, path='../input', fname='bureau')
    bur_eda = eda.df_info(bur)
    bur_eda.to_csv('../eda/bureau_eda.csv')

    pre = pd.read_csv('../input/previous_application.csv')
    utils.to_df_pkl(df=pre, path='../input', fname='previous_application')
    pre_eda = eda.df_info(pre)
    pre_eda.to_csv('../eda/prev_eda.csv')

    ins = pd.read_csv('../input/installments_payments.csv')
    utils.to_df_pkl(df=ins, path='../input', fname='installments_payments')
    ins_eda = eda.df_info(ins)
    ins_eda.to_csv('../eda/install_eda.csv')

    ccb = pd.read_csv('../input/credit_card_balance.csv')
    utils.to_df_pkl(df=ccb, path='../input', fname='credit_card_balance')
    ccb_eda = eda.df_info(ccb)
    ccb_eda.to_csv('../eda/credit_eda.csv')

    pos = pd.read_csv('../input/POS_CASH_balance.csv')
    utils.to_df_pkl(df=pos, path='../input', fname='POS_CASH_balance')
    pos_eda = eda.df_info(pos)
    pos_eda.to_csv('../eda/pos_eda.csv')
Exemplo n.º 7
0
def clean_ins(ins):

    # なぜ0なのかよくわからないし290行しかないので抜いてしまう
    ins = ins.query("AMT_INSTALMENT>0")

    utils.to_df_pkl(df=ins, path='../input', fname='clean_install')
Exemplo n.º 8
0
    print("Pad Sequences Start!!")
    train_word_sequences = pad_sequences(x_train, maxlen=max_length, padding='post')
    test_word_sequences = pad_sequences(x_test, maxlen=max_length, padding='post')
    # train_word_sequences = pad_sequences(train_word_sequences, maxlen=max_length, padding='post')
    # test_word_sequences = pad_sequences(test_word_sequences, maxlen=max_length, padding='post')

    pred_prob = np.zeros((len(test_word_sequences),), dtype=np.float32)
del x_train, x_test
gc.collect()

#========================================================================
# Numericの結合
is_num = 1
if is_num:
    train_word_sequences = np.hstack((train_word_sequences, num_train.values))
    test_word_sequences = np.hstack((test_word_sequences, num_test.values))

print(f"Train: {train_word_sequences.shape} | Test: {test_word_sequences.shape}")
print(train_word_sequences[:1])
print(test_word_sequences[:1])

# Train Test Set
tx_train = train_word_sequences.copy()
x_test = test_word_sequences.copy()
if is_save:
    utils.to_df_pkl(df=pd.DataFrame(tx_train), path='../input', fname=f'0306_MS_NLP_train_only_feat{tx_train.shape[1]}')
    utils.to_df_pkl(df=pd.DataFrame(x_test), path='../input', fname=f'0306_MS_NLP_test_only_feat{x_test.shape[1]}')
del train_word_sequences, test_word_sequences
gc.collect()
#========================================================================
Exemplo n.º 9
0
            print(f"{cnt} :", len(df[f'ir_{cnt}@'].dropna()))
            if len(df[f'ir_{cnt}@'].dropna())<len(df)*0.001:
                df.drop(f'ir_{cnt}@', axis=1, inplace=True)
                continue
        else:
            ir = ( (df[aan].values * df[cpy].values) / df[acr].values ) - 1.0
            df[f'ir_pred@'] = ir
            df[f'ir_pred@'] = df[f'ir_pred@'].map(lambda x: x if (0.08<=x) and (x<=0.5) else np.nan)
            cnt = 'pred'

    ir_cols = [col for col in df.columns if col.count('ir_')]
    df['ir_mean'] = df[ir_cols].mean(axis=1)
    df['ir_max'] = df[ir_cols].max(axis=1)
    df['ir_min'] = df[ir_cols].min(axis=1)
    df['ir_std'] = df[ir_cols].std(axis=1)
    utils.to_df_pkl(df=df, path='../eda/', fname='1024_prev_ir')


# Curren Applicationに対するCNT_PAYMENTの予測値
df = utils.read_df_pkl('../input/clean_cpy*')
df['Pred_CPY_diff_Cal_CPY@'] = df['CNT_PAYMENT'].values - (df['AMT_CREDIT'].values / df['AMT_ANNUITY'].values)


if bureau:
    kb = 'SK_ID_BUREAU'
    bur = utils.read_df_pkl('../input/clean_bur*')[[key, kb]].groupby(key)[kb].max().reset_index()
    df = df.reset_index().merge(bur, on=key, how='left')
    df['bur_bin'] = 'train_no_bureau'
    df['bur_bin'] = df.where(df[kb].isnull(), np.nan)['bur_bin']

if standard:
Exemplo n.º 10
0
import re
import pandas as pd
from wordcloud import STOPWORDS


def quara_load_data():
    # read pickle
    train = utils.read_df_pkl(path='../input/train*.p')
    test = utils.read_df_pkl(path='../input/test*.p')
    return train, test


if is_pickle:
    train = pd.read_csv('../input/train.csv')
    test = pd.read_csv('../input/test.csv')
    utils.to_df_pkl(path='../input/', fname='train', df=train)
    utils.to_df_pkl(path='../input/', fname='test', df=test)

if is_base:
    train, test = quara_load_data()
    df = pd.concat([train, test], axis=0)
    utils.to_pkl_gzip(obj=df[['qid', 'target']], path='../input/base')
    sys.exit()


def cleansing_text(text, remove_stopwords=True, stem_words=False):

    # Convert words to lower case and split them
    text = re.sub("_", " ", text, flags=re.IGNORECASE)
    text = text.lower().split()
    regex_num = re.compile(u"[0-90-9]")
Exemplo n.º 11
0
    train.drop(drop_list, axis=1, inplace=True)
    test.drop(drop_list, axis=1, inplace=True)

#========================================================================
# Train & Prediction Start
#========================================================================
LGBM = LGBM.cross_prediction(train=train,
                             test=test,
                             key=key,
                             target=target,
                             fold_type=fold_type,
                             fold=fold,
                             group_col_name=group_col_name,
                             params=params,
                             num_boost_round=num_boost_round,
                             early_stopping_rounds=early_stopping_rounds,
                             oof_flg=oof_flg)

cv_score = LGBM.cv_score
result = LGBM.prediction
cv_feim = LGBM.cv_feim
feature_num = len(LGBM.use_cols)

cv_feim.to_csv(
    f'../valid/{start_time[4:12]}_feim_feat{feature_num}_CV{cv_score}.csv',
    index=False)

test[target] = result.astype(np.int64)

utils.to_df_pkl(df=test, path='../input/', fname='clean_cpy_application')
Exemplo n.º 12
0
#========================================================================

#========================================================================
# Tokenize
start_time = time.time()
print("Transforming...")

if is_make:
    ## Tokenize the sentences
    use_cols = [col for col in nlp_train.columns if col not in ignore_list]
    tx_col = "text"
    # nlp_train[tx_col] = nlp_train[use_cols].apply(lambda x: ' '.join([ str(tx) for tx in x]), axis=1)
    nlp_train[tx_col] = nlp_train[use_cols].apply(
        lambda x: ' '.join(x.values.tolist()), axis=1)
    utils.to_df_pkl(df=nlp_train[[key, tx_col, target]],
                    path='../input/',
                    fname=f'0305_MS_NLP_feat{len(use_cols)}')
else:
    base = utils.read_df_pkl(path='../input/base_Av*')
    #  len_train = base[~base[target].isnull()]
    nlp_train = utils.read_df_pkl(path='../input/0305_MS_NLP_feat*')
    nlp_train[target] = base[target].values

text_list = nlp_train[tx_col].values.tolist()

max_features = 10000
nb_words = max_features
max_length = 100
tokenizer = Tokenizer(num_words=max_features, split=" ")
tokenizer.fit_on_texts(text_list)
del text_list
Exemplo n.º 13
0
        train.loc[train[col].isin(no_train_list), col] = major_val
        test.loc[test[col].isin(no_train_list), col] = major_val

        # マイナーなカテゴリはマイナーとわかるようにしておく
        train.loc[train[col].isin(less_than), col] = -1
        test.loc[test[col].isin(less_than), col] = -1

    print(f"{col} Complete!")

if is_save:
    no_test_idx_list = list(set(no_test_idx_list))
    print(f"All Train shape: {train.shape}")
    train = train.loc[~train[key].isin(no_test_idx_list), :]
    print(f"Exclude No Test Category Train shape: {train.shape}")
    df_feat = pd.concat([train, test], axis=0, ignore_index=True)

    if is_base:
        base = df_feat[[key, target, 'country_group']]
        utils.to_df_pkl(df=base, path='../input', fname='base_exclude_no_test')

    feat_cols = [
        col for col in train.columns
        if col not in ignore_list and not (col.count('country_'))
    ]
    df_feat = df_feat[feat_cols]
    feat_cols = [col.replace('f000_', '') for col in df_feat.columns]
    df_feat.columns = feat_cols

    MS_utils.save_feature(df_feat=df_feat, prefix=prefix)
    print(f"Feature Save Complete!!")