def aggregate(args):
    path, pref = args

    df = utils.read_pickles(path, [KEY, 'SK_ID_PREV', 'month'] + COL)
    #    df = df[df['DAYS_INSTALMENT'].between(day_start, day_end)]
    #    del df['SK_ID_PREV']

    df = df.groupby([KEY, 'SK_ID_PREV', 'month'])[COL].sum().reset_index()

    df_agg = df.groupby(KEY).agg({**num_agg})
    df_agg.columns = pd.Index(
        [e[0] + "_" + e[1] for e in df_agg.columns.tolist()])

    df_agg['INS_COUNT'] = df.groupby(KEY).size()
    df_agg = df_agg.add_prefix(pref).reset_index()

    utils.remove_feature(df_agg, var_limit=0, sample_size=19999)

    tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/train')

    tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/test')

    return
def multi_agg(args):
    path, pref, cont_type, cont_type_pref = args
    print(args)
    
    ins = utils.read_pickles(path)
    ins = ins[ins['DAYS_INSTALMENT'].between(day_start, day_end)]
    ins = pd.merge(ins, prev, on='SK_ID_PREV', how='left'); gc.collect()
    del ins['SK_ID_CURR']
    
    
    
    if cont_type=='NA':
        df = ins[ins['NAME_CONTRACT_TYPE'].isnull()]
    else:
        df = ins[ins['NAME_CONTRACT_TYPE']==cont_type]
    
    df_agg = df.groupby(KEY).agg({**utils_agg.ins_num_aggregations})
    
    df_agg.columns = pd.Index([e[0] + "_" + e[1] for e in df_agg.columns.tolist()])
    
    df_agg['INS_COUNT'] = df.groupby(KEY).size()
    df_agg = df_agg.add_prefix(pref+cont_type_pref).reset_index()
    
    utils.remove_feature(df_agg, var_limit=0, sample_size=19999)
    
    tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature_prev/train')
    
    tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF),  '../feature_prev/test')
    
    return
예제 #3
0
def aggregate(args):
    path, cont_type, pref = args

    df = utils.read_pickles(path, [KEY, 'DAYS_ENTRY_PAYMENT'])
    df = df[df['DAYS_ENTRY_PAYMENT'].between(day_start, day_end)].sort_values(
        [KEY, 'DAYS_ENTRY_PAYMENT'])
    df = pd.merge(df, prev, on=KEY, how='left')
    gc.collect()

    if cont_type == 'NA':
        df = df[df['NAME_CONTRACT_TYPE'].isnull()]
    else:
        df = df[df['NAME_CONTRACT_TYPE'] == cont_type]

    df['DEP_diff'] = df.groupby(KEY).DAYS_ENTRY_PAYMENT.diff()
    feature = df.groupby(KEY).agg(
        {'DEP_diff': ['min', 'mean', 'max', 'var', 'nunique']})
    feature.columns = pd.Index(
        [e[0] + "_" + e[1] for e in feature.columns.tolist()])
    feature.reset_index(inplace=True)

    utils.remove_feature(feature, var_limit=0, sample_size=19999)

    tmp = pd.merge(train, feature, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF + pref), '../feature_prev/train')

    tmp = pd.merge(test, feature, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF + pref), '../feature_prev/test')

    return
예제 #4
0
def plot_accuracy(X_train, Y_train, X_test, Y_test, model, title):
    feature_num = X_train.shape[1]
    # fig, ax = plt.subplots()
    fig, ax = plt.subplots(figsize=(7, 6))
    ax.set_title(title)
    model.fit(X_train, Y_train)
    pred = model.predict(X_test)
    acc_for_all = accuracy_score(Y_test, pred)
    print(classification_report(Y_test, pred))
    print(confusion_matrix(Y_test, pred))
    ax.hlines(acc_for_all * 100,
              0,
              feature_num - 1,
              label='Full dataset accuracy = %.2f%%' % (acc_for_all * 100))
    acc = []
    for i in range(feature_num):
        tmp_X_train = utils.remove_feature(X_train, i)
        tmp_X_test = utils.remove_feature(X_test, i)
        model.fit(tmp_X_train, Y_train)
        pred = model.predict(tmp_X_test)
        acc.append(accuracy_score(Y_test, pred) * 100)
    points = [i for i in range(feature_num)]
    xticks = [i + 1 for i in range(feature_num)]
    plt.xticks(points, xticks)
    ax.plot(range(feature_num), acc)
    ax.set_xlabel('Excluded feature number')
    ax.set_ylabel('Accuracy')
    plt.legend(loc='upper right')
    plt.show()
def aggregate():
    
    df = utils.get_dummies(pos)
    
    li = []
    for c1 in df.columns:
        for c2 in col_cat:
            if c1.startswith(c2+'_'):
                li.append(c1)
                break
    
    cat_aggregations = {}
    for cat in li:
        cat_aggregations[cat] = ['mean', 'sum']
    
    df_agg = df.groupby(KEY).agg({**utils_agg.pos_num_aggregations, **cat_aggregations})
    df_agg.columns = pd.Index([e[0] + "_" + e[1] for e in df_agg.columns.tolist()])
    
    df_agg['POS_COUNT'] = df.groupby(KEY).size()
    df_agg.reset_index(inplace=True)
    
    utils.remove_feature(df_agg, var_limit=0, corr_limit=0.98, sample_size=19999)
    
    tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature_prev/train')
    
    tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF),  '../feature_prev/test')
    
    return
예제 #6
0
def aggregate(args):

    path, pref = args
    df = utils.read_pickles(path)
    df = df[df['DAYS_INSTALMENT'].between(day_start, day_end)]
    df_key = df[[KEY, 'SK_ID_PREV']].drop_duplicates()

    df_agg = df.groupby('SK_ID_PREV').agg({**utils_agg.ins_num_aggregations})
    df_agg.columns = pd.Index(
        [e[0] + "_" + e[1] for e in df_agg.columns.tolist()])

    df_agg['INS_COUNT'] = df.groupby('SK_ID_PREV').size()
    df_agg = df_agg.add_prefix(pref).reset_index()

    utils.remove_feature(df_agg, var_limit=0, sample_size=19999)

    df_agg = pd.merge(df_agg, df_key, on='SK_ID_PREV',
                      how='left').drop('SK_ID_PREV', axis=1)

    df_agg2 = df_agg.groupby(KEY).agg(['mean', 'var'])
    df_agg2.columns = pd.Index(
        [e[0] + "_" + e[1] for e in df_agg2.columns.tolist()])
    df_agg2.reset_index(inplace=True)

    tmp = pd.merge(train, df_agg2, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/train')

    tmp = pd.merge(test, df_agg2, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/test')

    return
def aggregate(args):
    print(args)
    k, v, prefix = args

    df = utils.get_dummies(prev[prev[k] == v])

    li = []
    for c1 in df.columns:
        for c2 in col_cat:
            if c1.startswith(c2 + '_'):
                li.append(c1)
                break

    cat_aggregations = {}
    for cat in li:
        cat_aggregations[cat] = ['mean', 'sum']

    df_agg = df.groupby('SK_ID_CURR').agg({
        **utils_agg.prev_num_aggregations,
        **cat_aggregations
    })
    df_agg.columns = pd.Index(
        [prefix + e[0] + "_" + e[1] for e in df_agg.columns.tolist()])

    # std / mean
    col_std = [c for c in df_agg.columns if c.endswith('_std')]
    for c in col_std:
        df_agg[f'{c}-d-mean'] = df_agg[c] / df_agg[c.replace('_std', '_mean')]

    # max / min
    col_max = [c for c in df_agg.columns if c.endswith('_max')]
    for c in col_max:
        df_agg[f'{c}-d-min'] = df_agg[c] / df_agg[c.replace('_max', '_min')]

    df_agg[prefix + 'PREV_COUNT'] = df.groupby('SK_ID_CURR').size()
    df_agg.reset_index(inplace=True)

    utils.remove_feature(df_agg,
                         var_limit=0,
                         corr_limit=0.98,
                         sample_size=19999)

    tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/train')

    tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/test')

    return
예제 #8
0
def multi_agg(args):
    path, pref, cont_type, cont_type_pref = args
    print(args)

    ins = utils.read_pickles(path)
    ins = ins[ins['DAYS_INSTALMENT'].between(day_start, day_end)]
    ins = pd.merge(ins, prev, on='SK_ID_PREV', how='left')
    gc.collect()
    del ins['SK_ID_PREV']

    if cont_type == 'NA':
        df = ins[ins['NAME_CONTRACT_TYPE'].isnull()]
    else:
        df = ins[ins['NAME_CONTRACT_TYPE'] == cont_type]

    df_agg = df.groupby(KEY).agg({**utils_agg.ins_num_aggregations})
    df_agg.columns = pd.Index(
        [e[0] + "_" + e[1] for e in df_agg.columns.tolist()])

    # std / mean
    col_std = [c for c in df_agg.columns if c.endswith('_std')]
    for c in col_std:
        df_agg[f'{c}-d-mean'] = df_agg[c] / df_agg[c.replace('_std', '_mean')]

    # max / min
    col_max = [c for c in df_agg.columns if c.endswith('_max')]
    for c in col_max:
        try:
            df_agg[f'{c}-d-min'] = df_agg[c] / df_agg[c.replace(
                '_max', '_min')]
            df_agg[f'{c}-m-min'] = df_agg[c] - df_agg[c.replace(
                '_max', '_min')]
        except:
            pass

    df_agg['INS_COUNT'] = df.groupby(KEY).size()
    df_agg = df_agg.add_prefix(pref + cont_type_pref).reset_index()

    utils.remove_feature(df_agg, var_limit=0, sample_size=19999)

    tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/train')

    tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/test')

    return
def aggregate(args):
    path, pref = args

    df = utils.read_pickles(path)
    df = df[df['DAYS_INSTALMENT'].between(day_start, day_end)]
    del df['SK_ID_PREV']

    df_agg = df.groupby(KEY).agg({**utils_agg.ins_num_aggregations})
    df_agg.columns = pd.Index(
        [e[0] + "_" + e[1] for e in df_agg.columns.tolist()])

    # std / mean
    col_std = [c for c in df_agg.columns if c.endswith('_std')]
    for c in col_std:
        df_agg[f'{c}-d-mean'] = df_agg[c] / df_agg[c.replace('_std', '_mean')]

    # max / min
    col_max = [c for c in df_agg.columns if c.endswith('_max')]
    for c in col_max:
        try:
            df_agg[f'{c}-d-min'] = df_agg[c] / df_agg[c.replace(
                '_max', '_min')]
            df_agg[f'{c}-m-min'] = df_agg[c] - df_agg[c.replace(
                '_max', '_min')]
        except:
            pass

    df_agg['INS_COUNT'] = df.groupby(KEY).size()
    df_agg = df_agg.add_prefix(pref).reset_index()

    utils.remove_feature(df_agg, var_limit=0, sample_size=19999)

    tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/train')

    tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/test')

    return
    feature[f'{c}_ratio'] = (dup[c] / gr[c].shift(1)).values
    feature[f'{c}_min'] = pd.concat([dup[c], gr[c].shift(1), gr[c].shift(2)],
                                    axis=1).min(1).values
    feature[f'{c}_max'] = pd.concat([dup[c], gr[c].shift(1), gr[c].shift(2)],
                                    axis=1).max(1).values
    feature[f'{c}_mean'] = pd.concat([dup[c], gr[c].shift(1), gr[c].shift(2)],
                                     axis=1).mean(1).values

#    feature[f'{c}_diff_r'] = gr[c].diff(-1).values
#    feature[f'{c}_ratio_r'] = ( dup[c] / gr[c].shift(-1) ).values
#    feature[f'{c}_min_r'] = pd.concat([ dup[c], gr[c].shift(-1), gr[c].shift(-2)], axis=1).min(1).values
#    feature[f'{c}_max_r'] = pd.concat([ dup[c], gr[c].shift(-1), gr[c].shift(-2)], axis=1).max(1).values
#    feature[f'{c}_mean_r'] = pd.concat([ dup[c], gr[c].shift(-1), gr[c].shift(-2)], axis=1).mean(1).values

feature.dropna(how='all', inplace=True)
utils.remove_feature(feature, var_limit=0, corr_limit=0.98, sample_size=19999)

train = utils.load_train([KEY])
test = utils.load_test([KEY])

feature.reset_index(inplace=True)
feature = pd.merge(feature, user_id, on=KEY, how='left')

tmp = pd.merge(train, feature, on=KEY, how='left').drop(KEY, axis=1)
utils.to_feature(tmp.add_prefix(PREF), '../feature/train')

tmp = pd.merge(test, feature, on=KEY, how='left').drop(KEY, axis=1)
utils.to_feature(tmp.add_prefix(PREF), '../feature/test')

# =============================================================================
# drop old user
예제 #11
0
def test_fetures(X, Y):
    print('K-Neighbors')
    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.2,
                                                        random_state=42)
    KNN_model = KNeighborsClassifier(n_neighbors=5)
    KNN_model.fit(X_train, Y_train)
    KNN_pred = KNN_model.predict(X_test)
    print("%.2f%%" % (accuracy_score(Y_test, KNN_pred) * 100))
    for i in range(X.shape[1]):
        tmp = utils.remove_feature(X, i)
        X_train, X_test, Y_train, Y_test = train_test_split(tmp,
                                                            Y,
                                                            test_size=0.2,
                                                            random_state=42)
        KNN_model = KNeighborsClassifier(n_neighbors=5)
        KNN_model.fit(X_train, Y_train)
        KNN_pred = KNN_model.predict(X_test)
        # print(classification_report(Y_test, KNN_pred))
        print("%.2f%%" % (accuracy_score(Y_test, KNN_pred) * 100), end=' ')

    print('\nSVM')
    X_train, X_test, Y_train, Y_test = train_test_split(tmp,
                                                        Y,
                                                        test_size=0.2,
                                                        random_state=42)
    SVC_model = SVC()
    SVC_model.fit(X_train, Y_train)
    SVC_pred = SVC_model.predict(X_test)
    print("%.2f%%" % (accuracy_score(Y_test, SVC_pred) * 100))
    for i in range(X.shape[1]):
        tmp = utils.remove_feature(X, i)
        X_train, X_test, Y_train, Y_test = train_test_split(tmp,
                                                            Y,
                                                            test_size=0.2,
                                                            random_state=42)
        SVC_model = SVC()
        SVC_model.fit(X_train, Y_train)
        SVC_pred = SVC_model.predict(X_test)
        # print(classification_report(Y_test, SVC_pred))
        print("%.2f%%" % (accuracy_score(Y_test, SVC_pred) * 100), end=' ')

    print('\nNaive Bayes Classifier')
    X_train, X_test, Y_train, Y_test = train_test_split(tmp,
                                                        Y,
                                                        test_size=0.2,
                                                        random_state=42)
    GNB_model = GaussianNB()
    GNB_model.fit(X_train, Y_train)
    GNB_pred = GNB_model.predict(X_test)
    print("%.2f%%" % (accuracy_score(Y_test, GNB_pred) * 100))
    for i in range(X.shape[1]):
        tmp = utils.remove_feature(X, i)
        X_train, X_test, Y_train, Y_test = train_test_split(tmp,
                                                            Y,
                                                            test_size=0.2,
                                                            random_state=42)
        GNB_model = GaussianNB()
        GNB_model.fit(X_train, Y_train)
        GNB_pred = GNB_model.predict(X_test)
        # print(classification_report(Y_test, GNB_pred))
        print("%.2f%%" % (accuracy_score(Y_test, GNB_pred) * 100), end=' ')

    print('\nDecision Tree')
    X_train, X_test, Y_train, Y_test = train_test_split(tmp,
                                                        Y,
                                                        test_size=0.2,
                                                        random_state=42)
    tree_model = DecisionTreeClassifier()
    tree_model.fit(X_train, Y_train)
    tree_pred = tree_model.predict(X_test)
    print("%.2f%%" % (accuracy_score(Y_test, tree_pred) * 100))
    for i in range(X.shape[1]):
        tmp = utils.remove_feature(X, i)
        X_train, X_test, Y_train, Y_test = train_test_split(tmp,
                                                            Y,
                                                            test_size=0.2,
                                                            random_state=42)
        tree_model = DecisionTreeClassifier()
        tree_model.fit(X_train, Y_train)
        tree_pred = tree_model.predict(X_test)
        # print(classification_report(Y_test, tree_pred))
        print("%.2f%%" % (accuracy_score(Y_test, tree_pred) * 100), end=' ')

    print('\nRandom Forest')
    X_train, X_test, Y_train, Y_test = train_test_split(tmp,
                                                        Y,
                                                        test_size=0.2,
                                                        random_state=42)
    forest_model = RandomForestClassifier()
    forest_model.fit(X_train, Y_train)
    forest_pred = forest_model.predict(X_test)
    print("%.2f%%" % (accuracy_score(Y_test, forest_pred) * 100))
    for i in range(X.shape[1]):
        tmp = utils.remove_feature(X, i)
        X_train, X_test, Y_train, Y_test = train_test_split(tmp,
                                                            Y,
                                                            test_size=0.2,
                                                            random_state=42)
        forest_model = RandomForestClassifier()
        forest_model.fit(X_train, Y_train)
        forest_pred = forest_model.predict(X_test)
        # print(classification_report(Y_test, forest_pred))
        print("%.2f%%" % (accuracy_score(Y_test, forest_pred) * 100), end=' ')
예제 #12
0
def prepare_dataset(X_train, X_test):
    tmp_X_train = X_train
    tmp_X_test = X_test
    tmp_X_train = utils.remove_feature(tmp_X_train, 17)
    tmp_X_train = utils.remove_feature(tmp_X_train, 16)
    tmp_X_train = utils.remove_feature(tmp_X_train, 15)
    tmp_X_train = utils.remove_feature(tmp_X_train, 14)
    tmp_X_train = utils.remove_feature(tmp_X_train, 13)
    tmp_X_train = utils.remove_feature(tmp_X_train, 12)
    tmp_X_train = utils.remove_feature(tmp_X_train, 11)
    tmp_X_train = utils.remove_feature(tmp_X_train, 10)
    tmp_X_train = utils.remove_feature(tmp_X_train, 9)
    tmp_X_train = utils.remove_feature(tmp_X_train, 8)
    tmp_X_train = utils.remove_feature(tmp_X_train, 7)
    tmp_X_train = utils.remove_feature(tmp_X_train, 6)
    # tmp_X_train = utils.remove_feature(tmp_X_train, 5)
    # tmp_X_train = utils.remove_feature(tmp_X_train, 4)
    # tmp_X_train = utils.remove_feature(tmp_X_train, 3)
    # tmp_X_train = utils.remove_feature(tmp_X_train, 2)
    # tmp_X_train = utils.remove_feature(tmp_X_train, 1)
    # tmp_X_train = utils.remove_feature(tmp_X_train, 0)
    tmp_X_test = utils.remove_feature(tmp_X_test, 17)
    tmp_X_test = utils.remove_feature(tmp_X_test, 16)
    tmp_X_test = utils.remove_feature(tmp_X_test, 15)
    tmp_X_test = utils.remove_feature(tmp_X_test, 14)
    tmp_X_test = utils.remove_feature(tmp_X_test, 13)
    tmp_X_test = utils.remove_feature(tmp_X_test, 12)
    tmp_X_test = utils.remove_feature(tmp_X_test, 11)
    tmp_X_test = utils.remove_feature(tmp_X_test, 10)
    tmp_X_test = utils.remove_feature(tmp_X_test, 9)
    tmp_X_test = utils.remove_feature(tmp_X_test, 8)
    tmp_X_test = utils.remove_feature(tmp_X_test, 7)
    tmp_X_test = utils.remove_feature(tmp_X_test, 6)
    # tmp_X_test = utils.remove_feature(tmp_X_test, 5)
    # tmp_X_test = utils.remove_feature(tmp_X_test, 4)
    # tmp_X_test = utils.remove_feature(tmp_X_test, 3)
    # tmp_X_test = utils.remove_feature(tmp_X_test, 2)
    # tmp_X_test = utils.remove_feature(tmp_X_test, 1)
    # tmp_X_test = utils.remove_feature(tmp_X_test, 0)
    return tmp_X_train, tmp_X_test
예제 #13
0
            #                         gr2.sum().add_suffix('_sum'),
        ],
        axis=1)
    return feature


col_list = [
    col_delayed_day, col_delayed_money, col_delayed_money_ratio,
    col_not_delayed_day, col_not_delayed_money, col_not_delayed_money_ratio
]

feature = [mk_feature(col) for col in col_list]

feature = pd.concat(feature, axis=1)

utils.remove_feature(feature)
feature.reset_index(inplace=True)

# =============================================================================
# merge
# =============================================================================

train = utils.load_train([KEY])

test = utils.load_test([KEY])

train = pd.merge(train, feature, on=KEY, how='left').drop(KEY, axis=1)
utils.to_feature(train.add_prefix(PREF), '../feature/train')

test = pd.merge(test, feature, on=KEY, how='left').drop(KEY, axis=1)
utils.to_feature(test.add_prefix(PREF), '../feature/test')
from keras.models import Model
from keras.layers import Dense, Dropout, Activation, BatchNormalization, Input
from keras.layers import Average, Add, Maximum
from keras.callbacks import TensorBoard
from keras.utils import plot_model
from sklearn.model_selection import train_test_split
import numpy as np
import utils
import os

X, Y = utils.get_dataset('dataset_threshold_100_shift_05_2.txt')

X = utils.remove_feature(X, 17)
X = utils.remove_feature(X, 16)
X = utils.remove_feature(X, 15)
X = utils.remove_feature(X, 14)
X = utils.remove_feature(X, 13)
X = utils.remove_feature(X, 12)
X = utils.remove_feature(X, 11)
X = utils.remove_feature(X, 10)
X = utils.remove_feature(X, 9)
X = utils.remove_feature(X, 8)
X = utils.remove_feature(X, 7)
X = utils.remove_feature(X, 6)

X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=42)
input_shape = X[0].shape
output_num = Y.shape[1]
예제 #15
0
def aggregate(args):
    path, cont_type, pref = args

    df = utils.read_pickles(path,
                            [KEY, 'SK_ID_PREV', 'DAYS_ENTRY_PAYMENT'] + COL)
    #    df = df[df['DAYS_ENTRY_PAYMENT'].between(day_start, day_end)].sort_values([KEY, 'DAYS_ENTRY_PAYMENT'])
    df = pd.merge(df, prev, on='SK_ID_PREV', how='left')
    gc.collect()

    if cont_type == 'NA':
        df = df[df['NAME_CONTRACT_TYPE'].isnull()]
    else:
        df = df[df['NAME_CONTRACT_TYPE'] == cont_type]

    df.sort_values(['SK_ID_PREV', 'DAYS_ENTRY_PAYMENT'], inplace=True)
    df.reset_index(drop=True, inplace=True)

    li = []
    for c in COL:
        ret_diff = []
        ret_pctchng = []
        key_bk = x_bk = None
        for key, x in df[['SK_ID_PREV', c]].values:

            if key_bk is None:
                ret_diff.append(None)
                ret_pctchng.append(None)
            else:
                if key_bk == key:
                    ret_diff.append(x - x_bk)
                    ret_pctchng.append((x_bk - x) / x_bk)
                else:
                    ret_diff.append(None)
                    ret_pctchng.append(None)
            key_bk = key
            x_bk = x

        ret_diff = pd.Series(ret_diff, name=f'{c}_diff')
        ret_pctchng = pd.Series(ret_pctchng, name=f'{c}_pctchange')
        ret = pd.concat([ret_diff, ret_pctchng], axis=1)
        li.append(ret)
    callback = pd.concat(li, axis=1)
    col_ = callback.columns.tolist()
    callback[KEY] = df[KEY]

    num_agg = {}
    for c in col_:
        num_agg[c] = ['min', 'mean', 'max', 'var']

    feature = callback.groupby(KEY).agg(num_agg)
    feature.columns = pd.Index(
        [e[0] + "_" + e[1] for e in feature.columns.tolist()])
    feature.reset_index(inplace=True)

    utils.remove_feature(feature, var_limit=0, sample_size=19999)

    tmp = pd.merge(train, feature, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF + pref), '../feature/train')

    tmp = pd.merge(test, feature, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF + pref), '../feature/test')

    return
예제 #16
0
def get_part(X, n):
    tmp = X
    if (n == 1):
        tmp = utils.remove_feature(tmp, 17)
        tmp = utils.remove_feature(tmp, 16)
        tmp = utils.remove_feature(tmp, 15)
        tmp = utils.remove_feature(tmp, 14)
        tmp = utils.remove_feature(tmp, 13)
        tmp = utils.remove_feature(tmp, 12)
        tmp = utils.remove_feature(tmp, 11)
        tmp = utils.remove_feature(tmp, 10)
        tmp = utils.remove_feature(tmp, 9)
        tmp = utils.remove_feature(tmp, 8)
        tmp = utils.remove_feature(tmp, 7)
        tmp = utils.remove_feature(tmp, 6)
    elif (n == 2):
        tmp = utils.remove_feature(tmp, 17)
        tmp = utils.remove_feature(tmp, 16)
        tmp = utils.remove_feature(tmp, 15)
        tmp = utils.remove_feature(tmp, 14)
        tmp = utils.remove_feature(tmp, 13)
        tmp = utils.remove_feature(tmp, 12)
        tmp = utils.remove_feature(tmp, 5)
        tmp = utils.remove_feature(tmp, 4)
        tmp = utils.remove_feature(tmp, 3)
        tmp = utils.remove_feature(tmp, 2)
        tmp = utils.remove_feature(tmp, 1)
        tmp = utils.remove_feature(tmp, 0)
    else:
        tmp = utils.remove_feature(tmp, 11)
        tmp = utils.remove_feature(tmp, 10)
        tmp = utils.remove_feature(tmp, 9)
        tmp = utils.remove_feature(tmp, 8)
        tmp = utils.remove_feature(tmp, 7)
        tmp = utils.remove_feature(tmp, 6)
        tmp = utils.remove_feature(tmp, 5)
        tmp = utils.remove_feature(tmp, 4)
        tmp = utils.remove_feature(tmp, 3)
        tmp = utils.remove_feature(tmp, 2)
        tmp = utils.remove_feature(tmp, 1)
        tmp = utils.remove_feature(tmp, 0)
    return tmp
예제 #17
0
        print(id_curr, c)
        raise
    tmp = pd.DataFrame.from_dict(di, orient='index').T
    tmp['SK_ID_CURR'] = id_curr
    return tmp.set_index('SK_ID_CURR')


# =============================================================================
# main
# =============================================================================
pool = Pool(NTHREAD)
callback = pool.map(multi, ids)
pool.close()

base = pd.concat(callback)
utils.remove_feature(base)
# =============================================================================
# merge
# =============================================================================
base.reset_index(inplace=True)
if base.columns.duplicated().sum() != 0:
    raise Exception(base.columns[base.columns.duplicated()])

train = utils.load_train([KEY])
train = pd.merge(train, base, on=KEY, how='left').drop(KEY, axis=1)
utils.to_feature(train.add_prefix(PREF), '../feature/train')

test = utils.load_test([KEY])
test = pd.merge(test, base, on=KEY, how='left').drop(KEY, axis=1)
utils.to_feature(test.add_prefix(PREF), '../feature/test')