def aggregate():

    df = utils.get_dummies(cre)

    li = []
    for c1 in df.columns:
        for c2 in col_cat:
            if c1.startswith(c2 + '_'):
                li.append(c1)
                break

    cat_aggregations = {}
    for cat in li:
        cat_aggregations[cat] = ['mean', 'sum']

    df_agg = df.groupby(KEY).agg({
        **utils_agg.cre_num_aggregations,
        **cat_aggregations
    })
    df_agg.columns = pd.Index(
        [e[0] + "_" + e[1] for e in df_agg.columns.tolist()])

    df_agg['CRE_COUNT'] = df.groupby(KEY).size()
    df_agg.reset_index(inplace=True)

    tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/train')

    tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/test')

    return
示例#2
0
def aggregate():

    df = utils.get_dummies(pos)

    li = []
    for c1 in df.columns:
        for c2 in col_cat:
            if c1.startswith(c2 + '_'):
                li.append(c1)
                break

    cat_aggregations = {}
    for cat in li:
        cat_aggregations[cat] = ['mean', 'sum']

    df_agg = df.groupby(KEY).agg({
        **utils_agg.pos_num_aggregations,
        **cat_aggregations
    })
    df_agg.columns = pd.Index(
        [e[0] + "_" + e[1] for e in df_agg.columns.tolist()])

    # std / mean
    col_std = [c for c in df_agg.columns if c.endswith('_std')]
    for c in col_std:
        df_agg[f'{c}-d-mean'] = df_agg[c] / df_agg[c.replace('_std', '_mean')]

    # max / min
    col_max = [c for c in df_agg.columns if c.endswith('_max')]
    for c in col_max:
        try:
            df_agg[f'{c}-d-min'] = df_agg[c] / df_agg[c.replace(
                '_max', '_min')]
        except:
            pass

    df_agg['POS_COUNT'] = df.groupby(KEY).size()
    df_agg.reset_index(inplace=True)

    utils.remove_feature(df_agg,
                         var_limit=0,
                         corr_limit=0.98,
                         sample_size=19999)

    tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/train')

    tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/test')

    return
示例#3
0
def aggregate(args):
    print(args)
    k, v, prefix = args

    df = utils.get_dummies(bure[bure[k] == v])

    li = []
    for c1 in df.columns:
        for c2 in col_cat:
            if c1.startswith(c2 + '_'):
                li.append(c1)
                break

    cat_aggregations = {}
    for cat in li:
        cat_aggregations[cat] = ['mean', 'sum']

    df_agg = df.groupby(KEY).agg({
        **utils_agg.bure_num_aggregations,
        **cat_aggregations
    })
    df_agg.columns = pd.Index(
        [prefix + e[0] + "_" + e[1] for e in df_agg.columns.tolist()])

    # std / mean
    col_std = [c for c in df_agg.columns if c.endswith('_std')]
    for c in col_std:
        df_agg[f'{c}-d-mean'] = df_agg[c] / df_agg[c.replace('_std', '_mean')]

    # max / min
    col_max = [c for c in df_agg.columns if c.endswith('_max')]
    for c in col_max:
        try:
            df_agg[f'{c}-d-min'] = df_agg[c] / df_agg[c.replace(
                '_max', '_min')]
            df_agg[f'{c}-m-min'] = df_agg[c] - df_agg[c.replace(
                '_max', '_min')]
        except:
            pass

    df_agg[f'{prefix}BURE_COUNT'] = df.groupby(KEY).size()
    df_agg.reset_index(inplace=True)

    tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/train')

    tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/test')

    return
def aggregate():

    df = utils.get_dummies(ins)

    df_agg = df.groupby(KEY).agg({**num_aggregations})
    df_agg.columns = pd.Index(
        [e[0] + "_" + e[1] for e in df_agg.columns.tolist()])

    df_agg.reset_index(inplace=True)

    #    utils.remove_feature(df_agg, var_limit=0, corr_limit=0.98, sample_size=19999)

    tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/train')

    tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1)
    utils.to_feature(tmp.add_prefix(PREF), '../feature/test')

    return
示例#5
0
    'DAYS_ENDDATE_FACT', 'AMT_CREDIT_MAX_OVERDUE', 'CNT_CREDIT_PROLONG',
    'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_LIMIT',
    'AMT_CREDIT_SUM_OVERDUE', 'DAYS_CREDIT_UPDATE', 'AMT_ANNUITY'
]

col_cat = ['CREDIT_ACTIVE', 'CREDIT_CURRENCY', 'CREDIT_TYPE']

col_group = ['CREDIT_ACTIVE', 'CREDIT_CURRENCY', 'CREDIT_TYPE']

# =============================================================================
# feature
# =============================================================================
bureau = utils.read_pickles('../data/bureau')

bureau = bureau[bureau['DAYS_CREDIT_ENDDATE'].between(-365, 0)]
bureau = utils.get_dummies(bureau)

bureau.drop('SK_ID_BUREAU', axis=1, inplace=True)
gr = bureau.groupby(KEY)

train = utils.load_train([KEY])

test = utils.load_test([KEY])


def nunique(x):
    return len(set(x))


# =============================================================================
# gr1
示例#6
0
from multiprocessing import Pool
import utils
utils.start(__file__)
#==============================================================================

KEY = 'SK_ID_CURR'
PREF = 'prev_102'

# =============================================================================
# feature
# =============================================================================
prev = utils.read_pickles('../data/previous_application')

prev = prev[prev['NAME_CONTRACT_STATUS'] == 'Refused']

prev = utils.get_dummies(prev)
prev.columns = [c.replace('/', '') for c in prev.columns]
prev.drop('SK_ID_PREV', axis=1, inplace=True)

base = prev[[KEY]].drop_duplicates().set_index(KEY)

gr = prev.groupby(KEY)

train = utils.load_train([KEY])

test = utils.load_test([KEY])


def nunique(x):
    return len(set(x))
    'MONTHS_BALANCE', 'CNT_INSTALMENT', 'CNT_INSTALMENT_FUTURE', 'SK_DPD',
    'SK_DPD_DEF'
]

col_cat = ['NAME_CONTRACT_STATUS']

col_group = ['SK_ID_PREV', 'NAME_CONTRACT_STATUS']

# =============================================================================
# feature
# =============================================================================
pos = utils.read_pickles('../data/POS_CASH_balance')

pos = pos[pos['MONTHS_BALANCE'] > -12]

pos = utils.get_dummies(pos)
pos.drop('SK_ID_PREV', axis=1, inplace=True)

base = pos[[KEY]].drop_duplicates().set_index(KEY)

gr = pos.groupby(KEY)

train = utils.load_train([KEY])

test = utils.load_test([KEY])


def nunique(x):
    return len(set(x))

PREF = f'ins_{No}'

col_num = [
    'NUM_INSTALMENT_VERSION', 'NUM_INSTALMENT_NUMBER', 'DAYS_INSTALMENT',
    'DAYS_ENTRY_PAYMENT', 'AMT_INSTALMENT', 'AMT_PAYMENT'
]

col_group = ['SK_ID_PREV', 'NUM_INSTALMENT_VERSION', 'NUM_INSTALMENT_NUMBER']

# =============================================================================
# feature
# =============================================================================
ins = utils.read_pickles('../data/installments_payments')
ins = ins[ins['DAYS_INSTALMENT'].between(-365, 0)]

ins = utils.get_dummies(ins)
ins.drop('SK_ID_PREV', axis=1, inplace=True)

base = ins[[KEY]].drop_duplicates().set_index(KEY)

gr = ins.groupby(KEY)

train = utils.load_train([KEY])

test = utils.load_test([KEY])


def nunique(x):
    return len(set(x))

KEY = 'SK_ID_CURR'
No = '301'
PREF = f'ins_{No}'
NTHREAD = 3

col_num = [
    'NUM_INSTALMENT_VERSION', 'NUM_INSTALMENT_NUMBER', 'DAYS_INSTALMENT',
    'DAYS_ENTRY_PAYMENT', 'AMT_INSTALMENT', 'AMT_PAYMENT'
]

col_group = ['SK_ID_PREV', 'NUM_INSTALMENT_VERSION', 'NUM_INSTALMENT_NUMBER']

# =============================================================================
# feature
# =============================================================================
ins = utils.get_dummies(utils.read_pickles('../data/installments_payments'))
ins.drop('SK_ID_PREV', axis=1, inplace=True)

base = ins[[KEY]].drop_duplicates().set_index(KEY)

gr = ins.groupby(KEY)

train = utils.load_train([KEY])

test = utils.load_test([KEY])


def nunique(x):
    return len(set(x))

#           'AMT_DRAWINGS_CURRENT', 'AMT_DRAWINGS_OTHER_CURRENT',
#           'AMT_DRAWINGS_POS_CURRENT', 'AMT_INST_MIN_REGULARITY',
#           'AMT_PAYMENT_CURRENT', 'AMT_PAYMENT_TOTAL_CURRENT',
#           'AMT_RECEIVABLE_PRINCIPAL', 'AMT_RECIVABLE', 'AMT_TOTAL_RECEIVABLE',
#           'CNT_DRAWINGS_ATM_CURRENT', 'CNT_DRAWINGS_CURRENT',
#           'CNT_DRAWINGS_OTHER_CURRENT', 'CNT_DRAWINGS_POS_CURRENT',
#           'CNT_INSTALMENT_MATURE_CUM', 'SK_DPD', 'SK_DPD_DEF']
#
#col_cat = ['CNT_DRAWINGS_OTHER_CURRENT', 'NAME_CONTRACT_STATUS']
#
#col_group = ['SK_ID_PREV', 'CNT_DRAWINGS_OTHER_CURRENT', 'NAME_CONTRACT_STATUS']

# =============================================================================
# feature
# =============================================================================
cre = utils.get_dummies(utils.read_pickles('../data/credit_card_balance'))
cre.drop('SK_ID_PREV', axis=1, inplace=True)

base = cre[[KEY]].drop_duplicates().set_index(KEY)


gr = cre.groupby(KEY)

train = utils.load_train([KEY])

test = utils.load_test([KEY])

def nunique(x):
    return len(set(x))

# =============================================================================
示例#11
0
KEY = 'SK_ID_CURR'
PREF = 'pos_201'

NTHREAD = 2

col_num = ['MONTHS_BALANCE', 'CNT_INSTALMENT', 'CNT_INSTALMENT_FUTURE',
           'SK_DPD', 'SK_DPD_DEF']

col_cat = ['NAME_CONTRACT_STATUS']

col_group = ['SK_ID_PREV', 'NAME_CONTRACT_STATUS']

# =============================================================================
# feature
# =============================================================================
pos = utils.get_dummies(utils.read_pickles('../data/POS_CASH_balance'))
pos.drop('SK_ID_PREV', axis=1, inplace=True)

base = pos[[KEY]].drop_duplicates().set_index(KEY)

gr = pos.groupby(KEY)

train = utils.load_train([KEY])

test = utils.load_test([KEY])



def nunique(x):
    return len(set(x))
col_num = [
    'DAYS_CREDIT', 'CREDIT_DAY_OVERDUE', 'DAYS_CREDIT_ENDDATE',
    'DAYS_ENDDATE_FACT', 'AMT_CREDIT_MAX_OVERDUE', 'CNT_CREDIT_PROLONG',
    'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_LIMIT',
    'AMT_CREDIT_SUM_OVERDUE', 'DAYS_CREDIT_UPDATE', 'AMT_ANNUITY'
]

col_cat = ['CREDIT_ACTIVE', 'CREDIT_CURRENCY', 'CREDIT_TYPE']

col_group = ['CREDIT_ACTIVE', 'CREDIT_CURRENCY', 'CREDIT_TYPE']

# =============================================================================
# feature
# =============================================================================
bureau = utils.get_dummies(utils.read_pickles('../data/bureau'))

bureau.drop('SK_ID_BUREAU', axis=1, inplace=True)
gr = bureau.groupby(KEY)

train = utils.load_train([KEY])

test = utils.load_test([KEY])


def nunique(x):
    return len(set(x))


# =============================================================================
# gr1
def run_training():
    if torch.cuda.is_available():
        DEVICE = 'cuda'
    else:
        DEVICE = 'cpu'
    df_train = pd.read_csv(PATH + 'train_features.csv')
    targets = pd.read_csv(PATH + 'train_targets_scored.csv')
    utils.get_dummies(df_train, ['cp_type', 'cp_dose', 'cp_time'])
    sig_ids = df_train['sig_id']
    df_train.drop('sig_id', axis=1, inplace=True)
    targets.drop('sig_id', axis=1, inplace=True)

    # TODO use unscored data for training as well
    X_train, X_val, y_train, y_val = train_test_split(df_train.values,
                                                      targets.values,
                                                      test_size=0.3,
                                                      random_state=42)

    train_dataset = utils.ModelDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset,
                              batch_size=BATCH_SIZE,
                              shuffle=True,
                              num_workers=4)

    val_dataset = utils.ModelDataset(X_val, y_val)
    val_loader = DataLoader(val_dataset, batch_size=1)

    model = utils.Model(X_train.shape[1], y_train.shape[1], num_layers,
                        hidden_size)
    model.to(DEVICE)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=30,
                                                gamma=0.1)

    engine = utils.Engine(model, optimizer, device=DEVICE)

    best_loss = np.inf
    early_stopping = 10
    early_stopping_counter = 0

    # TODO use optuns for trails
    for epoch in range(EPOCHS):
        train_loss = engine.train(train_loader)
        val_loss = engine.validate(val_loader)
        scheduler.step(val_loss)

        print(f'Epoch {epoch}, train_loss {train_loss}, val_loss {val_loss}')

        if val_loss < best_loss:
            best_loss = val_loss
            torch.save(model.state_dict(), '/models')
        else:
            early_stopping_counter += 1

        if early_stopping_counter > early_stopping:
            break

    print(f'best loss {best_loss}')
    return best_loss