Пример #1
0
                     free_raw_data=False)
gc.collect()

model_all = []
nround_mean = 0
wloss_list = []
y_preds = []
for i in range(1):
    gc.collect()
    param['seed'] = np.random.randint(9999)
    ret, models = lgb.cv(param, dtrain, 99999, nfold=NFOLD, 
                         fobj=utils_metric.wloss_objective, 
                         feval=utils_metric.wloss_metric,
                         early_stopping_rounds=100, verbose_eval=50,
                         seed=SEED)
    y_pred = ex.eval_oob(X, y, models, SEED, stratified=True, shuffle=True, 
                         n_class=y.unique().shape[0])
    y_preds.append(y_pred)
    model_all += models
    nround_mean += len(ret['wloss-mean'])
    wloss_list.append( ret['wloss-mean'][-1] )

nround_mean = int((nround_mean/1) * 1.3)

result = f"CV wloss: {np.mean(wloss_list)} + {np.std(wloss_list)}"
print(result)

imp = ex.getImp(model_all)
imp['split'] /= imp['split'].max()
imp['gain'] /= imp['gain'].max()
imp['total'] = imp['split'] + imp['gain']
# best(gal)
# =============================================================================
N = best_N
#N = 250
dtrain = lgb.Dataset(X_gal[COL[:N]], y_gal, #categorical_feature=CAT, 
                     free_raw_data=False)
ret, models = lgb.cv(param, dtrain, 99999, nfold=NFOLD, 
                     fobj=utils_metric.wloss_objective_gal, 
                     feval=utils_metric.wloss_metric_gal,
                     folds=group_kfold.split(X_gal, y_gal, group_gal),
                     early_stopping_rounds=100, verbose_eval=50,
                     seed=SEED)

score = ret['wloss-mean'][-1]

y_pred_gal = ex.eval_oob(X_gal[COL[:N]], y_gal, models, SEED, stratified=True, shuffle=True, 
                           n_class=y_exgal.unique().shape[0])

# =============================================================================
# cv(exgal)
# =============================================================================
print('==== EXGAL ====')
param['num_class'] = 9
param['learning_rate'] = 0.1

dtrain = lgb.Dataset(X_exgal, y_exgal, #categorical_feature=CAT, 
                     free_raw_data=False)
gc.collect()

model_all = []
nround_mean = 0
wloss_list = []
        param['seed'] = np.random.randint(9999)

        ret, models = lgb.cv(param,
                             dtrain,
                             NROUND,
                             nfold=NFOLD,
                             stratified=True,
                             shuffle=True,
                             feval=ex.eval_auc,
                             early_stopping_rounds=ESR,
                             verbose_eval=VERBOSE_EVAL,
                             seed=SEED + i)

        y_pred = ex.eval_oob(X_train,
                             y_train.values,
                             models,
                             SEED + i,
                             stratified=True,
                             shuffle=True)
        y_preds.append(y_pred)

        model_all += models
        nround_mean += len(ret['auc-mean'])
        loss_list.append(ret['auc-mean'][-1])

    nround_mean = int((nround_mean / LOOP) * 1.3)

    imp = ex.getImp(model_all)
    imp['split'] /= imp['split'].max()
    imp['gain'] /= imp['gain'].max()
    imp['total'] = imp['split'] + imp['gain']
    imp.sort_values('total', ascending=False, inplace=True)
def mk_submit(HEAD):

    SUBMIT_FILE_PATH_ = SUBMIT_FILE_PATH.replace('feature', str(HEAD))
    files_tr = ('../feature/train_' + imp.head(HEAD).feature + '.f').tolist()
    files_te = ('../feature/test_' + imp.head(HEAD).feature + '.f').tolist()

    # =============================================================================
    # load
    # =============================================================================
    # train
    X_train = pd.concat(
        [pd.read_feather(f) for f in tqdm(files_tr, mininterval=60)], axis=1)
    y_train = utils.read_pickles('../data/label').TARGET

    X_train.head().to_csv(SUBMIT_FILE_PATH_.replace('.csv', '_X.csv'),
                          index=False,
                          compression='gzip')

    if X_train.columns.duplicated().sum() > 0:
        raise Exception(
            f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }')
    print('no dup :) ')
    print(f'X_train.shape {X_train.shape}')

    gc.collect()

    CAT = list(set(X_train.columns) & set(utils_cat.ALL))

    COL = X_train.columns.tolist()

    # test
    X_test = pd.concat(
        [pd.read_feather(f) for f in tqdm(files_te, mininterval=60)],
        axis=1)[COL]

    # =============================================================================
    # training with cv
    # =============================================================================
    dtrain = lgb.Dataset(X_train,
                         y_train,
                         categorical_feature=CAT,
                         free_raw_data=False)

    model_all = []
    y_pred = pd.Series(0, index=y_train.index)
    for i in range(LOOP):
        gc.collect()
        param['seed'] = i
        ret, models = lgb.cv(param,
                             dtrain,
                             9999,
                             nfold=NFOLD,
                             early_stopping_rounds=100,
                             verbose_eval=50,
                             seed=i)
        model_all += models
        y_pred += ex.eval_oob(X_train, y_train, models, i).rank()
    y_pred /= y_pred.max()

    auc_mean = roc_auc_score(y_train, y_pred)
    result = f"CV auc-mean(feature {HEAD}): {auc_mean}"
    print(result)
    utils.send_line(result)

    # =============================================================================
    # predict
    # =============================================================================
    sub = pd.read_pickle('../data/sub.p')

    gc.collect()

    label_name = 'TARGET'

    sub[label_name] = 0
    for model in model_all:
        y_pred = model.predict(X_test)
        sub[label_name] += pd.Series(y_pred).rank()
    sub[label_name] /= len(model_all)
    sub[label_name] /= sub[label_name].max()
    sub['SK_ID_CURR'] = sub['SK_ID_CURR'].map(int)

    sub.to_csv(SUBMIT_FILE_PATH_, index=False, compression='gzip')
Пример #5
0
def mk_submit():

    # =============================================================================
    # load
    # =============================================================================
    # train
    X_train = loader.train()
    col = [c for c in X_train.columns if c.startswith('f702_')]
    X_train.drop(col, axis=1, inplace=True)

    y_train = utils.read_pickles('../data/label').TARGET

    X_train.head().to_csv(SUBMIT_FILE_PATH.replace('.csv', '_X.csv'),
                          index=False,
                          compression='gzip')

    if X_train.columns.duplicated().sum() > 0:
        raise Exception(
            f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }')
    print('no dup :) ')
    print(f'X_train.shape {X_train.shape}')

    gc.collect()

    CAT = list(set(X_train.columns) & set(loader.category()))
    print('CAT :', CAT)

    COL = X_train.columns.tolist()

    # test
    X_test = loader.test()[COL]

    # =============================================================================
    # training with cv
    # =============================================================================
    dtrain = lgb.Dataset(X_train,
                         y_train,
                         categorical_feature=CAT,
                         free_raw_data=False)

    model_all = []
    y_pred = pd.Series(0, index=y_train.index)
    for i in range(LOOP):
        gc.collect()
        param['seed'] = i
        ret, models = lgb.cv(param,
                             dtrain,
                             9999,
                             nfold=NFOLD,
                             early_stopping_rounds=100,
                             verbose_eval=50,
                             seed=i)
        model_all += models
        y_pred += ex.eval_oob(X_train, y_train, models, i).rank()

        auc_mean = roc_auc_score(y_train, y_pred)
        result = f"CV auc-mean(loop {i}): {auc_mean} {ret['auc-mean'][-1]}"
        print(result)
        utils.send_line(result)

    y_pred /= y_pred.max()

    auc_mean = roc_auc_score(y_train, y_pred)
    result = f"CV auc-mean: {auc_mean}"
    print(result)
    utils.send_line(result)

    # =============================================================================
    # predict
    # =============================================================================
    sub = pd.read_pickle('../data/sub.p')

    gc.collect()

    label_name = 'TARGET'

    sub[label_name] = 0
    for model in model_all:
        y_pred = model.predict(X_test)
        sub[label_name] += pd.Series(y_pred).rank()
    sub[label_name] /= len(model_all)
    sub[label_name] /= sub[label_name].max()
    sub['SK_ID_CURR'] = sub['SK_ID_CURR'].map(int)

    sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip')

    # =============================================================================
    # submission
    # =============================================================================
    if EXE_SUBMIT:
        print('submit')
        utils.submit(SUBMIT_FILE_PATH, COMMENT)
Пример #6
0
oofs = []
for i in range(LOOP):

    gc.collect()
    param['seed'] = np.random.randint(9999)
    ret, models = lgb.cv(param,
                         dtrain,
                         99999,
                         nfold=NFOLD,
                         early_stopping_rounds=100,
                         verbose_eval=50,
                         seed=SEED)
    y_pred = ex.eval_oob(
        X_52_90,
        y_52_90.values,
        models,
        SEED,
        stratified=True,
        shuffle=True,
    )
    oofs.append(y_pred)
    model_all += models
    nround_mean += len(ret['auc-mean'])
    wloss_list.append(ret['auc-mean'][-1])

nround_mean = int((nround_mean / 2) * 1.3)
utils.send_line(f'nround_mean: {nround_mean}')

result = f"CV wloss: {np.mean(wloss_list)} + {np.std(wloss_list)}"
utils.send_line(result)

for i, y_pred in enumerate(oofs):
# =============================================================================
# cv
# =============================================================================
dtrain = lgb.Dataset(X, y, categorical_feature=CAT)
gc.collect()

ret, models = lgb.cv(param,
                     dtrain,
                     9999,
                     folds=folds,
                     early_stopping_rounds=100,
                     verbose_eval=50,
                     seed=SEED)

y_pred = ex.eval_oob(X, y, models, SEED)

result = f"CV auc-mean({COMMENT}): {ret['auc-mean'][-1]} + {ret['auc-stdv'][-1]}"
print(result)
utils.send_line(result)

imp = ex.getImp(models)

imp['split'] /= imp['split'].max()
imp['gain'] /= imp['gain'].max()
imp['total'] = imp['split'] + imp['gain']

imp.sort_values('total', ascending=False, inplace=True)
imp.reset_index(drop=True, inplace=True)

imp.to_csv('LOG/imp_f021.csv', index=False)
Пример #8
0
        param,
        dtrain,
        NROUND,
        #                          nfold=NFOLD,
        folds=group_kfold.split(X_train_, y_train_, group),
        stratified=True,
        shuffle=True,
        feval=ex.eval_auc,
        early_stopping_rounds=ESR,
        verbose_eval=VERBOSE_EVAL,
        seed=SEED + i)

    y_pred = ex.eval_oob(X_train_,
                         y_train_.values,
                         models,
                         SEED + i,
                         folds=group_kfold.split(X_train_, y_train_, group),
                         stratified=True,
                         shuffle=True)
    y_preds.append(y_pred)

    model_all += models
    nround_mean += len(ret['auc-mean'])
    loss_list.append(ret['auc-mean'][-1])

nround_mean = int((nround_mean / LOOP) * 1.3)

# =============================================================================
# test
# =============================================================================
    param['seed'] = np.random.randint(9999)

    ret, models = lgb.cv(param,
                         dtrain,
                         NROUND,
                         nfold=NFOLD,
                         stratified=True,
                         shuffle=True,
                         feval=ex.eval_auc,
                         early_stopping_rounds=ESR,
                         verbose_eval=VERBOSE_EVAL,
                         seed=SEED + i)

    p_train = ex.eval_oob(X_train[col],
                          y_train.values,
                          models,
                          SEED + i,
                          stratified=True,
                          shuffle=True)

    model_all += models
    nround_mean += len(ret['auc-mean'])
    loss_list.append(ret['auc-mean'][-1])

    utils.send_line(
        f'oof AUC({i}): {round(roc_auc_score(y_train, p_train), 5)}')

#==============================================================================
utils.end(__file__)
#utils.stop_instance()
# =============================================================================
# cv
# =============================================================================
dtrain = lgb.Dataset(X, y)
gc.collect()

ret, models = lgb.cv(param,
                     dtrain,
                     99999,
                     nfold=7,
                     early_stopping_rounds=100,
                     verbose_eval=50,
                     seed=111)

y_pred = ex.eval_oob(X, y, models, 111)

result = f"CV auc-mean: {ret['auc-mean'][-1]} + {ret['auc-stdv'][-1]}"
print(result)
utils.send_line(result)

imp = ex.getImp(models)

# =============================================================================
# cv loop
# =============================================================================
from sklearn.metrics import roc_auc_score

dtrain = lgb.Dataset(X, y, free_raw_data=False)
gc.collect()
# =============================================================================
# cv loop
# =============================================================================

dtrain = lgb.Dataset(X_new, y, categorical_feature=CAT, free_raw_data=False)
gc.collect()

y_pred = pd.Series(0, index=y.index)

for i in range(5):
    ret, models = lgb.cv(param, dtrain, 99999, nfold=7,
                         early_stopping_rounds=100, verbose_eval=50,
                         seed=i)
    
    y_pred += ex.eval_oob(X_new, y, models, i).rank()

y_pred /= y_pred.max()

auc_mean = roc_auc_score(y, y_pred)
result = f"CV auc-mean(ext imp): {auc_mean}"
print(result)
utils.send_line(result)

#==============================================================================
utils.end(__file__)
#utils.stop_instance()



def mk_submit():

    files_tr = ('../feature/train_' + features + '.f').tolist()
    files_te = ('../feature/test_' + features + '.f').tolist()

    # =============================================================================
    # load
    # =============================================================================
    # train
    X_train = pd.concat(
        [pd.read_feather(f) for f in tqdm(files_tr, mininterval=60)], axis=1)
    y_train = utils.read_pickles('../data/label').TARGET

    # =============================================================================
    # remove old users
    # =============================================================================
    X_train['SK_ID_CURR'] = SK_ID_CURR

    y_train = y_train[~X_train.SK_ID_CURR.isin(drop_ids)]
    X_train = X_train[~X_train.SK_ID_CURR.isin(drop_ids)]
    oof_train = X_train[['SK_ID_CURR']]
    X_train.drop('SK_ID_CURR', axis=1, inplace=True)

    X_train.head().to_csv(SUBMIT_FILE_PATH.replace('.csv', '_X.csv'),
                          index=False,
                          compression='gzip')

    if X_train.columns.duplicated().sum() > 0:
        raise Exception(
            f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }')
    print('no dup :) ')
    print(f'X_train.shape {X_train.shape}')

    gc.collect()

    CAT = list(set(X_train.columns) & set(utils_cat.ALL))

    COL = X_train.columns.tolist()

    # test
    X_test = pd.concat(
        [pd.read_feather(f) for f in tqdm(files_te, mininterval=60)],
        axis=1)[COL]

    # =============================================================================
    # training with cv
    # =============================================================================
    dtrain = lgb.Dataset(X_train,
                         y_train,
                         categorical_feature=CAT,
                         free_raw_data=False)

    model_all = []
    y_pred = pd.Series(0, index=y_train.index)
    for i in range(LOOP):
        gc.collect()
        param['seed'] = i
        ret, models = lgb.cv(param,
                             dtrain,
                             9999,
                             nfold=NFOLD,
                             early_stopping_rounds=100,
                             verbose_eval=50,
                             seed=i)
        model_all += models
        y_pred += ex.eval_oob(X_train, y_train, models, i).rank()

        auc_mean = roc_auc_score(y_train, y_pred)
        result = f"CV auc-mean(loop {i}): {auc_mean}"
        print(result)
        utils.send_line(result)

    y_pred /= y_pred.max()

    auc_mean = roc_auc_score(y_train, y_pred)
    result = f"CV auc-mean: {auc_mean}"
    print(result)
    utils.send_line(result)

    # save
    oof_train['oof'] = y_pred
    oof_train.to_csv('../output/onodera-last-oof.csv', index=False)

    # =============================================================================
    # predict
    # =============================================================================
    sub = pd.read_pickle('../data/sub.p')

    gc.collect()

    label_name = 'TARGET'

    sub[label_name] = 0
    for model in model_all:
        y_pred = model.predict(X_test)
        sub[label_name] += pd.Series(y_pred).rank()
    sub[label_name] /= len(model_all)
    sub[label_name] /= sub[label_name].max()
    sub['SK_ID_CURR'] = sub['SK_ID_CURR'].map(int)

    sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip')

    # =============================================================================
    # submission
    # =============================================================================
    if EXE_SUBMIT:
        print('submit')
        utils.submit(SUBMIT_FILE_PATH, COMMENT)
# =============================================================================
# cv
# =============================================================================
dtrain = lgb.Dataset(X, y, categorical_feature=CAT)
gc.collect()

ret, models = lgb.cv(param,
                     dtrain,
                     9999,
                     folds=folds,
                     early_stopping_rounds=100,
                     verbose_eval=50,
                     seed=SEED)

y_pred = ex.eval_oob(X, y, models, SEED)

result = f"CV auc-mean({COMMENT}): {ret['auc-mean'][-1]} + {ret['auc-stdv'][-1]}"
print(result)
utils.send_line(result)

imp = ex.getImp(models)

imp['split'] /= imp['split'].max()
imp['gain'] /= imp['gain'].max()
imp['total'] = imp['split'] + imp['gain']

imp.sort_values('total', ascending=False, inplace=True)
imp.reset_index(drop=True, inplace=True)

# =============================================================================
Пример #14
0
    gc.collect()
    param['seed'] = np.random.randint(9999)
    ret, models = lgb.cv(param,
                         dtrain,
                         99999,
                         nfold=NFOLD,
                         fobj=utils_metric.wloss_objective_gal,
                         feval=utils_metric.wloss_metric_gal,
                         early_stopping_rounds=100,
                         verbose_eval=50,
                         folds=group_kfold.split(X_gal, y_gal, group_gal),
                         seed=SEED)
    oof = ex.eval_oob(X_gal,
                      y_gal.values,
                      models,
                      SEED,
                      stratified=True,
                      shuffle=True,
                      n_class=True)
    oofs_gal.append(oof)
    #    model_all += models
    nround_mean += len(ret['wloss-mean'])
    wloss_list.append(ret['wloss-mean'][-1])

nround_mean = int((nround_mean / 2) * 1.3)
utils.send_line(f'nround_mean: {nround_mean}')

result = f"CV GAL wloss: {np.mean(wloss_list)} + {np.std(wloss_list)}"
utils.send_line(result)

# =============================================================================
    gc.collect()
    param['seed'] = np.random.randint(9999)
    ret, models = lgb.cv(param,
                         dtrain,
                         99999,
                         nfold=NFOLD,
                         fobj=utils_metric.wloss_objective,
                         feval=utils_metric.wloss_metric,
                         early_stopping_rounds=100,
                         verbose_eval=50,
                         seed=SEED)
    y_pred = ex.eval_oob(X[feature_set[i]],
                         y.values,
                         models,
                         SEED,
                         stratified=True,
                         shuffle=True,
                         n_class=True)
    y_preds.append(y_pred)
    model_all += models
    nround_mean += len(ret['wloss-mean'])
    wloss_list.append(ret['wloss-mean'][-1])

nround_mean = int((nround_mean / MOD_N) * 1.3)
utils.send_line(f'nround_mean: {nround_mean}')

result = f"CV wloss: {np.mean(wloss_list)} + {np.std(wloss_list)}"
print(result)
utils.send_line(result)
Пример #16
0
dtrain = lgb.Dataset(X, y, categorical_feature=CAT, free_raw_data=False)

model_all = []
y_pred = pd.Series(0, index=y.index)
for i in range(LOOP):
    gc.collect()
    param['seed'] = i
    ret, models = lgb.cv(param,
                         dtrain,
                         9999,
                         nfold=NFOLD,
                         early_stopping_rounds=100,
                         verbose_eval=50,
                         seed=i)
    model_all += models
    y_pred += ex.eval_oob(X, y, models, i)

    auc_mean = roc_auc_score(y, y_pred)
    result = f"CV auc-mean(loop {i}): {auc_mean} {ret['auc-mean'][-1]}"
    print(result)
    utils.send_line(result)

y_pred /= LOOP

auc_mean = roc_auc_score(y, y_pred)
result = f"CV auc-mean: {auc_mean}"
print(result)
utils.send_line(result)

y_pred.name = 'f190_adv'
y_pred = y_pred.to_frame()
Пример #17
0
    gc.collect()
    param['seed'] = np.random.randint(9999)
    ret, models = lgb.cv(param,
                         dtrain,
                         99999,
                         nfold=NFOLD,
                         fobj=utils_metric.wloss_objective,
                         feval=utils_metric.wloss_metric,
                         early_stopping_rounds=100,
                         verbose_eval=50,
                         seed=SEED)
    y_pred = ex.eval_oob(X[COL],
                         y.values,
                         models,
                         SEED,
                         stratified=True,
                         shuffle=True,
                         n_class=True)
    y_preds.append(y_pred)
    model_all += models
    nround_mean += len(ret['wloss-mean'])
    wloss_list.append(ret['wloss-mean'][-1])

nround_mean = int((nround_mean / 2) * 1.3)
utils.send_line(f'nround_mean: {nround_mean}')

result = f"CV wloss: {np.mean(wloss_list)} + {np.std(wloss_list)}"
print(result)
utils.send_line(result)
Пример #18
0
gc.collect()

model_all = []
nround_mean = 0
auc_list = []
for i in range(LOOP):
    gc.collect()
    param['seed'] = np.random.randint(9999)
    ret, models = lgb.cv(param,
                         dtrain,
                         99999,
                         nfold=NFOLD,
                         early_stopping_rounds=100,
                         verbose_eval=50,
                         seed=SEED)
    y_pred = ex.eval_oob(X, y, models, SEED, stratified=True, shuffle=True)

    model_all += models
    nround_mean += len(ret['auc-mean'])
    auc_list.append(ret['auc-mean'][-1])

nround_mean = int((nround_mean / LOOP) * 1.3)

result = f"CV wloss: {np.mean(nround_mean)} + {np.std(nround_mean)}"
print(result)

utils.send_line(result)
imp = ex.getImp(model_all)
imp['split'] /= imp['split'].max()
imp['gain'] /= imp['gain'].max()
imp['total'] = imp['split'] + imp['gain']