Exemplo n.º 1
0
def get_imp(shuffle=True, loop=1):

    if shuffle:
        dtrain = lgb.Dataset(X, y.sample(frac=1).values, free_raw_data=False)
        gc.collect()
    else:
        dtrain = lgb.Dataset(X, y.values, free_raw_data=False)
        gc.collect()

    model_all = []
    nround_mean = 0
    wloss_list = []
    for i in range(loop):
        gc.collect()
        param['seed'] = np.random.randint(9999)
        ret, models = lgb.cv(param,
                             dtrain,
                             99999,
                             nfold=NFOLD,
                             fobj=utils_metric.wloss_objective,
                             feval=utils_metric.wloss_metric,
                             early_stopping_rounds=100,
                             verbose_eval=200,
                             seed=SEED + i)
        model_all += models
        nround_mean += len(ret['multi_logloss-mean'])
        wloss_list.append(ret['wloss-mean'][-1])

    imp = ex.getImp(model_all)
    imp['split'] /= imp['split'].max()
    imp['gain'] /= imp['gain'].max()
    imp['total'] = imp['split'] + imp['gain']

    imp.sort_values('total', ascending=False, inplace=True)
    imp.reset_index(drop=True, inplace=True)

    return imp.set_index('feature')
CAT = list(set(X.columns) & set(utils_cat.ALL))

if X.columns.duplicated().sum() > 0:
    raise Exception(f'duplicated!: { X.columns[X.columns.duplicated()] }')
print('no dup :) ')
print(f'X.shape {X.shape}')

gc.collect()

# =============================================================================
# imp
# =============================================================================
dtrain = lgb.Dataset(X, y, categorical_feature=CAT)
model = lgb.train(params, dtrain, 1000)
imp = ex.getImp(model).sort_values(['gain', 'feature'],
                                   ascending=[False, True])

features_search = imp[imp['split'] > 0].feature.tolist()
features_curr = features_search[:20]

# =============================================================================
# stepwise
# =============================================================================

ex.stepwise(params,
            X,
            y,
            features_search,
            features_curr,
            best_score=0,
            send_line=utils.send_line,
                     shuffle=True,
                     feval=ex.eval_auc,
                     early_stopping_rounds=ESR,
                     verbose_eval=VERBOSE_EVAL,
                     categorical_feature=CAT,
                     seed=SEED)

for i, model in enumerate(models):
    model.save_model(f'../data/lgb{i}.model')

#models = []
#for i in range(LOOP):
#    model = lgb.Booster(model_file=f'../data/lgb{i}.model')
#    models.append(model)

imp = ex.getImp(models)
imp['split'] /= imp['split'].max()
imp['gain'] /= imp['gain'].max()
imp['total'] = imp['split'] + imp['gain']
imp.sort_values('total', ascending=False, inplace=True)
imp.reset_index(drop=True, inplace=True)

imp.to_csv(f'LOG/imp_{__file__}.csv', index=False)
utils.savefig_imp(imp, f'LOG/imp_{__file__}.png', x='total')

RESULT_DICT['nfold'] = NFOLD
RESULT_DICT['seed'] = SEED
RESULT_DICT['eta'] = param['learning_rate']
RESULT_DICT['best NROUND'] = len(ret['auc-mean'])
RESULT_DICT['train AUC'] = ret['auc-mean'][-1]
Exemplo n.º 4
0
                         feval=utils_metric.wloss_metric,
                         early_stopping_rounds=100, verbose_eval=50,
                         seed=SEED)
    y_pred = ex.eval_oob(X, y, models, SEED, stratified=True, shuffle=True, 
                         n_class=y.unique().shape[0])
    y_preds.append(y_pred)
    model_all += models
    nround_mean += len(ret['wloss-mean'])
    wloss_list.append( ret['wloss-mean'][-1] )

nround_mean = int((nround_mean/1) * 1.3)

result = f"CV wloss: {np.mean(wloss_list)} + {np.std(wloss_list)}"
print(result)

imp = ex.getImp(model_all)
imp['split'] /= imp['split'].max()
imp['gain'] /= imp['gain'].max()
imp['total'] = imp['split'] + imp['gain']

imp.sort_values('total', ascending=False, inplace=True)
imp.reset_index(drop=True, inplace=True)


imp.to_csv(f'LOG/imp_{__file__}.csv', index=False)

png = f'LOG/imp_{__file__}.png'
utils.savefig_imp(imp, png, x='total', title=f'{__file__}')
utils.send_line(result, png)

for i,y_pred in enumerate(y_preds):