Пример #1
0
def fit_predict(X, y, X_pred):
    predictors = [i for i in X.columns]
    stacking_num = 5
    bagging_num = 3
    bagging_test_size = 0.33
    num_boost_round = 500
    early_stopping_rounds = 100

    stacking_model = []
    bagging_model = []

    l2_error = []
    X = X.values
    y = y.values
    layer_train = np.zeros((X.shape[0], 2))
    SK = StratifiedKFold(n_splits=stacking_num, shuffle=True, random_state=1)
    for k, (train_index, test_index) in enumerate(SK.split(X, y)):
        X_train = X[train_index]
        y_train = y[train_index]
        X_test = X[test_index]
        y_test = y[test_index]

        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_test, y_test)

        gbm = lgb.train(param,
                        lgb_train,
                        num_boost_round=num_boost_round,
                        valid_sets=lgb_eval,
                        early_stopping_rounds=early_stopping_rounds)
        stacking_model.append(gbm)

    X = np.hstack((X, layer_train[:, 1].reshape((-1, 1))))

    predictors.append('lgb_result')

    for bn in range(bagging_num):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=bagging_test_size, random_state=bn)

        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_test, y_test)

        gbm = lgb.train(param,
                        lgb_train,
                        num_boost_round=10000,
                        valid_sets=lgb_eval,
                        early_stopping_rounds=200)

        bagging_model.append(gbm)

        l2_error.append(
            mean_squared_error(
                gbm.predict(X_test, num_iteration=gbm.best_iteration), y_test))

        feat_imp = pd.Series(gbm.feature_importance(),
                             predictors).sort_values(ascending=False)

    test_pred = np.zeros((X_pred.shape[0], stacking_num))
    for sn, gbm in enumerate(stacking_model):
        pred = gbm.predict(X_pred, num_iteration=gbm.best_iteration)
        test_pred[:, sn] = pred

        X_pred = np.hstack((X_pred, test_pred.mean(axis=1).reshape((-1, 1))))

    for bn, gbm in enumerate(bagging_model):
        pred = gbm.predict(X_pred, num_iteration=gbm.best_iteration)
        if bn == 0:
            pred_out = pred
        else:
            pred_out += pred
    return pred_out / bagging_num, feat_imp
    col += [c for i, c in enumerate(COL[USE_FEATURES:]) if i % MOD_N == j]
    feature_set[j] = col

# =============================================================================
# cv
# =============================================================================

gc.collect()

model_all = []
nround_mean = 0
loss_list = []
y_preds = []
for i in range(MOD_N):
    dtrain = lgb.Dataset(
        X[feature_set[i]],
        y.values,  #categorical_feature=CAT, 
        free_raw_data=False)

    gc.collect()
    param['seed'] = np.random.randint(9999)
    ret, models = lgb.cv(param,
                         dtrain,
                         99999,
                         nfold=NFOLD,
                         early_stopping_rounds=100,
                         verbose_eval=50,
                         seed=SEED)
    y_pred = ex.eval_oob(X[feature_set[i]],
                         y.values,
                         models,
                         SEED,
Пример #3
0
        'min_child_weight':
        0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        'scale_pos_weight':
        400,  # because training data is extremely unbalanced
        'subsample_for_bin': 200000,  # Number of samples for constructing bin
        'min_split_gain':
        0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
        'reg_alpha': 0,  # L1 regularization term on weights
        'reg_lambda': 0,  # L2 regularization term on weights
        'nthread': NUM_CORES,
        'verbose': 0,
    }

    print("Preparing validation datasets")
    xgtrain = lgb.Dataset(train_df[predictors].values,
                          label=train_df[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical)
    del train_df
    gc.collect()

    xgvalid = lgb.Dataset(val_df[predictors].values,
                          label=val_df[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical)
    del val_df
    gc.collect()

    evals_results = {}
    print('LGB PARAMETER: ', lgb_params)
    bst = lgb.train(lgb_params,
                    xgtrain,
Пример #4
0
iris['Species'] = load_iris().target % 2

## train test split
train = iris[0:130]
test = iris[130:]
X_train = train.filter(
    items=['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'])
X_test = test.filter(
    items=['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'])
y_train = train[["Species"]]
y_test = test[["Species"]]
# y_train = train[[train.Species.name]]
# y_test = test[[test.Species.name]]

## build lgb model
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 16,
    'num_trees': 100,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}
gbm = lgb.train(params=params,
Пример #5
0
    'max_depth': -1,
    'reg_alpha': 0.2,
    'reg_lambda': 0.4,
    'nthread': 8
}
#降低chunk时要同时增大refit次数
chunk = 90000
count = 0
#先shuffle train_set
train_set = train_set.sample(frac=1).reset_index(drop=True)
gc.collect()
for i in range(1, 130):
    #分14部分,留最后一部分当test_set
    train_i = train_set[(i - 1) * chunk:i * chunk]
    train_x, train_y = train_i[features], train_i['label']
    if count == 0:
        print(count)
        num_round = 2000
        trn_data = lgb.Dataset(train_x, label=train_y)
        clf = lgb.train(params, trn_data, num_round)
    else:
        decay_rate = 1 / count + 1
        clf = clf.refit(train_x, train_y, decay_rate=decay_rate)
    count += 1

test_i = train_set[129 * chunk:]
test_x, test_y = train_i[features], train_i['label'].values

pred_y = clf.predict(test_x)
mse = mean_squared_error(test_y.reshape(-1), pred_y.reshape(-1))
Пример #6
0
    'metric': 'auc',
}

N = 10
kf = KFold(n_splits=N)
importance = pd.DataFrame(
    np.zeros((X_train.shape[1], N)),
    columns=['Fold_{}'.format(i) for i in range(1, N + 1)],
    index=X_train.columns)
scores = []
y_pred = np.zeros(X_test.shape[0])
oof = np.zeros(X_train.shape[0])

for fold, (trn_idx, val_idx) in enumerate(kf.split(X_train, y_train), 1):
    print('Fold {}'.format(fold))
    trn_data = lgb.Dataset(X_train.iloc[trn_idx, :].values,
                           label=y_train.iloc[trn_idx].values)
    val_data = lgb.Dataset(X_train.iloc[val_idx, :].values,
                           label=y_train.iloc[val_idx].values)
    clf = lgb.train(lgb_param,
                    trn_data,
                    10000,
                    valid_sets=[trn_data, val_data],
                    verbose_eval=500,
                    early_stopping_rounds=500)
    predictions = clf.predict(X_train.iloc[val_idx, :].values)
    importance.iloc[:, fold - 1] = clf.feature_importance()
    oof[val_idx] = predictions
    score = roc_auc_score(y_train.iloc[val_idx].values, predictions)
    scores.append(score)
    print('Fold {} ROC AUC Score {}\\n'.format(fold, score))
    y_pred += clf.predict(X_test) / N
Пример #7
0
train['month'] = train.transactiondate.dt.month + (
    train.transactiondate.dt.year - 2016) * 12
train_df = train.merge(properties, how='left', on='parcelid')

del properties
gc.collect()

train_df = train_df[train_df.logerror > -0.16]
train_df = train_df[train_df.logerror < 0.17]

x_train = train_df.drop(['parcelid', 'logerror', 'transactiondate'], axis=1)
col_lgb = x_train.columns.values
y_train = train_df["logerror"].values.astype(np.float32)

d_train = lgb.Dataset(x_train, label=y_train)
categorical = [
    'airconditioningtypeid',
    'architecturalstyletypeid',
    'buildingclasstypeid',
    'buildingqualitytypeid',
    'fips',
    'heatingorsystemtypeid',
    'propertycountylandusecode',
    'propertylandusetypeid',
    'propertyzoningdesc',
    'rawcensustractandblock',
    'regionidcity',
    'regionidcounty',
    'regionidneighborhood',
    'regionidzip',
Пример #8
0
# We now train the model. Here, we use a standard KFold split of the dataset in order to validate the results and to stop the training. Interstingly, during the writing of this kernel, the model was enriched adding new features, which improved the CV score. **The variations observed on the CV were found to be quite similar to the variations on the LB**: it seems that the current competition won't give us headaches to define the correct validation scheme:

# In[ ]:

folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))
start = time.time()
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx,
            val_idx) in enumerate(folds.split(train.values, target.values)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][features],
                           label=target.iloc[trn_idx],
                           categorical_feature=categorical_feats)
    val_data = lgb.Dataset(train.iloc[val_idx][features],
                           label=target.iloc[val_idx],
                           categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(param,
                    trn_data,
                    num_round,
                    valid_sets=[trn_data, val_data],
                    verbose_eval=100,
                    early_stopping_rounds=200)

    oof[val_idx] = clf.predict(train.iloc[val_idx][features],
                               num_iteration=clf.best_iteration)
Пример #9
0
train_columns = x_train.columns

y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)
print x_train.columns
pd.Series(list(x_train.columns)).to_csv('../../data/columns.csv')


del df_train; gc.collect()


x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]

print('Building DMatrix...')

d_train = lgb.Dataset(x_train, label=y_train)
d_valid = lgb.Dataset(x_valid, label=y_valid)

del x_train, x_valid; gc.collect()

print('Training ...')
params = {'max_bin': 10, 'learning_rate': 0.0021, 'boosting_type': 'gbdt', 'objective': 'regression',
                      'metric': 'l1', 'sub_feature': 0.5, 'bagging_fraction': 0.85, 'bagging_freq': 40,
                      'num_leaves': 512, 'min_data': 500, 'min_hessian': 0.05, 'verbose': 0 }

print(params)

watchlist = [d_valid]
clf = lgb.train(params, d_train, 10000, watchlist,  early_stopping_rounds=100)

print("Features importance...")
Пример #10
0
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'feature_fraction': 0.2319,
    'feature_fraction_seed': 9,
    'bagging_seed': 9,
    'min_data_in_leaf': 6,
    'min_sum_hessian_in_leaf': 11
}

for fold_n, (train_index, valid_index) in tqdm(enumerate(folds.split(x, y))):
    print('Fold', fold_n, 'started at', time.ctime())

    x_train, x_valid = x.iloc[train_index], x.iloc[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]

    train_lgb = lgb.Dataset(x_train, y_train)
    val_lgb = lgb.Dataset(x_valid, y_valid)

    lgbm = lgb.train(
        params,
        train_lgb,
        #                num_boost_round  = 1000,
        valid_sets=[val_lgb, train_lgb],
        early_stopping_rounds=200,
        fobj=smape_objective,
        feval=smape_error,
        verbose_eval=100)

    y_pred_lgb += lgbm.predict(
        X_test, num_iteration=lgbm.best_iteration) / folds.n_splits
dftest = pd.get_dummies(xtest,
                        columns=xtest.columns,
                        dtype='float64',
                        drop_first=True)

#Implementing lightGBM
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': 2,
    'learning_rate': 0.3,
    'feature_fraction': 0.2,
    'is_unbalance': True
}

train_data = lgb.Dataset(xtrain, ytrain)
test_data = lgb.Dataset(xtest, reference=train_data)
lgb_train = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, test_data],
    num_boost_round=5000,
)
predicted = lgb_train.predict(xtest)

#Submission
submission1 = pd.DataFrame(predicted, columns=['target'])
submission1['id'] = dftest['id'].astype('int32')
submission1 = submission1[['id', 'target']]
submission1.to_csv('10.OneHotEncodeAllLightGBM.csv', header=True, index=False)
Пример #12
0
def test_lightgbm(tmp_path, num_classes, n_categorical):
    import lightgbm as lgb

    if n_categorical > 0:
        n_features = 10
        n_rows = 1000
        n_informative = n_features
    else:
        n_features = 10 if num_classes == 2 else 50
        n_rows = 500
        n_informative = 'auto'

    X, y = simulate_data(n_rows,
                         n_features,
                         num_classes,
                         n_informative=n_informative,
                         random_state=43210,
                         classification=True)
    if n_categorical > 0:
        X_fit, X_predict = to_categorical(X,
                                          n_categorical=n_categorical,
                                          invalid_frac=0.1,
                                          random_state=43210)
    else:
        X_fit, X_predict = X, X

    train_data = lgb.Dataset(X_fit, label=y)
    num_round = 5
    model_path = str(os.path.join(tmp_path, 'lgb.model'))

    if num_classes == 2:
        param = {'objective': 'binary',
                 'metric': 'binary_logloss',
                 'num_class': 1}
        bst = lgb.train(param, train_data, num_round)
        bst.save_model(model_path)
        fm = ForestInference.load(model_path,
                                  algo='TREE_REORG',
                                  output_class=True,
                                  model_type="lightgbm")
        # binary classification
        gbm_proba = bst.predict(X_predict)
        fil_proba = fm.predict_proba(X_predict)[:, 1]
        gbm_preds = (gbm_proba > 0.5).astype(float)
        fil_preds = fm.predict(X_predict)
        assert array_equal(gbm_preds, fil_preds)
        np.testing.assert_allclose(gbm_proba, fil_proba,
                                   atol=proba_atol[num_classes > 2])
    else:
        # multi-class classification
        lgm = lgb.LGBMClassifier(objective='multiclass',
                                 boosting_type='gbdt',
                                 n_estimators=num_round)
        lgm.fit(X_fit, y)
        lgm.booster_.save_model(model_path)
        lgm_preds = lgm.predict(X_predict).astype(int)
        fm = ForestInference.load(model_path,
                                  algo='TREE_REORG',
                                  output_class=True,
                                  model_type="lightgbm")
        assert array_equal(lgm.booster_.predict(X_predict).argmax(axis=1),
                           lgm_preds)
        assert array_equal(lgm_preds, fm.predict(X_predict))
        # lightgbm uses float64 thresholds, while FIL uses float32
        np.testing.assert_allclose(lgm.predict_proba(X_predict),
                                   fm.predict_proba(X_predict),
                                   atol=proba_atol[num_classes > 2])
Пример #13
0
from xgboost_ray import RayXGBClassifier

start = time.time()
model = RayXGBClassifier(
    n_jobs=10,  # In XGBoost-Ray, n_jobs sets the number of actors
    random_state=1)
model.fit(X_train, y_train)
print(f"executed Ray XGBoost in {time.time() - start}")
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

print('light GBM')
# see https://www.analyticsvidhya.com/blog/2017/06/which-algorithm-takes-the-crown-light-gbm-vs-xgboost/
import lightgbm as lgb

train_data = lgb.Dataset(X_train, label=y_train)
param = {
    'num_leaves': 150,
    'objective': 'binary',
    'learning_rate': .05,
    'max_bin': 200
}
param['metric'] = ['auc', 'binary_logloss']
start = time.time()
model = lgb.train(param, train_data, 100)
print(f"executed GBM in {time.time() - start}")
y_pred = model.predict(X_test)
#converting probabilities into 0 or 1
for i in range(len(y_pred)):
    if y_pred[i] >= .5:  # setting threshold to .5
        y_pred[i] = 1
        del train_set, val_set

        t = len(Y_tr)
        t1 = sum(Y_tr)
        t0 = t - t1
        print('train size:', t, 'number of 1:', t1, 'number of 0:', t0)
        print('train: 1 in all:', t1 / t, '0 in all:', t0 / t, '1/0:', t1 / t0)
        t = len(Y_val)
        t1 = sum(Y_val)
        t0 = t - t1
        print('val size:', t, 'number of 1:', t1, 'number of 0:', t0)
        print('val: 1 in all:', t1 / t, '0 in all:', t0 / t, '1/0:', t1 / t0)
        print()
        print()

        train_set = lgb.Dataset(X_tr, Y_tr)
        val_set = lgb.Dataset(X_val, Y_val)
        del X_tr, Y_tr, X_val, Y_val

        print('Training...')

        model = lgb.train(
            params,
            train_set,
            num_boost_round=num_boost_round,
            early_stopping_rounds=early_stopping_rounds,
            valid_sets=val_set,
            verbose_eval=verbose_eval,
        )
        print('best score:', model.best_score['valid_0']['auc'])
        print('best iteration:', model.best_iteration)
Пример #15
0
def train(x_train):

    # y_train = pd.read_feather('../protos/train_0618.ftr')['t_deal_probability'].values
    # np.savetxt('y_train.npy', y_train)
    y_train = np.loadtxt('y_train.npy')
    usecols = x_train.columns.values.tolist()

    cv = KFold(n_splits=5, shuffle=True, random_state=871)

    with open(DIR + 'usecols.pkl', 'wb') as f:
        pickle.dump(usecols, f, -1)
    for _, test in cv.split(x_train, y_train):
        x_train = x_train.iloc[test].values
        y_train = y_train[test]

        break

    all_params = {
        'boosting_type': 'gbdt',
        'colsample_bytree': 0.8,
        'learning_rate': 0.01,
        'max_bin': 255,
        'max_depth': -1,
        'metric': 'rmse',
        'min_child_weight': 50,
        'min_split_gain': 0.01,
        'num_leaves': 15,
        'objective': 'xentropy ',
        'reg_alpha': 0,
        'scale_pos_weight': 1,
        'seed': 114514,
        'subsample': 1,
        'subsample_freq': 0,
        'verbose': -1
    }
    """
    all_params = {'min_child_weight': [80],
                  'subsample': [1],
                  'subsample_freq': [0],
                  'seed': [114514],
                  'colsample_bytree': [0.8],
                  'learning_rate': [0.01],
                  'max_depth': [4],
                  'min_split_gain': [0.01],
                  'reg_alpha': [0.001],
                  'reg_lambda': [0.1],
                  'max_bin': [255],
                  'num_leaves': [15],
                  'objective': ['xentropy'],
                  'scale_pos_weight': [1],
                  'verbose': [-1],
                  'boosting_type': ['gbdt'],
                  'metric': ['rmse'],
                  # 'skip_drop': [0.7],
                  }
    """
    all_params = {k: [v] for k, v in all_params.items()}
    use_score = 0
    min_score = (100, 100, 100)
    cv = KFold(n_splits=3, shuffle=True, random_state=871)
    for params in tqdm(list(ParameterGrid(all_params))):
        cnt = -1
        list_score = []
        list_score2 = []
        list_best_iter = []
        all_pred = np.zeros(y_train.shape[0])
        for train, test in cv.split(x_train, y_train):
            cnt += 1
            trn_x = x_train[train]
            val_x = x_train[test]
            trn_y = y_train[train]
            val_y = y_train[test]

            train_data = lgb.Dataset(trn_x, label=trn_y, feature_name=usecols)
            test_data = lgb.Dataset(val_x, label=val_y, feature_name=usecols)
            del trn_x
            gc.collect()
            clf = lgb.train(
                params,
                train_data,
                100000,  # params['n_estimators'],
                early_stopping_rounds=100,
                valid_sets=[test_data],
                # feval=cst_metric_xgb,
                # callbacks=[callback],
                verbose_eval=10)
            pred = clf.predict(val_x).clip(0, 1)

            all_pred[test] = pred

            _score = np.sqrt(mean_squared_error(val_y, pred))
            _score2 = _score  # - roc_auc_score(val_y, pred)

            logger.info('   _score: %s' % _score)
            logger.info('   _score2: %s' % _score2)

            list_score.append(_score)
            list_score2.append(_score2)

            if clf.best_iteration != 0:
                list_best_iter.append(clf.best_iteration)
            else:
                list_best_iter.append(params['n_estimators'])

            with open(DIR + 'train_cv_pred_%s.pkl' % cnt, 'wb') as f:
                pickle.dump(pred, f, -1)
            with open(DIR + 'model_%s.pkl' % cnt, 'wb') as f:
                pickle.dump(clf, f, -1)
            gc.collect()
        with open(DIR + 'train_cv_tmp.pkl', 'wb') as f:
            pickle.dump(all_pred, f, -1)

        logger.info('trees: {}'.format(list_best_iter))
        # trees = np.mean(list_best_iter, dtype=int)
        score = (np.mean(list_score), np.min(list_score), np.max(list_score))
        score2 = (np.mean(list_score2), np.min(list_score2),
                  np.max(list_score2))

        logger.info('param: %s' % (params))
        logger.info('cv: {})'.format(list_score))
        logger.info('cv2: {})'.format(list_score2))

        logger.info('loss: {} (avg min max {})'.format(score[use_score],
                                                       score))
        logger.info('all loss: {}'.format(
            np.sqrt(mean_squared_error(y_train, all_pred))))
        logger.info('qwk: {} (avg min max {})'.format(score2[use_score],
                                                      score2))

        if min_score[use_score] > score[use_score]:
            min_score = score
            min_params = params
        logger.info('best score: {} {}'.format(min_score[use_score],
                                               min_score))

        logger.info('best params: {}'.format(min_params))

    imp = pd.DataFrame(clf.feature_importance(), columns=['imp'])
    imp['col'] = usecols
    n_features = imp.shape[0]
    imp = imp.sort_values('imp', ascending=False)
    imp.to_csv(DIR + 'feature_importances_0.csv')
    logger.info('imp use {} {}'.format(imp[imp.imp > 0].shape, n_features))

    del val_x
    del trn_y
    del val_y
    del train_data
    del test_data
    gc.collect()

    trees = np.mean(list_best_iter)

    logger.info('all data size {}'.format(x_train.shape))

    train_data = lgb.Dataset(x_train, label=y_train, feature_name=usecols)
    del x_train
    gc.collect()
    logger.info('train start')
    clf = lgb.train(min_params,
                    train_data,
                    int(trees * 1.1),
                    valid_sets=[train_data],
                    verbose_eval=10)
    logger.info('train end')
    with open(DIR + 'model.pkl', 'wb') as f:
        pickle.dump(clf, f, -1)
    # del x_train
    gc.collect()

    logger.info('save end')
from sklearn.model_selection import train_test_split
import pyarrow.feather as pyfa
import lightgbm as lgb
import gc

train_data = pyfa.read_feather('train_data.feather')

test_data = train_data[(train_data.shape[0] - 5000000):train_data.shape[0]]
train_data = train_data[0:(train_data.shape[0] - 5000000)]
gc.collect()

target = 'is_attributed'
predictors = train_data.columns
categorical = ['app', 'device', 'os', 'channel', 'hour']

xgtrain = lgb.Dataset(train_data[predictors].values, label=train_data[target].values, feature_name=predictors,categorical_feature=categorical, free_raw_data=False)
xgtrain.save_binary('train_data.bin')
del train_data
gc.collect()

xgtest = lgb.Dataset(test_data[predictors].values, label=test_data[target].values,feature_name=predictors,categorical_feature=categorical,free_raw_data = False,reference=xgtrain)
xgtest.save_binary('test_data.bin')
del test_data
gc.collect()

lgb_params = {
    'learning_rate': 0.1,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 7,  # we should let it be smaller than 2^(max_depth)
    def objective(self, trial):
        # Extract optuna attribs from the input json
        optuna_trn_params = {}
        for key, val in self.params["trn_params"].items():
            if type(val) != list:
                optuna_trn_params[key] = val
            else:
                if type(val[0]) == float:
                    optuna_trn_params[key] = trial.suggest_uniform(key, val[0], val[1])
                elif type(val[0]) == int:
                    optuna_trn_params[key] = trial.suggest_int(key, val[0], val[1])
                else:
                    optuna_trn_params[key] = trial.suggest_categorical(key, val)

        # Initialize parameters
        mtd_params = self.params["mtd_params"]
        validity = None
        model_path = Path(__file__).absolute().parents[2] / "data" / "model" / str(get_version())
        Path.mkdir(model_path, exist_ok=True, parents=True)
        START_FOLD = 0
        if get_back_training():
            START_FOLD = len(list(model_path.glob('**/*.model')))
        END_FOLD = 5
        if train_one_round():
            START_FOLD = 0
            END_FOLD = 1
        if START_FOLD == END_FOLD:
            return None

        start2 = time.time()
        getLogger(get_version()).info("\t [OPTUNA] {}th optimization starts".format(self.optimized_count))
        send_message("\t [OPTUNA] :sushi: {} th optimization starts".format(self.optimized_count))
        # Process for each fold
        for fold in range(START_FOLD, END_FOLD):
            start = time.time()
            getLogger(get_version()).info("\t [OPTUNA] >> {} folds start".format(fold))
            send_message("\t [OPTUNA] :sushi: {} folds start".format(fold))

            # Generate dataset
            valid = "valid{}".format(str(fold))
            trn_x = super().get_feature_df(self.feature_names, valid, "train")
            val_x = super().get_feature_df(self.feature_names, valid, "validate")
            trn_x.set_index("MachineIdentifier", inplace=True)
            val_x.set_index("MachineIdentifier", inplace=True)
            trn_y = trn_x["HasDetections"].astype(np.int8)
            val_y = val_x["HasDetections"].astype(np.int8)
            train_dataset = lgb.Dataset(trn_x, trn_y)
            valid_dataset = lgb.Dataset(val_x, val_y)

            # Initialize variables for scoring
            if validity is None:
                validity = pd.DataFrame()
                validity["HasDetections"] = pd.concat([trn_y, val_y])
                validity["Predict"] = 0

            # Delete needless features
            del trn_x["HasDetections"], val_x["HasDetections"]

            # Classify
            clf = lgb.train(optuna_trn_params,
                            train_dataset,
                            mtd_params["num_boost_round"],
                            valid_sets=[train_dataset, valid_dataset],
                            feval=eval_auc,
                            verbose_eval=mtd_params["verbose_eval"],
                            early_stopping_rounds=mtd_params["early_stopping_rounds"])
            validity.loc[validity.index.isin(val_x.index), "Predict"] = clf.predict(val_x, num_iteration=clf.best_iteration)

            if fold == START_FOLD:
                getLogger(get_version()).info("\t {}".format(clf.params))
                send_message("\t {}".format(clf.params))

            for train_or_valid, metrics in clf.best_score.items():
                for metric, score in metrics.items():
                    getLogger(get_version()).info("\t\t >> Best {} {}: {}".format(train_or_valid, metric, score))
                    send_message("\t\t :star-struck: Best {} {}: {}".format(train_or_valid, metric, score))

            # Post-process this fold
            del train_dataset, valid_dataset
            gc.collect()
            elapsed_time = int(time.time() - start)
            minutes, sec = divmod(elapsed_time, 60)
            hour, minutes = divmod(minutes, 60)
            getLogger(get_version()).info(
                "\t [OPTUNA] >> {} folds finish: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}"
                .format(fold, hour, minutes, sec))
            send_message("\t [OPTUNA] :sushi: {} folds finish: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}".format(fold, hour, minutes, sec))

        elapsed_time = int(time.time() - start2)
        minutes, sec = divmod(elapsed_time, 60)
        hour, minutes = divmod(minutes, 60)
        getLogger(get_version()).info(
            "\t [OPTUNA] >> {}th optimization finishes: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}"
            .format(self.optimized_count, hour, minutes, sec))
        send_message("\t [OPTUNA] :sushi: {}th optimiaztion finishes: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}".format(self.optimized_count, hour, minutes, sec))
        self.optimized_count += 1

        # Output CV score
        validity = validity.reset_index()
        columns_order = ["MachineIdentifier", "HasDetections", "Predict"]
        validity = validity.sort_values("MachineIdentifier").reset_index(drop=True).loc[:, columns_order]
        cv_auc = (fast_auc(validity["HasDetections"], np.array(validity["Predict"])))

        return 1 - cv_auc
Пример #18
0
}

fi = []
cv_score = []
test_pred = np.zeros((test.shape[0], ))
skf = StratifiedKFold(n_splits=5, random_state=2019, shuffle=True)

from xgboost import XGBRegressor

for index, (train_index, test_index) in enumerate(skf.split(train, y)):
    print(index)
    train_x, test_x, train_y, test_y = train.iloc[train_index], train.iloc[
        test_index], y.iloc[train_index], y.iloc[test_index]
    lgb_model = lgb.train(
        lgb_paras,
        train_set=lgb.Dataset(train_x[feature], train_y),
        valid_sets=[lgb.Dataset(test_x[feature], test_y)],
        num_boost_round=800,
        feval=lgb_roc_auc_score,
        verbose_eval=100,
        categorical_feature=object_col,
    )
    y_val = lgb_model.predict(test_x[feature])
    print("roc_auc:", roc_auc_score(test_y, y_val))
    cv_score.append(roc_auc_score(test_y, y_val))
    print("cv_score:", cv_score[index])
    test_pred += lgb_model.predict(test[feature]) / 5

submission['Label'] = test_pred
submission.to_csv('submission_light_gbm.csv', index=False)
Пример #19
0
def main():
    model_output_dir = f'../processed/lgb_output/'
    if not os.path.isdir(model_output_dir):
        os.makedirs(model_output_dir)

    dataset_dir = '../processed/dataset/'
    X_train = pd.read_pickle(os.path.join(dataset_dir, 'X_train.pickle'))
    y_train = pd.read_pickle(os.path.join(dataset_dir, 'y_train.pickle'))
    X_test = pd.read_pickle(os.path.join(dataset_dir, 'X_test.pickle'))

    params = {
        'bagging_freq': 5,
        'bagging_fraction': 0.95,
        'boost_from_average': 'false',
        'boost': 'gbdt',
        'feature_fraction': 1.0,
        'learning_rate': 0.005,
        'max_depth': -1,
        'metric': 'binary_logloss',
        'min_data_in_leaf': 30,
        'min_sum_hessian_in_leaf': 10.0,
        'num_leaves': 64,
        'num_threads': 32,
        'tree_learner': 'serial',
        'objective': 'binary',
        'verbosity': 1
    }

    dset_list = []
    for cnum in range(200):
        _dset = arrange_dataset(X_train, cnum)
        dset_list.append(_dset)

    concat_X_train = pd.concat(dset_list, axis=0)
    concat_X_train['var_num'] = concat_X_train['var_num'].astype('category')

    train_dset = lgb.Dataset(concat_X_train,
                             pd.concat([y_train for c in range(200)], axis=0),
                             free_raw_data=False)

    for fold_set_number in range(10):
        print('### start iter {} in 10 ###'.format(fold_set_number + 1))
        skf = StratifiedKFold(n_splits=5,
                              shuffle=True,
                              random_state=2019 + fold_set_number)
        folds = [[
            np.concatenate([_trn + i * X_train.shape[0] for i in range(200)]),
            np.concatenate([_val + i * X_train.shape[0] for i in range(200)])
        ] for _trn, _val in skf.split(X_train, y_train)]

        extraction_cb = ModelExtractionCallback()
        callbacks = [
            extraction_cb,
        ]

        print('start training. ')
        cv_result = lgb.cv(params,
                           train_set=train_dset,
                           num_boost_round=100000,
                           early_stopping_rounds=100,
                           verbose_eval=100,
                           folds=folds,
                           callbacks=callbacks)
        bsts = extraction_cb.raw_boosters
        best_iteration = extraction_cb.best_iteration
        print('training end. ')

        print('start predicting. ')
        oof_pred_array = np.ones((X_train.shape[0], 200))
        test_pred_array = np.ones((X_test.shape[0], 5, 200))
        for cnum in tqdm(range(200)):
            for i, bst in enumerate(bsts):
                cv_valid_index = bst.valid_sets[0].used_indices
                cv_valid_index = cv_valid_index[:int(cv_valid_index.shape[0] /
                                                     200)]
                # oofの予測
                cv_valid_data = arrange_dataset(
                    X_train, cnum).iloc[cv_valid_index].values
                oof_pred_array[cv_valid_index, cnum] = bst.predict(
                    cv_valid_data, num_iteration=best_iteration)
                # testの予測
                test_pred_array[:, i, cnum] = bst.predict(
                    arrange_dataset(X_test, cnum).values,
                    num_iteration=best_iteration)
        print('prediction end. ')

        print('start postprocess. ')
        thr = 0.500
        oof_pred_odds_prod = np.ones((X_train.shape[0]))
        test_pred_odds_prod = np.ones((X_test.shape[0], 5))
        for cnum in tqdm(range(200)):
            tmp_auc = roc_auc_score(y_train, oof_pred_array[:, cnum])
            if tmp_auc >= thr:
                oof_pred_odds_prod *= oof_pred_array[:, cnum] / (
                    1 - oof_pred_array[:, cnum])
                test_pred_odds_prod *= test_pred_array[:, :, cnum] / (
                    1 - test_pred_array[:, :, cnum])
        print('postprocess end. auc : {0:.6f}'.format(
            roc_auc_score(y_train, oof_pred_odds_prod)))

        print('save iteration results')
        pd.DataFrame(oof_pred_odds_prod, index=X_train.index, columns=['pred'])\
            .to_pickle(os.path.join(model_output_dir, f'oof_preds_{fold_set_number}.pkl.gz'), compression='gzip')
        for fold_num in range(5):
            model_management_num = fold_num + fold_set_number * 5
            pd.DataFrame(test_pred_odds_prod[:, fold_num], index=X_test.index, columns=['pred'])\
                .to_pickle(os.path.join(model_output_dir, f'test_preds_{model_management_num}.pkl.gz'), compression='gzip')
Пример #20
0
folds = KFold(n_splits=n_fold, shuffle=False)

os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
lgb_sub = sub.copy()
lgb_sub['isFraud'] = 0
aucs = []
training_start_time = time()
for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):

    if fold_n == 2:
        break

    start_time = time()
    print('Training on fold {}'.format(fold_n + 1))

    trn_data = lgb.Dataset(X.iloc[train_index], label=y.iloc[train_index])
    val_data = lgb.Dataset(X.iloc[valid_index], label=y.iloc[valid_index])
    clf = lgb.train(params, trn_data, num_boost_round=10000, valid_sets=[val_data], verbose_eval=100,
                    early_stopping_rounds=500)

    pred = clf.predict(test_X)
    val = clf.predict(X.iloc[valid_index])
    print('ROC accuracy: {}'.format(roc_auc_score(y.iloc[valid_index], val)))
    aucs.append(roc_auc_score(y.iloc[valid_index], val))

    # 不使用最后一折
    # lgb_sub['isFraud'] = lgb_sub['isFraud'] + pred / (n_fold - 1)

    # 不使用最后三折
    lgb_sub['isFraud'] = lgb_sub['isFraud'] + pred / (n_fold - 3)
Пример #21
0
def test_lightgbm_cpu_airlines_full(booster):
    import numpy as np
    import pandas as pd
    from h2o4gpu.util.lightgbm_dynamic import got_cpu_lgb, got_gpu_lgb
    import lightgbm as lgb

    data = pd.read_csv('./open_data/allyears.1987.2013.zip',
                       dtype={
                           'UniqueCarrier': 'category',
                           'Origin': 'category',
                           'Dest': 'category',
                           'TailNum': 'category',
                           'CancellationCode': 'category',
                           'IsArrDelayed': 'category',
                           'IsDepDelayed': 'category',
                           'DepTime': np.float32,
                           'CRSDepTime': np.float32,
                           'ArrTime': np.float32,
                           'CRSArrTime': np.float32,
                           'ActualElapsedTime': np.float32,
                           'CRSElapsedTime': np.float32,
                           'AirTime': np.float32,
                           'ArrDelay': np.float32,
                           'DepDelay': np.float32,
                           'Distance': np.float32,
                           'TaxiIn': np.float32,
                           'TaxiOut': np.float32,
                           'Diverted': np.float32,
                           'Year': np.int32,
                           'Month': np.int32,
                           'DayOfWeek': np.int32,
                           'DayofMonth': np.int32,
                           'Cancelled': 'category',
                           'CarrierDelay': np.float32,
                           'WeatherDelay': np.float32,
                           'NASDelay': np.float32,
                           'SecurityDelay': np.float32,
                           'LateAircraftDelay': np.float32
                       })

    y = data["IsArrDelayed"].cat.codes
    data = data[[
        'UniqueCarrier', 'Origin', 'Dest', 'IsDepDelayed', 'Year', 'Month',
        'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime', 'ArrTime',
        'CRSArrTime', 'FlightNum', 'TailNum', 'ActualElapsedTime',
        'CRSElapsedTime', 'AirTime', 'ArrDelay', 'DepDelay', 'Distance',
        'TaxiIn', 'TaxiOut', 'Cancelled', 'CancellationCode', 'Diverted',
        'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay',
        'LateAircraftDelay'
    ]]

    lgb_params = {
        'learning_rate': 0.1,
        'boosting': booster,
        'objective': 'binary',
        'metric': 'rmse',
        'feature_fraction': 0.9,
        'bagging_fraction': 0.75,
        'num_leaves': 31,
        'bagging_freq': 1,
        'min_data_per_leaf': 250
    }
    lgb_train = lgb.Dataset(data=data, label=y)
    cv = lgb.cv(lgb_params,
                lgb_train,
                num_boost_round=50,
                early_stopping_rounds=5,
                stratified=False,
                verbose_eval=10)
Пример #22
0
                      early_stopping_rounds=10)
    y_pred_valid = model.predict(X_valid, num_iteration=model.best_iteration)
    score = log_loss(y_valid, y_pred_valid)
    return score


study = optuna.create_study(sampler=optuna.samplers.RandomSampler(seed=0))
study.optimize(objective, n_trials=40)

params = {
    'objective': 'binary',
    'max_bin': study.best_params['max_bin'],
    'learning_rate': 0.05,
    'num_leaves': study.best_params['num_leaves'],
}

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

model = lgb.train(params,
                  lgb_train,
                  valid_sets=[lgb_train, lgb_eval],
                  verbose_eval=10,
                  num_boost_round=1000,
                  early_stopping_rounds=10)
y_pred = model.predict(X_test, num_iteration=model.best_iteration)

y_pred = (y_pred > 0.5).astype(int)
sub['Perished'] = y_pred
sub.to_csv('./submission_net1.csv', index=False)
Пример #23
0
def kfold_lightgbm(params,df, predictors,target,num_folds, stratified = True,
                   objective='', metrics='',debug= False,
                   feval = f1_score_vali, early_stopping_rounds=100, num_boost_round=100, verbose_eval=50, categorical_features=None,sklearn_mertric = evaluate_macroF1_lgb ):

    lgb_params = params
    
    train_df = df[df[target].notnull()]
    test_df = df[df[target].isnull()]
    
    # Divide in training/validation and test data
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df[predictors].shape, test_df[predictors].shape))
    del df
    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1234)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=1234)
#    folds = GroupKFold(n_splits=5)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros((train_df.shape[0],11))
    sub_preds = np.zeros((test_df.shape[0],11))
    feature_importance_df = pd.DataFrame()
    feats = predictors
    cv_resul = []
    '''
    perm = [i for i in range(len(train_df))]
    perm = pd.DataFrame(perm)
    perm.columns = ['index_']

    for n_fold in range(5):
        train_idx = np.array(perm[train_df['cv'] != n_fold]['index_'])
        valid_idx = np.array(perm[train_df['cv'] == n_fold]['index_'])
    '''
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df[target])):
        if (USE_KFOLD == False) and (n_fold == 1):
            break
        train_x, train_y = train_df[feats].iloc[train_idx], train_df[target].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[target].iloc[valid_idx]

        train_x = pd.concat([train_x,train_old[feats]])
        train_y = pd.concat([train_y,train_old[target]])

        train_y_t = train_y.values
        valid_y_t = valid_y.values
        print(train_y_t)
        xgtrain = lgb.Dataset(train_x.values, label = train_y_t,
                              feature_name=predictors,
                              categorical_feature=categorical_features
                              )
        xgvalid = lgb.Dataset(valid_x.values, label = valid_y_t,
                              feature_name=predictors,
                              categorical_feature=categorical_features
                              )

        clf = lgb.train(lgb_params, 
                         xgtrain, 
                         valid_sets=[xgvalid],#, xgtrain], 
                         valid_names=['valid'],#,'train'], 
                         num_boost_round=num_boost_round,
                         early_stopping_rounds=early_stopping_rounds,
                         verbose_eval=verbose_eval, 
#                         feval=feval
                         )



        oof_preds[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration)
        sub_preds += clf.predict(test_df[feats], num_iteration=clf.best_iteration)/ folds.n_splits


        gain = clf.feature_importance('gain')
        fold_importance_df = pd.DataFrame({'feature':clf.feature_name(),
                                           'split':clf.feature_importance('split'),
                                           'gain':100*gain/gain.sum(),
                                           'fold':n_fold,                        
                                           }).sort_values('gain',ascending=False)
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        
        result = evaluate_macroF1_lgb(valid_y, oof_preds[valid_idx])
#        result = clf.best_score['valid']['macro_f1_score']
        print('Fold %2d macro-f1 : %.6f' % (n_fold + 1, result))
        cv_resul.append(round(result,5))
        gc.collect()
        
    #score = np.array(cv_resul).mean()\
    score = 'model_3_1'
    if USE_KFOLD:
       #print('Full f1 score %.6f' % score)
        for i in range(11):
            train_df["class_" + str(i)] = oof_preds[:,i]
            test_df["class_" + str(i)] = sub_preds[:,i]
        train_df[['user_id'] + ["class_" + str(i) for i in range(11)]].to_csv('./cv/val_prob_{}.csv'.format(score), index= False, float_format = '%.4f')
        test_df[['user_id'] + ["class_" + str(i) for i in range(11)]].to_csv('./cv/sub_prob_{}.csv'.format(score), index= False, float_format = '%.4f')   
        oof_preds = [np.argmax(x)for x in oof_preds]
        sub_preds = [np.argmax(x)for x in sub_preds]    
        train_df[target] = oof_preds
        test_df[target] = sub_preds
        print(test_df[target].mean())
        train_df[target] = oof_preds
        train_df[target] = train_df[target].map(label2current_service)
        test_df[target] = sub_preds
        test_df[target] = test_df[target].map(label2current_service)
        print('all_cv', cv_resul)
        train_df[['user_id', target]].to_csv('./sub/val_{}.csv'.format(score), index= False)
        test_df[['user_id', target]].to_csv('./sub/sub_{}.csv'.format(score), index= False)
        print("test_df mean:")
    
    display_importances(feature_importance_df,score)
          "nthread": 15,
          'metric': 'multi_logloss',
          "random_state": 2019,
          # 'device': 'gpu'
          }


folds = KFold(n_splits=5, shuffle=True, random_state=2019)
prob_oof = np.zeros((train_x.shape[0], category))
test_pred_prob = np.zeros((test_x.shape[0], category))

## train and predict
feature_importance_df = pd.DataFrame()
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_data)):
    print("fold {}".format(fold_ + 1))
    trn_data = lgb.Dataset(train_x.iloc[trn_idx], label=train_y.iloc[trn_idx])
    val_data = lgb.Dataset(train_x.iloc[val_idx], label=train_y.iloc[val_idx])

    clf = lgb.train(params,
                    trn_data,
                    num_round,
                    valid_sets=[trn_data, val_data],
                    verbose_eval=20,
                    # categorical_feature=None,
                    early_stopping_rounds=60)
    prob_oof[val_idx] = clf.predict(train_x.iloc[val_idx], num_iteration=clf.best_iteration)


    # fold_importance_df = pd.DataFrame()
    # fold_importance_df["Feature"] = features
    # fold_importance_df["importance"] = clf.feature_importance()
Пример #25
0
if __name__ == '__main__':

    train = load_file('../data/train.csv')  # (76020, 371)
    test = load_file('../data/test.csv')  # (75818, 370)
    verbalise_dataset(train, test)

    train, test = remove_duplicate_col(train, test)
    verbalise_dataset(train, test)  # (76020, 371), (75818, 308)

    train, test = remove_constant_col(train, test)
    verbalise_dataset(train, test)  # (76020, 308), (75818, 307)

    # split data into train and test
    X = train.drop(["TARGET", "ID"], axis=1)
    Y = train['TARGET'].values

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=1632)
    print(X_train.shape, X_test.shape)

    d_train = lgb.Dataset(X_train, label=Y_train)
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
    }
    clf = lgb.train(train_set=d_train, params=params)

    Y_pred = clf.predict(X_test)
    print("Score: " + str(roc_auc_score(Y_test, Y_pred)))
def lgb_modelfit_nocv(params,
                      dtrain,
                      dtrain_target,
                      dvalid,
                      predictors,
                      target='target',
                      objective='binary',
                      metrics='auc',
                      feval=None,
                      early_stopping_rounds=20,
                      num_boost_round=3000,
                      verbose_eval=10,
                      categorical_features=None):
    lgb_params = {
        'boosting_type': 'gbdt',
        'objective': objective,
        'metric': metrics,
        'learning_rate': 0.01,
        'is_unbalance':
        'true',  #because training data is unbalance (replaced with scale_pos_weight)
        'num_leaves': 31,  # we should let it be smaller than 2^(max_depth)
        'max_depth': -1,  # -1 means no limit
        'min_child_samples':
        20,  # Minimum number of data need in a child(min_data_in_leaf)
        'max_bin': 255,  # Number of bucketed bin for feature values
        'subsample': 0.6,  # Subsample ratio of the training instance.
        'subsample_freq': 0,  # frequence of subsample, <=0 means no enable
        'colsample_bytree':
        0.3,  # Subsample ratio of columns when constructing each tree.
        'min_child_weight':
        5,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        'subsample_for_bin': 200000,  # Number of samples for constructing bin
        'min_split_gain':
        0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
        'reg_alpha': 0,  # L1 regularization term on weights
        'reg_lambda': 0,  # L2 regularization term on weights
        'nthread': 8,
        'verbose': 0,
        'metric': metrics
    }

    lgb_params.update(params)

    print("preparing validation datasets")

    xgtrain = lgb.Dataset(dtrain[predictors].values,
                          label=dtrain_target[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical_features)
    xgvalid = lgb.Dataset(dvalid[predictors].values,
                          feature_name=predictors,
                          categorical_feature=categorical_features)

    evals_results = {}

    bst1 = lgb.train(lgb_params,
                     xgtrain,
                     valid_sets=xgvalid,
                     evals_result=evals_results,
                     num_boost_round=num_boost_round,
                     early_stopping_rounds=early_stopping_rounds,
                     verbose_eval=50,
                     feval=feval)

    n_estimators = bst1.best_iteration
    print("\nModel Report")
    print("n_estimators : ", n_estimators)
    # print(metrics+":", evals_results['valid'][metrics][n_estimators-1])

    return bst1
    num_round = default_num_round
    
print ("train " + target + " mean:", x_train[target].mean())
x_train.loc[x_train[target]>{u_limit}, target] = {u_limit_apply}
x_train.loc[x_train[target]<{l_limit}, target] = {l_limit_apply}
print ("train " + target + " mean:", x_train[target].mean())
            
print ("x_test rows count: " + str(len(x_test)))
print ("x_train rows count: " + str(len(x_train)))

y_train = x_train[target]
x_train = x_train.drop(target, 1)

x_test = x_test.drop(target, 1)

d_train = lgb.Dataset(x_train, label=y_train)
if not output_mode:
    d_valid = lgb.Dataset(x_test, label=y_test)
else:
    d_valid = lgb.Dataset(x_train, label=y_train)

watchlist = [d_valid]
print("\nFitting LightGBM model ...")
predictor = lgb.train(params, d_train, num_round, watchlist, verbose_eval = 100, early_stopping_rounds=100)

prediction = predictor.predict(x_test)

if not output_mode:
    result = my_log_loss(y_test, prediction)
    print ("fitness="+str(result))
     'feature_fraction': [0.8],
     'max_depth': [13],
     'num_leaves': [200],
     'bagging_fraction': [0.8],
     'bagging_freq': [5],
     'min_data_in_leaf': [15],
     'min_gain_to_split': [0],
     'num_iterations': [best_iterations],
     'lambda_l1': [0.01],
     'lambda_l2': [1],
     'verbose': [0],
     'is_unbalance': [True]
 }
 params = list(ParameterGrid(params))
 lgbtrain = lgb.Dataset(train_feat,
                        label=train_label,
                        feature_name=feat_names,
                        categorical_feature=categorical_feat_names)
 lgbtest = test_feat[feat_names]
 for param in params:
     clf = lgb.train(param,
                     lgbtrain,
                     num_boost_round=param['num_iterations'],
                     categorical_feature=categorical_feat_names)
     pred = clf.predict(lgbtest)
     predict_label = np.argmax(pred, axis=1)
     rows = test_feat['row_id'].values
     shop_ids = []
     for l in predict_label:
         shop_ids.append(map_dict[l])
     results = pd.DataFrame([list(rows), list(shop_ids)],
                            index=['row_id', 'shop_ids'])
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 4
}

MAX_ROUNDS = 1000
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i + 1))
    print("=" * 50)
    dtrain = lgb.Dataset(X_train,
                         label=y_train[:, i],
                         categorical_feature=cate_vars,
                         weight=pd.concat([items["perishable"]] * 4) * 0.25 +
                         1)
    dval = lgb.Dataset(X_val,
                       label=y_val[:, i],
                       reference=dtrain,
                       weight=items["perishable"] * 0.25 + 1,
                       categorical_feature=cate_vars)
    bst = lgb.train(params,
                    dtrain,
                    num_boost_round=MAX_ROUNDS,
                    valid_sets=[dtrain, dval],
                    early_stopping_rounds=50,
                    verbose_eval=50)
    print("\n".join(
        ("%s: %.2f" % x)
Пример #30
0
def kfold_lightgbm(train_df,
                   num_folds=5,
                   feat=None,
                   target=None,
                   classification=False):

    folds = KFold(n_splits=num_folds, shuffle=True, random_state=326)

    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    feature_importance_df = pd.DataFrame()

    if feat is not None:
        feats = [f for f in feat if f not in [target]]
    else:
        feats = [f for f in train_df.columns if f not in [target]]

    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(
            folds.split(train_df[feats], train_df[target])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df[
            target].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[
            target].iloc[valid_idx]

        # set data structure
        lgb_train = lgb.Dataset(train_x, label=train_y, free_raw_data=False)
        lgb_test = lgb.Dataset(valid_x, label=valid_y, free_raw_data=False)

        if not classification:
            params = {
                'num_leaves': 32,
                'objective': 'regression',
                'max_depth': -1,
                'learning_rate': 0.05,
                "boosting": "gbdt",
                "metric": 'mse',
                "verbosity": -1,
                "random_state": 2019
            }

            reg = lgb.train(params,
                            lgb_train,
                            valid_sets=[lgb_train, lgb_test],
                            valid_names=['train', 'test'],
                            num_boost_round=10000,
                            early_stopping_rounds=200,
                            verbose_eval=-1)

        else:
            params = {
                'num_leaves': 32,
                'objective': 'multiclass',
                'max_depth': -1,
                'learning_rate': 0.05,
                "boosting": "gbdt",
                "verbosity": -1,
                "random_state": 2019
            }

            reg = lgb.train(params,
                            lgb_train,
                            valid_sets=[lgb_train, lgb_test],
                            valid_names=['train', 'test'],
                            num_boost_round=10000,
                            early_stopping_rounds=200,
                            verbose_eval=-1)

        oof_preds[valid_idx] = reg.predict(valid_x,
                                           num_iteration=reg.best_iteration)

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = np.log1p(
            reg.feature_importance(importance_type='gain',
                                   iteration=reg.best_iteration))
        fold_importance_df["fold"] = n_fold + 1

        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)
        if classification:
            print('Fold {} accuracy : {}'.format(
                n_fold + 1, accuracy_score(valid_y, oof_preds[valid_idx])))
        else:
            print('Fold {} mse : {}'.format(
                n_fold + 1, mean_squared_error(valid_y, oof_preds[valid_idx])))
        del reg, train_x, train_y, valid_x, valid_y

    # display importances

    feature_importance_df = feature_importance_df.groupby('feature').agg(
        {'importance': ['mean']})
    feature_importance_df.columns = ['importance']
    feature_importance_df = feature_importance_df.sort_values(by='importance',
                                                              ascending=False)
    display_importances(feature_importance_df)

    if classification:
        acc = accuracy_score(oof_preds, train_df[target])
        print('LGBM oof accuracy: {}'.format(
            accuracy_score(oof_preds, train_df[target])))
    else:
        acc = mean_squared_error(oof_preds, train_df[target])
        print('LGBM oof mse: {}'.format(
            mean_squared_error(oof_preds, train_df[target])))
    return oof_preds, acc