Exemplo n.º 1
0
def model_pred(trn_x, trn_y, val_x, val_y, tst_x):
    global trn_tmp_x
    global trn_tmp_y
    global val_tmp_x
    global val_tmp_y
    trn_tmp_x = trn_x
    trn_tmp_y = trn_y
    val_tmp_x = val_x
    val_tmp_y = val_y

    global pred
    pred = []

    modelBO = BO(model_run, {
        'max_depth': (1, 10),
        'min_child_samples': (10, 600),
        'min_child_weight': (0.001, 20),
        'colsample_bytree': (0.1, 1),
        'subsample': (0.5, 1),
        'subsample_freq': (10, 100),
        'num_leaves': (10, 600),
        'alpha': (0, 20),
        'scale_pos_weight': (1, 4)
    },
                 random_state=1987)
    modelBO.maximize(init_points=10, n_iter=20, acq='rnd')

    model = lgb.LGBMClassifier(
        boosting_type='gbdt',
        n_estimators=2000,
        max_depth=int(round(modelBO.res['max']['max_params']['max_depth'])),
        objective="binary_logloss",
        learning_rate=0.01,
        num_leaves=int(modelBO.res['max']['max_params']['num_leaves']),
        subsample=modelBO.res['max']['max_params']
        ['subsample'],  # bagging_fraction
        subsample_freq=int(modelBO.res['max']['max_params']['subsample_freq']),
        min_child_samples=int(
            modelBO.res['max']['max_params']['min_child_samples']),
        min_child_weight=modelBO.res['max']['max_params']
        ['min_child_weight'],  #min_sum_hessian_in_leaf
        colsample_bytree=modelBO.res['max']['max_params']
        ['colsample_bytree'],  # feature fraction
        scale_pos_weight=modelBO.res['max']['max_params']['scale_pos_weight'],
        reg_alpha=modelBO.res['max']['max_params']['alpha'],
        reg_lambda=1.3,
        n_jobs=8)

    fit_model = model.fit(trn_tmp_x,
                          trn_tmp_y,
                          eval_set=[(val_tmp_x, val_tmp_y)],
                          eval_metric=gini_metric,
                          early_stopping_rounds=100,
                          verbose=False)

    max_pred_ind = (np.in1d(modelBO.res['all']['values'],
                            modelBO.res['max']['max_val'])).argmax()
    return pred[max_pred_ind], fit_model.predict_proba(
        tst_x, num_iteration=model.best_iteration_)[:, 1]
Exemplo n.º 2
0
def model_pred(trn_x, trn_y, val_x, val_y, tst_x):
    global trn_tmp_x
    global trn_tmp_y
    global val_tmp_x
    global val_tmp_y
    trn_tmp_x = trn_x
    trn_tmp_y = trn_y
    val_tmp_x = val_x
    val_tmp_y = val_y

    global pred
    pred = []

    modelBO = BO(model_run, {
        'depth': (3, 10),
        'reg_lambda': (0, 20),
        'feature_fraction': (0.5, 1)
    },
                 random_state=1987)
    #    modelBO.explore({'depth': [6],
    #                     'reg_lambda': [14],
    #                     'feature_fraction': [1]})
    modelBO.maximize(init_points=10, n_iter=20, acq='rnd')

    model = catb.CatBoostClassifier(
        iterations=2000,
        learning_rate=0.05,
        depth=int(round(modelBO.res['max']['max_params']['depth'])),
        l2_leaf_reg=modelBO.res['max']['max_params']['reg_lambda'],
        rsm=modelBO.res['max']['max_params']['feature_fraction'],
        loss_function='Logloss',
        thread_count=16,
        random_seed=1987,
        use_best_model=True,
        od_type='Iter',
        od_wait=100,
        eval_metric=gini_metric(),
        verbose=False)
    cat_f = data_categ(trn_tmp_x)
    fit_model = model.fit(X=trn_tmp_x,
                          y=trn_tmp_y,
                          cat_features=cat_f,
                          use_best_model=True,
                          eval_set=(val_tmp_x, val_tmp_y),
                          verbose=False)

    max_pred_ind = (np.in1d(modelBO.res['all']['values'],
                            modelBO.res['max']['max_val'])).argmax()
    return pred[max_pred_ind], fit_model.predict_proba(tst_x)[:, 1]
Exemplo n.º 3
0
def forloop(j):
    trn_tmp_x = trn_x.iloc[cv_ind[~flag_valid[:, j]]]
    trn_tmp_y = trn_y.iloc[cv_ind[~flag_valid[:, j]]]
    val_tmp_x = trn_x.iloc[cv_ind[flag_valid[:, j]]]
    val_tmp_y = trn_y.iloc[cv_ind[flag_valid[:, j]]]

    def one_pass(C):
        global pred_trn, pred_tst
        pred_tst[:, j] = 0
        print 'jfold: ', j
        pred_trn_tmp, pred_tst_tmp, best_iter = m1logit.model_pred(
            trn_tmp_x, trn_tmp_y, val_tmp_x, val_tmp_y, tst_x, C)
        print 'double check gini of jfold=', j, ' pred = ', eval_gini(
            val_tmp_y, pred_trn_tmp)
        pred_trn[cv_ind[flag_valid[:, j]], j] = pred_trn_tmp
        pred_tst[:, j] = pred_tst_tmp

        for i in xrange(nfold):
            if i == j:
                continue
            else:
                trn_tmp_x_ = trn_x.iloc[cv_ind[~flag_valid[:, i]
                                               & ~flag_valid[:, j]]]
                trn_tmp_y_ = trn_y.iloc[cv_ind[~flag_valid[:, i]
                                               & ~flag_valid[:, j]]]
                val_tmp_x_ = trn_x.iloc[cv_ind[flag_valid[:, i]]]
                val_tmp_y_ = trn_y.iloc[cv_ind[flag_valid[:, i]]]
                print 'ifold: ', i
                pred_trn_tmp, pred_tst_tmp, best_iter = m1logit.model_pred(
                    trn_tmp_x_, trn_tmp_y_, val_tmp_x_, val_tmp_y_, tst_x, C)
                print 'double check gini of ifold=', i, ' pred = ', eval_gini(
                    val_tmp_y_, pred_trn_tmp)
                pred_trn[cv_ind[flag_valid[:, i]], j] = pred_trn_tmp
                pred_tst[:, j] = pred_tst[:, j] + pred_tst_tmp

        return eval_gini(trn_y, pred_trn[:, j])

    modelBO = BO(one_pass, {'C': (0.001, 3.0)}, random_state=1987)
    modelBO.maximize(init_points=20, n_iter=20, acq='rnd')

    one_pass(modelBO.res['max']['max_params']['C'])
    #    one_pass()

    print "\nGini for full training set ", j, ':', eval_gini(
        trn_y, pred_trn[:, j])

    return [pred_trn[:, j], pred_tst[:, j] / float(nfold)]
Exemplo n.º 4
0
def forloop(j):
    trn_tmp_x = trn_x.iloc[cv_ind[~flag_valid[:, j]]]
    trn_tmp_y = trn_y.iloc[cv_ind[~flag_valid[:, j]]]
    val_tmp_x = trn_x.iloc[cv_ind[flag_valid[:, j]]]
    val_tmp_y = trn_y.iloc[cv_ind[flag_valid[:, j]]]

    def one_pass(max_depth, min_child_samples, min_child_weight,
                 colsample_bytree, subsample, subsample_freq, num_leaves,
                 alpha, scale_pos_weight):
        global pred_trn
        print 'jfold: ', j
        pred_trn_tmp, pred_tst_tmp = m1lgbrf.model_pred(
            trn_tmp_x, trn_tmp_y, val_tmp_x, val_tmp_y, tst_x, max_depth,
            min_child_samples, min_child_weight, colsample_bytree, subsample,
            subsample_freq, num_leaves, alpha, scale_pos_weight)
        print 'double check gini of jfold=', j, ' pred = ', eval_gini(
            val_tmp_y, pred_trn_tmp)
        pred_trn[cv_ind[flag_valid[:, j]], j] = pred_trn_tmp

        for i in xrange(nfold):
            if i == j:
                continue
            else:
                trn_tmp_x_ = trn_x.iloc[cv_ind[~flag_valid[:, i]
                                               & ~flag_valid[:, j]]]
                trn_tmp_y_ = trn_y.iloc[cv_ind[~flag_valid[:, i]
                                               & ~flag_valid[:, j]]]
                val_tmp_x_ = trn_x.iloc[cv_ind[flag_valid[:, i]]]
                val_tmp_y_ = trn_y.iloc[cv_ind[flag_valid[:, i]]]
                print 'ifold: ', i
                pred_trn_tmp, pred_tst_tmp = m1lgbrf.model_pred(
                    trn_tmp_x_, trn_tmp_y_, val_tmp_x_, val_tmp_y_, tst_x,
                    max_depth, min_child_samples, min_child_weight,
                    colsample_bytree, subsample, subsample_freq, num_leaves,
                    alpha, scale_pos_weight)
                print 'double check gini of ifold=', i, ' pred = ', eval_gini(
                    val_tmp_y_, pred_trn_tmp)
                pred_trn[cv_ind[flag_valid[:, i]], j] = pred_trn_tmp

        return eval_gini(trn_y, pred_trn[:, j])

    modelBO = BO(one_pass, {
        'max_depth': (1, 10),
        'min_child_samples': (10, 600),
        'min_child_weight': (0.001, 20),
        'colsample_bytree': (0.1, 1),
        'subsample': (0.5, 1),
        'subsample_freq': (10, 100),
        'num_leaves': (10, 600),
        'alpha': (0, 20),
        'scale_pos_weight': (1, 4)
    },
                 random_state=1987)
    modelBO.maximize(init_points=10, n_iter=20, acq='rnd')

    one_pass(modelBO.res['max']['max_params']['max_depth'],
             modelBO.res['max']['max_params']['min_child_samples'],
             modelBO.res['max']['max_params']['min_child_weight'],
             modelBO.res['max']['max_params']['colsample_bytree'],
             modelBO.res['max']['max_params']['subsample'],
             modelBO.res['max']['max_params']['subsample_freq'],
             modelBO.res['max']['max_params']['num_leaves'],
             modelBO.res['max']['max_params']['alpha'],
             modelBO.res['max']['max_params']['scale_pos_weight'])

    print "\nGini for full training set ", j, ':', eval_gini(
        trn_y, pred_trn[:, j])

    return pred_trn[:, j]
Exemplo n.º 5
0
    for i in xrange(nfold):
        pred_trn[cv_ind[flag_valid[:, i]]] = pred_trn_separate[
            cv_ind[flag_valid[:, i]], i]

    return eval_gini(trn_y, pred_trn)


t0 = timer()

modelBO = BO(one_pass, {
    'max_depth': (1, 10),
    'min_child_samples': (10, 600),
    'min_child_weight': (0.001, 20),
    'colsample_bytree': (0.1, 1),
    'subsample': (0.5, 1),
    'subsample_freq': (10, 100),
    'num_leaves': (10, 600),
    'alpha': (0, 20),
    'scale_pos_weight': (1, 4)
},
             random_state=1987)
modelBO.maximize(init_points=50, n_iter=50, acq='rnd')
print modelBO.res['max']['max_params']['max_depth'],\
      modelBO.res['max']['max_params']['min_child_samples'],\
      modelBO.res['max']['max_params']['min_child_weight'],\
      modelBO.res['max']['max_params']['colsample_bytree'],\
      modelBO.res['max']['max_params']['subsample'],\
      modelBO.res['max']['max_params']['subsample_freq'],\
      modelBO.res['max']['max_params']['num_leaves'],\
      modelBO.res['max']['max_params']['alpha'],\
      modelBO.res['max']['max_params']['scale_pos_weight']