'eta': 0.1,
    'eval_metric': 'auc',
    'max_depth': 4,
    'objective': 'binary:logistic',
    'silent': 1,
    'tree_method': 'hist',
    'nthread': 64,
    'seed': SEED
}

#dtrain = xgb.DMatrix(X, y)
#col = X.columns
#del X, y; gc.collect()

gc.collect()
yhat, imp, ret = ex.stacking(X, y, param, 9999, nfold=5, esr=30)

imp.to_csv('imp_{}.csv'.format(datetime.today().date()), index=False)

# =============================================================================
# cv
# =============================================================================

#model = xgb.train(param, dbuild, NROUND, watchlist, verbose_eval=10,
#                  early_stopping_rounds=50)
#
#imp = ex.getImp(model)
#imp.to_csv('imp.csv', index=False)

#==============================================================================
utils.end(__file__)
예제 #2
0
    ])
    return df


X = pd.concat([question_id] + [utils.read_pickles(f)
                               for f in train_files] + [vec],
              axis=1)
#X = pd.concat([question_id]+[subsample(f) for f in train_files], axis=1)
y = utils.read_pickles('../data/label')['answer_score'].map(np.log1p)

# =============================================================================
# xgb
# =============================================================================
params = {
    'max_depth': 5,
    'eta': 0.1,
    'colsample_bytree': 0.7,
    'silent': 1,
    'eval_metric': 'rmse',
    'objective': 'reg:linear',
    'tree_method': 'hist'
}

yhat, imp, ret = ex.stacking(X,
                             y,
                             params,
                             9999,
                             esr=50,
                             by='question_id',
                             seed=SEED)
예제 #3
0
}

y = X.is_duplicate
col = ['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate']

lls = []
iters = []

for i in range(loop):

    X_, y_ = utils.down_sampling(X, y, p)

    sub = X_[col]
    X_.drop(col, axis=1, inplace=True)

    yhat, imp, ret = ex.stacking(X_, y_, param, nround, esr=30, nfold=5)

    ll = log_loss(y_, yhat)
    iter_ = ret['best_iter']

    print('998-1_cv LOGLOSS:', ll)
    print('998-1_cv best iter:', iter_)

    lls.append(ll)
    iters.append(iter_)

print(np.mean(lls), np.mean(iters))

sub['yhat'] = yhat
sub['d'] = abs(sub.is_duplicate - sub.yhat)
sub.sort_values('d', ascending=False, inplace=True)
예제 #4
0
test_sub = test[['test_id']]

for i in range(1):
    train = pd.merge(train, utils.load_010([0, 1, 2], 1), on='id', how='left')
    test = pd.merge(test,
                    utils.load_010([0, 1, 2], 0),
                    on='test_id',
                    how='left')

    col = get_valid_col(train.columns)
    train.drop(col, axis=1, inplace=1)
    test.drop(col, axis=1, inplace=1)

    yhat, imp, ret = ex.stacking(train.drop('id', axis=1),
                                 y_train,
                                 param,
                                 nround,
                                 esr=30,
                                 test=test.drop('test_id', axis=1))
    gc.collect()

    print('log_loss:', log_loss(y_train, yhat))
    train_sub['yhat'] = yhat
    test_sub['yhat'] = ret.get('test')

train_sub.drop('is_duplicate',
               axis=1).to_csv('../feature/train_f009_xgb.csv.gz',
                              index=False,
                              compression='gzip')
train_sub.to_csv('../feature/test_f009_xgb.csv.gz',
                 index=False,
                 compression='gzip')
예제 #5
0
y_train = X.is_duplicate
sub = X[['id', 'is_duplicate']]
X.drop(col, axis=1, inplace=1)


def get_valid_col(col):
    return [
        c for c in col if c.count(',') > 0 or c.count('[') > 0
        or c.count(']') > 0 or c.count('>') > 0
    ]


col = get_valid_col(X.columns)
X.drop(col, axis=1, inplace=1)

yhat, imp, ret = ex.stacking(X, y_train, param, nround, esr=30)

sub['yhat'] = yhat

sub[['id', 'yhat']].to_csv('../output/cv.csv.gz',
                           index=False,
                           compression='gzip')
imp.to_csv('../output/imp.csv.gz', index=False, compression='gzip')

print(log_loss(y_train, yhat))
"""
eta: 0.02
CV: 0.317928303359 ==> LB: 0.28018

eta: 0.2
CV: 0.363839232148