示例#1
0
test_X = test[feature_names]

# training
print('training...')

clf = LGBMClassifier(learning_rate=0.2,
                     n_estimators=1000,
                     subsample=0.4,
                     subsample_freq=1,
                     colsample_bytree=0.4,
                     random_state=2019,
                     num_leaves=10,
                     min_child_samples=20,
                     max_depth=3)
clf.fit(train_X, train_y, \
        eval_set=[(train_X, train_y), (val_X, val_y)], \
        early_stopping_rounds=10)
joblib.dump(clf, 'treemodel/lgb_final.model')

clf_xgb = XGBClassifier(learning_rate=0.2,
                        n_estimators=1000,
                        subsample=0.4,
                        subsample_freq=1,
                        colsample_bytree=0.4,
                        random_state=2019,
                        num_leaves=10,
                        min_child_samples=20,
                        max_depth=3)
clf_xgb.fit(train_X, train_y, \
        eval_set=[(train_X, train_y), (val_X, val_y)], \
        early_stopping_rounds=10)
示例#2
0
    amt_oof = np.zeros(train_num)
    prob_oof = np.zeros((train_num, 33))
    test_pred_prob = np.zeros((x_test.shape[0], 33))
    for i, (trn_idx, val_idx) in enumerate(skf.split(x_train, y_train)):
        print(i, 'fold...')

        trn_x, trn_y = x_train[trn_idx], y_train[trn_idx]
        val_x, val_y = x_train[val_idx], y_train[val_idx]
        val_repay_amt = label_amt[val_idx]

        val_due_amt = x_train_due_amt.iloc[val_idx]

        clf.fit(trn_x,
                trn_y,
                eval_set=[(trn_x, trn_y), (val_x, val_y)],
                early_stopping_rounds=100,
                verbose=5)
        joblib.dump(clf, '../model/lgb.pkl')
        # shape = (-1, 33)
        val_pred_prob_everyday = clf.predict_proba(
            val_x, num_iteration=clf.best_iteration_)
        prob_oof[val_idx] = val_pred_prob_everyday
        val_pred_prob_today = [
            val_pred_prob_everyday[i][val_y[i]]
            for i in range(val_pred_prob_everyday.shape[0])
        ]
        val_pred_repay_amt = val_due_amt['due_amt'].values * val_pred_prob_today
        print('val rmse:',
              np.sqrt(mean_squared_error(val_repay_amt, val_pred_repay_amt)))
        print('val mae:', mean_absolute_error(val_repay_amt,
def main():
    train_transaction = pd.read_csv('../data/train_transaction.csv')
    test_transaction = pd.read_csv('../data/test_transaction.csv')
    test_transaction['split'] = 2
    train_transaction['split'] = 1
    transaction = pd.concat([train_transaction, test_transaction])

    aer = pd.read_csv('../data/ae_result.csv')
    transaction = pd.merge(transaction,
                           aer[['TransactionID', 'autoscore']],
                           on='TransactionID',
                           how='left')

    categoricalDomain = [
        'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'P_emaildomain',
        'ProductCD', 'R_emaildomain', 'card4', 'card6'
    ]
    continuousDomain = []
    for i in transaction:
        if i not in categoricalDomain and i != 'TransactionID' and i != 'split':
            continuousDomain.append(i)

    transaction = transaction.fillna(-1)
    #step1 = ('label_encode', label_encoder_sk(cols=categoricalDomain))
    step1 = ('label_encode', label_encoder_sk(cols=categoricalDomain))

    pipeline = Pipeline(steps=[step1])
    transaction_new = pipeline.fit_transform(transaction)

    feature = [
        f for f in transaction_new.columns
        if f != 'TransactionID' and f != 'split' and f != 'isFraud'
    ]

    transaction_new.to_csv('transaction_new.csv', index=False)

    data = transaction_new[transaction_new['split'] == 1]
    valid = transaction_new[transaction_new['split'] == 2]
    train, test = train_test_split(data, test_size=0.3, random_state=42)
    train_x = train[feature]
    test_x = test[feature]
    train_y = train['isFraud']
    test_y = test['isFraud']

    parms = {
        # 'x_train':X_train,
        # 'y_train':y_train,
        'num_leaves': (5, 40),
        'colsample_bytree': (0.1, 0.5),
        'drop_rate': (0.1, 1),
        'learning_rate': (0.001, 0.1),
        'max_bin': (10, 1000),
        'max_depth': (2, 5),
        'min_split_gain': (0.1, 0.9),
        'min_child_samples': (2, 10000),
        'n_estimators': (50, 2000),
        'reg_alpha': (0.1, 1000),
        'reg_lambda': (0.1, 1000),
        'sigmoid': (0.1, 1),
        'subsample': (0.1, 1),
        'subsample_for_bin': (100, 50000),
        'subsample_freq': (1, 10)
    }

    def roc_auc_score_fix(y_true, y_score):
        score = metrics.roc_auc_score(y_true, y_score)
        if score > 0.8:
            return 0
        else:
            return score

    # 参数整理格式,其实只需要提供parms里的参数即可
    intdeal = [
        'max_bin', 'max_depth', 'max_drop', 'min_child_samples',
        'min_child_weight', 'n_estimators', 'num_leaves', 'scale_pos_weight',
        'subsample_for_bin', 'subsample_freq'
    ]  # int类参数
    middledeal = [
        'colsample_bytree', 'drop_rate', 'learning_rate', 'min_split_gain',
        'skip_drop', 'subsample', ''
    ]  # float, 只能在0,1之间
    maxdeal = ['reg_alpha', 'reg_lambda', 'sigmoid']  # float,且可以大于1

    others = {'is_unbalance': True, 'random_state': 24}

    bayesopsObj = bayes_ops(estimator=LGBMClassifier,
                            param_grid=parms,
                            cv=5,
                            intdeal=intdeal,
                            middledeal=middledeal,
                            maxdeal=maxdeal,
                            score_func=make_scorer(
                                score_func=roc_auc_score_fix,
                                greater_is_better=True,
                                needs_threshold=True),
                            init_points=3,
                            n_iter=10,
                            acq="ucb",
                            kappa=0.1,
                            others=others)
    bayesopsObj.run(X=train_x, Y=train_y)
    parms = bayesopsObj.baseparms
    print(parms)

    clf = LGBMClassifier(**parms)
    clf.fit(train_x, train_y)
    train_y_pred = clf.predict_proba(train_x)[:, 1]
    train_ks = cal_ks_scipy(train_y_pred, train_y)
    y_pred = clf.predict_proba(test_x)[:, 1]
    test_ks = cal_ks_scipy(y_pred, test_y)
    print(train_ks, test_ks)
    tr_auc = metrics.roc_auc_score(train_y, train_y_pred)
    te_auc = metrics.roc_auc_score(test_y, y_pred)
    print(tr_auc, te_auc)

    valid['isFraud'] = clf.predict_proba(valid[clf._Booster.feature_name()])[:,
                                                                             1]
    valid[['TransactionID', 'isFraud']].to_csv('submitops.csv', index=False)
示例#4
0
                        param_grid={
                            'n_estimators': [100, 300, 600],
                            'reg_lambda': [0.001, 0.01, 0.1, 1]
                        })
best_clf.fit(train_data, train_label)
print(
    "Select best LGB model with n_estimators = {}  with best_score={}".format(
        best_clf.best_params_['n_estimators'], best_clf.best_score_))
#%%
for a in [100, 300, 600, 1000]:
    for b in [0.0001, 0.001, 0.01, 0.1, 1, 10]:
        LBMclf = LGBMClassifier(random_state=50,
                                n_jobs=-1,
                                n_estimators=a,
                                reg_lambda=b)
        LBMclf.fit(train_data, train_label)
        print(
            "The reuslt AUC_ROC of the lightGBM with n_estimators={} and reg_lambda={} on test data is"
            .format(a, b),
            roc_auc_score(test_label.tolist(),
                          LBMclf.predict(test_data).tolist()))

#%%
# Make the model with the specified regularization parameter
clf = LogisticRegression()
best_clf = GridSearchCV(clf,
                        scoring='roc_auc',
                        cv=5,
                        n_jobs=-1,
                        param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100]})
best_clf.fit(train_data, train_label)
示例#5
0
params = {
    'boosting_type': 'goss',
    'objective': 'binary',
    'is_unbalance': True,
    'categorical_feature': [0, 1, 3, 5, 6, 12, 15, 16, 17, 18, 19, 20],
    'n_jobs': 4,
    'learning_rate': 0.01,
    #'n_estimators':n_estimators_1,
    'num_leaves': 75,
    'max_depth': 6,
    'min_child_samples': 40,
    'colsample_bytree': 0.4
}

lg = LGBMClassifier(silent=False, **params)
lg.fit(X_train, y_train)

# ## 保存模型,用于后续测试

# In[26]:

import pickle

pickle.dump(lg, open("HappyBank_LightGBM_.pkl", 'wb'))

# ### 特征重要性

# In[27]:

df = pd.DataFrame({
    "columns": list(feat_names),
示例#6
0
                     subsample=0.8,
                     colsample_bytree=0.8,
                     random_state=2019,
                     bagging_fraction=0.9,
                     bagging_freq=8,
                     lambda_l1=0.5,
                     lambda_l2=0,
                     cat_smooth=10,
                     is_unbalenced='True',
                     metric=None)

print('************** training **************')
print(train_x.shape, val_x.shape)
clf.fit(train_x,
        train_y,
        eval_set=[(val_x, val_y)],
        eval_metric='auc',
        early_stopping_rounds=50,
        verbose=100)

train_df = pd.read_csv('../user_data/train_df.csv')
train_df = train_df.merge(serial, how='left', on=['serial_number', 'model'])
train_df['dt'] = pd.to_datetime(train_df['dt'], format='%Y-%m-%d %H:%M:%S')
train_df['dt_first'] = pd.to_datetime(train_df['dt_first'],
                                      format='%Y-%m-%d %H:%M:%S')
train_df['days'] = (train_df['dt'] - train_df['dt_first']).dt.days

train_df = train_df.merge(tag, how='left', on=['serial_number', 'model'])
train_df['days_1'] = (train_df['dt'] - train_df['fault_time_1']).dt.days
train_df.loc[train_df.days_1 <= 0, 'tag'] = None
train_df.loc[train_df.days_1 <= 0, 'days_1'] = None
train_df['days_2'] = (train_df['fault_time_1'] - train_df['dt_first']).dt.days
示例#7
0
import pandas as pd
import numpy as np
from lightgbm.sklearn import LGBMClassifier
import xgboost as xgb
from scipy.stats import rankdata
import gc

# 前処理したデータをロード
ts = pd.read_csv('../input/test_3comb_nmf.csv').sort_values(
    by='test_id').reset_index(drop=True)
X_cols = ts.columns.drop(['date_time', 'test_id', 'is_arrested']).tolist()

n_ = 5
preds = np.zeros([len(ts), n_])

for i in range(n_):
    gc.collect()
    d_sample = pd.read_csv('../input/under_sampled_{}_lda_nmf.csv'.format(
        101 + i))  # baggingのため、複数回モデルを作成
    #    clf = xgb.XGBClassifier(eta=0.05, min_child_weight=1, subsample=0.9, colsample_bylevel = 0.2, reg_lambda=1, reg_alpha=0.4)
    clf = LGBMClassifier()
    clf.fit(d_sample[X_cols], d_sample['is_arrested'])
    preds[:, i] = rankdata(clf.predict_proba(ts[X_cols])[:, 1]) / len(ts)

res = pd.DataFrame()
res['test_id'] = ts['test_id']
res['pred_proba'] = preds.mean(axis=1)
res = res.sort_values(by='test_id')
res.to_csv('../output/sub.csv')
)
fea_imp_list = []
clf = LGBMClassifier(learning_rate=0.01,
                     n_estimators=6000,
                     num_leaves=255,
                     subsample=0.9,
                     colsample_bytree=0.8,
                     random_state=2019,
                     metric=None,
                     n_jobs=20)

print('************** training **************')
clf.fit(
    train_x,
    train_y,
    eval_set=[(val_x, val_y)],
    eval_metric='auc',
    # categorical_feature=cate_cols,
    early_stopping_rounds=200,
    verbose=50)
print('runtime:', time.time() - t)

print('************** validation result **************')
best_rounds = clf.best_iteration_
best_score = clf.best_score_['valid_0']['auc']
val_pred = clf.predict_proba(val_x)[:, 1]
fea_imp_list.append(clf.feature_importances_)
print('runtime:', time.time() - t)

print(
    '=============================================== whole dataset training  ==============================================='
)
示例#9
0
    )
    fea_imp_list = []
    clf = LGBMClassifier(
        learning_rate=0.07,
        n_estimators=300,
        num_leaves=512,
        subsample=0.8,
        max_depth=-1,
        colsample_bytree=0.8,
        random_state=2018,
        is_unbalenced='True',
        objective='multiclass',
        #     metric=['binary_logloss']
    )  # 'binary_error','xentropy'
    print('************** training **************')

    clf = clf.fit(
        train_x[smartcol],
        train_y,
        eval_set=[(val_x[smartcol], val_y)],
        eval_metric=['multi_logloss'],  # ,'binary_error','xentropy'
        categorical_feature='auto',
        early_stopping_rounds=50,
        verbose=50)
    tag_pred = clf.predict_proba(data_45678[smartcol])
    fea_imp_list.append(clf.feature_importances_)

    # 保存模型
    with open('tag_model.pkl', 'wb') as pickle_file:
        pickle.dump(clf, pickle_file)
示例#10
0
test = data[data.TARGET == -9999]
X = train.drop(['EID', 'TARGET'], 1).values
y = train.TARGET.values.ravel()

clf = LGBMClassifier(boosting_type='gbdt',
                     objective='binary',
                     max_depth=-1,
                     learning_rate=0.01,
                     n_estimators=2000,
                     subsample=0.6,
                     colsample_bytree=0.6,
                     reg_alpha=5.39,
                     reg_lambda=10,
                     num_leaves=2**6,
                     min_child_weight=10,
                     min_split_gain=0.05,
                     scale_pos_weight=1,
                     random_state=999,
                     n_jobs=-1)

clf.fit(X, y)


def get_res(clf, path='cv.07470.csv'):
    res = clf.predict_proba(test.drop(['EID', 'TARGET'], 1))[:, 1]
    test[['EID']].assign(FORTARGET=0, PROB=res).to_csv(path, index=False)


if __name__ == '__main__':
    get_res(clf)
示例#11
0
def train_lightgbm(verbose=True):
    """Train a boosted tree with LightGBM."""
    logger.info("Training with LightGBM")
    df = pd.read_csv(STAGE1_LABELS)
    """
    data = []
    for id in df['id'].tolist():
        dt = np.load(FEATURE_FOLDER + '/%s.npy' % str(id))
        dt = np.r_[np.mean(dt, axis=0), np.max(dt, axis=0), np.min(dt, axis=0), np.var(dt, axis=0)]
        data.append(dt)
    x = np.array(data)[:, FEATURE]
    """
    x = np.array([
        np.load(FEATURE_FOLDER + '/%s.npy' % str(id))
        for id in df['id'].tolist()
    ])
    """
    x2 = np.array([np.load(FEATURE_FOLDER_2 + '/%s.npy' % str(id))
                   for id in df['id'].tolist()])[:, FEATURE]
    x = np.c_[x, x2]
    """
    """
    x2 = np.array([np.r_[np.mean(np.load(FEATURE_FOLDER_2 + '/%s.npy' % str(id)), axis=0)]
                   for id in df['id'].tolist()])

    """
    # x = np.array([np.load(FEATURE_FOLDER + '/%s.npy' % str(id))[:30].flatten()
    #              for id in df['id'].tolist()])[:, FEATURE]
    y = df['cancer'].as_matrix()
    """
    trn_x, val_x, trn_y, val_y = cross_validation.train_test_split(x, y, random_state=42, stratify=y,
                                                                   test_size=0.20)
    """
    logger.info('data size: {}'.format(x.shape))
    all_params = {
        'max_depth': [3, 5, 10],
        'learning_rate': [0.06, 0.1, 0.2],
        'n_estimators': [1500],
        'min_child_weight': [0],
        'subsample': [1],
        'colsample_bytree': [0.5, 0.6],
        'boosting_type': ['gbdt'],
        #'num_leaves': [2, 3],
        #'reg_alpha': [0.1, 0, 1],
        #'reg_lambda': [0.1, 0, 1],
        #'is_unbalance': [True, False],
        #'subsample_freq': [1, 3],
        'seed': [2261]
    }
    min_score = 100
    min_params = None
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=871)
    for params in ParameterGrid(all_params):
        list_score = []
        for train, test in cv.split(x, y):
            trn_x = x[train]
            val_x = x[test]
            trn_y = y[train]
            val_y = y[test]

            clf = LGBMClassifier(**params)
            clf.fit(
                trn_x,
                trn_y,
                eval_set=[(val_x, val_y)],
                verbose=verbose,
                # eval_metric=log_loss,
                early_stopping_rounds=300)
            _score = log_loss(val_y, clf.predict_proba(val_x)[:, 1])
            # logger.debug('   _score: %s' % _score)
            list_score.append(_score)
        score = np.mean(list_score)
        params['n_estimators'] = clf.best_iteration
        logger.info('param: %s' % (params))
        logger.info('score: %s (avg %s min %s max %s)' %
                    (score, np.mean(list_score), np.min(list_score),
                     np.max(list_score)))
        if min_score > score:
            min_score = score
            min_params = params
        logger.info('best score: %s' % min_score)
        logger.info('best_param: %s' % (min_params))
    """
    imp = pd.DataFrame(clf.feature_importances_, columns=['imp'])
    with open('features.py', 'a') as f:
        f.write('FEATURE = [' + ','.join(map(str, imp[imp['imp'] > 0].index.values)) + ']\n')
    """
    clf = LGBMClassifier(**min_params)
    clf.fit(x, y)

    return clf
示例#12
0
                     subsample_for_bin=800,
                     n_jobs=4)
# # specify your configurations as a dict
# param_grid_xgboost={'min_child_samples':np.arange(10,100,10)}
# start_time=time.clock()
# grid_lgb=GridSearchCV(lgb,param_grid_xgboost,cv=5,scoring='accuracy')
# grid_lgb.fit(X,y)
# endtime=time.clock()
# print('score',grid_lgb.grid_scores_)
# print('Xgboost_best_estimator_',grid_lgb.best_estimator_)
# print('Xgboost_best_score_',grid_lgb.best_score_)
# print('Xgboost_best_params_',grid_lgb.best_params_)
# print("run_time",endtime-start_time)

start_time = time.clock()
score_all = 0
kf = KFold(n_splits=5, shuffle=True)
for train, test in kf.split(X):
    print(len(train), len(test))
    X_train = X[train]
    X_test = X[test]
    y_train = y[train]
    y_test = y[test]
    lgb.fit(X_train, y_train)
    preds = lgb.predict(X_test)
    score = accuracy_score(y_test, preds)
    print("score:", score)
    score_all = score_all + score
print("score_all", score_all / 5)
endtime = time.clock()
print("run_time", endtime - start_time)
示例#13
0
def multi_machine_learing_models(data_train, data_cv):
    print('正在训练模型!')
    data_train=pd.concat([data_train,data_cv],axis=0)
    y_train = data_train['label'].apply(lambda x: 0 if x == 'good' else 1)
    y_test = data_cv['label'].apply(lambda x: 0 if x == 'good' else 1)

    X_train = data_train.drop(['URL', 'label'], axis=1)
    X_test = data_cv.drop(['URL', 'label'], axis=1)

    filename_bayes = 'classifier_model\c_bayes.model'
    filename_LGB = 'classifier_model\c_LGB.model'
    filename_ada = 'classifier_model\c_ada.model'
    filename_rf = 'classifier_model\c_rf.model'
    filename_decision_tree = 'classifier_model\c_decision_tree.model'
    filename_lgs = 'classifier_model\c_lgs.model'

    vote = []
    for i in range(len(y_test)):
        vote.append(0)

    bayes = BernoulliNB()
    bayes.fit(X_train, y_train)
    print('\nbayes模型的准确度:', bayes.score(X_test, y_test))
    predict = bayes.predict(X_test)
    vote = list(map(lambda x: x[0] + x[1], zip(predict, vote)))
    precision = metrics.precision_score(y_test, predict)
    recall = metrics.recall_score(y_test, predict)
    print("precison:", precision)
    print("recall:", recall)
    joblib.dump(bayes, filename_bayes)

    gbc = LGBMClassifier(n_estimators=200, objective='binary')
    gbc.fit(X_train, y_train)
    print('LGBMClassifier模型的准确度:', gbc.score(X_test, y_test))
    predict = gbc.predict(X_test)
    vote = list(map(lambda x: 3 * x[0] + x[1], zip(predict, vote)))
    precision = metrics.precision_score(y_test, predict)
    recall = metrics.recall_score(y_test, predict)
    print("precison:", precision)
    print("recall:", recall)
    joblib.dump(gbc, filename_LGB)

    ada = AdaBoostClassifier(n_estimators=100)  # 迭代100次
    ada.fit(X_train, y_train)
    print('ada模型的准确度:', ada.score(X_test, y_test))
    predict = ada.predict(X_test)
    vote = list(map(lambda x: 2 * x[0] + x[1], zip(predict, vote)))
    precision = metrics.precision_score(y_test, predict)
    recall = metrics.recall_score(y_test, predict)
    print("precison:", precision)
    print("recall:", recall)
    joblib.dump(ada, filename_ada)

    rf = RandomForestClassifier(n_estimators=100, oob_score=True)
    rf.fit(X_train, y_train)
    print('\nrf模型的准确度:', rf.score(X_test, y_test))
    predict = rf.predict(X_test)
    vote = list(map(lambda x: x[0] * 3 + x[1], zip(predict, vote)))
    precision = metrics.precision_score(y_test, predict)
    recall = metrics.recall_score(y_test, predict)
    print("precison:", precision)
    print("recall:", recall)
    joblib.dump(rf, filename_rf)

    decision_tree = tree.DecisionTreeClassifier()
    decision_tree.fit(X_train, y_train)
    print('\ndecision_tree模型的准确度:', decision_tree.score(X_test, y_test))
    predict = decision_tree.predict(X_test)
    vote = list(map(lambda x: x[0] * 2 + x[1], zip(predict, vote)))
    precision = metrics.precision_score(y_test, predict)
    recall = metrics.recall_score(y_test, predict)
    print("precison:", precision)
    print("recall:", recall)
    joblib.dump(decision_tree, filename_decision_tree)

    lgs = LogisticRegression()
    lgs.fit(X_train, y_train)
    print('\nLogisticRegression模型的准确度:', lgs.score(X_test, y_test))
    predict = lgs.predict(X_test)
    vote = list(map(lambda x: x[0] * 2 + x[1], zip(predict, vote)))
    precision = metrics.precision_score(y_test, predict)
    recall = metrics.recall_score(y_test, predict)
    print("precison:", precision)
    print("recall:", recall)
    joblib.dump(lgs, filename_lgs)

    print('\n投票结果:')
    vote_r = []
    for i in range(len(vote)):
        if vote[i] >= 3:
            vote_r.append(1)
        else:
            vote_r.append(0)
    precision = metrics.precision_score(y_test, vote_r)
    recall = metrics.recall_score(y_test, vote_r)
    acc = metrics.accuracy_score(y_test, vote_r)
    print('准确度:', acc)
    print("precison:", precision)
    print("recall:", recall)
示例#14
0
            clf = LGBMClassifier(
                learning_rate=i_params["learning_rate"][j],
                n_estimators=i_params["n_estimators"][j],
                num_leaves=i_params["num_leaves"][j],
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=2019,
                is_unbalenced='True',
                metric=None)

            print('************** training **************')
            print(train_x.shape)
            clf.fit(
                train_x, train_y,
                eval_set=[(train_x, train_y)],
                eval_metric='auc',
                early_stopping_rounds=10,
                verbose=10
            )
            # 保存模型
            joblib.dump(clf, './model/model_saved/lgb_voting_{}.pkl'.format(model_index))
            model_index += 1

    # 预测部份
    test_data_dir = "./data/disk_sample_smart_log_round2"
    test_file_list = os.listdir(test_data_dir)

    submit = pd.DataFrame([])
    new_disks = []
    for day in pd.date_range("2018-08-20", "2018-09-30"):
        print("start predicting for {}".format(day.strftime("%Y-%m-%d")))
示例#15
0
def train_k_fold_lgbm(X, y, features, FOLDS=5, RANDOM_STATE=707, PARAM_COMBINATION=40):
    print(f'X shape: {X.shape}')

    lgbm_default = LGBMClassifier(learning_rate=0.1, n_estimators=450,
                                  max_depth=7, min_child_weight=1, subsample=0.8,
                                  class_weight='balanced', boosting='gbdt')
    lgbm_params = {
        'num_leaves': [6, 12, 24, 64],
        'max_depth': [3, 5, 7, 14],
        'min_data_in_leaf': [20, 40, 80],
        'min_sum_hessian_in_leaf': [1e-5, 1e-2, 1, 1e2, 1e4],
        'bagging_fraction': [i / 10.0 for i in range(7, 11)],
        'bagging_freq': [0, 5, 10, 20, 30],
        'feature_fraction': [i / 10.0 for i in range(3, 7)],
        'lambda_l1': [0, 1e-5, 1e-2],
        'lambda_l2': [0, 1e-5, 1e-2]
    }
    print('lgbm params: ', lgbm_params)

    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=RANDOM_STATE)

    rd_lgbm = RandomizedSearchCV(
        estimator=lgbm_default,
        param_distributions=lgbm_params,
        scoring='f1_macro',
        n_jobs=-1,
        pre_dispatch='2*n_jobs',
        cv=skf.split(X.loc[:, features], y),
        verbose=1,
        random_state=RANDOM_STATE,
        n_iter=PARAM_COMBINATION
    )

    print(f'randomcv shape: {X.loc[:, features].shape}')

    rd_lgbm.fit(
        X=X.loc[:, features],
        y=y
    )

    feature_importance = pd.DataFrame(
        rd_lgbm.best_estimator_.feature_importances_,
        index=features,
        columns=['importance']
    ).sort_values('importance', ascending=False)

    feature_hyped = feature_importance[feature_importance['importance'] > 0].index
    lgbm_hyped = LGBMClassifier(**rd_lgbm.best_estimator_.get_params())

    print('Training on whole population with best parameters and features...')
    final_features = list(feature_hyped)
    print(f'training final shape: {X.loc[:, final_features].shape}')
    lgbm_hyped.fit(
        X=X.loc[:, final_features],
        y=y
    )

    feature_importance = pd.DataFrame(
        lgbm_hyped.feature_importances_,
        index=final_features,
        columns=['importance']
    ).sort_values('importance', ascending=False)

    print('Finished!')
    return feature_importance, final_features, lgbm_hyped
示例#16
0
    'colsample_bytree': 0.9497036,
    'subsample': 0.8715623,
    'max_depth': 8,
    'reg_alpha': 0.041545473,
    'reg_lambda': 0.0735294,
    'min_child_weight': 2,
    'silent': -1,
    'verbose': -1,
    'objective': 'binary',
    'seed': 3
}

model_single = LGBMClassifier(**lgb_params_classif2)
model_single.fit(x_train,
                 y_train,
                 eval_set=[(x_val, y_val)],
                 verbose=False,
                 early_stopping_rounds=10)
preds_val = model_single.predict_proba(x_val)
print('Acc single model :', acc(y_val, preds_val))

model = CrossValClassifier(LGBMClassifier(**lgb_params_classif2), n_split=10)
model.fit(x_train, y_train, x_val, y_val, eval_metric=acc)
model.save_models('test_lgbclassifcv.pkl')
del model

with open('test_lgbclassifcv.pkl', 'rb') as f:
    model = pickle.load(f)
preds = model.predict_proba(x_val)
print("Evaluation CV : ", acc(y_val, preds))
示例#17
0
pf = pd.read_csv("../../Datasets/Cancer.csv")
X = pf.drop(['Unnamed: 32', "id", "diagnosis"], axis=1)
Y = np.array(pd.get_dummies(pf['diagnosis'],
                            drop_first=True)).reshape(X.shape[0])

#veriyi bölme
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.21,
                                                    random_state=42)

#modeli kurma

LightGBM = LGBMClassifier()

LightGBM.fit(X_train, y_train)

#modelden tahmin tapma
pred = LightGBM.predict(X_test)

#ilkel başarı değeri
print(f"İlkel başarı değeri : {accuracy_score(y_test,pred)}")

#hiperparametre seçelim

hiperparams = {
    'max_depth': np.arange(2, 10, 2),
    'learning_rate': [0.0001, 0.001, 0.01, 0.1, 1],
    'n_estimators': np.arange(200, 1000, 200)
}
示例#18
0
        clf = LGBMClassifier(
            objective='multiclass',
            num_leaves=63,
            learning_rate=0.01,
            n_estimators=10000,
            subsample_freq=1,
            subsample=0.8,
            colsample_bytree=0.8,
            min_child_weight=5,
            random_state=2020,
            n_jobs=24,
        )
        clf.fit(
            X_trn,
            Y_trn,
            eval_set=[(X_val, Y_val)],
            early_stopping_rounds=500,
            verbose=200,
        )
        print('val_acc: {:.5f}'.format(
            accuracy_score(Y_val, clf.predict(X_val))))
        oof[val_idx] = clf.predict_proba(X_val)
        sub += clf.predict_proba(X_sub) / skf.n_splits

    print('cv_acc : {:.5f}'.format(accuracy_score(Y_train,
                                                  oof.argmax(axis=1))))
    print(
        classification_report(Y_train,
                              oof.argmax(axis=1),
                              target_names=lbl.classes_))
示例#19
0
def train_merged_Model(trainset_path,
                       valset_path,
                       model_save_folder,
                       lr=0.001,
                       isLGB=True):
    '''
    训练融合后的模型
    :param trainset_path:
    :param valset_path:
    :param model_save_folder:
    :param lr:
    :param isLGB:
    :return:
    '''
    os.makedirs(model_save_folder, exist_ok=True)
    cols = ['model', 'days', 'label']
    cols += ['p_' + str(i) for i in range(8)]

    def load_set(set_path):
        df = pd.read_csv(set_path, usecols=cols)
        df['model_1'] = df['model'].apply(lambda x: int(x == 1))
        df['model_2'] = df['model'].apply(lambda x: int(x == 2))
        sety = df['label']
        setX = df.drop(['label', 'model'], axis=1)
        del df
        return setX, sety

    def auc_prc(y_true, y_pred):
        return 'AUC_PRC', average_precision_score(y_true, y_pred), True

    trainX, trainy = load_set(trainset_path)
    valX, valy = load_set(valset_path)
    print('trainset info, shape: {},value_counts: {}'.format(
        trainX.shape, trainy.value_counts()))
    print('valset info, shape: {},value_counts: {}'.format(
        valX.shape, valy.value_counts()))

    ##########LGBMClassifier
    clf = LGBMClassifier(
        num_leaves=127,
        learning_rate=lr,
        n_estimators=10000,
        objective='binary',
        is_unbalance=True,
        subsample=0.8,
        colsample_bytree=0.8,
    ) if isLGB else RandomForestClassifier()
    t0 = time.time()
    if isLGB:
        clf.fit(trainX,
                trainy,
                eval_set=[(valX, valy)],
                eval_metric=auc_prc,
                early_stopping_rounds=50,
                verbose=100)
    else:
        clf.fit(trainX, trainy)
    print('fit time: {:.4f}'.format(time.time() - t0))
    save_name='LGBM_Merged_'+ datetime.now().strftime('%Y%m%d_%H%M%S') if isLGB else \
        'RF_Merged'+datetime.now().strftime('%Y%m%d_%H%M%S')
    joblib.dump(clf, os.path.join(model_save_folder, save_name))
    print('Merged model is saved to {}'.format(save_name))