Пример #1
0
def lgbm_regressor():
    train, test = load_datasets(filename='../input/data_560.csv')
    X_train, y_train, X_test, df_coulumns = getDataSet(train, test)
    models = ['lr_data_560' + str(i) for i in range(201, 300)]
    training_lgbm_regressor(X_train,
                            y_train,
                            X_test,
                            df_coulumns,
                            ratio=0.8,
                            model_name=models)
Пример #2
0
def main():
    print('load train test datasets')
    train, test = load_datasets(dropDuplicate=False)
    batch_size = 1
    submit_df = pd.DataFrame({'Id': test['Id']})

    submit_pred = np.zeros((test.shape[0], 1))
    submit_pred_n = np.zeros((test.shape[0], batch_size))

    submit_train = np.zeros((train.shape[0], 1))
    submit_train_n = np.zeros((train.shape[0], batch_size))

    test_rmses = []
    for _, train_all in enumerate(bacth(train, batch_size)):
        print('第 %d 批 dataset 开始训练' % _)
        y_train_pred, y_test_pred, mean_test_rmse = sub_train(train_all,
                                                              test,
                                                              hasFilter=False)
        submit_pred_n[:, _] = y_test_pred.reshape(-1)
        submit_train_n[:, _] = y_train_pred.reshape(-1)

        test_rmses.append(mean_test_rmse)
        print('第 %d 批 dataset 训练结束' % _)

    print('train finished...')

    test_rmse = np.mean(test_rmses)
    print('mean test rmse: ', test_rmse)
    submit_pred[:] = submit_pred_n.mean(1).reshape(-1, 1)
    submit_train[:] = submit_train_n.mean(1).reshape(-1, 1)

    # test
    submit_df['Score_xgb_bagging'] = submit_pred
    submission_path_raw = '../models/__models__/{}_{}_{}.csv'.format(
        'xgboost_bagging_test', test_rmse,
        time.strftime('%m%d%H%M', time.localtime(time.time())))

    submission_path_threshold = '../result/{}_{}_{}.csv'.format(
        'xgboost_threshold', test_rmse,
        time.strftime('%m%d%H%M', time.localtime(time.time())))
    submit_df.to_csv(submission_path_raw, index=False)
    submit_df = threshold(submit_df, feature='Score_xgb_bagging')
    submit_df.to_csv(submission_path_threshold, index=False, header=False)

    # train
    submit_train_df = train[['Id']]
    submit_train_df['Score_xgb_bagging'] = submit_train

    submit_train_df.to_csv(submission_path_raw.replace('test', 'train'),
                           index=False)
    print('done.')
Пример #3
0
def regression():

    train, test = load_datasets(fillNan=True)
    X_train, y_train, X_test, df_coulumns = getDataSet(train, test)

    model_name = ['ridge_1', 'ridge_2', 'lasso_1', 'lasso_2']
    clfs = [
        Ridge(fit_intercept=True,
              alpha=8.858667904100823,
              max_iter=500,
              normalize=False,
              tol=0.01),
        Ridge(fit_intercept=True,
              alpha=8.858667904100823,
              max_iter=500,
              normalize=True,
              tol=0.01),
        Lasso(fit_intercept=True,
              alpha=8.858667904100823,
              max_iter=500,
              normalize=True,
              tol=0.01),
        Lasso(fit_intercept=True,
              alpha=8.858667904100823,
              max_iter=500,
              normalize=False,
              tol=0.01)
    ]

    training_regression(X_train,
                        y_train,
                        X_test,
                        df_coulumns,
                        clfs,
                        kBest=True,
                        k=476,
                        ratio=1,
                        model_name=model_name)

    model_name = ['ridge_3', 'ridge_4', 'lasso_3', 'lasso_4']
    training_regression(X_train,
                        y_train,
                        X_test,
                        df_coulumns,
                        clfs,
                        minMaxScaler=MinMaxScaler((-1, 1)),
                        kBest=True,
                        k=476,
                        ratio=1,
                        model_name=model_name)
Пример #4
0
def main():
    print('load train test datasets')
    usebatch = False

    # configuration display
    print('batch is used: {}'.format("Yes" if usebatch else "No"))

    with open('../input/feature.info', 'a+') as outf:
        if usebatch:
            for train, test in batch_with_gain_importance(featurefile='../models/info/gain_importance_data.csv', filename='../input/data.csv'):
                _train(train, test, outf)
        else:
            train, test = load_datasets(filename='../input/data_560.csv')
            _train(train, test, outf)
Пример #5
0
def main():
    # 加载 数据集
    train, text = load_datasets()

    # 加载 feature importance
    features = most_importance('split_importance_02181349.csv')

    best_feature = []
    maxRmse = 0

    drop_feature = []
    drop_len = 10

    drop_freq = defaultdict(int)
    all_feature = features[::]

    while len(features) != 0:
        choose_feature = features.pop(0)
        candicate_feature = best_feature.copy()
        candicate_feature.append(choose_feature)

        train_feature = train[candicate_feature]
        train_label = train['Score']

        train_feature, valid_feature, train_label, valid_label = train_test_split(
            train_feature, train_label, test_size=0.3, random_state=0)
        train_feature = np.array(train_feature.values)
        valid_feature = np.array(valid_feature.values)

        train_label = np.array(train_label).reshape(-1)
        valid_label = np.array(valid_label).reshape(-1)

        lgb_train = lgbm.Dataset(train_feature, label=train_label)
        lgb_eval = lgbm.Dataset(valid_feature,
                                label=valid_label,
                                reference=lgb_train)

        lgbm_params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': 'rmse',
            'min_child_weight': 20,
            'num_leaves': 2**5,
            'lambda_l2': 2,
            'subsample': 0.5,
            'colsample_bytree': 1,
            'learning_rate': 0.1,
            'seed': 2017,
            'verbose': 100,
            'silent': True,
        }

        model = lgbm.train(lgbm_params,
                           lgb_train,
                           num_boost_round=5000,
                           valid_sets=lgb_eval,
                           early_stopping_rounds=100)

        valid_pred = model.predict(valid_feature,
                                   num_iteration=model.best_iteration)
        valid_rmse = rmse(valid_label, valid_pred)

        valid_rmse = 1 / (1 + valid_rmse)
        if valid_rmse >= maxRmse - 0.000245:
            best_feature.append(choose_feature)
            maxRmse = max(maxRmse, valid_rmse)
            with open('../models/info/sub_feature_02182137.csv', 'a+') as out:
                out.write(
                    str(valid_rmse) + ',' + choose_feature + ',' +
                    str(len(drop_feature)) + '\n')
        else:
            drop_freq[choose_feature] += 1
            drop_feature.append(choose_feature)
            if len(drop_feature) == drop_len:
                for val in drop_feature[::-1]:
                    if drop_freq[val] > 1: continue
                    features.insert(0, val)
                drop_feature = []
                drop_len += drop_len

            with open('../models/info/sub_feature_02182137.log', 'a+') as out:
                out.write('当前 drop 队列长度 {}:'.format(len(drop_feature)) + '\n')
                out.write(' || '.join(drop_feature) + '\n')
                for drop in drop_feature:
                    out.write('{} {}'.format(drop_freq[drop], drop) + '\n')
def main():
    print('load train test datasets')
    train, test = load_datasets()

    submit_df = pd.DataFrame({'userid': test['Id']})

    X_train = train.drop(['Id', 'Score'], axis=1)
    X_test = test.drop(['Id'], axis=1)

    y_train = train['Score'] - 1
    df_columns = X_train.columns

    xgb_params = {
        'eta': 0.01,
        'min_child_weight': 20,
        'colsample_bytree': 0.5,
        'max_depth': 10,
        'subsample': 0.5,
        'lambda': 2.0,
        'scale_pos_weight': 1,
        'eval_metric': 'mlogloss',
        'objective': 'multi:softmax',
        'silent': 1,
        'booster': 'gbtree',
        'num_class': 5
    }

    dtrain_all = xgb.DMatrix(X_train.values,
                             y_train.values,
                             feature_names=df_columns)
    dtest = xgb.DMatrix(X_test.values, feature_names=df_columns)

    # 5 折交叉验证
    nfold = 5
    cv_result = xgb.cv(dict(xgb_params),
                       dtrain_all,
                       nfold=nfold,
                       stratified=True,
                       num_boost_round=10000,
                       early_stopping_rounds=100,
                       verbose_eval=100,
                       show_stdv=False)

    best_num_boost_rounds = len(cv_result)
    mean_train_mlogloss = cv_result.loc[best_num_boost_rounds -
                                        11:best_num_boost_rounds - 1,
                                        'train-mlogloss-mean'].mean()
    mean_test_mlogloss = cv_result.loc[best_num_boost_rounds -
                                       11:best_num_boost_rounds - 1,
                                       'test-mlogloss-mean'].mean()
    print('best_num_boost_rounds = {}'.format(best_num_boost_rounds))

    # num_boost_round = int(best_num_boost_rounds * 1.1)
    # print('num_boost_round = ', num_boost_round)

    print('mean_rmse_auc = {:.7f} , mean_rmse_auc = {:.7f}\n'.format(
        mean_train_mlogloss, mean_test_mlogloss))

    print('---> training on total dataset')
    model = xgb.train(dict(xgb_params),
                      dtrain_all,
                      num_boost_round=best_num_boost_rounds)

    print('---> predict test')
    y_pred = model.predict(dtest, ntree_limit=model.best_ntree_limit)
    submit_df['Score'] = y_pred
    submit_df['Score'] = submit_df['Score'] + 1
    print(y_pred)
    submission_path = '../result/{}_{}.csv'.format('xgb', mean_test_mlogloss)

    submit_df.to_csv(submission_path, index=False, header=False)
    print('done.')