Пример #1
0
def try_some(train, test):
    train_x, test_x = train[pred_col], test[pred_col]
    train_y = train["target"]
    print(train_x.shape)
    print(train_y.shape)
    print(test_x.shape)

    submission = pd.DataFrame()
    submission["card_id"] = test["card_id"]
    submission["target"] = 0
    train_cv = pd.DataFrame()
    train_cv["card_id"] = train["card_id"]
    train_cv["cv_pred"] = 0

    outliers = (train["target"] < -30).astype(int).values
    bagging_num = 1
    split_num = 5
    random_state = 4590
    for bagging_index in range(bagging_num):
        skf = model_selection.StratifiedKFold(n_splits=split_num,
                                              shuffle=True,
                                              random_state=random_state)
        logger.print("random_state=" + str(random_state))
        lgb = pocket_lgb.GoldenLgb()
        total_score = 0
        models = []
        train_preds = []
        for train_index, test_index in skf.split(train, outliers):
            X_train, X_test = train_x.iloc[train_index], train_x.iloc[
                test_index]
            y_train, y_test = train_y.iloc[train_index], train_y.iloc[
                test_index]

            model = lgb.do_train_direct(X_train, X_test, y_train, y_test)
            score = model.best_score["valid_0"]["rmse"]
            total_score += score
            y_pred = model.predict(test_x)
            valid_set_pred = model.predict(X_test)
            models.append(model)

            submission["target"] = submission["target"] + y_pred
            train_id = train.iloc[test_index]
            train_cv_prediction = pd.DataFrame()
            train_cv_prediction["card_id"] = train_id["card_id"]
            train_cv_prediction["cv_pred"] = valid_set_pred
            train_preds.append(train_cv_prediction)
            timer.time("done one set in")

        train_output = pd.concat(train_preds, axis=0)
        train_cv["cv_pred"] += train_output["cv_pred"]

        lgb.show_feature_importance(models[0], path_const.FEATURE_GAIN)
        avg_score = str(total_score / split_num)
        logger.print("average score= " + avg_score)
        timer.time("end train in ")

    submission["target"] = submission["target"] / (bagging_num * split_num)
    submission.to_csv(path_const.OUTPUT_SUB, index=False)

    train_cv["cv_pred"] = train_cv["cv_pred"] / bagging_num
    train_cv.to_csv(path_const.OUTPUT_OOF, index=False)

    y_true = train_y
    y_pred = train_cv["cv_pred"]
    rmse_score = evaluator.rmse(y_true, y_pred)
    logger.print("evaluator rmse score= " + str(rmse_score))

    print(train["target"].describe())
    logger.print(train_cv.describe())
    logger.print(submission.describe())
    timer.time("done submission in ")
Пример #2
0
test_x = test[use_col]
print(train.shape)
print(test.shape)
timer.time("load csv in ")

submission = pd.DataFrame()
submission["card_id"] = test["card_id"]
submission["target"] = 0

outliers = (train["target"] < -30).astype(int).values
split_num = 5
random_state = 4590

skf = model_selection.StratifiedKFold(n_splits=split_num, shuffle=True, random_state=random_state)
logger.print("random_state=" + str(random_state))
lgb = pocket_lgb.GoldenLgb()
total_score = 0
models = []
train_preds = []
no_out_preds = []
for train_index, test_index in skf.split(train, outliers):
    _train, _test = train.iloc[train_index], train.iloc[test_index]

    _outlier = _test[_test["target"] < -30]
    outlier_x, outlier_y = _outlier[use_col], _outlier["target"]

    _train, _test = _train[_train["target"] > -30], _test[_test["target"] > -30]
    X_train, X_test = _train[use_col], _test[use_col]
    y_train, y_test = _train["target"], _test["target"]

    model = lgb.do_train_direct(X_train, X_test, y_train, y_test)