예제 #1
0
def ensemble(train_x=None, train_y=None, test_x=None, ids=None):
    if train_x is None:
        train_x, train_y, test_x, ids = data_frame.main()

    preds_train = []
    preds_test = []
    for i in range(5):
        params["random_seed"] = i
        params["depth"] = 5 + i
        pred_train, pred_test = predict_cv(params,
                                           train_x,
                                           train_y,
                                           test_x,
                                           seed=i + 100)
        preds_train.append(pred_train)
        preds_test.append(pred_test)
        learn_lgb.output_metrics(train_y, pred_train)

    pred_train = np.mean(preds_train, axis=0)
    pred_test = np.mean(preds_test, axis=0)

    learn_lgb.output_metrics(train_y, pred_train)
    learn_lgb.output_metrics(np.expm1(train_y), np.expm1(pred_train))

    return pred_train, pred_test
예제 #2
0
def em(complement=True):
    train_x, train_y, test_x, ids = data_frame.main(complement=complement)
    embeded_lgb_feature = feature_selection.null_importance(train_x,
                                                            train_y,
                                                            test_x,
                                                            ids,
                                                            create=False)
    if "categoryId_TE_mean" not in embeded_lgb_feature:
        embeded_lgb_feature.append("categoryId_TE_mean")
    if "ratings_disabled" not in embeded_lgb_feature:
        embeded_lgb_feature.append("ratings_disabled")
    train_x = train_x[embeded_lgb_feature]
    test_x = test_x[embeded_lgb_feature]

    train_x, train_y, test_x = pseudo.get_pseudo_data_set(train_x,
                                                          train_y,
                                                          test_x,
                                                          threshold=0.3)
    pred_train, pred_test = ensemble(train_x, train_y, test_x, ids)

    sub = pd.DataFrame()
    sub["id"] = ids
    sub['y'] = np.expm1(pred_test)

    sub.to_csv(f'./data/output/test_cat_complement_{complement}.csv',
               index=False)

    sub = pd.DataFrame()
    sub['y'] = np.expm1(pred_train)

    sub.to_csv(f'./data/output/train_cat_complement_{complement}.csv',
               index=False)
def ensemble_diff_category_TE(train_x=None,
                              train_y=None,
                              test_x=None,
                              ids=None):
    if train_x is None:
        train_x, train_y, test_x, ids = data_frame.main()

    train_y = train_y - train_x.categoryId_TE_mean

    preds_train = []
    preds_test = []
    for i in range(5):
        params["random_state"] = i
        # params["num_leaves"] -= 1
        pred_train, pred_test = main(train_x, train_y, test_x, ids, i)
        preds_train.append(pred_train)
        preds_test.append(pred_test)

    pred_train = np.mean(preds_train, axis=0)
    pred_test = np.mean(preds_test, axis=0)

    train_y = train_y + train_x.categoryId_TE_mean
    pred_train = pred_train + train_x.categoryId_TE_mean
    pred_test = pred_test + test_x.categoryId_TE_mean

    output_metrics(train_y, pred_train)
    output_metrics(np.expm1(train_y), np.expm1(pred_train))

    return pred_train, pred_test
def ensemble_div_period(train_x=None, train_y=None, test_x=None, ids=None):
    if train_x is None:
        train_x, train_y, test_x, ids = data_frame.main()

    train_y = train_y / train_x.period.apply(np.log1p)

    preds_train = []
    preds_test = []
    for i in range(5):
        params["random_state"] = i
        params["num_leaves"] -= 1
        pred_train, pred_test = main(train_x, train_y, test_x, ids, i)
        preds_train.append(pred_train)
        preds_test.append(pred_test)

    pred_train = np.mean(preds_train, axis=0)
    pred_test = np.mean(preds_test, axis=0)

    train_y = train_y * train_x.period.apply(np.log1p)
    pred_train = pred_train * train_x.period.apply(np.log1p)
    pred_test = pred_test * test_x.period.apply(np.log1p)

    output_metrics(train_y, pred_train)
    output_metrics(np.expm1(train_y), np.expm1(pred_train))

    sub = pd.DataFrame()
    sub["id"] = ids
    sub['y'] = np.expm1(pred_test)

    sub.to_csv('./data/output/lgb.csv', index=False)

    return pred_train, pred_test
def ensemble(train_x=None, train_y=None, test_x=None, ids=None):
    if train_x is None:
        train_x, train_y, test_x, ids = data_frame.main()

    preds_train = []
    preds_test = []
    for i in range(5):
        params["random_state"] = i
        params["num_leaves"] += 1
        pred_train, pred_test = main(train_x, train_y, test_x, ids, i)
        preds_train.append(pred_train)
        preds_test.append(pred_test)

    pred_train = np.mean(preds_train, axis=0)
    pred_test = np.mean(preds_test, axis=0)

    output_metrics(train_y, pred_train)
    output_metrics(np.expm1(train_y), np.expm1(pred_train))

    # sub = pd.DataFrame()
    # sub["id"] = ids
    # sub['y'] = np.expm1(pred_test)
    #
    # sub.to_csv('./data/output/test_lgb.csv', index=False)
    #
    # sub = pd.DataFrame()
    # sub['y'] = np.expm1(pred_train)
    #
    # sub.to_csv('./data/output/train_lgb.csv', index=False)

    return pred_train, pred_test
def em(complement=True):
    train_x, train_y, test_x, ids = data_frame.main(complement=complement)
    # embeded_lgb_feature = feature_selection.null_importance(train_x, train_y, test_x, ids, create=True)
    embeded_lgb_feature = feature_selection.main(train_x, train_y, test_x, ids)
    if "categoryId_TE_mean" not in embeded_lgb_feature:
        embeded_lgb_feature.append("categoryId_TE_mean")
    if "ratings_disabled" not in embeded_lgb_feature:
        embeded_lgb_feature.append("ratings_disabled")
    train_x = train_x[embeded_lgb_feature]
    test_x = test_x[embeded_lgb_feature]

    train_x, train_y, test_x = pseudo.get_pseudo_data_set(train_x,
                                                          train_y,
                                                          test_x,
                                                          threshold=0.3)

    preds_train = []
    preds_test = []

    # 通常
    pred_train, pred_test = ensemble(train_x, train_y, test_x, ids)
    preds_train.append(pred_train)
    preds_test.append(pred_test)

    # categoryIdのTEの差分
    pred_train, pred_test = ensemble_diff_category_TE(train_x, train_y, test_x,
                                                      ids)
    preds_train.append(pred_train)
    preds_test.append(pred_test)

    # 期間で除算
    pred_train, pred_test = ensemble_div_period(train_x, train_y, test_x, ids)
    preds_train.append(pred_train)
    preds_test.append(pred_test)

    # 平均をとる
    pred_train = np.mean(preds_train, axis=0)
    pred_test = np.mean(preds_test, axis=0)

    output_metrics(train_y, pred_train)
    output_metrics(np.expm1(train_y), np.expm1(pred_train))
    sub = pd.DataFrame()
    sub["id"] = ids
    sub['y'] = np.expm1(pred_test)

    sub.to_csv(f'./data/output/test_lgb_complement_{complement}.csv',
               index=False)

    sub = pd.DataFrame()
    sub['y'] = np.expm1(pred_train)

    sub.to_csv(f'./data/output/train_lgb_complement_{complement}.csv',
               index=False)

    return pred_train, pred_test, preds_train, preds_test
예제 #7
0
def select_k_best(train_x=None,
                  train_y=None,
                  test_x=None,
                  ids=None,
                  seed=22,
                  k=300):
    if train_x is None:
        train_x, train_y, test_x, ids = data_frame.main()

    selector = SelectKBest(score_func=f_regression, k=5)
    selector.fit(train_x, train_y)
    mask = selector.get_support()
    return selector.transform(train_x), selector.transform(test_x)
def ensemble():
    train_x, train_y, test_x, ids = data_frame.main()

    preds_train = []
    preds_test = []
    for i in range(5):
        pred_train, pred_test = main(train_x, train_y, test_x, ids, i)
        preds_train.append(pred_train)
        preds_test.append(pred_test)

    pred_train = np.mean(preds_train, axis=0)
    pred_test = np.mean(preds_test, axis=0)

    learn_lgb.output_metrics(train_y, pred_train)
    learn_lgb.output_metrics(np.expm1(train_y), np.expm1(pred_train))

    sub = pd.DataFrame()
    sub["id"] = ids
    sub['y'] = np.expm1(pred_test)

    sub.to_csv('./data/output/ridge.csv', index=False)
def em2():
    train_x, train_y, test_x, ids = data_frame.main(complement=True)
    pred_train_a, pred_test_a, _, _ = em(complement=True)
    pred_train_b, pred_test_b, _, _ = em(complement=False)
    # 平均をとる
    pred_train = np.mean([pred_train_a, pred_train_b], axis=0)
    pred_test = np.mean([pred_test_a, pred_test_b], axis=0)

    output_metrics(train_y, pred_train)
    output_metrics(np.expm1(train_y), np.expm1(pred_train))

    sub = pd.DataFrame()
    sub["id"] = ids
    sub['y'] = np.expm1(pred_test)

    sub.to_csv('./data/output/test_lgb.csv', index=False)

    sub = pd.DataFrame()
    sub['y'] = np.expm1(pred_train)

    sub.to_csv('./data/output/train_lgb.csv', index=False)
예제 #10
0
def main(train_x=None, train_y=None, test_x=None, ids=None, seed=22):
    if train_x is None:
        train_x, train_y, test_x, ids = data_frame.main()

    lgbr = LGBMRegressor(n_estimators=500,
                         learning_rate=0.1,
                         num_leaves=86,
                         subsample_freq=1,
                         subsample=0.9,
                         feature_fraction=0.2,
                         bagging_seed=11,
                         metrics="rmse",
                         reg_alpha=0.1,
                         reg_lambda=0.1,
                         random_state=0)
    embeded_lgb_selector = SelectFromModel(lgbr, threshold='1.25*median')
    embeded_lgb_selector.fit(train_x, train_y)

    embeded_lgb_support = embeded_lgb_selector.get_support()
    embeded_lgb_feature = train_x.loc[:, embeded_lgb_support].columns.tolist()
    print(str(len(embeded_lgb_feature)), 'selected features')
    return embeded_lgb_feature
def main(train_x=None, train_y=None, test_x=None, ids=None, seed=22):
    if train_x is None:
        train_x, train_y, test_x, ids = data_frame.main()

    drop_null = set(test_x.keys()[test_x.isna().any()].to_list() +
                    train_x.keys()[train_x.isna().any()].to_list())
    drop_list = ["publishedAt", "categoryId", "collection_date"
                 ] + list(drop_null)
    train_x = train_x.drop(drop_list, axis=1)
    test_x = test_x.drop(drop_list, axis=1)

    # train_x, test_x = preprocess(train_x, test_x)

    pred_train, pred_test = predict_cv(train_x, train_y, test_x, seed=seed)

    learn_lgb.output_metrics(train_y, pred_train)

    # sub = pd.DataFrame()
    # sub["id"] = ids
    # sub['y'] = np.expm1(pred_test)
    #
    # sub.to_csv('./data/output/ridge.csv', index=False)

    return pred_train, pred_test
def main(train_x=None, train_y=None, test_x=None, ids=None, seed=22):
    if train_x is None:
        train_x, train_y, test_x, ids = data_frame.main()

    # embeded_lgb_feature = feature_selection.null_importance(train_x, train_y, test_x, ids)
    # if "categoryId_TE_mean" not in embeded_lgb_feature:
    #     embeded_lgb_feature.append("categoryId_TE_mean")
    # if "ratings_disabled" not in embeded_lgb_feature:
    #     embeded_lgb_feature.append("ratings_disabled")
    # train_x = train_x[embeded_lgb_feature]
    # test_x = test_x[embeded_lgb_feature]

    pred_train, pred_test = predict_cv(params, train_x, train_y, test_x, seed)

    # output_metrics(train_y, pred_train)
    # output_metrics(np.expm1(train_y), np.expm1(pred_train))

    # sub = pd.DataFrame()
    # sub["id"] = ids
    # sub['y'] = np.expm1(pred_test)
    #
    # sub.to_csv('./data/output/lgb.csv', index=False)

    return pred_train, pred_test
예제 #13
0
def null_importance(train_x=None,
                    train_y=None,
                    test_x=None,
                    ids=None,
                    seed=22,
                    create=False):
    # 閾値を設定
    THRESHOLD = 40

    if not create:
        print(f"Create {create}")
        actual_importance = pd.read_csv("./data/null_importance.csv")
        imp_features = []
        for feature, score in zip(actual_importance["feature"],
                                  actual_importance["score"]):
            if score >= THRESHOLD:
                imp_features.append(feature)
        print(str(len(imp_features)), 'selected features')
        return imp_features

    if train_x is None:
        train_x, train_y, test_x, ids = data_frame.main()

    # 実際の目的変数でモデルを学習し、特徴量の重要度を含むデータフレームを作成
    actual_importance = get_feature_importances(train_x,
                                                train_y,
                                                shuffle=False)

    # 目的変数をシャッフルした状態でモデルを学習し、特徴量の重要度を含むデータフレームを作成
    N_RUNS = 100
    null_importance = pd.DataFrame()
    for i in range(N_RUNS):
        imp_df = get_feature_importances(train_x,
                                         train_y,
                                         shuffle=True,
                                         seed=i)
        imp_df["run"] = i + 1
        null_importance = pd.concat([null_importance, imp_df])

    # 実データにおいて特徴量の重要度が高かった上位5位を表示
    # for feature in actual_importance["feature"][:5]:
    #     display_distributions(actual_importance, null_importance, feature)

    score_list = []

    # 閾値を超える特徴量を取得
    imp_features = []
    for feature in actual_importance["feature"]:
        actual_value = actual_importance.query(
            f"feature=='{feature}'")["importance"].values
        null_value = null_importance.query(
            f"feature=='{feature}'")["importance"].values
        percentage = (null_value < actual_value).sum() / null_value.size * 100
        score_list.append(percentage)
        if percentage >= THRESHOLD:
            imp_features.append(feature)

    actual_importance["score"] = score_list
    actual_importance.to_csv("./data/null_importance.csv", index=False)

    print(str(len(imp_features)), 'selected features')
    return imp_features