示例#1
0
    def doit(self):
        logger = pocket_logger.get_my_logger()
        timer = pocket_timer.GoldenTimer(logger)

        # (file_name, col_name)
        files = ["subset_exp_" + str(idx) for idx in range(100)]
        files = [(f, f) for f in files]

        train, test = self.make_files(files)
        timer.time("load csv in ")
        print(train.describe())

        self.print_corr(train, test, files)
        timer.time("corr check")
        self.print_score(train, files)
        timer.time("score check")
        sig_idx = self.do_preds(train, files)

        base_files = ["subset_exp_" + str(idx) for idx in sig_idx]
        base_files = [(f, f) for f in base_files]
        bin_files = ["subset_exp_" + str(idx) for idx in range(10)]
        bin_files = [(f, "bin" + f) for f in bin_files]
        train, test = self.make_files(base_files, bin_files)

        files = base_files + bin_files
        self.print_corr(train, test, files)
        self.do_cv_pred(train, test, files)
示例#2
0
    def doit(self):
        logger = pocket_logger.get_my_logger()
        timer = pocket_timer.GoldenTimer(logger)

        # (file_name, col_name)
        sig_idx = [
            0, 9, 12, 13, 18, 19, 22, 24, 28, 29, 33, 35, 36, 37, 43, 45, 52,
            54, 55, 56, 58, 59, 61, 65, 67, 69, 71, 72, 74, 75, 76, 77, 78, 81,
            83, 84, 88, 90, 91, 97, 98
        ]
        pos_sig_idx = [
            9, 12, 19, 24, 33, 36, 45, 52, 54, 55, 58, 59, 65, 67, 69, 72, 74,
            75, 77, 78, 83, 88, 91, 97
        ]
        sig2_idx = [12, 24, 36, 58, 65, 67, 69, 72, 75, 91, 97]
        # files = ["subset_exp_" + str(idx) for idx in range(100)]
        files = ["subset_exp_" + str(idx) for idx in sig_idx]
        files = [(f, f) for f in files]

        train, test = self.make_files(files)
        timer.time("load csv in ")
        print(train.describe())

        self.print_corr(train, test, files)
        timer.time("corr check")
        self.print_score(train, files)
        timer.time("score check")
        self.do_cv_pred(train, test, files)
示例#3
0
    def doit(self):
        logger = pocket_logger.get_my_logger()
        timer = pocket_timer.GoldenTimer(logger)

        # (file_name, col_name)
        files = [
            ("team_v63", "lgb"),
            ("big_mlp", "mlp"),
        ]
        team_files = [
            'select_v44_ridge',
            'tune_stack_57_v1',
            'select_v51_ridge',
            # 'tune_stack_57_2_v1',
            'tune_stack_cgb_v1',
            # 'elo_rnd_feat_bridge',
            'outlier_lgb_v3_kh_time_feature2_pocket',
            # 'delete_outlier_kh_pocket_stack_correct_ridge',
            # 'outlier_lgb_pocket_logistic',
            'delete_outlier_kh_pocket_stack_correct2_ridge'
        ]
        files = files + [(t, t) for t in team_files]

        train, test = self.make_files(files)
        timer.time("load csv in ")
        print(train.describe())

        self.print_corr(train, test, files)
        timer.time("corr check")
        self.print_score(train, files)
        timer.time("score check")
        self.do_cv_pred(train, test, files)
示例#4
0
    def doit(self):
        logger = pocket_logger.get_my_logger()
        timer = pocket_timer.GoldenTimer(logger)

        # (file_name, col_name)
        files = [
            ("org_param", "big"),
            ("medium", "medium"),
            ("mlp3", "mlp"),
            # ("mlp_rank", "mlp_rank"),
            ("bin", "bin"),
            ("no_out2", "no_out2"),
            ("bin_large", "bin_large"),
            ("no_out_large", "no_out_large"),
            ("tune_param", "tune_param")
        ]

        train, test = self.make_files(files)
        timer.time("load csv in ")
        train["no_out2"] = (1 - train["bin"]) * train["no_out2"]
        train["no_out_large"] = (1 -
                                 train["bin_large"]) * train["no_out_large"]
        print(train.describe())

        self.print_corr(train, test, files)
        timer.time("corr check")
        self.print_score(train, files)
        timer.time("score check")
        self.do_preds(train, test, files)
示例#5
0
    def load_whole_input(self):
        logger = pocket_logger.get_my_logger()
        timer = pocket_timer.GoldenTimer(logger)
        csv_io = pocket_file_io.GoldenCsv()

        train = csv_io.read_file(path_const.TRAIN1)
        test = csv_io.read_file(path_const.TEST1)
        use_files = [
            path_const.RE_NEW_TRANS1,
            path_const.RE_OLD_TRANS1,
            path_const.OLD_TRANS3,
            path_const.NEW_TRANS6,
            path_const.OLD_TRANS6,
            path_const.OLD_TRANS9,
            # path_const.NEW_TRANS11,
            # path_const.OLD_TRANS11,
        ]
        for f in use_files:
            train, test = self.load_file_and_merge(train, test, f, csv_io)

        pred_train = csv_io.read_file(path_const.NEW_DAY_PRED_OOF)
        pred_test = csv_io.read_file(path_const.NEW_DAY_PRED_SUB)
        train = pd.merge(train, pred_train, on="card_id", how="left")
        test = pd.merge(test, pred_test, on="card_id", how="left")
        # train, test = self.load_lda(train, test, csv_io)

        print(train.shape)
        print(test.shape)
        timer.time("load csv in ")

        fer = jit_fe.JitFe()
        train = fer.do_fe(train)
        test = fer.do_fe(test)
        return train, test
示例#6
0
    def load_ts():
        logger = pocket_logger.get_my_logger()
        timer = pocket_timer.GoldenTimer(logger)
        csv_io = pocket_file_io.GoldenCsv()

        num = csv_io.read_file(path_const.NEW_NUM)
        cat = csv_io.read_file(path_const.NEW_CAT)
        key = csv_io.read_file(path_const.NEW_KEY)
        timer.time("load ts")
示例#7
0
    def __init__(self):
        self.small_col = [
            # "new_trans_elapsed_days_max", "new_trans_elapsed_days_min", "new_trans_elapsed_days_mean",  # 0.001
            "old_trans_elapsed_days_max",
            "old_trans_elapsed_days_min",
            "old_trans_elapsed_days_mean",  # 0.025 mean001
            # "new_last_day",  # 0.005
            "new_to_last_day",
            "old_installments_sum",
            "old_installments_mean",  # 0.005
            "old_month_nunique",
            "old_woy_nunique",  # 0.010
            "old_merchant_id_nunique",  # 0.002
            "new_month_lag_mean",
            "old_month_lag_mean",
            "elapsed_days",  # 0.010
            "new_purchase_amount_max",
            "new_purchase_amount_count",
            "new_purchase_amount_mean",  # 0.020
            "old_purchase_amount_max",
            "old_purchase_amount_count",
            "old_purchase_amount_mean",  # 0.002
            "old_category_1_mean",
            "new_category_1_mean",  # 0.006
            "old_authorized_flag_sum",  # "old_authorized_flag_mean", bad?
            "old_no_city_purchase_amount_min",  # 0.003
            "old_no_city_purchase_amount_max",
            "old_no_city_purchase_amount_mean",  # 0.002
            "rec1_purchase_amount_count",  # 0.005
            "old_month_lag_max",  # 0.002
            "new_time_diff_mean",
            "new_trans_elapsed_days_std",  # 0.002
            "old_month_diff_mean",
            "old_pa2_month_diff_min",  # 0.004
            # "old_mer_cnt_whole_mean",  # 0.001
        ]
        lda_col = [
            'lda-merchant_category_id-0',
            'lda-merchant_category_id-1',
            'lda-merchant_category_id-2',
            'lda-merchant_category_id-3',
            'lda-merchant_category_id-4',
            'lda-month_lag-0',
            'lda-month_lag-1',
            'lda-month_lag-2',
            'lda-month_lag-3',
            'lda-month_lag-4',
        ]
        # self.small_col += lda_col
        self.medium_col = self.small_col + ["pred_diff"]

        self.logger = pocket_logger.get_my_logger()
        self.timer = pocket_timer.GoldenTimer(self.logger)
示例#8
0
    def doit(self):
        logger = pocket_logger.get_my_logger()
        timer = pocket_timer.GoldenTimer(logger)

        # (file_name, col_name)
        files = ens_loader.get_team_ens()

        train, test = self.make_files(files)
        timer.time("load csv in ")
        print(train.describe())

        self.print_corr(train, test, files)
        timer.time("corr check")
        self.print_score(train, files)
        timer.time("score check")
        self.do_preds(train, test, files)
示例#9
0
    def doit(self):
        logger = pocket_logger.get_my_logger()
        timer = pocket_timer.GoldenTimer(logger)

        # (file_name, col_name)
        files = ["subset_exp_" + str(idx) for idx in range(40, 70)]
        files = [(f, f) for f in files]

        train, test = self.make_files(files)
        timer.time("load csv in ")
        print(train.describe())

        self.print_corr(train, test, files)
        timer.time("corr check")
        self.print_score(train, files)
        timer.time("score check")
        self.do_preds(train, files)
示例#10
0
    def doit(self):
        logger = pocket_logger.get_my_logger()
        timer = pocket_timer.GoldenTimer(logger)

        # (file_name, col_name)
        files = [("team_v63", "lgb"), ("bin_team", "bin"),
                 ("no_out_team", "no_out"), ("rnd_feat_bridge", "rnd_feat"),
                 ("small_team", "small")]

        train, test = self.make_files(files)
        timer.time("load csv in ")
        print(train.describe())

        self.print_corr(train, test, files)
        timer.time("corr check")
        self.print_score(train, files)
        timer.time("score check")
        self.do_cv_pred(train, test, files)
示例#11
0
def get_cv_score(param):
    local_timer = pocket_timer.GoldenTimer(logger)
    lgb = pocket_lgb.OptLgb(param)
    for train_index, test_index in skf.split(train, outliers):
        X_train, X_test = train_x.iloc[train_index], train_x.iloc[test_index]
        y_train, y_test = train_y.iloc[train_index], train_y.iloc[test_index]

        model = lgb.do_train_direct(X_train, X_test, y_train, y_test)
        valid_set_pred = model.predict(X_test)

        train_id = train.iloc[test_index]
        train_cv_prediction = pd.DataFrame()
        train_cv_prediction["card_id"] = train_id["card_id"]
        train_cv_prediction["cv_pred"] = valid_set_pred
        train_preds.append(train_cv_prediction)

    train_output = pd.concat(train_preds, axis=0)
    local_timer.time("end train in ")
    train_output = pd.merge(train_output, train, on="card_id", how="left")
    score = evaluator.rmse(train_output["target"], train_output["cv_pred"])
    return score
示例#12
0
 def __init__(self, prefix="", split_num=32):
     self._SPLIT_NUM = split_num
     self.fer = agg_fe.AggFe(prefix)
     self.timer = pocket_timer.GoldenTimer()
示例#13
0
 def __init__(self, fer, split_num=32):
     self._SPLIT_NUM = split_num
     self.fer = fer
     self.timer = pocket_timer.GoldenTimer()
示例#14
0
    def __init__(self):
        self.small_col = [
            # "new_trans_elapsed_days_max", "new_trans_elapsed_days_min", "new_trans_elapsed_days_mean",  # 0.001
            "old_trans_elapsed_days_max",
            "old_trans_elapsed_days_min",
            "old_trans_elapsed_days_mean",  # 0.025 mean001
            # "new_last_day",  # 0.005
            "new_to_last_day",
            "old_installments_sum",
            "old_installments_mean",  # 0.005
            "old_month_nunique",
            "old_woy_nunique",  # 0.010
            "old_merchant_id_nunique",  # 0.002
            "new_month_lag_mean",
            "old_month_lag_mean",
            "elapsed_days",  # 0.010
            "new_purchase_amount_max",
            "new_purchase_amount_count",
            "new_purchase_amount_mean",  # 0.020
            "old_purchase_amount_max",
            "old_purchase_amount_count",
            "old_purchase_amount_mean",  # 0.002
            "old_category_1_mean",
            "new_category_1_mean",  # 0.006
            "old_authorized_flag_sum",  # "old_authorized_flag_mean", bad?
            "old_no_city_purchase_amount_min",  # 0.003
            "old_no_city_purchase_amount_max",
            "old_no_city_purchase_amount_mean",  # 0.002
            "rec1_purchase_amount_count",  # 0.005
            "old_month_lag_max",  # 0.002
            "new_time_diff_mean",
            "new_trans_elapsed_days_std",  # 0.002
            "old_month_diff_mean",
            "old_pa2_month_diff_min",  # 0.004
            "old_mer_cnt_whole_mean",  # 0.001
        ]
        self.medium_col = self.small_col + ["pred_diff"]

        self.drop_col = [
            "card_id",
            "target",  # "feature_1", "feature_2", "feature_3",
            "old_weekend_mean",
            "new_weekend_mean",
            "new_authorized_flag_mean",
            "old_null_state",
            "new_null_state",
            "new_null_install",  # "old_null_install",
            "old_cat3_pur_mean",
            "new_cat3_pur_mean",
            "old_cat2_pur_mean",
            "new_cat2_pur_mean",
            "new_category_4_mean",  # "new_merchant_group_id_nunique", "old_merchant_group_id_nunique"
            "new_mon_nunique_mean",
            "new_woy_nunique_mean",
            # "new_month_lag_ptp", "new_month_lag_min",
            "new_purchase_amount_skew",  # "new_purchase_amount_std",
            "old_purchase_amount_skew",  # "old_purchase_amount_std",
            # "new_category_2_nunique", "old_category_2_nunique",
            # "old_null_merchant", "new_null_merchant",
            # "old_ym_target_encode_mean", "new_ym_target_encode_mean",
            # "old_hour_target_encode_mean", "new_hour_target_encode_mean",
            # "old_subsector_id_target_encode_mean",
            # "new_merchant_id_target_encode_mean", "old_merchant_id_target_encode_mean",
            "pred_new",
            "old_same_buy_count",
            "old_purchase_amount_nunique",
            "new_purchase_amount_nunique",
            "old_installments_nunique",
            "new_installments_nunique",  # "pred_new_pur_max",
            "new_trans_elapsed_days_max",
            "new_trans_elapsed_days_min",
            "new_trans_elapsed_days_mean",  # +0.001
        ]

        self.team_small_col = [
            "first_mer_old_woy_nunique",
            "kh_hist_kh__purchase_date_seconds_diff_std",
            "merchant_id_most",
            "new_category_3_mean",
            "old_time_diff_std",
            "old_hour_0_count",
            "old_time_diff_min",
            "authorized_flag_y_ratio",
            "hist_merchant_id_nunique",
            "kh_ratio_kh__purchase_days_diff_min",
            "new_subsector_id_nunique",
        ]

        self.logger = pocket_logger.get_my_logger()
        self.timer = pocket_timer.GoldenTimer(self.logger)
示例#15
0
    def load_small_input():
        logger = pocket_logger.get_my_logger()
        timer = pocket_timer.GoldenTimer(logger)
        csv_io = pocket_file_io.GoldenCsv()

        train = csv_io.read_file(path_const.TRAIN1)
        test = csv_io.read_file(path_const.TEST1)
        new_trans = csv_io.read_file(path_const.RE_NEW_TRANS1)
        old_trans = csv_io.read_file(path_const.RE_OLD_TRANS1)
        new_trans6 = csv_io.read_file(path_const.NEW_TRANS6)
        old_trans6 = csv_io.read_file(path_const.OLD_TRANS6)
        print(train.shape)
        print(test.shape)
        timer.time("load csv in ")

        train = pd.merge(train, new_trans, on="card_id", how="left")
        train = pd.merge(train, old_trans, on="card_id", how="left")
        train = pd.merge(train, new_trans6, on="card_id", how="left")
        train = pd.merge(train, old_trans6, on="card_id", how="left")
        #
        test = pd.merge(test, new_trans, on="card_id", how="left")
        test = pd.merge(test, old_trans, on="card_id", how="left")
        test = pd.merge(test, new_trans6, on="card_id", how="left")
        test = pd.merge(test, old_trans6, on="card_id", how="left")
        # print(train.shape)
        # print(test.shape)
        #
        fer = jit_fe.JitFe()
        train = fer.do_fe(train)
        test = fer.do_fe(test)

        train_y = train["target"]
        # 3.660 - 3.658
        use_col = [
            "new_trans_elapsed_days_max",
            "new_trans_elapsed_days_min",
            "new_trans_elapsed_days_mean",  # 0.001
            "old_trans_elapsed_days_max",
            "old_trans_elapsed_days_min",
            "old_trans_elapsed_days_mean",  # 0.025 mean001
            "new_last_day",  # 0.005
            "old_installments_sum",
            "old_installments_mean",  # 0.005
            "old_month_nunique",
            "old_woy_nunique",  # 0.010
            "old_merchant_id_nunique",  # 0.002
            "new_month_lag_mean",
            "old_month_lag_mean",
            "elapsed_days",  # 0.010
            "new_purchase_amount_max",
            "new_purchase_amount_count",
            "new_purchase_amount_mean",  # 0.020
            "old_purchase_amount_max",
            "old_purchase_amount_count",
            "old_purchase_amount_mean",  # 0.002
            "old_category_1_mean",
            "new_category_1_mean",  # 0.006
            "old_authorized_flag_sum",  # "old_authorized_flag_mean", bad?
            "old_no_city_purchase_amount_min",  # 0.003
            "old_no_city_purchase_amount_max",
            "old_no_city_purchase_amount_mean",  # 0.002
            "rec1_purchase_amount_count",  # 0.005
            "old_month_lag_max",  # 0.002
            "new_time_diff_mean",
            "new_trans_elapsed_days_std",  # 0.002
            "old_month_diff_mean",
            "old_pa2_month_diff_min",  # 0.004
        ]
        train_x = train[use_col]
        test_x = test[use_col]

        print(train_x.shape)
        print(train_y.shape)
        print(test_x.shape)
        timer.time("prepare train in ")

        return train[["card_id",
                      "target"]], test[["card_id"]], train_x, train_y, test_x
示例#16
0
import os, sys
ROOT = os.path.abspath(
    os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../'))
sys.path.append(ROOT)

import pandas as pd
import numpy as np
from elo.common import pocket_timer, pocket_logger, pocket_file_io, path_const
from elo.common import pocket_lgb, evaluator
from elo.loader import input_loader
from sklearn import model_selection

logger = pocket_logger.get_my_logger()
timer = pocket_timer.GoldenTimer(logger)
csv_io = pocket_file_io.GoldenCsv()

loader = input_loader.GoldenLoader()
_train, _test = loader.load_whole_input()
timer.time("load csv")
pred_col = loader.small_col


def try_some(train, test):
    train_x, test_x = train[pred_col], test[pred_col]
    train_y = train["target"]
    print(train_x.shape)
    print(train_y.shape)
    print(test_x.shape)

    submission = pd.DataFrame()
    submission["card_id"] = test["card_id"]
示例#17
0
    def do_cv(self, data):
        for d in data:
            print(d.shape)
        train, test, train_x, train_y, test_x = data
        timer = pocket_timer.GoldenTimer(self.logger)

        submission = pd.DataFrame()
        submission["card_id"] = test["card_id"]
        submission["target"] = 0
        train_cv = pd.DataFrame()
        train_cv["card_id"] = train["card_id"]
        train_cv["cv_pred"] = 0

        outliers = (train["target"] < -30).astype(int).values
        bagging_num = 1
        split_num = 5
        for bagging_index in range(bagging_num):
            skf = model_selection.StratifiedKFold(n_splits=split_num, shuffle=True, random_state=4590)

            total_score = 0
            train_preds = []
            for idx, (train_index, test_index) in enumerate(skf.split(train, outliers)):
                lr_schedule = learning_rate.GoldenLearningRate(0.1, 50).cosine_annealing_scheduler()
                mlp = pocket_network2.GoldenMlp2(self.epochs, self.batch_size, lr_schedule)
                network = mlp.build_model(train_x.shape[1])
                X_train, X_test = train_x.iloc[train_index], train_x.iloc[test_index]
                y_train, y_test = train_y.iloc[train_index], train_y.iloc[test_index]

                print("start train")
                model, history = mlp.do_train_direct(str(idx), network, X_train, X_test, y_train, y_test)
                mlp.save_history(history, str(idx))
                print('Loading Best Model')
                model.load_weights(path_const.get_weight_file(str(idx)))

                y_pred = model.predict(test_x, batch_size=self.batch_size)
                y_pred = np.reshape(y_pred, -1)
                y_pred = np.clip(y_pred, -33.219281, 18.0)
                valid_set_pred = model.predict(X_test, batch_size=self.batch_size)
                score = evaluator.rmse(y_test, valid_set_pred)
                print(score)
                total_score += score

                submission["target"] = submission["target"] + y_pred
                train_id = train.iloc[test_index]
                train_cv_prediction = pd.DataFrame()
                train_cv_prediction["card_id"] = train_id["card_id"]
                train_cv_prediction["cv_pred"] = valid_set_pred
                train_preds.append(train_cv_prediction)
                timer.time("done one set in")

            train_output = pd.concat(train_preds, axis=0)
            train_cv["cv_pred"] += train_output["cv_pred"]

            avg_score = str(total_score / split_num)
            self.logger.print("average score= " + avg_score)
            timer.time("end train in ")

        submission["target"] = submission["target"] / (bagging_num * split_num)
        # submission["target"] = np.clip(submission["target"], -33.219281, 18.0)
        submission.to_csv(path_const.OUTPUT_SUB, index=False)

        train_cv["cv_pred"] = train_cv["cv_pred"] / bagging_num
        train_cv["cv_pred"] = np.clip(train_cv["cv_pred"], -33.219281, 18.0)
        train_cv.to_csv(path_const.OUTPUT_OOF, index=False)

        y_true = train_y
        y_pred = train_cv["cv_pred"]
        rmse_score = evaluator.rmse(y_true, y_pred)
        self.logger.print("evaluator rmse score= " + str(rmse_score))

        print(train["target"].describe())
        self.logger.print(train_cv.describe())
        self.logger.print(submission.describe())
        timer.time("done submission in ")