예제 #1
0
    def load_whole_input(self):
        logger = pocket_logger.get_my_logger()
        timer = pocket_timer.GoldenTimer(logger)
        csv_io = pocket_file_io.GoldenCsv()

        train = csv_io.read_file(path_const.TRAIN1)
        test = csv_io.read_file(path_const.TEST1)
        use_files = [
            path_const.RE_NEW_TRANS1,
            path_const.RE_OLD_TRANS1,
            path_const.OLD_TRANS3,
            path_const.NEW_TRANS6,
            path_const.OLD_TRANS6,
            path_const.OLD_TRANS9,
            # path_const.NEW_TRANS11,
            # path_const.OLD_TRANS11,
        ]
        for f in use_files:
            train, test = self.load_file_and_merge(train, test, f, csv_io)

        pred_train = csv_io.read_file(path_const.NEW_DAY_PRED_OOF)
        pred_test = csv_io.read_file(path_const.NEW_DAY_PRED_SUB)
        train = pd.merge(train, pred_train, on="card_id", how="left")
        test = pd.merge(test, pred_test, on="card_id", how="left")
        # train, test = self.load_lda(train, test, csv_io)

        print(train.shape)
        print(test.shape)
        timer.time("load csv in ")

        fer = jit_fe.JitFe()
        train = fer.do_fe(train)
        test = fer.do_fe(test)
        return train, test
예제 #2
0
    def load_whole_input(self, use_pred=True):
        csv_io = pocket_file_io.GoldenCsv()

        train = csv_io.read_file(path_const.ORG_TRAIN)[["card_id"]]
        test = csv_io.read_file(path_const.ORG_TEST)[["card_id"]]
        train_test_files = [
            (path_const.TRAIN1, path_const.TEST1),
            (path_const.NEW_DAY_PRED_OOF, path_const.NEW_DAY_PRED_SUB),
            (path_const.NEW_PUR_MAX_PRED_OOF, path_const.NEW_PUR_MAX_PRED_SUB),
        ]
        if not use_pred:
            train_test_files = [
                (path_const.TRAIN1, path_const.TEST1),
            ]
        use_files = [
            path_const.RE_NEW_TRANS1,
            path_const.RE_OLD_TRANS1,
            path_const.OLD_TRANS3,
            path_const.NEW_TRANS6,
            path_const.OLD_TRANS6,
            path_const.OLD_TRANS9,
            path_const.NEW_TRANS11,
            path_const.OLD_TRANS11,
            # path_const.NEW_TRANS13,
            # path_const.OLD_TRANS13,
            # path_const.FEAT_FROM_TS_NEW,
            # path_const.FEAT_FROM_TS_OLD,
            # path_const.FEAT_FROM_TS_NEW2,
            # path_const.FEAT_FROM_TS_OLD2,
        ]
        for f in train_test_files:
            train, test = self.load_train_test_and_merge(
                train, test, f[0], f[1], csv_io)
        for f in use_files:
            train, test = self.load_file_and_merge(train, test, f, csv_io)

        print(train.shape)
        print(test.shape)

        fer = jit_fe.JitFe()
        train = fer.do_fe(train)
        test = fer.do_fe(test)
        return train, test
예제 #3
0
    def load_small_input():
        logger = pocket_logger.get_my_logger()
        timer = pocket_timer.GoldenTimer(logger)
        csv_io = pocket_file_io.GoldenCsv()

        train = csv_io.read_file(path_const.TRAIN1)
        test = csv_io.read_file(path_const.TEST1)
        new_trans = csv_io.read_file(path_const.RE_NEW_TRANS1)
        old_trans = csv_io.read_file(path_const.RE_OLD_TRANS1)
        new_trans6 = csv_io.read_file(path_const.NEW_TRANS6)
        old_trans6 = csv_io.read_file(path_const.OLD_TRANS6)
        print(train.shape)
        print(test.shape)
        timer.time("load csv in ")

        train = pd.merge(train, new_trans, on="card_id", how="left")
        train = pd.merge(train, old_trans, on="card_id", how="left")
        train = pd.merge(train, new_trans6, on="card_id", how="left")
        train = pd.merge(train, old_trans6, on="card_id", how="left")
        #
        test = pd.merge(test, new_trans, on="card_id", how="left")
        test = pd.merge(test, old_trans, on="card_id", how="left")
        test = pd.merge(test, new_trans6, on="card_id", how="left")
        test = pd.merge(test, old_trans6, on="card_id", how="left")
        # print(train.shape)
        # print(test.shape)
        #
        fer = jit_fe.JitFe()
        train = fer.do_fe(train)
        test = fer.do_fe(test)

        train_y = train["target"]
        # 3.660 - 3.658
        use_col = [
            "new_trans_elapsed_days_max",
            "new_trans_elapsed_days_min",
            "new_trans_elapsed_days_mean",  # 0.001
            "old_trans_elapsed_days_max",
            "old_trans_elapsed_days_min",
            "old_trans_elapsed_days_mean",  # 0.025 mean001
            "new_last_day",  # 0.005
            "old_installments_sum",
            "old_installments_mean",  # 0.005
            "old_month_nunique",
            "old_woy_nunique",  # 0.010
            "old_merchant_id_nunique",  # 0.002
            "new_month_lag_mean",
            "old_month_lag_mean",
            "elapsed_days",  # 0.010
            "new_purchase_amount_max",
            "new_purchase_amount_count",
            "new_purchase_amount_mean",  # 0.020
            "old_purchase_amount_max",
            "old_purchase_amount_count",
            "old_purchase_amount_mean",  # 0.002
            "old_category_1_mean",
            "new_category_1_mean",  # 0.006
            "old_authorized_flag_sum",  # "old_authorized_flag_mean", bad?
            "old_no_city_purchase_amount_min",  # 0.003
            "old_no_city_purchase_amount_max",
            "old_no_city_purchase_amount_mean",  # 0.002
            "rec1_purchase_amount_count",  # 0.005
            "old_month_lag_max",  # 0.002
            "new_time_diff_mean",
            "new_trans_elapsed_days_std",  # 0.002
            "old_month_diff_mean",
            "old_pa2_month_diff_min",  # 0.004
        ]
        train_x = train[use_col]
        test_x = test[use_col]

        print(train_x.shape)
        print(train_y.shape)
        print(test_x.shape)
        timer.time("prepare train in ")

        return train[["card_id",
                      "target"]], test[["card_id"]], train_x, train_y, test_x
예제 #4
0
train = pd.merge(train, old_trans3, on="card_id", how="left")
train = pd.merge(train, new_trans6, on="card_id", how="left")
train = pd.merge(train, old_trans6, on="card_id", how="left")
train = pd.merge(train, old_trans9, on="card_id", how="left")
#
test = pd.merge(test, new_trans, on="card_id", how="left")
test = pd.merge(test, old_trans, on="card_id", how="left")
test = pd.merge(test, old_trans3, on="card_id", how="left")
test = pd.merge(test, new_trans6, on="card_id", how="left")
test = pd.merge(test, old_trans6, on="card_id", how="left")
test = pd.merge(test, old_trans9, on="card_id", how="left")

print(train.shape)
print(test.shape)
#
fer = jit_fe.JitFe()
train = fer.do_fe(train)
test = fer.do_fe(test)

pred_train = csv_io.read_file(path_const.NEW_DAY_PRED_OOF)
pred_test = csv_io.read_file(path_const.NEW_DAY_PRED_SUB)
train = pd.merge(train, pred_train, on="card_id", how="left")
train["pred_diff"] = train["pred_new"] - train["new_to_last_day"]
test = pd.merge(test, pred_test, on="card_id", how="left")
test["pred_diff"] = test["pred_new"] - test["new_to_last_day"]

train_y = train["target"]
drop_col = [
    "card_id",
    "target",  # "feature_1", "feature_2", "feature_3",
    "old_weekend_mean",