def load_whole_input(self): logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) csv_io = pocket_file_io.GoldenCsv() train = csv_io.read_file(path_const.TRAIN1) test = csv_io.read_file(path_const.TEST1) use_files = [ path_const.RE_NEW_TRANS1, path_const.RE_OLD_TRANS1, path_const.OLD_TRANS3, path_const.NEW_TRANS6, path_const.OLD_TRANS6, path_const.OLD_TRANS9, # path_const.NEW_TRANS11, # path_const.OLD_TRANS11, ] for f in use_files: train, test = self.load_file_and_merge(train, test, f, csv_io) pred_train = csv_io.read_file(path_const.NEW_DAY_PRED_OOF) pred_test = csv_io.read_file(path_const.NEW_DAY_PRED_SUB) train = pd.merge(train, pred_train, on="card_id", how="left") test = pd.merge(test, pred_test, on="card_id", how="left") # train, test = self.load_lda(train, test, csv_io) print(train.shape) print(test.shape) timer.time("load csv in ") fer = jit_fe.JitFe() train = fer.do_fe(train) test = fer.do_fe(test) return train, test
def load_whole_input(self, use_pred=True): csv_io = pocket_file_io.GoldenCsv() train = csv_io.read_file(path_const.ORG_TRAIN)[["card_id"]] test = csv_io.read_file(path_const.ORG_TEST)[["card_id"]] train_test_files = [ (path_const.TRAIN1, path_const.TEST1), (path_const.NEW_DAY_PRED_OOF, path_const.NEW_DAY_PRED_SUB), (path_const.NEW_PUR_MAX_PRED_OOF, path_const.NEW_PUR_MAX_PRED_SUB), ] if not use_pred: train_test_files = [ (path_const.TRAIN1, path_const.TEST1), ] use_files = [ path_const.RE_NEW_TRANS1, path_const.RE_OLD_TRANS1, path_const.OLD_TRANS3, path_const.NEW_TRANS6, path_const.OLD_TRANS6, path_const.OLD_TRANS9, path_const.NEW_TRANS11, path_const.OLD_TRANS11, # path_const.NEW_TRANS13, # path_const.OLD_TRANS13, # path_const.FEAT_FROM_TS_NEW, # path_const.FEAT_FROM_TS_OLD, # path_const.FEAT_FROM_TS_NEW2, # path_const.FEAT_FROM_TS_OLD2, ] for f in train_test_files: train, test = self.load_train_test_and_merge( train, test, f[0], f[1], csv_io) for f in use_files: train, test = self.load_file_and_merge(train, test, f, csv_io) print(train.shape) print(test.shape) fer = jit_fe.JitFe() train = fer.do_fe(train) test = fer.do_fe(test) return train, test
def load_small_input(): logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) csv_io = pocket_file_io.GoldenCsv() train = csv_io.read_file(path_const.TRAIN1) test = csv_io.read_file(path_const.TEST1) new_trans = csv_io.read_file(path_const.RE_NEW_TRANS1) old_trans = csv_io.read_file(path_const.RE_OLD_TRANS1) new_trans6 = csv_io.read_file(path_const.NEW_TRANS6) old_trans6 = csv_io.read_file(path_const.OLD_TRANS6) print(train.shape) print(test.shape) timer.time("load csv in ") train = pd.merge(train, new_trans, on="card_id", how="left") train = pd.merge(train, old_trans, on="card_id", how="left") train = pd.merge(train, new_trans6, on="card_id", how="left") train = pd.merge(train, old_trans6, on="card_id", how="left") # test = pd.merge(test, new_trans, on="card_id", how="left") test = pd.merge(test, old_trans, on="card_id", how="left") test = pd.merge(test, new_trans6, on="card_id", how="left") test = pd.merge(test, old_trans6, on="card_id", how="left") # print(train.shape) # print(test.shape) # fer = jit_fe.JitFe() train = fer.do_fe(train) test = fer.do_fe(test) train_y = train["target"] # 3.660 - 3.658 use_col = [ "new_trans_elapsed_days_max", "new_trans_elapsed_days_min", "new_trans_elapsed_days_mean", # 0.001 "old_trans_elapsed_days_max", "old_trans_elapsed_days_min", "old_trans_elapsed_days_mean", # 0.025 mean001 "new_last_day", # 0.005 "old_installments_sum", "old_installments_mean", # 0.005 "old_month_nunique", "old_woy_nunique", # 0.010 "old_merchant_id_nunique", # 0.002 "new_month_lag_mean", "old_month_lag_mean", "elapsed_days", # 0.010 "new_purchase_amount_max", "new_purchase_amount_count", "new_purchase_amount_mean", # 0.020 "old_purchase_amount_max", "old_purchase_amount_count", "old_purchase_amount_mean", # 0.002 "old_category_1_mean", "new_category_1_mean", # 0.006 "old_authorized_flag_sum", # "old_authorized_flag_mean", bad? "old_no_city_purchase_amount_min", # 0.003 "old_no_city_purchase_amount_max", "old_no_city_purchase_amount_mean", # 0.002 "rec1_purchase_amount_count", # 0.005 "old_month_lag_max", # 0.002 "new_time_diff_mean", "new_trans_elapsed_days_std", # 0.002 "old_month_diff_mean", "old_pa2_month_diff_min", # 0.004 ] train_x = train[use_col] test_x = test[use_col] print(train_x.shape) print(train_y.shape) print(test_x.shape) timer.time("prepare train in ") return train[["card_id", "target"]], test[["card_id"]], train_x, train_y, test_x
train = pd.merge(train, old_trans3, on="card_id", how="left") train = pd.merge(train, new_trans6, on="card_id", how="left") train = pd.merge(train, old_trans6, on="card_id", how="left") train = pd.merge(train, old_trans9, on="card_id", how="left") # test = pd.merge(test, new_trans, on="card_id", how="left") test = pd.merge(test, old_trans, on="card_id", how="left") test = pd.merge(test, old_trans3, on="card_id", how="left") test = pd.merge(test, new_trans6, on="card_id", how="left") test = pd.merge(test, old_trans6, on="card_id", how="left") test = pd.merge(test, old_trans9, on="card_id", how="left") print(train.shape) print(test.shape) # fer = jit_fe.JitFe() train = fer.do_fe(train) test = fer.do_fe(test) pred_train = csv_io.read_file(path_const.NEW_DAY_PRED_OOF) pred_test = csv_io.read_file(path_const.NEW_DAY_PRED_SUB) train = pd.merge(train, pred_train, on="card_id", how="left") train["pred_diff"] = train["pred_new"] - train["new_to_last_day"] test = pd.merge(test, pred_test, on="card_id", how="left") test["pred_diff"] = test["pred_new"] - test["new_to_last_day"] train_y = train["target"] drop_col = [ "card_id", "target", # "feature_1", "feature_2", "feature_3", "old_weekend_mean",