class Handler: def __init__(self, lbl_time): self.q = queue.Queue() self.lbl_time = lbl_time self.watch = Watch(self.q, 1) def convert(self, t): m = int(t / 60) s = int(t % 60) return str(m).zfill(2) + ':' + str(s).zfill(2) def worker(self): self.rem = self.time while self.watch.is_alive(): t = self.q.get() self.rem -= t self.lbl_time.set_text(self.convert(self.rem)) self.q.task_done() def window_destroy_cb(self, *args): Gtk.main_quit() def btn_pomodoro_clicked_cb(self, button): self.lbl_time.set_text("25:00") self.time = 25 * 60.0 def btn_short_break_clicked_cb(self, button): self.lbl_time.set_text("05:00") self.time = 5 * 60.0 def btn_long_break_clicked_cb(self, button): self.lbl_time.set_text("10:00") self.time = 10 * 60.0 def btn_start_clicked_cb(self, button): self.watch = Watch(self.q, 1) self.watch.set_t_max(self.time) self.watch.start() thread = threading.Thread(target=self.worker) thread.start() def btn_stop_clicked_cb(self, button): self.watch.stop() def btn_reset_clicked_cb(self, button): self.lbl_time.set_text(self.convert(self.time))
def run(): cross_validation = True perform_imputation = False stop_after_validation = True rand_seed = 1 print("Reading data") read_watch = Watch("Reading data") read_watch.start() df_app_train, df_app_test = load_app_data() read_watch.stop() print("Finish reading data") missing_fill_mean = ["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"] missing_fill_most_freq = [ "CNT_FAM_MEMBERS", "AMT_ANNUITY", "DAYS_LAST_PHONE_CHANGE" ] mean_imputer = SimpleImputer(strategy="mean") most_freq_imputer = SimpleImputer(strategy="most_frequent") preprocess_watch = Watch("Preprocess") print("Preprocess training data") preprocess_watch.start() df_bureau_agg = None df_prev_app_agg = None df_bureau_agg = get_preprocessed_bureau_data() print("Finish preprocessing bureau data") # df_prev_app_agg = get_preprocessed_previous_app_data(False, False) # print("Finish preprocessing previous application data") df_app_train = shuffle(df_app_train, random_state=rand_seed) X_train = preprocess_app(df_app_train, df_bureau_agg, df_prev_app_agg) if perform_imputation: X_train[missing_fill_mean] = pd.DataFrame(mean_imputer.fit_transform( df_app_train[missing_fill_mean]), index=df_app_train.index) X_train[missing_fill_most_freq] = pd.DataFrame( most_freq_imputer.fit_transform( df_app_train[missing_fill_most_freq]), index=df_app_train.index) else: X_train[missing_fill_mean] = df_app_train[missing_fill_mean] X_train[missing_fill_most_freq] = df_app_train[missing_fill_most_freq] y_train = df_app_train["TARGET"] print("Preprocess test data") X_test = preprocess_app(df_app_test, df_bureau_agg, df_prev_app_agg) if perform_imputation: X_test[missing_fill_mean] = pd.DataFrame(mean_imputer.transform( df_app_test[missing_fill_mean]), index=df_app_test.index) X_test[missing_fill_most_freq] = pd.DataFrame( most_freq_imputer.transform(df_app_test[missing_fill_most_freq]), index=df_app_test.index) else: X_test[missing_fill_mean] = df_app_test[missing_fill_mean] X_test[missing_fill_most_freq] = df_app_test[missing_fill_most_freq] if not X_test.columns.equals(X_train.columns): X_test[X_train.columns.difference(X_test.columns)] = 0 X_test.drop(X_test.columns.difference(X_train.columns), axis=1, inplace=True) X_test = X_test.reindex(columns=X_train.columns, axis=1) assert X_train.columns.equals(X_test.columns) preprocess_watch.stop() print("Training data shape:", X_train.shape) X_train.info(verbose=5) print("Initializing classifier") weight_dict = {0: 1, 1: 1} clf = XGBClassifier(max_depth=10, min_child_weight=10, seed=rand_seed, tree_method="gpu_hist") # clf = XGBClassifier(max_depth=8, min_child_weight=12, seed=1) # clf = GradientBoostingClassifier(max_depth=10, min_samples_split=15, verbose=5) # clf = DecisionTreeClassifier(class_weight=weight_dict, max_depth=15, min_samples_split=4) # clf = LogisticRegression(class_weight=weight_dict) # clf = LGBMClassifier( # n_jobs=8, # n_estimators=10000, # learning_rate=0.02, # num_leaves=34, # colsample_bytree=0.9497036, # subsample=0.8715623, # max_depth=8, # reg_alpha=0.041545473, # reg_lambda=0.0735294, # min_split_gain=0.0222415, # min_child_weight=39.3259775, # silent=-1, # verbose=-1) print("Choosing classifier parameters") # model_selection_watch = Watch("Model selection") # params = {"max_depth": [5, 8, 10], "min_child_weight": [10, 12]} # model_selection_watch.start() # grid_clf = GridSearchCV(clf, param_grid=params, scoring="roc_auc", cv=5, verbose=5).fit(X_train, y_train) # model_selection_watch.stop() # print(grid_clf.best_score_) # print(grid_clf.best_params_) # print(grid_clf.cv_results_) # clf = grid_clf.best_estimator_ w = Watch("Validation") w.start() if cross_validation: k_fold = 5 print("Perform {:d}-fold cross validation".format(k_fold)) score_val = sum( cross_val_score(clf, X_train, y_train, cv=k_fold, scoring="roc_auc", verbose=5, n_jobs=2)) / k_fold else: test_size = 0.1 print("Perform hold-out validation (Test size: {:.0%})".format( test_size)) X_train, X_val, y_train, y_val = train_test_split( X_train, y_train, test_size=test_size, random_state=rand_seed) print(X_train[:10]) clf.fit(X_train, y_train) # mean_imputer.transform(X_val[missing_fill_mean]) # most_freq_imputer.transform(X_val[missing_fill_most_freq]) prob_val = clf.predict_proba(X_val)[:, 1] score_val = roc_auc_score(y_val, prob_val) w.stop() print("Validation AUC: %.6f" % score_val) # print(clf.feature_importances_) if stop_after_validation: Watch.print_all() return print("Training classifier") train_watch = Watch("Training") train_watch.start() clf.fit(X_train, y_train) train_watch.stop() print("Dumping trained classifier") from joblib import dump dump(clf, 'boost_tree_gpu_0.joblib') print("Classify test set") train_prob_df = pd.DataFrame(clf.predict_proba(X_train)[:, 1], index=X_train.index, columns=["PRED_PROB"]) train_prob_df.to_csv("train_prob.csv") test_prob_df = pd.DataFrame(clf.predict_proba(X_test)[:, 1], index=X_test.index, columns=["TARGET"]) test_prob_df.to_csv("submission.csv") Watch.print_all()
def clean_inst_pay(): df_inst_pay = load_install_payments(False) print_memory_usage(df_inst_pay, "installment_payments") df_inst_pay.DAYS_ENTRY_PAYMENT.fillna(0, inplace=True) df_inst_pay.AMT_PAYMENT.fillna(-1, inplace=True) df_inst_pay_valid_filter = (df_inst_pay["AMT_PAYMENT"] > 0) | (df_inst_pay["AMT_INSTALMENT"] > 0) print("Remove {:d} invalid records.".format( (~df_inst_pay_valid_filter).sum())) df_inst_pay_group = df_inst_pay[df_inst_pay_valid_filter].groupby([ "SK_ID_PREV", "NUM_INSTALMENT_NUMBER", "DAYS_ENTRY_PAYMENT", "AMT_PAYMENT" ]) del df_inst_pay_valid_filter w = Watch("Aggregation 1") print("Aggregate multiple installments for one payment") w.start() df_inst_pay_group_cnt = df_inst_pay_group.size() df_inst_agg = df_inst_pay_group.agg({ "SK_ID_CURR": ["min", "max"], "NUM_INSTALMENT_VERSION": ["max", "nunique"], "DAYS_INSTALMENT": ["min", "max"], "AMT_INSTALMENT": ["min", "max", "sum"] }) df_inst_agg.columns = ['_'.join(col) for col in df_inst_agg.columns] del df_inst_pay_group w.stop() print_memory_usage(df_inst_agg, "installment_pay_aggregation_1") print("Processing 1") assert ( df_inst_agg["SK_ID_CURR_min"] == df_inst_agg["SK_ID_CURR_max"]).all( axis=None), "Inconsistent SK_ID_CURR" df_inst_pay_processed = pd.DataFrame(index=df_inst_agg.index) df_inst_pay_processed["SK_ID_CURR"] = df_inst_agg["SK_ID_CURR_min"] df_inst_pay_group_cnt_distict = df_inst_agg[ "NUM_INSTALMENT_VERSION_nunique"] df_inst_pay_group_check = ((df_inst_pay_group_cnt == 2) | (df_inst_pay_group_cnt_distict == 1)) assert df_inst_pay_group_check.all(axis=None) del df_inst_pay_group_cnt, df_inst_pay_group_check df_inst_pay_processed["NUM_INSTALMENT_VERSION"] = df_inst_agg[ "NUM_INSTALMENT_VERSION_max"] assert (df_inst_agg["DAYS_INSTALMENT_min"] == df_inst_agg["DAYS_INSTALMENT_max"]).all(axis=None) df_inst_pay_processed["DAYS_INSTALMENT"] = df_inst_agg[ "DAYS_INSTALMENT_min"] df_agg_filter = (df_inst_pay_group_cnt_distict == 2) assert (df_agg_filter | (df_inst_agg["AMT_INSTALMENT_min"] == df_inst_agg["AMT_INSTALMENT_max"])).all(axis=None) df_inst_pay_processed["AMT_INSTALMENT"] = df_inst_agg["AMT_INSTALMENT_min"] df_inst_pay_processed.loc[ df_agg_filter, "AMT_INSTALMENT"] = df_inst_agg["AMT_INSTALMENT_sum"] print("%d payments aggregated" % df_agg_filter.sum()) del df_inst_pay_group_cnt_distict, df_agg_filter df_inst_pay_processed.reset_index(inplace=True) # df_inst_pay_processed["DAYS_ENTRY_PAYMENT"].astype(np.float16, copy=False) df_inst_pay_processed["DAYS_ENTRY_PAYMENT"] = df_inst_pay_processed[ "DAYS_ENTRY_PAYMENT"].astype(np.float16, copy=False) df_inst_pay_processed["AMT_PAYMENT"] = df_inst_pay_processed[ "AMT_PAYMENT"].astype(np.float32, copy=False) df_inst_pay_processed["AMT_PAYMENT"].replace(-1, -np.inf, inplace=True) assert ((df_inst_pay_processed["AMT_PAYMENT"] >= 0) | (df_inst_pay_processed["DAYS_ENTRY_PAYMENT"] == 0)).all(axis=None) df_diff_entry_offset = df_inst_pay_processed[ "DAYS_ENTRY_PAYMENT"] - df_inst_pay_processed["DAYS_INSTALMENT"] df_inst_pay_processed["AMT_DUE_PAYMENT"] = ( np.fmax(df_inst_pay_processed["AMT_PAYMENT"], 0) * (df_diff_entry_offset <= 0)) df_inst_pay_processed["AMT_DUE30_PAYMENT"] = ( np.fmax(df_inst_pay_processed["AMT_PAYMENT"], 0) * (df_diff_entry_offset <= 30)) print_memory_usage(df_inst_pay_processed, "inst_pay_processed_1") # print(df_inst_pay_processed.query("(SK_ID_PREV == 1001758) & (NUM_INSTALMENT_NUMBER == 24)").transpose()) df_inst_pay_group = df_inst_pay_processed.groupby( ["SK_ID_PREV", "NUM_INSTALMENT_NUMBER", "NUM_INSTALMENT_VERSION"]) del df_diff_entry_offset, df_inst_pay_processed, df_inst_agg w = Watch("Aggregation 2") print("Aggregate multiple payments for one installment") w.start() df_inst_pay_group_cnt = df_inst_pay_group.size() df_inst_agg = df_inst_pay_group.agg( { "SK_ID_CURR": ["min", "max"], # "NUM_INSTALMENT_VERSION": ["min", "max"], "DAYS_INSTALMENT": ["min", "max"], "DAYS_ENTRY_PAYMENT": ["min", "max"], "AMT_INSTALMENT": ["min", "max", "sum"], "AMT_PAYMENT": ["sum"], "AMT_DUE_PAYMENT": ["sum"], "AMT_DUE30_PAYMENT": ["sum"] }, skipna=False) df_inst_agg.columns = ['_'.join(col) for col in df_inst_agg.columns] del df_inst_pay_group w.stop() print("Finish aggregations") gc.collect() print_memory_usage(df_inst_agg, "installment_pay_aggregation_2") print("Processing 2") w = Watch("Processing 2") w.start() assert (df_inst_agg["SK_ID_CURR_min"] == df_inst_agg["SK_ID_CURR_max"] ).all(), "Inconsistent SK_ID_CURR" df_inst_pay_processed = pd.DataFrame(index=df_inst_agg.index) df_inst_pay_processed["SK_ID_CURR"] = df_inst_agg["SK_ID_CURR_min"] # df_inst_agg_INST_VER = df_inst_agg["NUM_INSTALMENT_VERSION"] # assert (df_inst_agg_INST_VER["min"] == df_inst_agg_INST_VER["max"]).all(axis=None), "Inconsistent NUM_INSTALMENT_VERSION" # df_inst_pay_processed["NUM_INSTALMENT_VERSION"] = df_inst_agg_INST_VER["min"] assert (df_inst_agg["DAYS_INSTALMENT_min"] == df_inst_agg["DAYS_INSTALMENT_max"]).all( axis=None), "Inconsistent DAYS_INSTALMENT" df_inst_pay_processed["DAYS_INSTALMENT"] = df_inst_agg[ "DAYS_INSTALMENT_min"] df_inst_pay_processed["DAYS_FIRST_PAYMENT"] = df_inst_agg[ "DAYS_ENTRY_PAYMENT_min"].replace(0, np.nan) df_inst_pay_processed["DAYS_LAST_PAYMENT"] = df_inst_agg[ "DAYS_ENTRY_PAYMENT_max"].replace(0, np.nan) assert (df_inst_agg["AMT_INSTALMENT_min"] == df_inst_agg["AMT_INSTALMENT_max"]).all(axis=None) df_inst_pay_processed["AMT_INSTALMENT"] = df_inst_agg["AMT_INSTALMENT_min"] # Fix missing installment info # df_prev_app_ann = pd.read_csv(r"data\previous_application.csv", index_col=0, usecols=[0, 3]) # df_inst_agg = df_inst_agg.join(df_prev_app_ann, how="left") # # df_annuity_check = ((df_inst_agg.index.get_level_values(2) != 1) | df_inst_agg["AMT_ANNUITY"].isna() | # (df_inst_agg["AMT_INSTALMENT_min"] == 0) | # ((df_inst_agg["AMT_ANNUITY"] - df_inst_agg["AMT_INSTALMENT_min"]).abs() < 0.01)) # assert df_annuity_check.all(axis=None) # inst_fix_filter = ((df_inst_agg["NUM_INSTALMENT_VERSION"] == 1) & (df_inst_agg["AMT_INSTALMENT_min"] == 0)) # df_inst_pay_processed.loc[inst_fix_filter, "AMT_INSTALMENT"] = df_inst_agg.loc[inst_fix_filter, "AMT_ANNUITY"] # del df_annuity_check, inst_fix_filter # inst_fix_filter = (df_inst_agg["AMT_INSTALMENT_min"] == 0) # df_inst_pay_processed.loc[inst_fix_filter, "AMT_INSTALMENT"] = df_inst_agg.loc[inst_fix_filter, "AMT_PAYMENT_sum"] # del inst_fix_filter df_inst_pay_invalid_filter = (df_inst_agg["AMT_PAYMENT_sum"] < 0) assert ((~df_inst_pay_invalid_filter) | (df_inst_pay_group_cnt == 1)).all(axis=None) df_inst_pay_processed["AMT_PAYMENT"] = df_inst_agg["AMT_PAYMENT_sum"] df_inst_pay_processed.loc[df_inst_pay_invalid_filter, "AMT_PAYMENT"] = np.nan assert (df_inst_pay_processed["AMT_PAYMENT"] != 0).all(axis=None) df_inst_pay_invalid_filter = df_inst_pay_processed["AMT_PAYMENT"].isnull() df_inst_pay_processed["NUM_PAYMENTS"] = df_inst_pay_group_cnt.astype( np.uint16) df_inst_pay_processed.loc[df_inst_pay_invalid_filter, "NUM_PAYMENTS"] = np.uint16(0) print("%d installments aggregated" % (df_inst_pay_group_cnt > 1).sum()) del df_inst_pay_group_cnt, df_inst_pay_invalid_filter df_inst_pay_processed["AMT_OVERDUE"] = np.fmax( df_inst_pay_processed["AMT_INSTALMENT"] - df_inst_agg["AMT_DUE_PAYMENT_sum"], 0) df_inst_pay_processed["AMT_OVERDUE"] *= ( df_inst_pay_processed["AMT_OVERDUE"] >= 0.01) df_inst_pay_processed["AMT_DPD30"] = np.fmax( df_inst_pay_processed["AMT_INSTALMENT"] - df_inst_agg["AMT_DUE30_PAYMENT_sum"], 0) df_inst_pay_processed["AMT_DPD30"] *= (df_inst_pay_processed["AMT_DPD30"] >= 0.01) df_inst_pay_processed["AMT_UNPAID"] = np.fmax( df_inst_pay_processed["AMT_INSTALMENT"] - df_inst_pay_processed["AMT_PAYMENT"].fillna(0), 0) df_inst_pay_processed["AMT_UNPAID"] *= (df_inst_pay_processed["AMT_UNPAID"] >= 0.01) df_inst_pay_processed.reset_index(inplace=True) # df_inst_pay_processed.rename(columns={"NUM_INSTALMENT_NUMBER": "NUM_INSTALMENT_NUMBER", # "NUM_INSTALMENT_VERSION": "INSTALMENT_VER"}) del df_inst_agg w.stop() print("Finish processing") print_memory_usage(df_inst_pay_processed, "inst_pay_processed_2") gc.collect() columns_to_write = [ "SK_ID_PREV", "SK_ID_CURR", "NUM_INSTALMENT_VERSION", "NUM_INSTALMENT_NUMBER", "DAYS_INSTALMENT", "DAYS_FIRST_PAYMENT", "DAYS_LAST_PAYMENT", "NUM_PAYMENTS", "AMT_INSTALMENT", "AMT_PAYMENT", "AMT_OVERDUE", "AMT_DPD30", "AMT_UNPAID" ] w = Watch("Save file") w.start() df_inst_pay_processed.to_csv(r"data\installments_payments_processed.csv", index=False, columns=columns_to_write) w.stop() Watch.print_all()