def main(): train_df = pd.read_pickle('./mnt/inputs/origin/train.pkl.gz') test_df = pd.read_csv('./mnt/inputs/origin/test.csv') # ============================== # start processing # ============================== use_feature = { "EventCount": [EventCount, False], # class, is_overwrite "EventCount2": [EventCount2, False], # class, is_overwrite "Worldcount": [Worldcount, False], "SessionTime": [SessionTime2, False], # "AssessEventCount": [AssessEventCount, False], "EncodingTitles": [EncodingTitles, False], # "encodingTitleOrder": [encodingTitleOrder, False], # "PrevAssessResult":[PrevAssessResult, True], # "PrevAssessAcc": [PrevAssessAcc, True], "PrevAssessAccByTitle": [PrevAssessAccByTitle, False], "dtFeatures": [dtFeatures, False], # "eventCodeRatioFeatures": [eventCodeRatioFeatures, False], # "eventIDRatioFeatures": [eventIDRatioFeatures, False], "immediatelyBeforeFeatures": [immediatelyBeforeFeatures, False], "worldLabelEncodingDiffFeatures": [worldLabelEncodingDiffFeatures, True], } is_local = False if is_local: base_path = "../input" # at local train_df, test_df = preprocess_dfs(use_feature, is_local=is_local, logger=None, debug=False) else: base_path = './mnt/inputs/origin' # at kaggle kernel sub = pd.read_csv(f'{base_path}/sample_submission.csv') # base_path = '/kaggle/input/data-science-bowl-2019' # at kaggle kernel # if len(sub) == 1000: if False: sub.to_csv('submission.csv', index=False) exit(0) else: train_df, test_df = preprocess_dfs(use_feature, is_local=is_local, logger=None, debug=is_debug) # remove , to avoid error of lgbm train_df.columns = [col.replace(',', '_') for col in train_df.columns] test_df.columns = [col.replace(',', '_') for col in test_df.columns] # train_params = { # 'learning_rate': 0.01, # 'bagging_fraction': 0.90, # 'feature_fraction': 0.85, # 'max_depth': 5, # 'lambda_l1': 0.7, # 'lambda_l2': 0.7, # 'metric': 'multiclass', # 'objective': 'multiclass', # 'num_classes': 4, # 'random_state': 773, # "n_estimators": 3000 # } train_params = { 'learning_rate': 0.01, 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'rmse', # 'num_leaves': 64, 'num_leaves': 16, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.7, 'max_depth': -1, 'lambda_l1': 0.2, 'lambda_l2': 0.4, 'seed': 19930802, 'n_estimators': 100000, 'importance_type': 'gain', } bad_feats = [ 'prev_gs_duration', 'session_intervalrmin', 'session_intervalrstd', 'session_intervalrmax', 'session_interval', 'accum_acc_gr_-99', 'session_intervalrmean', 'ass_session_interval', 'prev_gs_durationrmean', 'prev_gs_durationrmax', 'ev_cnt4070', 'prev_gs_durationrstd', 'mean_g_duration_meaan', 'ev_cnt3010', 'g_duration_std', 'ev_cnt4030', 'ev_cnt3110', 'g_duration_mean', 'meaan_g_duration_min', 'ass_session_interval_rmin', 'accum_acc_gr_3', 'g_duration_min', 'mean_g_duraation_std' ] no_use_cols = [ "accuracy", "accuracy_group", "game_session", "installation_id", "title", "type", "world", "pred_y" ] + list(set(train_df.columns) - set(test_df.columns)) + bad_feats train_cols = [c for c in list(train_df.columns) if c not in no_use_cols] print(f"train_df shape: {train_df.shape}") print(train_cols) cat_cols = [] # logger.log(logging.DEBUG, f"categorical cols: {cat_cols}") target = "accuracy_group" # target = "accuracy" model_conf = { "predict_type": "regressor", "train_params": train_params, "train_cols": train_cols, "cat_cols": cat_cols, "target": target, "is_debug": is_debug, } validation_param = { "model_name": "LGBM", } exp_conf = { "train_small_dataset": False, "use_feature": { "sample": True }, "train_params": train_params, "exp_name": exp_name } v = Validation(validation_param, exp_conf, train_df, test_df, logger) clf, oof, prediction, feature_importance_df \ = v.do_valid_kfold(model_conf) optR = OptimizedRounder() optR.fit(oof, train_df['accuracy_group']) # optR.fit(oof, train_df['accuracy_group']) coefficients = optR.coefficients() opt_preds = optR.predict(oof, coefficients) oof_dir = f'./mnt/oofs/{EXP_ID}' if not os.path.exists(oof_dir): os.mkdir(oof_dir) with open(f'{oof_dir}/{EXP_ID}_oof.pkl', 'wb') as fout: pickle.dump(oof, fout) res_qwk = qwk(train_df['accuracy_group'], opt_preds) print(f'res_qwk : {res_qwk}') logger.log(logging.DEBUG, f'qwk -- {res_qwk}') # print(f'qwk -- {np.mean(valid_qwks)} +- {np.std(valid_qwks)}') # logger.log( # logging.DEBUG, # f'qwk -- {np.mean(valid_qwks)} +- {np.std(valid_qwks)}') # save info feature_importance_df.to_csv(f'./mnt/importances/{EXP_ID}.csv', index=False)
def do_valid_kfold(self, model_conf, n_splits=5): sp = Splitter() target = model_conf["target"] split_x = self.train["installation_id"] split_y = self.train[target] seed = 773 sp.get_kfold_idx(split_x, split_y, seed, n_cv=n_splits, stratified=False, group=True, pref=self.exp_conf["exp_name"]) oof: ndarray = np.zeros((self.train.shape[0])) prediction = np.zeros((self.test.shape[0])) clf_list = [] self.logger.log(logging.DEBUG, "[train cols] " + "-" * 50) self.logger.log(logging.DEBUG, model_conf["train_cols"]) self.validation_scores = [] optimizers = [] valid_qwks = [] for i, (trn_idx, val_idx) in enumerate(sp.idx_list): self.logger.log(logging.DEBUG, "-" * 60) self.logger.log(logging.DEBUG, f"start training: {i}") with timer(f"fold {i}", self.logger): train_df, valid_df = self.train.loc[trn_idx], self.train.loc[ val_idx] model = self.generate_model(model_conf) clf, fold_oof, feature_importance_df = model.train( train_df, valid_df, self.logger) # fold_oof_class = fold_oof.argmax(axis = 1) fold_prediction = model.predict(self.test, self.logger) # fold_val_score = get_val_score(valid_df[target], fold_oof_class, "QWK") optR = OptimizedRounder() optR.fit(fold_oof, valid_df[target]) coefficients = optR.coefficients() opt_preds = optR.predict(fold_oof, coefficients) fold_qwk = qwk(valid_df[target], opt_preds) optimizers.append(optR) valid_qwks.append(fold_qwk) clf_list.append(clf) oof[val_idx] = fold_oof prediction += fold_prediction / n_splits feature_importance_df["fold"] = i self.feature_importance.append(feature_importance_df) # self.logger.log(logging.DEBUG, # f"Total Validation Score: {sum(self.validation_scores) / # len(self.validation_scores):,.5f}") self.feature_importance = pd.concat(self.feature_importance, axis=0) return clf_list, oof, prediction, self.feature_importance, optimizers, valid_qwks
def main(): # train_df = pd.read_pickle('./mnt/inputs/origin/train.pkl.gz') # test_df = pd.read_csv('./mnt/inputs/origin/test.csv') # ============================== # start processing # ============================== use_feature = { # "EventCount": [EventCount, False], # class, is_overwrite # "EventCount2": [EventCount2, False], # class, is_overwrite "Worldcount": [Worldcount, False], "SessionTime": [SessionTime2, False], # "AssessEventCount": [AssessEventCount, False], "EncodingTitles": [EncodingTitles, False], # "encodingTitleOrder": [encodingTitleOrder, False], # "PrevAssessResult": [PrevAssessResult, False], "PrevAssessAcc": [PrevAssessAcc, False], "PrevAssessAccByTitle": [PrevAssessAccByTitle, False], "GameDurMiss": [GameDurMiss, False], # "dtFeatures": [dtFeatures, False], # "eventCodeRatioFeatures": [eventCodeRatioFeatures, False], # "eventIDRatioFeatures": [eventIDRatioFeatures, False], "immediatelyBeforeFeatures": [immediatelyBeforeFeatures, False], # "worldLabelEncodingDiffFeatures": [worldLabelEncodingDiffFeatures, False], # "worldNumeriacalFeatures": [worldNumeriacalFeatures, False], # "worldAssessmentNumeriacalFeatures": [worldAssessmentNumeriacalFeatures, False], # "worldActivityNumeriacalFeatures": [worldActivityNumeriacalFeatures, False], "worldGameNumeriacalFeatures": [worldGameNumeriacalFeatures, False], # "worldEventDataFeatures1": [worldEventDataFeatures1, False], # to debug! killer features! # "worldEventDataFeaturesRolling5": [worldEventDataFeaturesRolling5, False], # "worldNumeriacalFeatures2": [worldNumeriacalFeatures2, False], # "currentSessionInfo": [currentSessionInfo, False], # "sameWorldBaseFeatures": [sameWorldBaseFeatures, False], "befTargetCntFeatures": [befTargetCntFeatures, False], } is_local = False if is_local: base_path = "../input" # at local train_df, test_df = preprocess_dfs(use_feature, is_local=is_local, logger=None, debug=False) else: base_path = './mnt/inputs/origin' # at kaggle kernel sub = pd.read_csv(f'{base_path}/sample_submission.csv') # base_path = '/kaggle/input/data-science-bowl-2019' # at kaggle kernel # if len(sub) == 1000: if False: sub.to_csv('submission.csv', index=False) exit(0) else: train_df, test_df = preprocess_dfs(use_feature, is_local=is_local, logger=None, debug=is_debug) # remove , to avoid error of lgbm train_df.columns = [col.replace(',', '_') for col in train_df.columns] test_df.columns = [col.replace(',', '_') for col in test_df.columns] # train_params = { # 'learning_rate': 0.01, # 'bagging_fraction': 0.90, # 'feature_fraction': 0.85, # 'max_depth': 5, # 'lambda_l1': 0.7, # 'lambda_l2': 0.7, # 'metric': 'multiclass', # 'objective': 'multiclass', # 'num_classes': 4, # 'random_state': 773, # "n_estimators": 3000 # } train_params = { 'learning_rate': 0.01, 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'rmse', 'num_leaves': 64, # 'num_leaves': 16, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.7, 'max_depth': -1, # 'lambda_l1': 0.2, # 'lambda_l2': 0.4, 'lambda_l1': 1, 'lambda_l2': 1, 'seed': 19930802, 'n_estimators': 100000, 'importance_type': 'gain', } bad_feats = [ 'prev_gs_duration', 'session_intervalrmin', 'session_intervalrstd', 'session_intervalrmax', 'session_interval', 'accum_acc_gr_-99', 'session_intervalrmean', 'ass_session_interval', 'prev_gs_durationrmean', 'prev_gs_durationrmax', 'ev_cnt4070', 'prev_gs_durationrstd', 'mean_g_duration_meaan', 'ev_cnt3010', 'g_duration_std', 'ev_cnt4030', 'ev_cnt3110', 'g_duration_mean', 'meaan_g_duration_min', 'ass_session_interval_rmin', 'accum_acc_gr_3', 'g_duration_min', 'mean_g_duraation_std', 'f019_bef_target_cnt', ] no_use_cols = [ "accuracy", "accuracy_group", "game_session", "installation_id", "title", "type", "world", "pred_y" ] + list(set(train_df.columns) - set(test_df.columns)) + bad_feats train_cols = [c for c in list(train_df.columns) if c not in no_use_cols] # set train cols # train_cols = pd.read_csv('./mnt/importances/e026_temp.csv').groupby( # 'feature').importance.mean().sort_values(ascending=False)[:10].index.tolist() to_exclude, test_df = exclude(train_df, test_df, train_cols) train_cols = [col for col in train_cols if col not in to_exclude] # print(f"train_df shape: {train_df.shape}") logger.log(logging.DEBUG, f"train_df shape: {train_df.shape}") print(train_cols) cat_cols = [] # logger.log(logging.DEBUG, f"categorical cols: {cat_cols}") target = "accuracy_group" # target = "accuracy" model_conf = { "predict_type": "regressor", "train_params": train_params, "train_cols": train_cols, "cat_cols": cat_cols, "target": target, "is_debug": is_debug, } validation_param = { "model_name": "LGBM", } exp_conf = { "train_small_dataset": False, "use_feature": { "sample": True }, "train_params": train_params, "exp_name": exp_name } # train_df[target] = train_df[target] / 3. # train_df.loc[train_df[target] <= 1, target] = 0 # train_df.loc[train_df[target] > 1, target] = 1 # print(train_df[target].head()) # v = Validation(validation_param, exp_conf, train_df, test_df, logger) v = Validation2(validation_param, exp_conf, train_df, test_df, logger) clf, oof, prediction, feature_importance_df, labels \ = v.do_valid_kfold(model_conf, trn_mode='simple', # trn_mode='last_truncated', val_mode='last_truncated') # val_mode='simple') optR = OptimizedRounder() # optR.fit(oof, train_df['accuracy_group'], [[1.0, 1.5, 2.9]]) # optR.fit(oof, train_df['accuracy_group'], [[1.0, 1.5, 2.0]]) optR.fit(oof[oof != 0], labels[oof != 0], [[1.0, 1.5, 2.0]]) coefficients = optR.coefficients() opt_preds = optR.predict(oof[oof != 0], coefficients) oof_dir = f'./mnt/oofs/{EXP_ID}' if not os.path.exists(oof_dir): os.mkdir(oof_dir) with open(f'{oof_dir}/{EXP_ID}_oof.pkl', 'wb') as fout: pickle.dump(oof, fout) with open(f'{oof_dir}/{EXP_ID}_label.pkl', 'wb') as fout: pickle.dump(labels, fout) res_qwk = qwk(labels[oof != 0], opt_preds) print(f'res_qwk : {res_qwk}') logger.log(logging.DEBUG, f'qwk -- {res_qwk}') # print(f'qwk -- {np.mean(valid_qwks)} +- {np.std(valid_qwks)}') # logger.log( # logging.DEBUG, # f'qwk -- {np.mean(valid_qwks)} +- {np.std(valid_qwks)}') # save info feature_importance_df.to_csv(f'./mnt/importances/{EXP_ID}.csv', index=False)
def do_adversarial_valid_kfold(self, model_conf, n_splits=2): sp = Splitter() target = "is_test" split_x = self.train["installation_id"] split_y = self.train[target] seed = 773 sp.get_kfold_idx(split_x, split_y, seed, n_cv=n_splits, stratified=True, pref="adv") target_length = 1 oof: ndarray = np.zeros(self.train.shape[0]) prediction = np.zeros(self.test.shape[0]) clf_list = [] self.logger.log(logging.DEBUG, "[train cols] " + "-" * 50) self.logger.log(logging.DEBUG, model_conf["train_cols"]) self.validation_scores = [] optimizers = [] valid_qwks = [] for i, (trn_idx, val_idx) in enumerate(sp.idx_list): self.logger.log(logging.DEBUG, "-" * 60) self.logger.log(logging.DEBUG, f"start training: {i}") with timer(f"fold {i}", self.logger): train_df, valid_df = self.train.loc[trn_idx], self.train.loc[ val_idx] model = self.generate_model(model_conf) clf, fold_oof, feature_importance_df = model.train( train_df, valid_df, self.logger) # calc validation score using clf.best_iteration_ fold_val_score = get_val_score(valid_df[target], fold_oof) self.validation_scores.append(fold_val_score) optR = OptimizedRounder() optR.fit(fold_oof, valid_df[target]) coefficients = optR.coefficients() opt_preds = optR.predict(fold_oof, coefficients) fold_qwk = qwk(valid_df[target], opt_preds) optimizers.append(optR) valid_qwks.append(fold_qwk) self.logger.log(logging.DEBUG, f"fold_val_score: {fold_val_score:,.5f}") clf_list.append(clf) oof[val_idx] = fold_oof feature_importance_df["fold"] = i self.feature_importance.append(feature_importance_df) self.logger.log( logging.DEBUG, f"Total Validation Score: {sum(self.validation_scores) / len(self.validation_scores):,.5f}" ) oof = np.expm1(oof) self.train["pred_y"] = oof self.feature_importance = pd.concat(self.feature_importance, axis=0) return clf_list, oof, prediction, self.feature_importance, optimizers, valid_qwks