def hyperopt_search(variant: cco.Variant, data, max_evals, mix_algo_ratio=None, random_state=None): timer = stopwatch.Timer() if variant.is_gb(): clf_hp_gb.do_fmin( data, max_evals, mix_algo_ratio=mix_algo_ratio, max_depth_space=hp.pchoice("max_depth", [(0.40, 1), (0.60, 2)]), random_state=random_state, ) elif variant.is_rf(): clf_hp_rf.do_fmin( data, max_evals, mix_algo_ratio=mix_algo_ratio, max_depth_space=hp.pchoice("max_depth", [(0.45, 1), (0.55, 2)]), random_state=random_state, ) elif variant.is_cb_native(): pools = clf_cat_tools.make_pools(data) clf_hp_cb.do_fmin( pools, max_evals, mix_algo_ratio=mix_algo_ratio, random_state=random_state, how="native", ) sec_elapsed = timer.elapsed cco.out("{} done for {}".format(variant.name, cco.fmt(sec_elapsed)))
def drop_by_condition(variant: cco.Variant, df: pd.DataFrame) -> None: cco.drop_by_condition( df, lambda d: ((d["sspd_absent_games"] >= 3) | (d["sspd_empty_games"] >= 4) # drop tough decset (exist 7-5, 7-6, 8-6 and so on) as noisy: | ((d["s3_fst_games"] + d["s3_snd_games"]) >= 12) | (d["best_of_five"] == 1)), ) variant.drop_by_condition(df)
def make_preanalyze_df(variant: cco.Variant): preanalyze_fnames = [ "fst_bet_chance", "dset_ratio_dif", "decided_win_snd_plr_by_score", "dif_elo_alt_pts", "dif_surf_elo_alt_pts", "dif_srv_ratio", "spd_fst_lastrow_games", "fst_s2choke_adv", "dif_service_win", "dif_receive_win", "dif_age", "avg_age", "dif_plr_tour_adapt", "dif_fatigue", "h2h_direct", ] sex = variant.sex filename = cfg_dir.analysis_data_file(sex, typename="decidedset") data0 = pd.read_csv(filename, sep=",") primary_edit(variant, data0) add_columns(variant.sex, data0) if data0.shape[0] < cco.DATASET_MIN_LEN: msg = f"after drop nan poor dataset ({data0.shape[0]} rows) sex:{sex}" print(msg) raise ValueError(msg) df_out = data0[preanalyze_fnames + [LABEL_NAME]] dirname = variant.persist_dirname(MODEL) filename = os.path.join(dirname, "preanalyze", "df.csv") df_out.to_csv(filename, index=False) write_note(variant, subname="preanalyze", text=f"size {df_out.shape}")
def read_df(variant: cco.Variant, subname: str) -> pd.DataFrame: dirname = variant.persist_dirname(MODEL) df = cco.load_df(dirname, subname=subname) if df is None: raise ValueError(f"no dataset {variant.sex} dirname: {dirname}") if df.shape[0] < cco.DATASET_MIN_LEN: raise ValueError(f"few dataset {variant.sex} dirname: {dirname}") return df
def random_train(variant: cco.Variant, msg="", split=True, plot=False): all_name_imp = defaultdict(lambda: 0.0) prc_list, acc_list, auc_list, treecnt_list, lrate_list = [], [], [], [], [] all_wl = st.WinLoss() all_test_size = 0 for seed in random_args.iter_seeds(): put_seed(seed) variant.set_random_state(seed) for random_state in random_args.iter_states(): log.info(f"random_state={random_state} start learning") data, _ = fill_data( variant, split=split, is_shuffle=args.shuffle, random_state=random_state, ) clf = variant.make_clf_fit(data, metric_name, random_seed=seed, plot=plot) name_imp = variant.get_features_importance( variant.feature_names.get_list()) for name, imp in name_imp.items(): all_name_imp[name] += imp prec = precision_score(data.test.y, clf.predict(data.test.X)) acc = accuracy_score(data.test.y, clf.predict(data.test.X)) auc = roc_auc_score(data.test.y, clf.predict_proba(data.test.X)[:, 1]) prc_list.append(prec) acc_list.append(acc) auc_list.append(auc) if variant.is_cb_native(): treecnt_list.append(clf.tree_count_) lrate_list.append(clf.learning_rate_) log.info(f"gomean acc {sum(acc_list) / len(acc_list)}") res = variant.make_test_result(data) all_wl += res.poswl + res.negwl all_test_size += data.test.X.shape[0] log.info(f"******************************************\n" f"*****{msg}*** {variant.name} results******\n") log.info(f"mean_prc {sum(prc_list) / random_args.space_size()}") log.info(f"mean_acc {sum(acc_list) / random_args.space_size()}") log.info(f"mean_auc {sum(auc_list) / random_args.space_size()}") if variant.is_cb_native(): log.info(f"treecnt {sum(treecnt_list) / random_args.space_size()}") log.info(f"lrate {sum(lrate_list) / random_args.space_size()}") log.info(f"all_wl {all_wl.ratio_size_str(precision=4)} " f"ratio {round(all_wl.size / all_test_size, 3)}") log.info("all_name_imp:") all_name_imp_list = [(k, v / random_args.space_size()) for k, v in all_name_imp.items()] all_name_imp_list.sort(key=lambda it: it[1], reverse=True) log.info("\n" + pprint.pformat(all_name_imp_list))
def go_hyperopt_search(variant: cco.Variant): put_seed(random_args.get_any_seed()) random_state = random_args.get_any_state() data, _ = fill_data( variant, split=variant.is_cb_native(), is_shuffle=args.shuffle, random_state=random_state, ) hyperopt_search( variant, data, max_evals=100, mix_algo_ratio=clf_hp.MixAlgoRatio(tpe=0.50, anneal=0.50, rand=0.00), random_state=random_state, )
def final_train_save(variant: cco.Variant, seed: int, random_state: int): put_seed(seed) variant.set_random_state(seed) data, _ = fill_data( variant, # split=False, # it supplies eval=None split=None, is_shuffle=args.shuffle, random_state=random_state, ) variant.make_clf_fit(data, metric_name, random_seed=seed) variant.save_clf(model_name=MODEL, metric_name=metric_name) msg = (f"final_train_save done var: {variant} " f"seed: {seed} random_state={random_state}") write_note(variant, subname="clf", text=msg) cco.out(msg)
def fill_data( variant: cco.Variant, split: Optional[bool], is_shuffle: bool = False, random_state: int = 0, ): vrnt_key = variant.key df0 = read_df(variant, subname="main") df = cco.stage_shuffle(df0, is_shuffle, random_state=random_state) add_columns(variant.sex, df) clf_columns.with_nan_columns(df, columns=variant.feature_names.get_list(), raise_ifnan=True) cat_feats_idx = (None if not variant.is_cb_native() else variant.feature_names.cat_indexes()) if CHRONO_SPLIT: vrnt_data, df_spl = cco.fill_data_ending_chrono( df, split, feature_names=variant.feature_names.get_list(), label_name=LABEL_NAME, other_names=(vrnt_key.get_stratify_names(is_text_style=False)), cat_features_idx=cat_feats_idx, weight_mode=variant.weight_mode, ) else: vrnt_data, df_spl = cco.fill_data_ending_stratify_n( df, split, test_size=DEFAULT_TEST_SIZE, eval_size=DEFAULT_EVAL_SIZE, feature_names=variant.feature_names.get_list(), label_name=LABEL_NAME, other_names=(vrnt_key.get_stratify_names(is_text_style=False)), cat_features_idx=cat_feats_idx, weight_mode=variant.weight_mode, random_state=random_state, ) return vrnt_data, df_spl
def write_df(variant: cco.Variant, df: pd.DataFrame, subname: str): dirname = variant.persist_dirname(MODEL) cco.save_df(df, dirname, subname=subname)
def write_note(variant: cco.Variant, subname: str, text: str): dirname = variant.persist_dirname(MODEL) filename = os.path.join(dirname, subname, "note.txt") with open(filename, "w") as f: f.write(f"{text}\n")