示例#1
0
def hyperopt_search(variant: cco.Variant,
                    data,
                    max_evals,
                    mix_algo_ratio=None,
                    random_state=None):
    timer = stopwatch.Timer()
    if variant.is_gb():
        clf_hp_gb.do_fmin(
            data,
            max_evals,
            mix_algo_ratio=mix_algo_ratio,
            max_depth_space=hp.pchoice("max_depth", [(0.40, 1), (0.60, 2)]),
            random_state=random_state,
        )
    elif variant.is_rf():
        clf_hp_rf.do_fmin(
            data,
            max_evals,
            mix_algo_ratio=mix_algo_ratio,
            max_depth_space=hp.pchoice("max_depth", [(0.45, 1), (0.55, 2)]),
            random_state=random_state,
        )
    elif variant.is_cb_native():
        pools = clf_cat_tools.make_pools(data)
        clf_hp_cb.do_fmin(
            pools,
            max_evals,
            mix_algo_ratio=mix_algo_ratio,
            random_state=random_state,
            how="native",
        )
    sec_elapsed = timer.elapsed
    cco.out("{} done for {}".format(variant.name, cco.fmt(sec_elapsed)))
示例#2
0
def drop_by_condition(variant: cco.Variant, df: pd.DataFrame) -> None:
    cco.drop_by_condition(
        df,
        lambda d:
        ((d["sspd_absent_games"] >= 3)
         | (d["sspd_empty_games"] >= 4)
         # drop tough decset (exist 7-5, 7-6, 8-6 and so on) as noisy:
         | ((d["s3_fst_games"] + d["s3_snd_games"]) >= 12)
         | (d["best_of_five"] == 1)),
    )
    variant.drop_by_condition(df)
示例#3
0
def make_preanalyze_df(variant: cco.Variant):
    preanalyze_fnames = [
        "fst_bet_chance",
        "dset_ratio_dif",
        "decided_win_snd_plr_by_score",
        "dif_elo_alt_pts",
        "dif_surf_elo_alt_pts",
        "dif_srv_ratio",
        "spd_fst_lastrow_games",
        "fst_s2choke_adv",
        "dif_service_win",
        "dif_receive_win",
        "dif_age",
        "avg_age",
        "dif_plr_tour_adapt",
        "dif_fatigue",
        "h2h_direct",
    ]
    sex = variant.sex
    filename = cfg_dir.analysis_data_file(sex, typename="decidedset")
    data0 = pd.read_csv(filename, sep=",")
    primary_edit(variant, data0)
    add_columns(variant.sex, data0)
    if data0.shape[0] < cco.DATASET_MIN_LEN:
        msg = f"after drop nan poor dataset ({data0.shape[0]} rows) sex:{sex}"
        print(msg)
        raise ValueError(msg)
    df_out = data0[preanalyze_fnames + [LABEL_NAME]]
    dirname = variant.persist_dirname(MODEL)
    filename = os.path.join(dirname, "preanalyze", "df.csv")
    df_out.to_csv(filename, index=False)
    write_note(variant, subname="preanalyze", text=f"size {df_out.shape}")
示例#4
0
def read_df(variant: cco.Variant, subname: str) -> pd.DataFrame:
    dirname = variant.persist_dirname(MODEL)
    df = cco.load_df(dirname, subname=subname)
    if df is None:
        raise ValueError(f"no dataset {variant.sex} dirname: {dirname}")
    if df.shape[0] < cco.DATASET_MIN_LEN:
        raise ValueError(f"few dataset {variant.sex} dirname: {dirname}")
    return df
示例#5
0
def random_train(variant: cco.Variant, msg="", split=True, plot=False):
    all_name_imp = defaultdict(lambda: 0.0)
    prc_list, acc_list, auc_list, treecnt_list, lrate_list = [], [], [], [], []
    all_wl = st.WinLoss()
    all_test_size = 0

    for seed in random_args.iter_seeds():
        put_seed(seed)
        variant.set_random_state(seed)

        for random_state in random_args.iter_states():
            log.info(f"random_state={random_state} start learning")
            data, _ = fill_data(
                variant,
                split=split,
                is_shuffle=args.shuffle,
                random_state=random_state,
            )
            clf = variant.make_clf_fit(data,
                                       metric_name,
                                       random_seed=seed,
                                       plot=plot)
            name_imp = variant.get_features_importance(
                variant.feature_names.get_list())
            for name, imp in name_imp.items():
                all_name_imp[name] += imp
            prec = precision_score(data.test.y, clf.predict(data.test.X))
            acc = accuracy_score(data.test.y, clf.predict(data.test.X))
            auc = roc_auc_score(data.test.y,
                                clf.predict_proba(data.test.X)[:, 1])
            prc_list.append(prec)
            acc_list.append(acc)
            auc_list.append(auc)
            if variant.is_cb_native():
                treecnt_list.append(clf.tree_count_)
                lrate_list.append(clf.learning_rate_)
            log.info(f"gomean acc {sum(acc_list) / len(acc_list)}")
            res = variant.make_test_result(data)
            all_wl += res.poswl + res.negwl
            all_test_size += data.test.X.shape[0]

    log.info(f"******************************************\n"
             f"*****{msg}*** {variant.name} results******\n")
    log.info(f"mean_prc {sum(prc_list) / random_args.space_size()}")
    log.info(f"mean_acc {sum(acc_list) / random_args.space_size()}")
    log.info(f"mean_auc {sum(auc_list) / random_args.space_size()}")
    if variant.is_cb_native():
        log.info(f"treecnt {sum(treecnt_list) / random_args.space_size()}")
        log.info(f"lrate {sum(lrate_list) / random_args.space_size()}")
    log.info(f"all_wl {all_wl.ratio_size_str(precision=4)} "
             f"ratio {round(all_wl.size / all_test_size, 3)}")
    log.info("all_name_imp:")
    all_name_imp_list = [(k, v / random_args.space_size())
                         for k, v in all_name_imp.items()]
    all_name_imp_list.sort(key=lambda it: it[1], reverse=True)
    log.info("\n" + pprint.pformat(all_name_imp_list))
示例#6
0
 def go_hyperopt_search(variant: cco.Variant):
     put_seed(random_args.get_any_seed())
     random_state = random_args.get_any_state()
     data, _ = fill_data(
         variant,
         split=variant.is_cb_native(),
         is_shuffle=args.shuffle,
         random_state=random_state,
     )
     hyperopt_search(
         variant,
         data,
         max_evals=100,
         mix_algo_ratio=clf_hp.MixAlgoRatio(tpe=0.50,
                                            anneal=0.50,
                                            rand=0.00),
         random_state=random_state,
     )
示例#7
0
def final_train_save(variant: cco.Variant, seed: int, random_state: int):
    put_seed(seed)
    variant.set_random_state(seed)
    data, _ = fill_data(
        variant,
        # split=False,  # it supplies eval=None
        split=None,
        is_shuffle=args.shuffle,
        random_state=random_state,
    )
    variant.make_clf_fit(data, metric_name, random_seed=seed)
    variant.save_clf(model_name=MODEL, metric_name=metric_name)
    msg = (f"final_train_save done var: {variant} "
           f"seed: {seed} random_state={random_state}")
    write_note(variant, subname="clf", text=msg)
    cco.out(msg)
示例#8
0
def fill_data(
    variant: cco.Variant,
    split: Optional[bool],
    is_shuffle: bool = False,
    random_state: int = 0,
):
    vrnt_key = variant.key
    df0 = read_df(variant, subname="main")
    df = cco.stage_shuffle(df0, is_shuffle, random_state=random_state)
    add_columns(variant.sex, df)

    clf_columns.with_nan_columns(df,
                                 columns=variant.feature_names.get_list(),
                                 raise_ifnan=True)
    cat_feats_idx = (None if not variant.is_cb_native() else
                     variant.feature_names.cat_indexes())
    if CHRONO_SPLIT:
        vrnt_data, df_spl = cco.fill_data_ending_chrono(
            df,
            split,
            feature_names=variant.feature_names.get_list(),
            label_name=LABEL_NAME,
            other_names=(vrnt_key.get_stratify_names(is_text_style=False)),
            cat_features_idx=cat_feats_idx,
            weight_mode=variant.weight_mode,
        )
    else:
        vrnt_data, df_spl = cco.fill_data_ending_stratify_n(
            df,
            split,
            test_size=DEFAULT_TEST_SIZE,
            eval_size=DEFAULT_EVAL_SIZE,
            feature_names=variant.feature_names.get_list(),
            label_name=LABEL_NAME,
            other_names=(vrnt_key.get_stratify_names(is_text_style=False)),
            cat_features_idx=cat_feats_idx,
            weight_mode=variant.weight_mode,
            random_state=random_state,
        )
    return vrnt_data, df_spl
示例#9
0
def write_df(variant: cco.Variant, df: pd.DataFrame, subname: str):
    dirname = variant.persist_dirname(MODEL)
    cco.save_df(df, dirname, subname=subname)
示例#10
0
def write_note(variant: cco.Variant, subname: str, text: str):
    dirname = variant.persist_dirname(MODEL)
    filename = os.path.join(dirname, subname, "note.txt")
    with open(filename, "w") as f:
        f.write(f"{text}\n")