Пример #1
0
def hyperopt_search(variant: cco.Variant,
                    data,
                    max_evals,
                    mix_algo_ratio=None,
                    random_state=None):
    timer = stopwatch.Timer()
    if variant.is_gb():
        clf_hp_gb.do_fmin(
            data,
            max_evals,
            mix_algo_ratio=mix_algo_ratio,
            max_depth_space=hp.pchoice("max_depth", [(0.40, 1), (0.60, 2)]),
            random_state=random_state,
        )
    elif variant.is_rf():
        clf_hp_rf.do_fmin(
            data,
            max_evals,
            mix_algo_ratio=mix_algo_ratio,
            max_depth_space=hp.pchoice("max_depth", [(0.45, 1), (0.55, 2)]),
            random_state=random_state,
        )
    elif variant.is_cb_native():
        pools = clf_cat_tools.make_pools(data)
        clf_hp_cb.do_fmin(
            pools,
            max_evals,
            mix_algo_ratio=mix_algo_ratio,
            random_state=random_state,
            how="native",
        )
    sec_elapsed = timer.elapsed
    cco.out("{} done for {}".format(variant.name, cco.fmt(sec_elapsed)))
Пример #2
0
def make_main_reserve(
    variant: cco.Variant,
    reserve_size: float,
    is_shuffle: bool = False,
    random_state: int = RESERVE_RANDOM_STATE,
):
    """ читает исходный файл csv,
       очищает (удаляет строки где важные колонки пустые(NaN)
                или где не вып-ны ограничения по рейтингам,
                или где не вып-ны ограничения по плотности решающ. партии),
       перемешивает строки (опционально, если задан is_shuffle)
       добавляет лищь некоторые клонки ('year', 'opener_lose_match'),
       разбивает на main/reserve части в соответствии с reserve_size, CHRONO_SPLIT
       сохраняет наборы в папки main/reserve для входного варанта variant
   """
    sex = variant.sex
    filename = cfg_dir.analysis_data_file(sex, typename="decidedset")
    data0 = pd.read_csv(filename, sep=",")
    primary_edit(variant, data0)
    if data0.shape[0] < cco.DATASET_MIN_LEN:
        msg = f"after drop nan poor dataset ({data0.shape[0]} rows) sex:{sex}"
        print(msg)
        raise ValueError(msg)
    df = cco.stage_shuffle(data0, is_shuffle, random_state=random_state)
    target_names = [LABEL_NAME
                    ] + variant.key.get_stratify_names(is_text_style=True)
    if CHRONO_SPLIT:
        df_main, df_reserve = cco.split2_by_year(
            df, test_years=cco.CHRONO_TEST_YEARS)
    else:
        df_main, df_reserve = cco.split2_stratified(
            df,
            target_names=target_names,
            test_size=reserve_size,
            random_state=random_state,
        )
    assert df_main.shape[0] > 0 and df_reserve.shape[0] > 0

    if variant.weight_mode == cco.WeightMode.BALANCED:
        clf_columns.add_weight_column(variant.weight_mode, df, LABEL_NAME)
        # df_main will weighted during fill_data_ending_stratify_n

    write_df(variant, df=df_main, subname="main")
    write_df(variant, df=df_reserve, subname="reserve")

    msg = (f"shuffle: {is_shuffle} chrono_split: {CHRONO_SPLIT} "
           f"random_seed: {random_seed} random_state: {random_state}\n")
    msg_m = f"reserving size {1 - reserve_size} in {df.shape}, main {df_main.shape}\n"
    msg_r = f"reserving size {reserve_size} in {df.shape}, reserve {df_reserve.shape}\n"
    write_note(variant, subname="main", text=f"{msg}{msg_m}")
    write_note(variant, subname="reserve", text=f"{msg}{msg_r}")
    cco.out(msg_m)
Пример #3
0
def with_nan_columns(df, columns=None, raise_ifnan=False):
    """return list of column names with nan"""
    result = []
    check_columns = columns if columns else df.columns
    for col in check_columns:
        if df[col].isnull().any():
            result.append(str(col))
    if result:
        err_text = "detected columns with nan: {}".format(result)
        out(err_text)
        if raise_ifnan:
            raise co.TennisError(err_text)
    return result
Пример #4
0
def final_train_save(variant: cco.Variant, seed: int, random_state: int):
    put_seed(seed)
    variant.set_random_state(seed)
    data, _ = fill_data(
        variant,
        # split=False,  # it supplies eval=None
        split=None,
        is_shuffle=args.shuffle,
        random_state=random_state,
    )
    variant.make_clf_fit(data, metric_name, random_seed=seed)
    variant.save_clf(model_name=MODEL, metric_name=metric_name)
    msg = (f"final_train_save done var: {variant} "
           f"seed: {seed} random_state={random_state}")
    write_note(variant, subname="clf", text=msg)
    cco.out(msg)
Пример #5
0
    def objective(space):
        global best_score, trials_count
        #       if os.path.isdir('./catboost_info'):
        #           shutil.rmtree('./catboost_info', ignore_errors=True)
        trials_count += 1
        if (trials_count % 5) == 0 and is_quit_pressed():
            raise co.TennisAbortError
        args_dct = dict(**space)
        params = {
            "eval_metric": metric_name,
            # 'eval_metric': 'Logloss',
            "random_seed": random_state,
            "logging_level": "Silent",
        }
        params.update(args_dct)
        if how == "cv":
            cv_data = cv(pools.train, params, stratified=True)
            scr_val = np.max(cv_data[f"test-{metric_name}-mean"])
        elif how == "sklearn":
            mdl = CatBoostClassifier(**params)
            mdl.fit(pools.train)
            pred = mdl.predict_proba(pools.eval)[:, 1]
            scr_val = roc_auc_score(pools.eval.y, pred)
        elif how == "native":
            mdl = CatBoost(params)
            mdl.fit(
                pools.train,
                eval_set=None,  # pools.eval if pools.eval else None,
                silent=True,
            )  # eval_set=pools.eval
            pred = mdl.predict(pools.eval, prediction_type="Probability")[:, 1]
            scr_val = roc_auc_score(pools.eval.get_label(), pred)
        else:
            raise Exception("bad how arg {}".format(how))

        #       pred = mdl.predict(data.X_test)
        #       scr_val = precision_score(data.y_test, pred)

        if scr_val > best_score:
            if how == "cv":
                cco.out("achieved best {} at {}".format(scr_val, params))
            else:
                cco.out("achieved best {} at {} lrate: {} ntrees: {}".format(
                    scr_val, mdl.get_params(), mdl.learning_rate_,
                    mdl.tree_count_))
            best_score = scr_val
        return {"loss": 1.0 - scr_val, "status": STATUS_OK}
Пример #6
0
 def objective(space):
     global best_score, trials_count
     trials_count += 1
     if (trials_count % 10) == 0 and is_quit_pressed():
         raise co.TennisAbortError
     args_dct = dict(**space)
     clf = cco.make_clf(classifier, args_dct, scaler=scaler)
     scores = cross_val_score(clf,
                              data.train.X,
                              data.train.y,
                              scoring=scoring,
                              cv=cv)
     scr_val = scores.mean()
     if scr_val > best_score:
         cco.out("achieved best score {} at {}".format(scr_val, args_dct))
         best_score = scr_val
     return {"loss": 1.0 - scr_val, "status": STATUS_OK}
Пример #7
0
def do_fmin_impl(objective,
                 space,
                 max_evals,
                 mix_algo_ratio=None,
                 random_state=None):
    # trials, max_evals = clf_hp.load_trials(max_evals)
    trials = Trials()
    best = None
    try:
        best = fmin(
            fn=objective,
            space=space,
            algo=(tpe.suggest if mix_algo_ratio is None else
                  clf_hp.get_mix_algo(mix_algo_ratio)),
            max_evals=max_evals,
            trials=trials,
            rstate=np.random.RandomState(random_state),
        )
        print("fmin done")
    except co.TennisAbortError:
        cco.out("user quit event")
    if best is not None:
        cco.out("best: {}".format(best))
    cco.out("best_trial result: {}".format(trials.best_trial.get("result")))
    # clf_hp.save_trials(trials)
    return trials
Пример #8
0
def load_trials(max_evals):
    try:  # try to load an already saved trials object, and increase the max
        trials = pickle.load(open("./trials.pkl", "rb"))
        cco.out("Found saved Trials! Loading...")
        new_max_evals = len(trials.trials) + max_evals
        cco.out("Rerunning from {} trials to {} (+{}) trials".format(
            len(trials.trials), new_max_evals, max_evals))
    except Exception as err:  # create a new trials object and start searching
        cco.out("load_trials failed: {}".format(err))
        trials = Trials()
        new_max_evals = max_evals
    return trials, new_max_evals
Пример #9
0
def scores_reserve(variant: cco.Variant, head: str = ""):
    sex = variant.sex
    if variant.feature_names.exist_cat():
        raise NotImplementedError(f"need pool from reserve, sex: {sex}")
    df = read_df(variant, subname="reserve")
    add_columns(variant.sex, df)
    X, y = cco.get_xy(df, variant.feature_names.get_list(), LABEL_NAME)
    y_pred = variant.clf.predict(X)
    # prec = precision_score(y, y_pred)
    acc = accuracy_score(y, y_pred)
    auc = roc_auc_score(y, variant.clf.predict_proba(X)[:, 1])
    cco.out(f"{head} acc: {acc} auc: {auc}")
    cco.out(f"treecnt: {variant.clf.tree_count_}")
    cco.out(f"lrate: {variant.clf.learning_rate_}")

    res = cco.get_result(variant, variant.clf, X_test=X, y_test=y)
    wl = res.negwl + res.poswl
    cco.out(f"wl {wl.ratio_size_str(precision=3)} "
            f"rat {round(wl.size / y.shape[0], 3)}")
Пример #10
0
def do_fmin(objective, space, max_evals, mix_algo_ratio=None):
    trials, max_evals = load_trials(max_evals)
    best = None
    try:
        best = fmin(
            fn=objective,
            space=space,
            algo=tpe.suggest
            if mix_algo_ratio is None else get_mix_algo(mix_algo_ratio),
            max_evals=max_evals,
            trials=trials,
        )
    except co.TennisAbortError:
        cco.out("user quit event")
    if best is not None:
        cco.out("best: {}".format(best))
    cco.out("best_trial result: {}".format(trials.best_trial.get("result")))
    save_trials(trials)
    return trials
Пример #11
0
def save_trials(trials):
    with open("./trials.pkl", "wb") as fh:
        pickle.dump(trials, fh)
    cco.out("saved trials")
Пример #12
0
def put_seed(seed: int):
    global random_seed
    random_seed = seed
    cco.set_seed(seed)
    cco.out(f"set random_seed {random_seed}")