def hyperopt_search(variant: cco.Variant, data, max_evals, mix_algo_ratio=None, random_state=None): timer = stopwatch.Timer() if variant.is_gb(): clf_hp_gb.do_fmin( data, max_evals, mix_algo_ratio=mix_algo_ratio, max_depth_space=hp.pchoice("max_depth", [(0.40, 1), (0.60, 2)]), random_state=random_state, ) elif variant.is_rf(): clf_hp_rf.do_fmin( data, max_evals, mix_algo_ratio=mix_algo_ratio, max_depth_space=hp.pchoice("max_depth", [(0.45, 1), (0.55, 2)]), random_state=random_state, ) elif variant.is_cb_native(): pools = clf_cat_tools.make_pools(data) clf_hp_cb.do_fmin( pools, max_evals, mix_algo_ratio=mix_algo_ratio, random_state=random_state, how="native", ) sec_elapsed = timer.elapsed cco.out("{} done for {}".format(variant.name, cco.fmt(sec_elapsed)))
def make_main_reserve( variant: cco.Variant, reserve_size: float, is_shuffle: bool = False, random_state: int = RESERVE_RANDOM_STATE, ): """ читает исходный файл csv, очищает (удаляет строки где важные колонки пустые(NaN) или где не вып-ны ограничения по рейтингам, или где не вып-ны ограничения по плотности решающ. партии), перемешивает строки (опционально, если задан is_shuffle) добавляет лищь некоторые клонки ('year', 'opener_lose_match'), разбивает на main/reserve части в соответствии с reserve_size, CHRONO_SPLIT сохраняет наборы в папки main/reserve для входного варанта variant """ sex = variant.sex filename = cfg_dir.analysis_data_file(sex, typename="decidedset") data0 = pd.read_csv(filename, sep=",") primary_edit(variant, data0) if data0.shape[0] < cco.DATASET_MIN_LEN: msg = f"after drop nan poor dataset ({data0.shape[0]} rows) sex:{sex}" print(msg) raise ValueError(msg) df = cco.stage_shuffle(data0, is_shuffle, random_state=random_state) target_names = [LABEL_NAME ] + variant.key.get_stratify_names(is_text_style=True) if CHRONO_SPLIT: df_main, df_reserve = cco.split2_by_year( df, test_years=cco.CHRONO_TEST_YEARS) else: df_main, df_reserve = cco.split2_stratified( df, target_names=target_names, test_size=reserve_size, random_state=random_state, ) assert df_main.shape[0] > 0 and df_reserve.shape[0] > 0 if variant.weight_mode == cco.WeightMode.BALANCED: clf_columns.add_weight_column(variant.weight_mode, df, LABEL_NAME) # df_main will weighted during fill_data_ending_stratify_n write_df(variant, df=df_main, subname="main") write_df(variant, df=df_reserve, subname="reserve") msg = (f"shuffle: {is_shuffle} chrono_split: {CHRONO_SPLIT} " f"random_seed: {random_seed} random_state: {random_state}\n") msg_m = f"reserving size {1 - reserve_size} in {df.shape}, main {df_main.shape}\n" msg_r = f"reserving size {reserve_size} in {df.shape}, reserve {df_reserve.shape}\n" write_note(variant, subname="main", text=f"{msg}{msg_m}") write_note(variant, subname="reserve", text=f"{msg}{msg_r}") cco.out(msg_m)
def with_nan_columns(df, columns=None, raise_ifnan=False): """return list of column names with nan""" result = [] check_columns = columns if columns else df.columns for col in check_columns: if df[col].isnull().any(): result.append(str(col)) if result: err_text = "detected columns with nan: {}".format(result) out(err_text) if raise_ifnan: raise co.TennisError(err_text) return result
def final_train_save(variant: cco.Variant, seed: int, random_state: int): put_seed(seed) variant.set_random_state(seed) data, _ = fill_data( variant, # split=False, # it supplies eval=None split=None, is_shuffle=args.shuffle, random_state=random_state, ) variant.make_clf_fit(data, metric_name, random_seed=seed) variant.save_clf(model_name=MODEL, metric_name=metric_name) msg = (f"final_train_save done var: {variant} " f"seed: {seed} random_state={random_state}") write_note(variant, subname="clf", text=msg) cco.out(msg)
def objective(space): global best_score, trials_count # if os.path.isdir('./catboost_info'): # shutil.rmtree('./catboost_info', ignore_errors=True) trials_count += 1 if (trials_count % 5) == 0 and is_quit_pressed(): raise co.TennisAbortError args_dct = dict(**space) params = { "eval_metric": metric_name, # 'eval_metric': 'Logloss', "random_seed": random_state, "logging_level": "Silent", } params.update(args_dct) if how == "cv": cv_data = cv(pools.train, params, stratified=True) scr_val = np.max(cv_data[f"test-{metric_name}-mean"]) elif how == "sklearn": mdl = CatBoostClassifier(**params) mdl.fit(pools.train) pred = mdl.predict_proba(pools.eval)[:, 1] scr_val = roc_auc_score(pools.eval.y, pred) elif how == "native": mdl = CatBoost(params) mdl.fit( pools.train, eval_set=None, # pools.eval if pools.eval else None, silent=True, ) # eval_set=pools.eval pred = mdl.predict(pools.eval, prediction_type="Probability")[:, 1] scr_val = roc_auc_score(pools.eval.get_label(), pred) else: raise Exception("bad how arg {}".format(how)) # pred = mdl.predict(data.X_test) # scr_val = precision_score(data.y_test, pred) if scr_val > best_score: if how == "cv": cco.out("achieved best {} at {}".format(scr_val, params)) else: cco.out("achieved best {} at {} lrate: {} ntrees: {}".format( scr_val, mdl.get_params(), mdl.learning_rate_, mdl.tree_count_)) best_score = scr_val return {"loss": 1.0 - scr_val, "status": STATUS_OK}
def objective(space): global best_score, trials_count trials_count += 1 if (trials_count % 10) == 0 and is_quit_pressed(): raise co.TennisAbortError args_dct = dict(**space) clf = cco.make_clf(classifier, args_dct, scaler=scaler) scores = cross_val_score(clf, data.train.X, data.train.y, scoring=scoring, cv=cv) scr_val = scores.mean() if scr_val > best_score: cco.out("achieved best score {} at {}".format(scr_val, args_dct)) best_score = scr_val return {"loss": 1.0 - scr_val, "status": STATUS_OK}
def do_fmin_impl(objective, space, max_evals, mix_algo_ratio=None, random_state=None): # trials, max_evals = clf_hp.load_trials(max_evals) trials = Trials() best = None try: best = fmin( fn=objective, space=space, algo=(tpe.suggest if mix_algo_ratio is None else clf_hp.get_mix_algo(mix_algo_ratio)), max_evals=max_evals, trials=trials, rstate=np.random.RandomState(random_state), ) print("fmin done") except co.TennisAbortError: cco.out("user quit event") if best is not None: cco.out("best: {}".format(best)) cco.out("best_trial result: {}".format(trials.best_trial.get("result"))) # clf_hp.save_trials(trials) return trials
def load_trials(max_evals): try: # try to load an already saved trials object, and increase the max trials = pickle.load(open("./trials.pkl", "rb")) cco.out("Found saved Trials! Loading...") new_max_evals = len(trials.trials) + max_evals cco.out("Rerunning from {} trials to {} (+{}) trials".format( len(trials.trials), new_max_evals, max_evals)) except Exception as err: # create a new trials object and start searching cco.out("load_trials failed: {}".format(err)) trials = Trials() new_max_evals = max_evals return trials, new_max_evals
def scores_reserve(variant: cco.Variant, head: str = ""): sex = variant.sex if variant.feature_names.exist_cat(): raise NotImplementedError(f"need pool from reserve, sex: {sex}") df = read_df(variant, subname="reserve") add_columns(variant.sex, df) X, y = cco.get_xy(df, variant.feature_names.get_list(), LABEL_NAME) y_pred = variant.clf.predict(X) # prec = precision_score(y, y_pred) acc = accuracy_score(y, y_pred) auc = roc_auc_score(y, variant.clf.predict_proba(X)[:, 1]) cco.out(f"{head} acc: {acc} auc: {auc}") cco.out(f"treecnt: {variant.clf.tree_count_}") cco.out(f"lrate: {variant.clf.learning_rate_}") res = cco.get_result(variant, variant.clf, X_test=X, y_test=y) wl = res.negwl + res.poswl cco.out(f"wl {wl.ratio_size_str(precision=3)} " f"rat {round(wl.size / y.shape[0], 3)}")
def do_fmin(objective, space, max_evals, mix_algo_ratio=None): trials, max_evals = load_trials(max_evals) best = None try: best = fmin( fn=objective, space=space, algo=tpe.suggest if mix_algo_ratio is None else get_mix_algo(mix_algo_ratio), max_evals=max_evals, trials=trials, ) except co.TennisAbortError: cco.out("user quit event") if best is not None: cco.out("best: {}".format(best)) cco.out("best_trial result: {}".format(trials.best_trial.get("result"))) save_trials(trials) return trials
def save_trials(trials): with open("./trials.pkl", "wb") as fh: pickle.dump(trials, fh) cco.out("saved trials")
def put_seed(seed: int): global random_seed random_seed = seed cco.set_seed(seed) cco.out(f"set random_seed {random_seed}")