def train_lightgbm(X: pd.DataFrame, y: pd.Series, stored_models_key: str, save_to_disk: bool, config: Config): config[stored_models_key] = [] data = lgb.Dataset(X, label=y, free_raw_data=False) data.construct() gc.collect() params = { "objective": config["objective"], "metric": config["metric"], "seed": config["seed"], 'num_threads': config['n_threads'], "verbosity": -1, } seed = config["seed"] space = { "learning_rate": hp.uniform("learning_rate", 0.01, 0.4), "max_depth": hp.choice("max_depth", [-1, 2, 3, 4, 5, 6, 10]), "num_leaves": hp.choice("num_leaves", np.linspace(4, 200, 50, dtype=int)), "feature_fraction": hp.quniform("feature_fraction", 0.1, 1., 0.1), "bagging_fraction": hp.quniform("bagging_fraction", 0.1, 1., 0.1), "bagging_freq": hp.choice("bagging_freq", np.linspace(0, 20, 10, dtype=int)), "reg_alpha": hp.uniform("reg_alpha", 0, 30), "reg_lambda": hp.uniform("reg_lambda", 0, 30), "min_child_weight": hp.uniform('min_child_weight', 1e-10, 20), "max_bin": hp.choice('max_bin', [50, 100, 255]), 'boosting_type': hp.choice( 'boosting_type', [ { 'boosting_type': 'gbdt', }, { 'boosting_type': 'dart', 'drop_rate': hp.uniform('drop_rate', 0.01, 0.6), 'max_drop': hp.choice( "max_drop", np.linspace(5, config["train_num_boost_round"] * .9, 10, dtype=int)), 'skip_drop': hp.uniform('skip_drop', 0.1, 0.7), }, # {'boosting_type': 'rf', # 'bagging_freq': 1, # }, # {'boosting_type': 'goss', # 'bagging_freq': 0, # }, ]), #train params 'early_stopping_rounds': hp.choice("early_stopping_rounds", [None, 50]), 'cv_splits': hp.choice("cv_splits", np.linspace(3, 12, 10, dtype=int)), # [4,8] 'shuffle': hp.choice("shuffle", [True, False]), } if config.is_classification(): space['scale_pos_weight'] = hp.uniform('scale_pos_weight', 0.5, 10) else: space['objective'] = hp.choice( "objective", [ 'regression', 'huber', # 'fair', # 'regression_l1', ]) def objective(space_sample): iteration_start = time.time() hyperparams = copy.deepcopy(space_sample) boosting_type = {} if 'boosting_type' in hyperparams.keys(): boosting_type = hyperparams.pop('boosting_type') hyperparams = {**params, **hyperparams, **boosting_type} scores, models, y_oof = train_lightgbm_cv(data=data, hyperparams=hyperparams, config=config) if config.is_classification(): scores['oof'] = -scores['oof'] iteration_time = time.time() - iteration_start log('iteration time %.1f, loss %.5f' % (iteration_time, scores['oof'])) elapsed_time = (time.time() - config['start_time']) have_time = (config["time_limit"] - elapsed_time - iteration_time) > 25 if have_time: save_model(models, hyperparams, scores, y_oof, stored_models_key, save_to_disk, config) status = STATUS_OK else: status = STATUS_FAIL return { 'loss': scores['oof'], 'runtime': iteration_time, 'scores': scores, 'models': models, 'y_oof': y_oof, 'status': status } have_time = True eval_n = 0 trials = Trials() while have_time: iteration_start = time.time() best = hyperopt.fmin( fn=objective, space=space, trials=trials, algo=tpe.suggest, max_evals=eval_n + 1, verbose=1, rstate=np.random.RandomState(eval_n) ) #TODO: (bug) if seed the same - in some cases it samples same values forever iteration_time = time.time() - iteration_start elapsed_time = (time.time() - config['start_time']) have_time = (config["time_limit"] - elapsed_time - iteration_time) > 25 eval_n += 1
def time_series_detect(df: pd.DataFrame, config: Config): sample_size = 10000 model_params = { "objective": "regression" if config["mode"] == "regression" else "binary", "metric": "rmse" if config["mode"] == "regression" else "auc", "learning_rate": 0.01, "verbosity": -1, "seed": 1, "max_depth": -1, } if config.is_train(): datetime_columns = [c for c in df if c.startswith("datetime_")] id_columns = [c for c in df if c.startswith("id_")] sort_columns = [] for dc in datetime_columns: sort_columns.append([dc]) for ic in id_columns: sort_columns.append([ic, dc]) else: for ic in id_columns: sort_columns.append([ic]) scores = [] config.limit_time_fraction(0.1) for sc in sort_columns: if config.is_time_fraction_limit(): break Log.silent(True) df.sort_values(sc, inplace=True) config_sample = copy.deepcopy(config) df_sample = df.iloc[-sample_size:].copy() if len(df) > sample_size else df.copy() df_sample = df_sample[[c for c in df_sample if c.startswith("number_") or c == "target" or c in sc]] shift_columns(df_sample, group= sc[0] if len(sc) > 1 else None) transform(df_sample, config_sample) y = df_sample["target"] X = df_sample.drop("target", axis=1) X_train, X_test, y_train, y_test = ts_split(X, y, test_size=0.5) model_sorted = lgb.train(model_params, lgb.Dataset(X_train, label=y_train), 3000, lgb.Dataset(X_test, label=y_test), early_stopping_rounds=100, verbose_eval=False) score_sorted = model_sorted.best_score["valid_0"][model_params["metric"]] sampled_columns = [c for c in X if "_shift" not in c] model_sampled = lgb.train(model_params, lgb.Dataset(X_train[sampled_columns], label=y_train), 3000, lgb.Dataset(X_test[sampled_columns], label=y_test), early_stopping_rounds=100, verbose_eval=False) score_sampled = model_sampled.best_score["valid_0"][model_params["metric"]] if config.is_classification(): score_sorted = -score_sorted score_sampled = -score_sampled Log.silent(False) Log.print("Sort: {}. Score sorted: {:0.4f}. Score sampled: {:0.4f}".format(sc, score_sorted, score_sampled)) score_ratio = score_sampled / score_sorted if config.is_regression() else abs(score_sorted / score_sampled) if score_ratio >= 1.03: Log.print(score_ratio) scores.append((score_sorted, sc)) if len(scores) > 0: scores = sorted(scores, key=lambda x: x[0]) Log.print("Scores: {}".format(scores)) config["sort_values"] = scores[0][1] df.sort_values(config["sort_values"], inplace=True) config_sample = copy.deepcopy(config) df_sample = df.iloc[-sample_size:].copy() if len(df) > sample_size else df.copy() shift_columns(df_sample, group=config["sort_values"][0] if len(config["sort_values"]) > 1 else None) transform(df_sample, config_sample) y = df_sample["target"] X = df_sample.drop("target", axis=1) model = lgb.train(model_params, lgb.Dataset(X, label=y), 1000) fi = pd.Series(model.feature_importance(importance_type="gain"), index=X.columns) fi = fi[fi > 0].sort_values() selected_columns = fi[fi >= fi.quantile(0.75)].index.tolist() selected_shift_columns = [c.replace("_shift", "") for c in selected_columns if "_shift" in c] if len(selected_shift_columns) > 0: Log.print("Shift columns: {}".format(selected_shift_columns)) config["shift_columns"] = selected_shift_columns if "shift_columns" in config: shift_columns(df, group=config["sort_values"][0] if len(config["sort_values"]) > 1 else None, number_columns=config["shift_columns"])