def execute(): #################### Environment #################### env = Environment( train_dataset=get_boston_data(), results_path="HyperparameterHunterAssets", holdout_dataset=get_holdout_data, target_column="DIS", metrics=["r2_score", "median_absolute_error"], cv_type="KFold", cv_params=dict(n_splits=10, random_state=1), ) #################### CVExperiment #################### exp_0 = CVExperiment( model_initializer=Ridge, model_init_params=dict(), feature_engineer=FeatureEngineer([quantile_transform]), ) #################### Optimization #################### # `opt_0` recognizes `exp_0`'s `feature_engineer` and its results as valid learning material # This is because `opt_0` marks the engineer step functions omitted by `exp_0` as `optional=True` opt_0 = DummyOptPro(iterations=10) opt_0.forge_experiment( model_initializer=Ridge, model_init_params=dict(), feature_engineer=FeatureEngineer([ Categorical([quantile_transform, log_transform], optional=True), Categorical([standard_scale, standard_scale_BAD], optional=True), Categorical([square_sum_feature], optional=True), ]), ) opt_0.go()
def test_feature_engineer_list_experiment_equality(env_boston, steps_0, steps_1): """Test that the `feature_engineer` attribute constructed by :class:`~hyperparameter_hunter.experiments.CVExperiment` is the same whether it was given a list as input, or a :class:`~hyperparameter_hunter.feature_engineering.FeatureEngineer`""" exp_0 = CVExperiment(Ridge, feature_engineer=steps_0) exp_1 = CVExperiment(Ridge, feature_engineer=FeatureEngineer(steps_1)) assert exp_0.feature_engineer == exp_1.feature_engineer # Repeat above, but switch which steps are wrapped in `FeatureEngineer` exp_2 = CVExperiment(Ridge, feature_engineer=steps_1) exp_3 = CVExperiment(Ridge, feature_engineer=FeatureEngineer(steps_0)) assert exp_2.feature_engineer == exp_3.feature_engineer
def test_feature_engineer_list_optimization_equality(env_boston, steps_0, steps_1): """Test that the `feature_engineer` attribute constructed by an OptPro is the same whether given a list as input, or a :class:`~hyperparameter_hunter.feature_engineering.FeatureEngineer`""" opt_0, opt_1, opt_2, opt_3 = GBRT(), GBRT(), GBRT(), GBRT() opt_0.forge_experiment(Ridge, feature_engineer=steps_0) opt_1.forge_experiment(Ridge, feature_engineer=FeatureEngineer(steps_1)) assert opt_0.feature_engineer == opt_1.feature_engineer # Repeat above, but switch which steps are wrapped in `FeatureEngineer` opt_2.forge_experiment(Ridge, feature_engineer=steps_1) opt_3.forge_experiment(Ridge, feature_engineer=FeatureEngineer(steps_0)) assert opt_2.feature_engineer == opt_3.feature_engineer
def test_feature_engineer_list_experiment_inequality(env_boston, steps_0, steps_1): """Test that the `feature_engineer` attribute constructed by :class:`~hyperparameter_hunter.experiments.CVExperiment` is NOT the same when given a list as input vs. a :class:`~hyperparameter_hunter.feature_engineering.FeatureEngineer` when the two are actually different. This is an insanity test to make sure that the related test in this module, :func:`test_feature_engineer_list_experiment_equality`, is not simply equating everything""" exp_0 = CVExperiment(Ridge, feature_engineer=steps_0) exp_1 = CVExperiment(Ridge, feature_engineer=FeatureEngineer(steps_1)) assert exp_0.feature_engineer != exp_1.feature_engineer # Repeat above, but switch which steps are wrapped in `FeatureEngineer` exp_2 = CVExperiment(Ridge, feature_engineer=steps_1) exp_3 = CVExperiment(Ridge, feature_engineer=FeatureEngineer(steps_0)) assert exp_2.feature_engineer != exp_3.feature_engineer
def execute(): env = Environment( train_dataset=get_imbalanced_dataset(), results_path="HyperparameterHunterAssets", target_column="target", metrics=["roc_auc_score", "accuracy_score"], cv_type="KFold", cv_params=dict(n_splits=5, random_state=7), ) # Since this is HyperparameterHunter, after all, we'll throw in some classic hyperparameter # optimization just for fun. If you're like most people and you think it's absurd to test # 18 different `imblearn` techniques, feel free to comment out some `EngineerStep`s below opt_0 = ET(iterations=20, random_state=32) opt_0.forge_experiment( model_initializer=XGBClassifier, model_init_params=dict( max_depth=Integer(2, 20), n_estimators=Integer(50, 900), learning_rate=Real(0.0001, 0.9), subsample=0.5, booster=Categorical(["gbtree", "gblinear"]), ), feature_engineer=FeatureEngineer([ Categorical( [ EngineerStep(resample_smote_tomek, stage="intra_cv"), EngineerStep(over_sample_random, stage="intra_cv"), EngineerStep(over_sample_smote, stage="intra_cv"), EngineerStep(under_sample_random, stage="intra_cv"), EngineerStep(under_sample_cluster_centroids, stage="intra_cv"), EngineerStep(under_sample_tomek_links, stage="intra_cv"), #################### GROUP 2 (EXTENDED) #################### EngineerStep(resample_smote_enn, stage="intra_cv"), EngineerStep(over_sample_ADASYN, stage="intra_cv"), EngineerStep(over_sample_BorderlineSMOTE, stage="intra_cv"), EngineerStep(over_sample_SVMSMOTE, stage="intra_cv"), EngineerStep(under_sample_NearMiss, stage="intra_cv"), EngineerStep(under_sample_CondensedNearestNeighbour, stage="intra_cv"), EngineerStep(under_sample_OneSidedSelection, stage="intra_cv"), EngineerStep(under_sample_NeighbourhoodCleaningRule, stage="intra_cv"), EngineerStep(under_sample_EditedNearestNeighbours, stage="intra_cv"), EngineerStep(under_sample_RepeatedEditedNearestNeighbour, stage="intra_cv"), EngineerStep(under_sample_AllKNN, stage="intra_cv"), EngineerStep(under_sample_InstanceHardnessThreshold, stage="intra_cv"), ], optional=True, ) ]), ) opt_0.go()
def prepped_experiment(request): """Build a partially prepared :class:`~hyperparameter_hunter.experiments.CVExperiment` instance Specifically, automatic execution is disabled via `auto_start=False`, then the following methods are called: 1. :meth:`~hyperparameter_hunter.experiments.BaseExperiment.preparation_workflow`, 2. :meth:`~hyperparameter_hunter.experiments.BaseExperiment._initialize_random_seeds`, and 3. :meth:`~hyperparameter_hunter.experiments.BaseExperiment.on_exp_start`, which initializes the four :mod:`~hyperparameter_hunter.data.datasets` classes, then performs pre-CV feature engineering Notes ----- Directly calling `on_exp_start` is ok in this test because after calling `_initialize_random_seeds`, `BaseExperiment` calls `execute`, which is implemented by `BaseCVExperiment`, and only calls `cross_validation_workflow`, whose first task is to call `on_exp_start`. So nothing gets skipped in between""" #################### Build `feature_engineer` #################### feature_engineer = FeatureEngineer(steps=request.param) #################### Partially Prepare `CVExperiment` #################### experiment = CVExperiment( model_initializer=AdaBoostClassifier, model_init_params=dict(), feature_engineer=feature_engineer, auto_start=False, ) experiment.preparation_workflow() # noinspection PyProtectedMember experiment._initialize_random_seeds() experiment.on_exp_start() return experiment
def test_validate_fe_steps_error_candidate_too_big(candidate, template, candidate_step_cast): """Test that `IncompatibleCandidateError` is raised by `validate_fe_steps` when `candidate` has more steps than `template`. See `test_validate_fe_steps` for parameter descriptions""" with pytest.raises(IncompatibleCandidateError): validate_fe_steps(candidate_step_cast(candidate), FeatureEngineer(template))
def test_feature_engineer_list_optimization_inequality(env_boston, steps_0, steps_1): """Test that the `feature_engineer` attribute constructed by an OptPro is NOT the same when given a list as input vs. a :class:`~hyperparameter_hunter.feature_engineering.FeatureEngineer` when the two are actually different. This is an insanity test to make sure that the related test in this module, :func:`test_feature_engineer_list_optimization_equality`, is not simply equating everything""" opt_0, opt_1, opt_2, opt_3 = GBRT(), GBRT(), GBRT(), GBRT() opt_0.forge_experiment(Ridge, feature_engineer=steps_0) opt_1.forge_experiment(Ridge, feature_engineer=FeatureEngineer(steps_1)) assert opt_0.feature_engineer != opt_1.feature_engineer # Repeat above, but switch which steps are wrapped in `FeatureEngineer` opt_2.forge_experiment(Ridge, feature_engineer=steps_1) opt_3.forge_experiment(Ridge, feature_engineer=FeatureEngineer(steps_0)) assert opt_2.feature_engineer != opt_3.feature_engineer
def test_validate_fe_steps_error_categorical_mismatch(candidate, template, candidate_step_cast): """Test that `IncompatibleCandidateError` is raised by `validate_fe_steps` when `candidate` has a step that does not fit in a `Categorical` step in `template`. See `test_validate_fe_steps` for parameter descriptions""" with pytest.raises(IncompatibleCandidateError): validate_fe_steps(candidate_step_cast(candidate), FeatureEngineer(template))
def test_validate_fe_steps(candidate, template, expected, candidate_step_cast): """Test that `validate_fe_steps` produces the `expected` output Parameters ---------- candidate: List `candidate` value given to :func:`~hyperparameter_hunter.result_reader.validate_fe_steps` template: List `template` value given to :func:`~hyperparameter_hunter.result_reader.validate_fe_steps` expected: List Output expected from invoking `validate_fe_steps` with `candidate` and `template`""" actual = validate_fe_steps(candidate_step_cast(candidate), FeatureEngineer(template)) # Because `actual` is going to be a list of `EngineerStep`/`RejectedOptional`, `expected` must # also be passed through a `FeatureEngineer` to convert each function to an `EngineerStep` assert actual == FeatureEngineer(expected).steps
def test_validate_fe_steps_error_concrete_missing(candidate, template, candidate_step_cast): """Test that `IncompatibleCandidateError` is raised by `validate_fe_steps` when `candidate` is missing a concrete (non-`Categorical`) step in `template`. See `test_validate_fe_steps` for parameter descriptions""" with pytest.raises(IncompatibleCandidateError): validate_fe_steps(candidate_step_cast(candidate), FeatureEngineer(template))
def fe_optimizer(request): if request.param is not None: request.param = FeatureEngineer(request.param) opt = BayesianOptPro() opt.forge_experiment(model_initializer=Ridge, model_init_params={}, feature_engineer=request.param) opt.go() return opt
def test_do_not_validate(env_boston): exp = CVExperiment( model_initializer=Ridge, model_init_params={}, feature_engineer=FeatureEngineer([standard_scale], do_validate=False), ) for step in exp.feature_engineer.steps: assert step.original_hashes == {} assert step.updated_hashes == {}
def test_inverse_type_error(env_boston): """Test that an error is raised if an `EngineerStep` function returns an extra value that is not a function or class instance. Extra return values are used for inverse transformations""" with pytest.raises(TypeError, match="`inversion` must be callable, or class with .*"): exp = CVExperiment( model_initializer=Ridge, model_init_params={}, feature_engineer=FeatureEngineer([bad_quantile_transform]), )
def experiment_fixture(request): #################### Build `feature_engineer` #################### feature_engineer = FeatureEngineer(steps=request.param) #################### Execute `CVExperiment` #################### experiment = CVExperiment( model_initializer=AdaBoostClassifier, model_init_params=dict(), feature_engineer=feature_engineer, ) return experiment
def test_reg_engineer(env_boston_regression, hh_assets, opt_pro): """Demonstrate problem with `BayesianOptPro` specifically - same configuration is fine with all other `OptPro`s""" opt = opt_pro(iterations=3, random_state=32, n_initial_points=1) opt.forge_experiment( model_initializer=AdaBoostRegressor, model_init_params=dict(), feature_engineer=FeatureEngineer( [Categorical([standard_scale, min_max_scale, normalize])]), ) opt.go()
def test_reg_engineer_integer_ok(env_boston_regression, hh_assets, opt_pro): """Identical to `test_reg_engineer`, except `Integer` dimension added to show that everything is fine now. Problem limited to not only `BayesianOptPro`, but also exclusively `Categorical` search spaces""" opt = opt_pro(iterations=3, random_state=32, n_initial_points=1) opt.forge_experiment( model_initializer=AdaBoostRegressor, model_init_params=dict(n_estimators=Integer(10, 40)), feature_engineer=FeatureEngineer( [Categorical([standard_scale, min_max_scale, normalize])]), ) opt.go()
def test_reg_engineer_categorical(env_boston_regression, hh_assets, opt_pro): """Demonstrate that `BayesianOptPro` breaks with multiple `Categorical`s when `FeatureEngineer` is included in the dimensions""" opt = opt_pro(iterations=3, random_state=32, n_initial_points=1) opt.forge_experiment( model_initializer=AdaBoostRegressor, model_init_params=dict( loss=Categorical(["linear", "square", "exponential"])), feature_engineer=FeatureEngineer( [Categorical([standard_scale, min_max_scale, normalize])]), ) opt.go()
def test_similar_experiments_unordered(): """Check that an experiment with a single `EngineerStep` is considered "similar" by an Optimization Protocol, with two `optional` `EngineerStep`s, where the second step is identical to the single step used by the standalone experiment. As of v3.0.0alpha2, this is expected to fail because the otherwise identical engineer steps occur at different indexes in `FeatureEngineer.steps` for the experiment and the OptPro. The experiment has `sqr_sum_feature` at index=0, while the same step in the OptPro is at index=1. Note that the step index in OptPro is still 1 despite the fact that the other step immediately preceding it is `optional`""" env = Environment( train_dataset=get_breast_cancer_data(), results_path=assets_dir, target_column="diagnosis", metrics=["roc_auc_score"], cv_type="StratifiedKFold", cv_params=dict(n_splits=5, shuffle=True, random_state=32), ) exp = CVExperiment( model_initializer=XGBClassifier, model_init_params=dict(objective="reg:linear", subsample=0.5, max_depth=3), feature_engineer=FeatureEngineer([EngineerStep(sqr_sum_feature)]), ) opt = BayesianOptPro(iterations=1) opt.forge_experiment( model_initializer=XGBClassifier, model_init_params=dict(objective="reg:linear", subsample=0.5, max_depth=3), feature_engineer=FeatureEngineer([ Categorical([standard_scale, normalize, min_max_scale], optional=True), Categorical([sqr_sum_feature], optional=True), ]), ) opt.go() assert exp.experiment_id in [_[2] for _ in opt.similar_experiments]
def test_1(): train_inputs, train_targets, holdout_inputs, holdout_targets = get_pima_data( ) feature_engineer = FeatureEngineer() feature_engineer.add_step(set_nan_0) assert feature_engineer._steps[-1].name == "set_nan_0" feature_engineer.add_step(impute_negative_one_0) assert feature_engineer._steps[-1].name == "impute_negative_one_0" feature_engineer("pre_cv", train_inputs=train_inputs.copy(), holdout_inputs=holdout_inputs.copy()) expected_train_inputs = [ [1, 85, 66, 29, -1, 26.6, 0.351, 31], [8, 183, 64, -1, -1, 23.3, 0.672, 32], [1, 89, 66, 23, 94, 28.1, 0.167, 21], [0, 137, 40, 35, 168, 43.1, 2.288, 33], ] expected_holdout_inputs = [[6, 148, 72, 35, -1, 33.6, 0.627, 50]] assert_array_almost_equal(feature_engineer.datasets["train_inputs"], expected_train_inputs) assert_array_almost_equal(feature_engineer.datasets["holdout_inputs"], expected_holdout_inputs)
def engineer_experiment(request): """`CVExperiment` fixture that supports provision of a `feature_engineer` through `request` Parameters ---------- request: Object If `request` has a "param" attribute, it must be a list of feature engineering steps to provide to :class:`~hyperparameter_hunter.feature_engineering.FeatureEngineer`""" feature_engineer = FeatureEngineer(steps=getattr(request, "param", None)) experiment = CVExperiment(model_initializer=SVC, model_init_params=dict(), feature_engineer=feature_engineer) return experiment
def test_reg_engineer_categorical_integer_ok(env_boston_regression, hh_assets, opt_pro): """Identical to `test_reg_engineer_categorical`, except `Integer` added to demonstrate that all `OptPro`s can optimize with `FeatureEngineer` if space is not exclusively `Categorical`""" opt = opt_pro(iterations=3, random_state=32, n_initial_points=1) opt.forge_experiment( model_initializer=AdaBoostRegressor, model_init_params=dict(loss=Categorical( ["linear", "square", "exponential"]), n_estimators=Integer(10, 40)), feature_engineer=FeatureEngineer( [Categorical([standard_scale, min_max_scale, normalize])]), ) opt.go()
def experiment_prep_fixture(request): #################### Build `feature_engineer` #################### feature_engineer = FeatureEngineer(steps=request.param) #################### Partially Prepare `CVExperiment` #################### experiment = CVExperiment( model_initializer=AdaBoostClassifier, model_init_params=dict(), feature_engineer=feature_engineer, auto_start=False, ) experiment.preparation_workflow() # noinspection PyProtectedMember experiment._initialize_random_seeds() # noinspection PyProtectedMember experiment._initial_preprocessing() return experiment
def opt_pro(optimization_protocol): opt = optimization_protocol(iterations=3, random_state=32, n_initial_points=1) opt.forge_experiment( model_initializer=XGBRegressor, model_init_params=dict( max_depth=Integer(2, 10), n_estimators=Integer(50, 300), learning_rate=Real(0.1, 0.9), subsample=0.5, booster=Categorical(["gbtree", "gblinear"]), ), model_extra_params=dict(fit=dict( eval_metric=Categorical(["rmse", "mae"]))), feature_engineer=FeatureEngineer( [Categorical([nothing_transform], optional=True)]), ) opt.go() return opt
def test_2(): train_inputs, train_targets, holdout_inputs, holdout_targets = get_pima_data( ) feature_engineer = FeatureEngineer() feature_engineer.add_step(set_nan_0) feature_engineer.add_step(impute_negative_one_0) feature_engineer.add_step(standard_scale_0) feature_engineer("pre_cv", train_inputs=train_inputs.copy(), holdout_inputs=holdout_inputs.copy()) expected_train_inputs = [ [ -0.468521, -0.962876, 0.636364, 0.548821, -0.929624, -0.48321, -0.618238, 0.363422 ], [ 1.717911, 1.488081, 0.454545, -1.646464, -0.929624, -0.917113, -0.235491, 0.571092 ], [ -0.468521, -0.862837, 0.636364, 0.109764, 0.408471, -0.285982, -0.837632, -1.713275 ], [ -0.780869, 0.337632, -1.727273, 0.987878, 1.450776, 1.686305, 1.691360, 0.778761 ], ] expected_holdout_inputs = [[ 1.093216, 0.612739, 1.181818, 0.987878, -0.929624, 0.437190, -0.289147, 4.309145 ]] assert_array_almost_equal(feature_engineer.datasets["train_inputs"], expected_train_inputs) assert_array_almost_equal(feature_engineer.datasets["holdout_inputs"], expected_holdout_inputs)
def test_validate_fe_steps_error_categorical_missing(candidate, template, candidate_suffix, template_suffix, candidate_step_cast): """Test that `IncompatibleCandidateError` is raised by `validate_fe_steps` when `candidate` is missing a non-`optional` `Categorical` step in `template` Parameters ---------- candidate: List `candidate` value given to :func:`~hyperparameter_hunter.result_reader.validate_fe_steps` template: List `template` value given to :func:`~hyperparameter_hunter.result_reader.validate_fe_steps` candidate_suffix: List Additional steps to append to the end of `candidate` before invoking `validate_fe_steps` template_suffix: List Additional steps to append to the end of `template` before invoking `validate_fe_steps`""" with pytest.raises(IncompatibleCandidateError): validate_fe_steps( candidate_step_cast(candidate + candidate_suffix), FeatureEngineer(template + template_suffix), )
def es_d(all_inputs): return all_inputs def es_e(all_inputs): return all_inputs ################################################## # Fixtures ################################################## @pytest.fixture( params=[ FeatureEngineer, lambda _: FeatureEngineer(_).get_key_data()["steps"] ], ids=["EngineerSteps", "step_dicts"], ) def candidate_step_cast(request): """Processing method applied to `candidate` to produce the candidate steps passed to :func:`~hyperparameter_hunter.result_reader.validate_fe_steps`. May be either 1) instantiation as a `FeatureEngineer` (which is how `template` is processed), or 2) result of invoking :meth:`~hyperparameter_hunter.feature_engineering.FeatureEngineer.get_key_data` on the former, then taking its "steps" value. The second method produces a list of `EngineerStep`-like dicts, which more closely resembles a candidate retrieved from a saved Experiment result description file""" return request.param ##################################################
def execute(): env = Environment( train_dataset="data/train.csv", test_dataset="data/test.csv", results_path="HyperparameterHunterAssets", target_column="target", metrics=dict(gini=gini_normalized_c), id_column="id", cv_type=StratifiedKFold, cv_params=dict(n_splits=5, shuffle=True, random_state=15), do_predict_proba=1, to_csv_params=dict( index=False), # Drops index from final prediction files ) exp = CVExperiment( model_initializer=XGBClassifier, model_init_params=dict( n_estimators=200, max_depth=4, objective="binary:logistic", learning_rate=0.1, subsample=0.8, colsample_bytree=0.8, gamma=1, reg_alpha=0, reg_lambda=1, nthread=2, ), model_extra_params=dict( eval_set=[ (env.train_input, env.train_target), (env.validation_input, env.validation_target), ], eval_metric=gini_xgb, early_stopping_rounds=None, verbose=False, ), feature_engineer=FeatureEngineer([ feature_combinations, EngineerStep(upsample_train_data, stage="intra_cv") ]), feature_selector=[ "ps_car_13", # : 1571.65 / shadow 609.23 "ps_reg_03", # : 1408.42 / shadow 511.15 "ps_ind_05_cat", # : 1387.87 / shadow 84.72 "ps_ind_03", # : 1219.47 / shadow 230.55 "ps_ind_15", # : 922.18 / shadow 242.00 "ps_reg_02", # : 920.65 / shadow 267.50 "ps_car_14", # : 798.48 / shadow 549.58 "ps_car_12", # : 731.93 / shadow 293.62 "ps_car_01_cat", # : 698.07 / shadow 178.72 "ps_car_07_cat", # : 694.53 / shadow 36.35 "ps_ind_17_bin", # : 620.77 / shadow 23.15 "ps_car_03_cat", # : 611.73 / shadow 50.67 "ps_reg_01", # : 598.60 / shadow 178.57 "ps_car_15", # : 593.35 / shadow 226.43 "ps_ind_01", # : 547.32 / shadow 154.58 "ps_ind_16_bin", # : 475.37 / shadow 34.17 "ps_ind_07_bin", # : 435.28 / shadow 28.92 "ps_car_06_cat", # : 398.02 / shadow 212.43 "ps_car_04_cat", # : 376.87 / shadow 76.98 "ps_ind_06_bin", # : 370.97 / shadow 36.13 "ps_car_09_cat", # : 214.12 / shadow 81.38 "ps_car_02_cat", # : 203.03 / shadow 26.67 "ps_ind_02_cat", # : 189.47 / shadow 65.68 "ps_car_11", # : 173.28 / shadow 76.45 "ps_car_05_cat", # : 172.75 / shadow 62.92 "ps_calc_09", # : 169.13 / shadow 129.72 "ps_calc_05", # : 148.83 / shadow 120.68 "ps_ind_08_bin", # : 140.73 / shadow 27.63 "ps_car_08_cat", # : 120.87 / shadow 28.82 "ps_ind_09_bin", # : 113.92 / shadow 27.05 "ps_ind_04_cat", # : 107.27 / shadow 37.43 "ps_ind_18_bin", # : 77.42 / shadow 25.97 "ps_ind_12_bin", # : 39.67 / shadow 15.52 "ps_ind_14", # : 37.37 / shadow 16.65 "ps_car_11_cat", # Very nice spot from Tilii : https://www.kaggle.com/tilii7 ], )
def feature_engineer(request): return FeatureEngineer(steps=request.param)
def fe_experiment(request): if request.param is not None: request.param = FeatureEngineer(request.param) return CVExperiment(model_initializer=Ridge, model_init_params={}, feature_engineer=request.param)