def test_similar_experiments_unordered(): """Check that an experiment with a single `EngineerStep` is considered "similar" by an Optimization Protocol, with two `optional` `EngineerStep`s, where the second step is identical to the single step used by the standalone experiment. As of v3.0.0alpha2, this is expected to fail because the otherwise identical engineer steps occur at different indexes in `FeatureEngineer.steps` for the experiment and the OptPro. The experiment has `sqr_sum_feature` at index=0, while the same step in the OptPro is at index=1. Note that the step index in OptPro is still 1 despite the fact that the other step immediately preceding it is `optional`""" env = Environment( train_dataset=get_breast_cancer_data(), results_path=assets_dir, target_column="diagnosis", metrics=["roc_auc_score"], cv_type="StratifiedKFold", cv_params=dict(n_splits=5, shuffle=True, random_state=32), ) exp = CVExperiment( model_initializer=XGBClassifier, model_init_params=dict(objective="reg:linear", subsample=0.5, max_depth=3), feature_engineer=FeatureEngineer([EngineerStep(sqr_sum_feature)]), ) opt = BayesianOptPro(iterations=1) opt.forge_experiment( model_initializer=XGBClassifier, model_init_params=dict(objective="reg:linear", subsample=0.5, max_depth=3), feature_engineer=FeatureEngineer([ Categorical([standard_scale, normalize, min_max_scale], optional=True), Categorical([sqr_sum_feature], optional=True), ]), ) opt.go() assert exp.experiment_id in [_[2] for _ in opt.similar_experiments]
class ChoiceMMNormalizeSS: functions = Categorical([min_max_scale, normalize, standard_scale]) engineers = Categorical([ EngineerStep(min_max_scale), EngineerStep(normalize), EngineerStep(standard_scale) ]) o_functions = Categorical([min_max_scale, normalize, standard_scale], optional=True) o_engineers = Categorical( [ EngineerStep(min_max_scale), EngineerStep(normalize), EngineerStep(standard_scale) ], optional=True, )
def test_honorary_step_from_dict_value_error(step_dict, dimension): with pytest.raises(ValueError, match="`step_dict` could not be found in `dimension`"): actual = EngineerStep.honorary_step_from_dict(step_dict, dimension)
def test_honorary_step_from_dict(step_dict, dimension, expected): actual = EngineerStep.honorary_step_from_dict(step_dict, dimension) assert isinstance(actual, EngineerStep) assert actual == expected
################################################## # `EngineerStep.honorary_step_from_dict` Tests ################################################## @pytest.mark.parametrize( ["step_dict", "dimension", "expected"], [ ( dict( name="nothing_transform", f="2jDrngAKAWUo9OtZOL7VNfoJBj7XXy340dsgNjVj7AE=", params=["train_targets", "non_train_targets"], stage="intra_cv", do_validate=False, ), Categorical([EngineerStep(nothing_transform)], optional=True), EngineerStep(nothing_transform), ), ( dict( name="nothing_transform", f="2jDrngAKAWUo9OtZOL7VNfoJBj7XXy340dsgNjVj7AE=", params=["train_targets", "non_train_targets"], stage="pre_cv", do_validate=False, ), Categorical([ EngineerStep(nothing_transform), EngineerStep(nothing_transform, stage="pre_cv") ]), EngineerStep(nothing_transform, stage="pre_cv"),
end_data_unchanged[3], ) @pytest.mark.parametrize( ["prepped_experiment", "end_data"], [ (None, end_data_unchanged), ([set_nan_0], end_data_sn), ([impute_negative_one_0], end_data_unchanged), ([set_nan_0, impute_negative_one_0], end_data_sn_ino), ([set_nan_0, impute_negative_one_1], end_data_sn_ino), ([set_nan_0, impute_negative_one_0, standard_scale_0 ], end_data_sn_ino_ss), ([set_nan_0, standard_scale_0], end_data_sn_ss), ([set_nan_0, EngineerStep(standard_scale_0)], end_data_sn_ss), ], indirect=["prepped_experiment"], ) def test_feature_engineer_experiment(toy_environment_fixture, prepped_experiment, end_data): assert_frame_equal(prepped_experiment.data_train.input.T.d, end_data[0], check_dtype=False) assert_frame_equal(prepped_experiment.data_train.target.T.d, end_data[1], check_dtype=False) assert_frame_equal(prepped_experiment.data_holdout.input.T.d, end_data[2], check_dtype=False) assert_frame_equal(prepped_experiment.data_holdout.target.T.d,
def execute(): env = Environment( train_dataset=get_imbalanced_dataset(), results_path="HyperparameterHunterAssets", target_column="target", metrics=["roc_auc_score", "accuracy_score"], cv_type="KFold", cv_params=dict(n_splits=5, random_state=7), ) # Since this is HyperparameterHunter, after all, we'll throw in some classic hyperparameter # optimization just for fun. If you're like most people and you think it's absurd to test # 18 different `imblearn` techniques, feel free to comment out some `EngineerStep`s below opt_0 = ET(iterations=20, random_state=32) opt_0.forge_experiment( model_initializer=XGBClassifier, model_init_params=dict( max_depth=Integer(2, 20), n_estimators=Integer(50, 900), learning_rate=Real(0.0001, 0.9), subsample=0.5, booster=Categorical(["gbtree", "gblinear"]), ), feature_engineer=FeatureEngineer([ Categorical( [ EngineerStep(resample_smote_tomek, stage="intra_cv"), EngineerStep(over_sample_random, stage="intra_cv"), EngineerStep(over_sample_smote, stage="intra_cv"), EngineerStep(under_sample_random, stage="intra_cv"), EngineerStep(under_sample_cluster_centroids, stage="intra_cv"), EngineerStep(under_sample_tomek_links, stage="intra_cv"), #################### GROUP 2 (EXTENDED) #################### EngineerStep(resample_smote_enn, stage="intra_cv"), EngineerStep(over_sample_ADASYN, stage="intra_cv"), EngineerStep(over_sample_BorderlineSMOTE, stage="intra_cv"), EngineerStep(over_sample_SVMSMOTE, stage="intra_cv"), EngineerStep(under_sample_NearMiss, stage="intra_cv"), EngineerStep(under_sample_CondensedNearestNeighbour, stage="intra_cv"), EngineerStep(under_sample_OneSidedSelection, stage="intra_cv"), EngineerStep(under_sample_NeighbourhoodCleaningRule, stage="intra_cv"), EngineerStep(under_sample_EditedNearestNeighbours, stage="intra_cv"), EngineerStep(under_sample_RepeatedEditedNearestNeighbour, stage="intra_cv"), EngineerStep(under_sample_AllKNN, stage="intra_cv"), EngineerStep(under_sample_InstanceHardnessThreshold, stage="intra_cv"), ], optional=True, ) ]), ) opt_0.go()
class ChoiceTarget: functions = Categorical([quantile_transform, nothing_transform]) engineers = Categorical( [EngineerStep(quantile_transform), EngineerStep(nothing_transform)])
class ChoiceNormalizeSS: functions = Categorical([normalize, standard_scale]) engineers = Categorical( [EngineerStep(normalize), EngineerStep(standard_scale)])
class ChoiceUpsample: functions = Categorical([pos_upsample, neg_upsample]) engineers = Categorical( [EngineerStep(pos_upsample), EngineerStep(neg_upsample)])
), ], indirect=["feature_engineer"], ) def test_is_choice_dimension(feature_engineer, expected_choices): choices = get_choice_dimensions( feature_engineer, iter_attrs=lambda p, k, v: isinstance(v, FeatureEngineer)) assert choices == expected_choices #################### ChoiceUpsample Contains #################### @pytest.mark.parametrize( "space_item", [ pytest.param(EngineerStep(pos_upsample), id="E(pos_upsample)"), pytest.param(EngineerStep(neg_upsample), id="E(neg_upsample)"), pytest.param(EngineerStep(pos_upsample, stage="pre_cv"), id="E(pos_upsample, stage)"), pytest.param( EngineerStep(neg_upsample, params=("train_inputs", "train_targets")), id="E(neg_upsample, params)", ), ], ) def test_in_upsample_space(space_item): assert space_item in ChoiceUpsample.engineers @pytest.mark.parametrize(
def execute(): env = Environment( train_dataset="data/train.csv", test_dataset="data/test.csv", results_path="HyperparameterHunterAssets", target_column="target", metrics=dict(gini=gini_normalized_c), id_column="id", cv_type=StratifiedKFold, cv_params=dict(n_splits=5, shuffle=True, random_state=15), do_predict_proba=1, to_csv_params=dict( index=False), # Drops index from final prediction files ) exp = CVExperiment( model_initializer=XGBClassifier, model_init_params=dict( n_estimators=200, max_depth=4, objective="binary:logistic", learning_rate=0.1, subsample=0.8, colsample_bytree=0.8, gamma=1, reg_alpha=0, reg_lambda=1, nthread=2, ), model_extra_params=dict( eval_set=[ (env.train_input, env.train_target), (env.validation_input, env.validation_target), ], eval_metric=gini_xgb, early_stopping_rounds=None, verbose=False, ), feature_engineer=FeatureEngineer([ feature_combinations, EngineerStep(upsample_train_data, stage="intra_cv") ]), feature_selector=[ "ps_car_13", # : 1571.65 / shadow 609.23 "ps_reg_03", # : 1408.42 / shadow 511.15 "ps_ind_05_cat", # : 1387.87 / shadow 84.72 "ps_ind_03", # : 1219.47 / shadow 230.55 "ps_ind_15", # : 922.18 / shadow 242.00 "ps_reg_02", # : 920.65 / shadow 267.50 "ps_car_14", # : 798.48 / shadow 549.58 "ps_car_12", # : 731.93 / shadow 293.62 "ps_car_01_cat", # : 698.07 / shadow 178.72 "ps_car_07_cat", # : 694.53 / shadow 36.35 "ps_ind_17_bin", # : 620.77 / shadow 23.15 "ps_car_03_cat", # : 611.73 / shadow 50.67 "ps_reg_01", # : 598.60 / shadow 178.57 "ps_car_15", # : 593.35 / shadow 226.43 "ps_ind_01", # : 547.32 / shadow 154.58 "ps_ind_16_bin", # : 475.37 / shadow 34.17 "ps_ind_07_bin", # : 435.28 / shadow 28.92 "ps_car_06_cat", # : 398.02 / shadow 212.43 "ps_car_04_cat", # : 376.87 / shadow 76.98 "ps_ind_06_bin", # : 370.97 / shadow 36.13 "ps_car_09_cat", # : 214.12 / shadow 81.38 "ps_car_02_cat", # : 203.03 / shadow 26.67 "ps_ind_02_cat", # : 189.47 / shadow 65.68 "ps_car_11", # : 173.28 / shadow 76.45 "ps_car_05_cat", # : 172.75 / shadow 62.92 "ps_calc_09", # : 169.13 / shadow 129.72 "ps_calc_05", # : 148.83 / shadow 120.68 "ps_ind_08_bin", # : 140.73 / shadow 27.63 "ps_car_08_cat", # : 120.87 / shadow 28.82 "ps_ind_09_bin", # : 113.92 / shadow 27.05 "ps_ind_04_cat", # : 107.27 / shadow 37.43 "ps_ind_18_bin", # : 77.42 / shadow 25.97 "ps_ind_12_bin", # : 39.67 / shadow 15.52 "ps_ind_14", # : 37.37 / shadow 16.65 "ps_car_11_cat", # Very nice spot from Tilii : https://www.kaggle.com/tilii7 ], )
model_initializer=Ridge, model_init_params={}, feature_engineer=FeatureEngineer([bad_quantile_transform]), ) ################################################## # `CVExperiment`: `FeatureEngineer` as List ################################################## #################### Equality #################### @pytest.mark.parametrize( ["steps_0", "steps_1"], [ ([standard_scale], [standard_scale]), ([standard_scale, standard_scale], [standard_scale, standard_scale]), ([standard_scale], [EngineerStep(standard_scale, stage="intra_cv")]), ([nothing_transform, standard_scale ], [nothing_transform, standard_scale]), ([nothing_transform, standard_scale ], [EngineerStep(nothing_transform), standard_scale]), ( [ EngineerStep(nothing_transform, name="nothing_transform"), standard_scale ], [nothing_transform, standard_scale], ), ], ) def test_feature_engineer_list_experiment_equality(env_boston, steps_0, steps_1):