def test_similar_experiments_unordered():
    """Check that an experiment with a single `EngineerStep` is considered "similar" by an
    Optimization Protocol, with two `optional` `EngineerStep`s, where the second step is identical
    to the single step used by the standalone experiment. As of v3.0.0alpha2, this is expected to
    fail because the otherwise identical engineer steps occur at different indexes in
    `FeatureEngineer.steps` for the experiment and the OptPro. The experiment has `sqr_sum_feature`
    at index=0, while the same step in the OptPro is at index=1. Note that the step index in OptPro
    is still 1 despite the fact that the other step immediately preceding it is `optional`"""
    env = Environment(
        train_dataset=get_breast_cancer_data(),
        results_path=assets_dir,
        target_column="diagnosis",
        metrics=["roc_auc_score"],
        cv_type="StratifiedKFold",
        cv_params=dict(n_splits=5, shuffle=True, random_state=32),
    )

    exp = CVExperiment(
        model_initializer=XGBClassifier,
        model_init_params=dict(objective="reg:linear",
                               subsample=0.5,
                               max_depth=3),
        feature_engineer=FeatureEngineer([EngineerStep(sqr_sum_feature)]),
    )

    opt = BayesianOptPro(iterations=1)
    opt.forge_experiment(
        model_initializer=XGBClassifier,
        model_init_params=dict(objective="reg:linear",
                               subsample=0.5,
                               max_depth=3),
        feature_engineer=FeatureEngineer([
            Categorical([standard_scale, normalize, min_max_scale],
                        optional=True),
            Categorical([sqr_sum_feature], optional=True),
        ]),
    )
    opt.go()

    assert exp.experiment_id in [_[2] for _ in opt.similar_experiments]
class ChoiceMMNormalizeSS:
    functions = Categorical([min_max_scale, normalize, standard_scale])
    engineers = Categorical([
        EngineerStep(min_max_scale),
        EngineerStep(normalize),
        EngineerStep(standard_scale)
    ])
    o_functions = Categorical([min_max_scale, normalize, standard_scale],
                              optional=True)
    o_engineers = Categorical(
        [
            EngineerStep(min_max_scale),
            EngineerStep(normalize),
            EngineerStep(standard_scale)
        ],
        optional=True,
    )
예제 #3
0
def test_honorary_step_from_dict_value_error(step_dict, dimension):
    with pytest.raises(ValueError,
                       match="`step_dict` could not be found in `dimension`"):
        actual = EngineerStep.honorary_step_from_dict(step_dict, dimension)
예제 #4
0
def test_honorary_step_from_dict(step_dict, dimension, expected):
    actual = EngineerStep.honorary_step_from_dict(step_dict, dimension)
    assert isinstance(actual, EngineerStep)
    assert actual == expected
예제 #5
0
##################################################
# `EngineerStep.honorary_step_from_dict` Tests
##################################################
@pytest.mark.parametrize(
    ["step_dict", "dimension", "expected"],
    [
        (
            dict(
                name="nothing_transform",
                f="2jDrngAKAWUo9OtZOL7VNfoJBj7XXy340dsgNjVj7AE=",
                params=["train_targets", "non_train_targets"],
                stage="intra_cv",
                do_validate=False,
            ),
            Categorical([EngineerStep(nothing_transform)], optional=True),
            EngineerStep(nothing_transform),
        ),
        (
            dict(
                name="nothing_transform",
                f="2jDrngAKAWUo9OtZOL7VNfoJBj7XXy340dsgNjVj7AE=",
                params=["train_targets", "non_train_targets"],
                stage="pre_cv",
                do_validate=False,
            ),
            Categorical([
                EngineerStep(nothing_transform),
                EngineerStep(nothing_transform, stage="pre_cv")
            ]),
            EngineerStep(nothing_transform, stage="pre_cv"),
예제 #6
0
    end_data_unchanged[3],
)


@pytest.mark.parametrize(
    ["prepped_experiment", "end_data"],
    [
        (None, end_data_unchanged),
        ([set_nan_0], end_data_sn),
        ([impute_negative_one_0], end_data_unchanged),
        ([set_nan_0, impute_negative_one_0], end_data_sn_ino),
        ([set_nan_0, impute_negative_one_1], end_data_sn_ino),
        ([set_nan_0, impute_negative_one_0, standard_scale_0
          ], end_data_sn_ino_ss),
        ([set_nan_0, standard_scale_0], end_data_sn_ss),
        ([set_nan_0, EngineerStep(standard_scale_0)], end_data_sn_ss),
    ],
    indirect=["prepped_experiment"],
)
def test_feature_engineer_experiment(toy_environment_fixture,
                                     prepped_experiment, end_data):
    assert_frame_equal(prepped_experiment.data_train.input.T.d,
                       end_data[0],
                       check_dtype=False)
    assert_frame_equal(prepped_experiment.data_train.target.T.d,
                       end_data[1],
                       check_dtype=False)
    assert_frame_equal(prepped_experiment.data_holdout.input.T.d,
                       end_data[2],
                       check_dtype=False)
    assert_frame_equal(prepped_experiment.data_holdout.target.T.d,
def execute():
    env = Environment(
        train_dataset=get_imbalanced_dataset(),
        results_path="HyperparameterHunterAssets",
        target_column="target",
        metrics=["roc_auc_score", "accuracy_score"],
        cv_type="KFold",
        cv_params=dict(n_splits=5, random_state=7),
    )

    # Since this is HyperparameterHunter, after all, we'll throw in some classic hyperparameter
    #   optimization just for fun. If you're like most people and you think it's absurd to test
    #   18 different `imblearn` techniques, feel free to comment out some `EngineerStep`s below

    opt_0 = ET(iterations=20, random_state=32)
    opt_0.forge_experiment(
        model_initializer=XGBClassifier,
        model_init_params=dict(
            max_depth=Integer(2, 20),
            n_estimators=Integer(50, 900),
            learning_rate=Real(0.0001, 0.9),
            subsample=0.5,
            booster=Categorical(["gbtree", "gblinear"]),
        ),
        feature_engineer=FeatureEngineer([
            Categorical(
                [
                    EngineerStep(resample_smote_tomek, stage="intra_cv"),
                    EngineerStep(over_sample_random, stage="intra_cv"),
                    EngineerStep(over_sample_smote, stage="intra_cv"),
                    EngineerStep(under_sample_random, stage="intra_cv"),
                    EngineerStep(under_sample_cluster_centroids,
                                 stage="intra_cv"),
                    EngineerStep(under_sample_tomek_links, stage="intra_cv"),
                    #################### GROUP 2 (EXTENDED) ####################
                    EngineerStep(resample_smote_enn, stage="intra_cv"),
                    EngineerStep(over_sample_ADASYN, stage="intra_cv"),
                    EngineerStep(over_sample_BorderlineSMOTE,
                                 stage="intra_cv"),
                    EngineerStep(over_sample_SVMSMOTE, stage="intra_cv"),
                    EngineerStep(under_sample_NearMiss, stage="intra_cv"),
                    EngineerStep(under_sample_CondensedNearestNeighbour,
                                 stage="intra_cv"),
                    EngineerStep(under_sample_OneSidedSelection,
                                 stage="intra_cv"),
                    EngineerStep(under_sample_NeighbourhoodCleaningRule,
                                 stage="intra_cv"),
                    EngineerStep(under_sample_EditedNearestNeighbours,
                                 stage="intra_cv"),
                    EngineerStep(under_sample_RepeatedEditedNearestNeighbour,
                                 stage="intra_cv"),
                    EngineerStep(under_sample_AllKNN, stage="intra_cv"),
                    EngineerStep(under_sample_InstanceHardnessThreshold,
                                 stage="intra_cv"),
                ],
                optional=True,
            )
        ]),
    )
    opt_0.go()
class ChoiceTarget:
    functions = Categorical([quantile_transform, nothing_transform])
    engineers = Categorical(
        [EngineerStep(quantile_transform),
         EngineerStep(nothing_transform)])
class ChoiceNormalizeSS:
    functions = Categorical([normalize, standard_scale])
    engineers = Categorical(
        [EngineerStep(normalize),
         EngineerStep(standard_scale)])
class ChoiceUpsample:
    functions = Categorical([pos_upsample, neg_upsample])
    engineers = Categorical(
        [EngineerStep(pos_upsample),
         EngineerStep(neg_upsample)])
        ),
    ],
    indirect=["feature_engineer"],
)
def test_is_choice_dimension(feature_engineer, expected_choices):
    choices = get_choice_dimensions(
        feature_engineer,
        iter_attrs=lambda p, k, v: isinstance(v, FeatureEngineer))
    assert choices == expected_choices


#################### ChoiceUpsample Contains ####################
@pytest.mark.parametrize(
    "space_item",
    [
        pytest.param(EngineerStep(pos_upsample), id="E(pos_upsample)"),
        pytest.param(EngineerStep(neg_upsample), id="E(neg_upsample)"),
        pytest.param(EngineerStep(pos_upsample, stage="pre_cv"),
                     id="E(pos_upsample, stage)"),
        pytest.param(
            EngineerStep(neg_upsample,
                         params=("train_inputs", "train_targets")),
            id="E(neg_upsample, params)",
        ),
    ],
)
def test_in_upsample_space(space_item):
    assert space_item in ChoiceUpsample.engineers


@pytest.mark.parametrize(
예제 #12
0
def execute():
    env = Environment(
        train_dataset="data/train.csv",
        test_dataset="data/test.csv",
        results_path="HyperparameterHunterAssets",
        target_column="target",
        metrics=dict(gini=gini_normalized_c),
        id_column="id",
        cv_type=StratifiedKFold,
        cv_params=dict(n_splits=5, shuffle=True, random_state=15),
        do_predict_proba=1,
        to_csv_params=dict(
            index=False),  # Drops index from final prediction files
    )

    exp = CVExperiment(
        model_initializer=XGBClassifier,
        model_init_params=dict(
            n_estimators=200,
            max_depth=4,
            objective="binary:logistic",
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            gamma=1,
            reg_alpha=0,
            reg_lambda=1,
            nthread=2,
        ),
        model_extra_params=dict(
            eval_set=[
                (env.train_input, env.train_target),
                (env.validation_input, env.validation_target),
            ],
            eval_metric=gini_xgb,
            early_stopping_rounds=None,
            verbose=False,
        ),
        feature_engineer=FeatureEngineer([
            feature_combinations,
            EngineerStep(upsample_train_data, stage="intra_cv")
        ]),
        feature_selector=[
            "ps_car_13",  # : 1571.65 / shadow  609.23
            "ps_reg_03",  # : 1408.42 / shadow  511.15
            "ps_ind_05_cat",  # : 1387.87 / shadow   84.72
            "ps_ind_03",  # : 1219.47 / shadow  230.55
            "ps_ind_15",  # :  922.18 / shadow  242.00
            "ps_reg_02",  # :  920.65 / shadow  267.50
            "ps_car_14",  # :  798.48 / shadow  549.58
            "ps_car_12",  # :  731.93 / shadow  293.62
            "ps_car_01_cat",  # :  698.07 / shadow  178.72
            "ps_car_07_cat",  # :  694.53 / shadow   36.35
            "ps_ind_17_bin",  # :  620.77 / shadow   23.15
            "ps_car_03_cat",  # :  611.73 / shadow   50.67
            "ps_reg_01",  # :  598.60 / shadow  178.57
            "ps_car_15",  # :  593.35 / shadow  226.43
            "ps_ind_01",  # :  547.32 / shadow  154.58
            "ps_ind_16_bin",  # :  475.37 / shadow   34.17
            "ps_ind_07_bin",  # :  435.28 / shadow   28.92
            "ps_car_06_cat",  # :  398.02 / shadow  212.43
            "ps_car_04_cat",  # :  376.87 / shadow   76.98
            "ps_ind_06_bin",  # :  370.97 / shadow   36.13
            "ps_car_09_cat",  # :  214.12 / shadow   81.38
            "ps_car_02_cat",  # :  203.03 / shadow   26.67
            "ps_ind_02_cat",  # :  189.47 / shadow   65.68
            "ps_car_11",  # :  173.28 / shadow   76.45
            "ps_car_05_cat",  # :  172.75 / shadow   62.92
            "ps_calc_09",  # :  169.13 / shadow  129.72
            "ps_calc_05",  # :  148.83 / shadow  120.68
            "ps_ind_08_bin",  # :  140.73 / shadow   27.63
            "ps_car_08_cat",  # :  120.87 / shadow   28.82
            "ps_ind_09_bin",  # :  113.92 / shadow   27.05
            "ps_ind_04_cat",  # :  107.27 / shadow   37.43
            "ps_ind_18_bin",  # :   77.42 / shadow   25.97
            "ps_ind_12_bin",  # :   39.67 / shadow   15.52
            "ps_ind_14",  # :   37.37 / shadow   16.65
            "ps_car_11_cat",  # Very nice spot from Tilii : https://www.kaggle.com/tilii7
        ],
    )
예제 #13
0
            model_initializer=Ridge,
            model_init_params={},
            feature_engineer=FeatureEngineer([bad_quantile_transform]),
        )


##################################################
# `CVExperiment`: `FeatureEngineer` as List
##################################################
#################### Equality ####################
@pytest.mark.parametrize(
    ["steps_0", "steps_1"],
    [
        ([standard_scale], [standard_scale]),
        ([standard_scale, standard_scale], [standard_scale, standard_scale]),
        ([standard_scale], [EngineerStep(standard_scale, stage="intra_cv")]),
        ([nothing_transform, standard_scale
          ], [nothing_transform, standard_scale]),
        ([nothing_transform, standard_scale
          ], [EngineerStep(nothing_transform), standard_scale]),
        (
            [
                EngineerStep(nothing_transform, name="nothing_transform"),
                standard_scale
            ],
            [nothing_transform, standard_scale],
        ),
    ],
)
def test_feature_engineer_list_experiment_equality(env_boston, steps_0,
                                                   steps_1):