def _execute():
    # To start, take a look at "examples/environment_params.json" - This is the file we're giving our Environment below
    # In this file, we can define a bunch of default Environment parameters that we don't want to always explicitly provide

    # It works really well for things that won't be changing often, like the following:
    # - `root_results_path`, which we probably never want to change, so all our results go to one place;
    # - `target_column`, which will probably be a constant for your data
    # - `metrics_map`, if you're not using any fancy metrics, and you already know what you want
    # - `file_blacklist`, if you're angry at me for adding that one result file that's always useless
    # Other parameters, whose default values you may want to change

    env = Environment(
        train_dataset=get_breast_cancer_data(),  # If your dataset is a str path, you can even add it to environment_params
        environment_params_path="./environment_params.json",  # Use this file for parameters not explicitly given
        cross_validation_params=dict(
            n_splits=5, shuffle=True, random_state=32
        ),  # Here we decide to override our default values
    )

    print(env.root_results_path)
    print(env.target_column)
    print(env.metrics_map)
    print(env.cross_validation_type)
    print(env.runs)
    print(env.file_blacklist)  # This includes some other values too, but you can ignore them
    # All of the above are from `environment_params_path`
    print(
        env.cross_validation_params
    )  # This is the value we provided above, rather than our `environment_params_path` default

    experiment = CVExperiment(model_initializer=KNeighborsClassifier, model_init_params={})
예제 #2
0
def execute():
    env = Environment(
        train_dataset=get_breast_cancer_data(),
        root_results_path='HyperparameterHunterAssets',
        target_column='diagnosis',
        metrics_map=['roc_auc_score'],
        cross_validation_type='StratifiedKFold',
        cross_validation_params=dict(n_splits=5, shuffle=True,
                                     random_state=32),
    )

    experiment = CrossValidationExperiment(
        model_initializer=KerasClassifier,
        model_init_params=build_fn,
        model_extra_params=dict(
            callbacks=[
                ModelCheckpoint(filepath=os.path.abspath('foo_checkpoint'),
                                save_best_only=True,
                                verbose=1),
                ReduceLROnPlateau(patience=5),
            ],
            batch_size=32,
            epochs=10,
            verbose=0,
            shuffle=True,
        ),
    )
def _execute():
    #################### Environment ####################
    env = Environment(
        train_dataset=get_breast_cancer_data(target="target"),
        root_results_path="HyperparameterHunterAssets",
        metrics_map=["roc_auc_score"],
        cross_validation_type="StratifiedKFold",
        cross_validation_params=dict(n_splits=5, shuffle=True,
                                     random_state=32),
    )

    #################### Experimentation ####################
    experiment = CVExperiment(
        model_initializer=KerasClassifier,
        model_init_params=dict(build_fn=_build_fn_experiment),
        model_extra_params=dict(callbacks=[ReduceLROnPlateau(patience=5)],
                                batch_size=32,
                                epochs=10,
                                verbose=0),
    )

    #################### Optimization ####################
    optimizer = BayesianOptimization(iterations=10)
    optimizer.set_experiment_guidelines(
        model_initializer=KerasClassifier,
        model_init_params=dict(build_fn=_build_fn_optimization),
        model_extra_params=dict(
            callbacks=[ReduceLROnPlateau(patience=Integer(5, 10))],
            batch_size=Categorical([32, 64], transform="onehot"),
            epochs=10,
            verbose=0,
        ),
    )
    optimizer.go()
예제 #4
0
def _execute():
    env = Environment(
        train_dataset=get_toy_classification_data(),
        root_results_path='HyperparameterHunterAssets',
        metrics_map=['roc_auc_score'],
        cross_validation_type='StratifiedKFold',
        cross_validation_params=dict(n_splits=5, shuffle=True,
                                     random_state=32),
        runs=1,
    )

    optimizer = GradientBoostedRegressionTreeOptimization(
        iterations=10,
        read_experiments=True,
        random_state=None,
    )

    optimizer.set_experiment_guidelines(
        model_initializer=CatBoostClassifier,
        model_init_params=dict(iterations=100,
                               eval_metric=Categorical(
                                   ['Logloss', 'Accuracy', 'AUC'],
                                   transform='onehot'),
                               learning_rate=Real(low=0.0001, high=0.5),
                               depth=Integer(4, 7),
                               save_snapshot=False),
    )

    optimizer.go()

    print('')
def execute():
    #################### Environment ####################
    env = Environment(
        train_dataset=get_boston_data(),
        results_path="HyperparameterHunterAssets",
        holdout_dataset=get_holdout_data,
        target_column="DIS",
        metrics=["r2_score", "median_absolute_error"],
        cv_type="KFold",
        cv_params=dict(n_splits=10, random_state=1),
    )

    #################### CVExperiment ####################
    exp_0 = CVExperiment(
        model_initializer=Ridge,
        model_init_params=dict(),
        feature_engineer=FeatureEngineer([quantile_transform]),
    )

    #################### Optimization ####################
    # `opt_0` recognizes `exp_0`'s `feature_engineer` and its results as valid learning material
    # This is because `opt_0` marks the engineer step functions omitted by `exp_0` as `optional=True`
    opt_0 = DummyOptPro(iterations=10)
    opt_0.forge_experiment(
        model_initializer=Ridge,
        model_init_params=dict(),
        feature_engineer=FeatureEngineer([
            Categorical([quantile_transform, log_transform], optional=True),
            Categorical([standard_scale, standard_scale_BAD], optional=True),
            Categorical([square_sum_feature], optional=True),
        ]),
    )
    opt_0.go()
def _execute():
    env = Environment(
        train_dataset=get_breast_cancer_data(),
        root_results_path='HyperparameterHunterAssets',
        target_column='diagnosis',
        metrics_map=['roc_auc_score'],
        cross_validation_type=StratifiedKFold,
        cross_validation_params=dict(n_splits=10, shuffle=True, random_state=32),
        runs=2,
    )

    optimizer = BayesianOptimization(iterations=100, read_experiments=True, random_state=None)

    optimizer.set_experiment_guidelines(
        model_initializer=XGBClassifier,
        model_init_params=dict(
            max_depth=Integer(2, 20),
            learning_rate=Real(0.0001, 0.5),
            n_estimators=200,
            subsample=0.5,
            booster=Categorical(['gbtree', 'gblinear', 'dart']),
        ),
        model_extra_params=dict(
            fit=dict(
                eval_metric=Categorical(['auc', 'rmse', 'mae'])
            )
        ),
    )

    optimizer.go()
def execute():
    env = Environment(
        train_dataset=get_breast_cancer_data(),
        results_path="HyperparameterHunterAssets",
        target_column="diagnosis",
        metrics=["roc_auc_score"],
        cv_type="StratifiedKFold",
        cv_params=dict(n_splits=5, shuffle=True, random_state=32),
    )

    experiment = CVExperiment(
        model_initializer=KerasClassifier,
        model_init_params=build_fn,
        model_extra_params=dict(
            callbacks=[
                ModelCheckpoint(
                    filepath=os.path.abspath("foo_checkpoint"), save_best_only=True, verbose=1
                ),
                ReduceLROnPlateau(patience=5),
            ],
            batch_size=32,
            epochs=10,
            verbose=0,
            shuffle=True,
        ),
    )
def _execute():
    env = Environment(
        train_dataset=get_breast_cancer_data(),
        results_path="HyperparameterHunterAssets",
        target_column="diagnosis",
        metrics=["roc_auc_score"],
        cv_type=StratifiedKFold,
        cv_params=dict(n_splits=10, shuffle=True, random_state=32),
        runs=2,
    )

    optimizer = BayesianOptPro(iterations=10, read_experiments=True, random_state=None)

    optimizer.forge_experiment(
        model_initializer=XGBClassifier,
        model_init_params=dict(
            max_depth=Integer(2, 20),
            learning_rate=Real(0.0001, 0.5),
            n_estimators=200,
            subsample=0.5,
            booster=Categorical(["gbtree", "gblinear", "dart"]),
        ),
        model_extra_params=dict(fit=dict(eval_metric=Categorical(["auc", "rmse", "mae"]))),
    )

    optimizer.go()
def execute():
    env = Environment(
        train_dataset=get_imbalanced_dataset(),
        results_path="HyperparameterHunterAssets",
        target_column="target",
        metrics=["roc_auc_score", "accuracy_score"],
        cv_type="KFold",
        cv_params=dict(n_splits=5, random_state=7),
    )

    # Since this is HyperparameterHunter, after all, we'll throw in some classic hyperparameter
    #   optimization just for fun. If you're like most people and you think it's absurd to test
    #   18 different `imblearn` techniques, feel free to comment out some `EngineerStep`s below

    opt_0 = ET(iterations=20, random_state=32)
    opt_0.forge_experiment(
        model_initializer=XGBClassifier,
        model_init_params=dict(
            max_depth=Integer(2, 20),
            n_estimators=Integer(50, 900),
            learning_rate=Real(0.0001, 0.9),
            subsample=0.5,
            booster=Categorical(["gbtree", "gblinear"]),
        ),
        feature_engineer=FeatureEngineer([
            Categorical(
                [
                    EngineerStep(resample_smote_tomek, stage="intra_cv"),
                    EngineerStep(over_sample_random, stage="intra_cv"),
                    EngineerStep(over_sample_smote, stage="intra_cv"),
                    EngineerStep(under_sample_random, stage="intra_cv"),
                    EngineerStep(under_sample_cluster_centroids,
                                 stage="intra_cv"),
                    EngineerStep(under_sample_tomek_links, stage="intra_cv"),
                    #################### GROUP 2 (EXTENDED) ####################
                    EngineerStep(resample_smote_enn, stage="intra_cv"),
                    EngineerStep(over_sample_ADASYN, stage="intra_cv"),
                    EngineerStep(over_sample_BorderlineSMOTE,
                                 stage="intra_cv"),
                    EngineerStep(over_sample_SVMSMOTE, stage="intra_cv"),
                    EngineerStep(under_sample_NearMiss, stage="intra_cv"),
                    EngineerStep(under_sample_CondensedNearestNeighbour,
                                 stage="intra_cv"),
                    EngineerStep(under_sample_OneSidedSelection,
                                 stage="intra_cv"),
                    EngineerStep(under_sample_NeighbourhoodCleaningRule,
                                 stage="intra_cv"),
                    EngineerStep(under_sample_EditedNearestNeighbours,
                                 stage="intra_cv"),
                    EngineerStep(under_sample_RepeatedEditedNearestNeighbour,
                                 stage="intra_cv"),
                    EngineerStep(under_sample_AllKNN, stage="intra_cv"),
                    EngineerStep(under_sample_InstanceHardnessThreshold,
                                 stage="intra_cv"),
                ],
                optional=True,
            )
        ]),
    )
    opt_0.go()
def _execute():
    env = Environment(
        train_dataset=get_breast_cancer_data(),
        root_results_path='HyperparameterHunterAssets',
        target_column='diagnosis',
        metrics_map=['roc_auc_score'],
        cross_validation_type=StratifiedKFold,
        cross_validation_params=dict(n_splits=10,
                                     shuffle=True,
                                     random_state=32),
        runs=1,
    )

    optimizer = RandomForestOptimization(
        iterations=100,
        read_experiments=True,
    )
    optimizer.set_experiment_guidelines(
        model_initializer=LGBMClassifier,
        model_init_params=dict(boosting_type=Categorical(['gbdt', 'dart']),
                               num_leaves=Integer(5, 20),
                               max_depth=-1,
                               min_child_samples=5,
                               subsample=0.5),
    )
    optimizer.go()
def execute():
    env = Environment(
        train_dataset=get_toy_classification_data(),
        root_results_path="HyperparameterHunterAssets",
        metrics_map=["roc_auc_score"],
        cross_validation_type=RepeatedStratifiedKFold,
        cross_validation_params=dict(n_splits=3, n_repeats=2, random_state=32),
        do_full_save=do_full_save,
    )

    experiment_0 = CVExperiment(model_initializer=XGBClassifier,
                                model_init_params=dict(subsample=0.01))
    # Pro Tip: By setting XGBoost's subsample ridiculously low, we can get bad scores on purpose

    # Upon completion of this Experiment, we see a warning that not all result files will be saved
    # This is because the final score of the Experiment was below our threshold of 0.75
    # Specifically, we skipped saving prediction files (OOF, holdout, test, or in-fold), and the heartbeat file

    # What still got saved is the Experiment's: key information, leaderboard position, and description file
    # These are saved to allow us to use the information for future hyperparameter optimization, and detect repeated Experiments
    # Additionally, the Experiment's script backup is saved, but that's because its one of the first things that happens
    # For even finer control over what gets saved, use `do_full_save` together with `file_blacklist`

    # Now, lets perform another Experiment that does a bit better than our intentionally miserable one
    experiment_1 = CVExperiment(model_initializer=XGBClassifier,
                                model_init_params=dict(subsample=0.5))
def env_3():
    def printer_callback():
        def printer_helper(_rep, _fold, _run, last_evaluation_results):
            print(f"{_rep}.{_fold}.{_run}   {last_evaluation_results}")

        return lambda_callback(
            on_experiment_start=printer_helper,
            on_experiment_end=printer_helper,
            on_repetition_start=printer_helper,
            on_repetition_end=printer_helper,
            on_fold_start=printer_helper,
            on_fold_end=printer_helper,
            on_run_start=printer_helper,
            on_run_end=printer_helper,
        )

    return Environment(
        train_dataset=get_toy_classification_data(),
        results_path=assets_dir,
        metrics=["roc_auc_score"],
        holdout_dataset=get_toy_classification_data(),
        cv_type=RepeatedStratifiedKFold,
        cv_params=dict(n_splits=3, n_repeats=2, random_state=32),
        runs=2,
        experiment_callbacks=[
            printer_callback(),
            confusion_matrix_oof(),
            confusion_matrix_holdout(),
        ],
    )
def _execute():
    env = Environment(
        train_dataset=get_toy_classification_data(target='diagnosis'),
        root_results_path='HyperparameterHunterAssets',
        target_column='diagnosis',
        metrics_map=['roc_auc_score'],
        cross_validation_type=RepeatedStratifiedKFold,
        cross_validation_params=dict(n_splits=5, n_repeats=2, random_state=32),
    )

    optimizer = ExtraTreesOptimization(
        iterations=10,
        read_experiments=True,
        random_state=None,
    )

    optimizer.set_experiment_guidelines(
        model_initializer=RGFClassifier,
        model_init_params=dict(max_leaf=1000,
                               algorithm=Categorical(
                                   ['RGF', 'RGF_Opt', 'RGF_Sib']),
                               l2=Real(0.01, 0.3),
                               normalize=Categorical([True, False]),
                               learning_rate=Real(0.3, 0.7),
                               loss=Categorical(['LS', 'Expo', 'Log', 'Abs'])),
    )

    optimizer.go()
예제 #14
0
def env_1():
    return Environment(
        train_dataset=get_breast_cancer_data(),
        environment_params_path="examples/advanced_examples/environment_params.json",
        results_path=assets_dir,
        cv_params=dict(n_splits=3, shuffle=True, random_state=32),
    )
def execute():
    """This is going to be a very simple example to illustrate what exactly HyperparameterHunter does, and how it revolutionizes
    hyperparameter optimization."""

    # Start by creating an `Environment` - This is where you define how Experiments (and optimization) will be conducted
    env = Environment(
        train_dataset=get_breast_cancer_data(target="target"),
        root_results_path="HyperparameterHunterAssets",
        metrics_map=["roc_auc_score"],
        cross_validation_type="StratifiedKFold",
        cross_validation_params=dict(n_splits=10,
                                     shuffle=True,
                                     random_state=32),
    )

    # Now, conduct an `Experiment`
    # This tells HyperparameterHunter to use the settings in the active `Environment` to train a model with these hyperparameters
    experiment = CVExperiment(model_initializer=XGBClassifier,
                              model_init_params=dict(objective="reg:linear",
                                                     max_depth=3))

    # That's it. No annoying boilerplate code to fit models and record results
    # Now, the `Environment`'s `root_results_path` directory will contain new files describing the Experiment just conducted

    # Time for the fun part. We'll set up some hyperparameter optimization by first defining the `OptimizationProtocol` we want
    optimizer = BayesianOptimization(verbose=1)

    # Now we're going to say which hyperparameters we want to optimize.
    # Notice how this looks just like our `experiment` above
    optimizer.set_experiment_guidelines(
        model_initializer=XGBClassifier,
        model_init_params=dict(
            objective=
            "reg:linear",  # We're setting this as a constant guideline - Not one to optimize
            max_depth=Integer(
                2, 10
            ),  # Instead of using an int like the `experiment` above, we provide a space to search
        ),
    )
    # Notice that our range for `max_depth` includes the `max_depth=3` value we used in our `experiment` earlier

    optimizer.go()  # Now, we go

    assert experiment.experiment_id in [
        _[2] for _ in optimizer.similar_experiments
    ]
    # Here we're verifying that the `experiment` we conducted first was found by `optimizer` and used as learning material
    # You can also see via the console that we found `experiment`'s saved files, and used it to start optimization

    last_experiment_id = optimizer.current_experiment.experiment_id
    # Let's save the id of the experiment that was just conducted by `optimizer`

    optimizer.go()  # Now, we'll start up `optimizer` again...

    # And we can see that this second optimization round learned from both our first `experiment` and our first optimization round
    assert experiment.experiment_id in [
        _[2] for _ in optimizer.similar_experiments
    ]
    assert last_experiment_id in [_[2] for _ in optimizer.similar_experiments]
예제 #16
0
def initialization_matching_env():
    return Environment(
        train_dataset=get_breast_cancer_data(target="target"),
        results_path=assets_dir,
        metrics=["roc_auc_score"],
        cv_type="KFold",
        cv_params=dict(n_splits=2, shuffle=True, random_state=32),
    )
예제 #17
0
def toy_environment_fixture():
    return Environment(
        train_dataset=pima_indians_head,
        holdout_dataset=holdout_first_row,
        metrics=["roc_auc_score"],
        target_column="class",
        cv_params=dict(n_splits=3, shuffle=True, random_state=32),
    )
예제 #18
0
def env_0():
    return Environment(
        train_dataset=get_toy_classification_data(),
        results_path=assets_dir,
        metrics=["roc_auc_score"],
        cv_type="RepeatedStratifiedKFold",
        cv_params=dict(n_splits=3, n_repeats=2, random_state=32),
    )
예제 #19
0
def env_0():
    return Environment(
        train_dataset=get_diabetes_data(target="target"),
        results_path=assets_dir,
        metrics=["mean_absolute_error"],
        cv_type="KFold",
        cv_params=dict(n_splits=2, shuffle=True, random_state=32),
    )
예제 #20
0
def env_digits():
    return Environment(
        train_dataset=get_digits_data(),
        results_path=assets_dir,
        metrics=["roc_auc_score"],
        cv_type="StratifiedKFold",
        cv_params=dict(n_splits=3, shuffle=True, random_state=32),
    )
예제 #21
0
def env_breast_cancer():
    env = Environment(
        train_dataset=get_breast_cancer_data(target="target"),
        results_path=assets_dir,
        metrics=["roc_auc_score"],
        cv_type="StratifiedKFold",
        cv_params=dict(n_splits=5, shuffle=True, random_state=32),
    )
    return env
예제 #22
0
def env_0():
    return Environment(
        train_dataset=get_breast_cancer_data(target="target"),
        root_results_path=assets_dir,
        metrics_map=["roc_auc_score"],
        cross_validation_type="StratifiedKFold",
        cross_validation_params=dict(n_splits=3, shuffle=True,
                                     random_state=32),
    )
예제 #23
0
def env_boston():
    return Environment(
        train_dataset=get_boston_data(),
        results_path=assets_dir,
        target_column="DIS",
        metrics=["r2_score"],
        cv_type="KFold",
        cv_params=dict(n_splits=3, random_state=1),
    )
예제 #24
0
def dataset_recorder_env(request):
    return Environment(
        train_dataset=small_toy_dataset,
        holdout_dataset=getattr(request, "param", None),
        metrics=["accuracy_score"],
        target_column="t",
        cv_params=dict(n_splits=4, shuffle=True, random_state=32),
        experiment_callbacks=[dataset_recorder()],
    )
def boston_env():
    return Environment(
        train_dataset=boston_head,
        holdout_dataset=holdout_last_row,
        target_column="DIS",
        metrics=["r2_score", "median_absolute_error"],
        cv_type="RepeatedKFold",
        cv_params=dict(n_repeats=2, n_splits=3, random_state=1),
        experiment_callbacks=[dataset_recorder()],
    )
def env_5(request):
    return Environment(
        train_dataset=get_breast_cancer_data(),
        results_path=assets_dir,
        target_column="diagnosis",
        metrics=["roc_auc_score"],
        cv_type=StratifiedKFold,
        cv_params=dict(n_splits=3, shuffle=True, random_state=32),
        experiment_recorders=request.param,
    )
def env_boston_regression():
    env = Environment(
        train_dataset=get_boston_data(),
        results_path=assets_dir,
        target_column="DIS",
        metrics=["median_absolute_error"],
        cv_type="KFold",
        cv_params=dict(n_splits=3, random_state=1),
    )
    return env
예제 #28
0
def env_0():
    def do_full_save(experiment_result):
        return experiment_result["final_evaluations"]["oof"]["roc_auc_score"] > 0.75

    return Environment(
        train_dataset=get_toy_classification_data(),
        results_path=assets_dir,
        metrics=["roc_auc_score"],
        cv_type=RepeatedStratifiedKFold,
        cv_params=dict(n_splits=3, n_repeats=2, random_state=32),
        do_full_save=do_full_save,
    )
def env_boston():
    return Environment(
        train_dataset=get_boston_data(),
        results_path=assets_dir,
        holdout_dataset=get_holdout_data,
        target_column="DIS",
        metrics=["r2_score", "median_absolute_error"],
        cv_type="KFold",
        cv_params=dict(n_splits=3, random_state=1),
        runs=1,
        verbose=1,
    )
예제 #30
0
def env_0():
    """`Environment` fixture that has `holdout_dataset` identical to `train_dataset` and is given
    `experiment_callbacks` consisting of the `lambda_callback` result of :func:`sentinel_checker`"""
    return Environment(
        train_dataset=get_breast_cancer_data(target="target"),
        results_path=assets_dir,
        holdout_dataset=get_breast_cancer_data(target="target"),
        metrics=["roc_auc_score"],
        cv_type="StratifiedKFold",
        cv_params=dict(n_splits=2, shuffle=True, random_state=32),
        experiment_callbacks=[sentinel_checker()],
    )