示例#1
0
def test_AUTO_stopping_metric_with_auc_sorting_metric():
    print("Check leaderboard with AUTO stopping metric and auc sorting metric")
    ds = prepare_data('binomial')
    exclude_algos = ["DeepLearning", "GLM", "StackedEnsemble"]
    aml = H2OAutoML(
        project_name="py_aml_lb_test_auto_stopping_metric_auc_sorting",
        seed=automl_seed,
        max_models=10,
        exclude_algos=exclude_algos,
        sort_metric='auc')
    aml.train(y=ds.target, training_frame=ds.train)

    check_leaderboard(
        aml, exclude_algos,
        ["auc", "logloss", "mean_per_class_error", "rmse", "mse"], "auc", True)
    non_se = get_partitioned_model_names(aml.leaderboard).non_se
    check_model_property(non_se, 'stopping_metric', True, "logloss")
示例#2
0
def test_balance_classes():
    print("Check balance_classes & related args work properly")
    ds = import_dataset()
    aml = H2OAutoML(
        project_name="py_aml_balance_classes_etc",
        exclude_algos=['XGBoost'],  # XGB doesn't support balance_classes
        max_models=3,
        balance_classes=True,
        class_sampling_factors=[0.2, 1.4],
        max_after_balance_size=3.0,
        seed=1)
    aml.train(y=ds['target'], training_frame=ds['train'])
    _, non_se, _ = get_partitioned_model_names(aml.leaderboard)
    amodel = h2o.get_model(non_se[0])
    assert amodel.params['balance_classes']['actual'] == True
    assert amodel.params['max_after_balance_size']['actual'] == 3.0
    assert amodel.params['class_sampling_factors']['actual'] == [0.2, 1.4]
示例#3
0
def test_stacked_ensembles_are_trained_after_timeout():
    print("Check that Stacked Ensembles are still trained after timeout")
    max_runtime_secs = 20
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_aml_SE_after_timeout",
                    seed=1,
                    max_runtime_secs=max_runtime_secs,
                    exclude_algos=['XGBoost', 'DeepLearning'])
    start = time.time()
    aml.train(y=ds['target'], training_frame=ds['train'])
    end = time.time()
    assert end - start - max_runtime_secs > 0

    _, _, se = get_partitioned_model_names(aml.leaderboard)
    assert len(
        se
    ) > 0, "StackedEnsemble should still be trained after timeout"  # we don't need to test if all SEs are built, there may be only one if just one model type was built.
示例#4
0
def fit_model_into_data_by(model_type, features,
                           model_params, X_train, X_test, X_train_selected, X_test_selected, y_train, y_test):
    if features == 'all':
        feature_cols = [col for col in list(X_train.columns) if
                        col not in model_params['TRAIN_TEST_SPLIT']['EXCLUDE_COL']]
        train = X_train
        test = X_test
    elif features == 'selected':
        feature_cols = [col for col in list(X_train_selected.columns) if
                        col not in model_params['TRAIN_TEST_SPLIT']['EXCLUDE_COL']]
        train = X_train_selected
        test = X_test_selected
    target_col = model_params['TRAIN_TEST_SPLIT']['TARGET_COL']

    if model_type == 'h2o':
        h2o.init(ip="127.0.0.1", max_mem_size_GB=2)
        hdf = h2o.H2OFrame(pd.concat([train, y_train], axis=1))
        aml = H2OAutoML(max_models=5, seed=1, max_runtime_secs=432000)
        aml.train(
            x=feature_cols,
            y=target_col,
            training_frame=hdf
        )
        h2o.save_model(model=aml,
                       path=model_params['DIR_NAME_OF_MODEL_PICKLE']+'fitted_'+features+'_features_'+model_type+'_model',
                       force=True)
        # joblib.dump(aml,
        #             model_params['DIR_NAME_OF_MODEL_PICKLE']+'fitted_'+features+'_features_'+model_type+'_model.pkl')
        y_test_pred = aml.predict(h2o.H2OFrame(test)).as_data_frame()['predict']
    elif model_type == 'rf':
        rf_reg = RandomForestRegressor(
            n_estimators=1000,
            max_depth=10,
            max_features='sqrt',
            n_jobs=-1,
            verbose=1,
            random_state=1
        )
        rf_reg.fit(train, y_train)
        joblib.dump(rf_reg, model_params['DIR_NAME_OF_MODEL_PICKLE']+'fitted_'+features+'_features_'+model_type+'_model.pkl')
        y_test_pred = rf_reg.predict(test)

    print('RMSE: ', np.sqrt(mean_squared_error(y_test_pred, y_test)))
    print('R2: ', r2_score(y_test_pred, y_test))
    return y_test_pred
示例#5
0
    def run_example(self):

        h2o.init()

        # Import a sample binary outcome train/test set into H2O
        train = h2o.import_file("./data/churn-train.csv")
        test = h2o.import_file("./data/churn-test.csv")
        #df = h2o.import_file("./data/churn.csv")
        #train, test = df.split_frame(ratios=[.75])

        # Identify predictors and response
        x = train.columns
        y = "churn_probability"
        x.remove(y)

        # For binary classification, response should be a factor
        #train[y] = train[y].asfactor()
        #test[y] = test[y].asfactor()

        # Run AutoML for 20 base models (limited to 1 hour max runtime by default)
        aml = H2OAutoML(max_runtime_secs=20, seed=1, sort_metric="mae")
        aml.train(x=x, y=y, training_frame=train)

        # View the AutoML Leaderboard
        lb = aml.leaderboard
        lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

        # The leader model is stored here
        print(aml.leader.model_performance(test))

        # If you need to generate predictions on a test set, you can make
        # predictions directly on the `"H2OAutoML"` object, or on the leader
        # model object directly

        preds = aml.predict(test)

        # or:
        preds = aml.leader.predict(test)
        print(preds)

        resp = [aml, aml.leader, preds.as_data_frame()]

        h2o.shutdown()

        return resp
    def test_remove_automl_with_xval_when_keeping_all_cv_details():
        target, train, valid, test = prepare_data()
        project_name = 'aml_with_xval_remove_test'
        max_models = 3
        nfolds = 5
        aml = H2OAutoML(project_name=project_name,
                        nfolds=nfolds,
                        max_models=max_models,
                        seed=1,
                        keep_cross_validation_predictions=True,
                        keep_cross_validation_fold_assignment=True,
                        keep_cross_validation_models=True)
        aml.train(y=target, training_frame=train)

        keys = list_keys_in_memory()
        # print(keys['all'].values)
        assert contains_leaderboard(project_name, keys)
        assert contains_event_log(project_name, keys)
        expectations = dict(
            models_base=max_models + 2,  # 2 SEs
            cv_models=(max_models+2) * nfolds,  # 1 cv model per fold for all models, incl. SEs
            predictions=(len(keys['cv_models'])  # cv predictions
                         + len(keys['models_base'])  # cv holdout predictions
                         ),
            metrics=(len(keys['cv_models']) * 3  # for each cv model, 1 on training frame, 1 on validation frame (=training for cv), one on adapted frame (to be removed with PUBDEV-6638)
                     + len(keys['models_base'])  # for each model, 1 on training_frame
                     + (2 * 1)  # for each SE, 1 on levelone training
                     )
        )
        for k, v in expectations.items():
            assert len(keys[k]) == v, "expected {} {}, but got {}".format(v, k, len(keys[k]))

        h2o.remove(aml)
        clean = list_keys_in_memory()
        print(clean['all'].values)
        assert not contains_leaderboard(project_name, clean)
        assert not contains_event_log(project_name, clean)
        assert len(clean['models_base']) == 0
        assert len(clean['cv_models']) == 0
        assert len(clean['models_all']) == 0
        assert len(clean['predictions']) == 0
        assert len(clean['metrics']) == 0
        assert len(clean['automl']) == 0
        for frame in [train, valid, test]:
            assert frame_in_cluster(frame), "frame {} has been removed from cluster".format(frame.frame_id)
示例#7
0
def test_frames_can_be_passed_as_key():
    print("Check that all AutoML frames can be passed as keys.")
    ds = import_dataset()

    kw_args = [
        dict(training_frame=ds.train.frame_id),
        dict(training_frame=ds.train, validation_frame=ds.valid.frame_id),
        dict(training_frame=ds.train, blending_frame=ds.valid.frame_id),
        dict(training_frame=ds.train, leaderboard_frame=ds.test.frame_id),
    ]

    for kwargs in kw_args:
        aml = H2OAutoML(project_name="py_aml_frames_as_keys",
                        seed=1,
                        max_models=1,
                        nfolds=0)
        aml.train(y=ds.target, **kwargs)
        h2o.remove(aml)
    def train_automl(self, train: h2o.H2OFrame, x: List[str], y: str, weight: str) -> H2OGenericEstimator:
        """ Use AutoML to build model

        Args:
            train (h2o dataframe): training data containing columns x, y, and weight
            x (list of str): column names of model features
            y (list of str): column name of ground truth
            weight (str): column name of row weights

        Return
            H2OGenericEstimator: best model out of the training grid

        """
        aml = H2OAutoML(max_runtime_secs=self.search_time, seed=1)
        aml.train(x=x, y=y, training_frame=train, weights_column=weight)
        best_model = aml.leader

        return best_model
def test_optional_SEs_not_trained_in_reproducible_mode():
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_SEs_reproducible_mode", 
                    seed=1, 
                    max_runtime_secs=30,
                    max_models=3,
                    include_algos=['StackedEnsemble', 'GLM', 'GBM'])  # 2 base model in group 1, 1 in group 2
    aml.train(y=ds.target, training_frame=ds.train)
    lb = get_leaderboard(aml, ['provider', 'step']).as_data_frame()
    print(lb)
    steps_SE = lb.query("provider == 'StackedEnsemble'").step.to_list()
    assert len(steps_SE) == 2
    assert 'best_of_family_1' not in steps_SE, "no SE should be built for first group (sequential reproducible mode)"
    assert 'best_of_family_2' not in steps_SE, "no SE should be built for second group (sequential reproducible mode)"
    assert 'best_of_family_3' not in steps_SE, "no SE should be built for third group (sequential reproducible mode)"
    assert 'best_of_family_xglm' in steps_SE, "final SE is missing"
    assert 'all_xglm' in steps_SE, "final SE is missing"
    assert 'best_of_family_gbm' not in steps_SE, 'no optional SE should be trained (sequential reproducible mode)'
def test_leaderboard_for_binary_with_custom_sorting():
    print("Check leaderboard for Binomial sort by logloss")
    ds = import_dataset('binary', split=False)
    exclude_algos = ["GLM", "DeepLearning", "DRF"]
    aml = H2OAutoML(project_name="py_aml_lb_test_custom_binom_sort",
                    seed=automl_seed,
                    max_models=8,
                    nfolds=2,
                    stopping_rounds=1,
                    stopping_tolerance=0.5,
                    exclude_algos=exclude_algos,
                    sort_metric="logloss")
    aml.train(y=ds.target, training_frame=ds.train)

    check_leaderboard(
        aml, exclude_algos,
        ["logloss", "auc", "aucpr", "mean_per_class_error", "rmse", "mse"],
        "logloss")
示例#11
0
def test_custom_leaderboard_as_method():
    ds = import_dataset('binary')
    aml = H2OAutoML(project_name="py_aml_custom_lb_method_test",
                    max_models=5,
                    seed=42)
    aml.train(y=ds.target, training_frame=ds.train)

    assert_frame_equal(aml.get_leaderboard().as_data_frame(),
                       aml.leaderboard.as_data_frame())
    lb_ext = get_leaderboard(aml, extra_columns='ALL')
    assert_frame_equal(
        aml.get_leaderboard('ALL').as_data_frame(), lb_ext.as_data_frame())

    aml2 = get_automl(aml.project_name)
    assert_frame_equal(aml2.get_leaderboard().as_data_frame(),
                       aml.leaderboard.as_data_frame())
    assert_frame_equal(
        aml2.get_leaderboard('ALL').as_data_frame(), lb_ext.as_data_frame())
def test_leaderboard_for_regression_with_custom_sorting_deviance():
    print("Check leaderboard for Regression sort by deviance")
    ds = import_dataset('regression', split=False)
    exclude_algos = ["GBM", "DeepLearning"]
    aml = H2OAutoML(project_name="py_aml_lb_test_custom_regr_deviance",
                    exclude_algos=exclude_algos,
                    max_models=10,
                    nfolds=2,
                    stopping_rounds=1,
                    stopping_tolerance=0.5,
                    seed=automl_seed,
                    sort_metric="deviance")
    aml.train(y=ds.target, training_frame=ds.train)

    check_leaderboard(
        aml, exclude_algos,
        ["mean_residual_deviance", "rmse", "mse", "mae", "rmsle"],
        "mean_residual_deviance")
示例#13
0
def test_no_x_train_set_only():
    print("AutoML run with x not provided and train set only")
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_aml1",
                    stopping_rounds=3,
                    stopping_tolerance=0.001,
                    stopping_metric="AUC",
                    max_models=max_models,
                    seed=1234)
    aml.train(y=ds['target'], training_frame=ds['train'])
    assert aml.project_name == "py_aml1", "Project name is not set"
    assert aml.stopping_rounds == 3, "stopping_rounds is not set to 3"
    assert aml.stopping_tolerence == 0.001, "stopping_tolerance is not set to 0.001"
    assert aml.stopping_metric == "AUC", "stopping_metrics is not set to `AUC`"
    assert aml.max_models == 2, "max_models is not set to 2"
    assert aml.seed == 1234, "seed is not set to `1234`"
    print("Check leaderboard")
    print(aml.leaderboard)
示例#14
0
def test_keep_cross_validation_fold_assignment_enabled_with_nfolds_eq_0():
    print(
        "Check that fold assignments were skipped when `keep_cross_validation_fold_assignment` = True and nfolds = 0"
    )
    ds = import_dataset()
    aml = H2OAutoML(
        project_name="py_aml_keep_cross_validation_fold_assignment_2",
        nfolds=0,
        max_models=3,
        seed=1,
        keep_cross_validation_fold_assignment=True)
    aml.train(y=ds['target'], training_frame=ds['train'])
    _, non_se, _ = get_partitioned_model_names(aml.leaderboard)
    amodel = h2o.get_model(non_se[0])
    assert amodel.params['keep_cross_validation_fold_assignment'][
        'actual'] == False
    assert amodel._model_json["output"][
        "cross_validation_fold_assignment_frame_id"] == None
示例#15
0
def test_frames_cannot_be_passed_as_key():
    print("Check that all AutoML frames can be passed as keys.")
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_aml_frames_as_keys", seed=1, max_models=3, nfolds=0)

    kw_args = [
        dict(training_frame=ds['train'].frame_id),
        dict(training_frame=ds['train'], validation_frame=ds['valid'].frame_id),
        dict(training_frame=ds['train'], blending_frame=ds['valid'].frame_id),
        dict(training_frame=ds['train'], leaderboard_frame=ds['test'].frame_id),
    ]
    for kwargs in kw_args:
        try:
            aml.train(y=ds['target'], **kwargs)
            assert False, "should have thrown due to wrong frame key"
        except H2OTypeError as e:
            attr = next(k for k, v in kwargs.items() if v is not ds['train'])
            assert "'{}' must be a valid H2OFrame".format(attr) in str(e)
示例#16
0
def test_keep_cross_validation_fold_assignment_enabled_with_nfolds_neq_0():
    print(
        "Check that fold assignments were kept when `keep_cross_validation_fold_assignment` = True and nfolds > 1"
    )
    ds = import_dataset()
    aml = H2OAutoML(
        project_name="py_aml_keep_cross_validation_fold_assignment_1",
        nfolds=3,
        max_models=3,
        seed=1,
        keep_cross_validation_fold_assignment=True)
    aml.train(y=ds.target, training_frame=ds.train)
    base_models = get_partitioned_model_names(aml.leaderboard).base
    amodel = h2o.get_model(base_models[0])
    assert amodel.params['keep_cross_validation_fold_assignment'][
        'actual'] == True
    assert amodel._model_json["output"][
        "cross_validation_fold_assignment_frame_id"] != None
示例#17
0
    def execute(self, params, **kwargs):
        import h2o
        from h2o.automl import H2OAutoML

        h2o.init()

        train_X_frame = h2o.H2OFrame.from_python(
            self.marvin_dataset['train_X'])
        test_X_frame = h2o.H2OFrame.from_python(self.marvin_dataset['test_X'])

        x = train_X_frame.columns
        y = 'Species'
        x.remove(y)

        automl = H2OAutoML(max_models=20, seed=1)
        automl.train(x=x, y=y, training_frame=train_X_frame)

        self.marvin_model = automl
示例#18
0
def auto_ML(df, n_models, validation_ratio=.5):
    """
    Initialize h2o, a new Auto ML object and already train it applying verification following the validation ratio.
    """
    h2o.init(ip="localhost", port=54323)
    aml = H2OAutoML(max_models=n_models, seed=1)
    X, y, train, test = split(df, validation_ratio)

    aml.train(x=list(df.loc[:, 'f1':'f20'].columns),
              y='REDSHIFT_SPEC',
              training_frame=train,
              leaderboard_frame=test)

    lb = aml.leaderboard
    print("Leaderboard: ", lb.head(rows=lb.nrows), '\n')
    print("Leader: ", aml.leader, "\n")

    return aml
示例#19
0
def test_event_log():
    ds = import_dataset()
    aml = H2OAutoML(project_name="test_event_log",
                    max_models=2,
                    seed=1234)
    aml.train(y=ds.target, training_frame=ds.train)

    print(aml.event_log)
    assert aml.event_log.columns == ['timestamp', 'level', 'stage', 'message', 'name', 'value']
    assert aml.event_log.nrows > 10

    print(aml.training_info)
    assert int(aml.training_info['stop_epoch']) > int(aml.training_info['start_epoch'])
    stop_dt = dt.datetime.fromtimestamp(int(aml.training_info['stop_epoch']))
    now = dt.datetime.now()
    # test that stop_epoch is time encoded as unix epoch
    assert abs(stop_dt - now) < dt.timedelta(minutes=1)
    assert abs(int(aml.training_info['duration_secs']) - (int(aml.training_info['stop_epoch']) - int(aml.training_info['start_epoch']))) <= 1
示例#20
0
def test_automl_stops_after_max_models():
    print("Check that automl gets interrupted after `max_models`")
    ds = import_dataset()
    max_models = 5
    aml = H2OAutoML(project_name="py_aml_max_models",
                    seed=1,
                    max_models=max_models)
    aml.train(y=ds['target'], training_frame=ds['train'])

    base_models = [
        m for m in
        [aml.leaderboard[i, 0] for i in range(0, (aml.leaderboard.nrows))]
        if not m.startswith('StackedEnsemble')
    ]
    assert len(
        base_models
    ) == max_models, "obtained {} base models when {} are expected".format(
        len(base_models), max_models)
def test_modeling_plan_using_simplified_syntax():
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_modeling_plan_simple_syntax",
                    max_models=3,
                    modeling_plan=[
                        ('DRF', ['XRT', 'def_1']),
                        ('GBM', 'grids'),
                        ('StackedEnsemble', ['best'])
                    ],
                    seed=1)
    aml.train(y=ds.target, training_frame=ds.train)
    _, non_se, se = get_partitioned_model_names(aml.leaderboard)
    assert len(non_se) == 3
    assert len(se) == 1
    assert any('DRF' in name for name in non_se)
    assert any('XRT' in name for name in non_se)
    assert any('GBM_grid' in name for name in non_se)
    assert any('BestOfFamily' in name for name in se)
示例#22
0
def test_AUTO_stopping_metric_with_custom_sorting_metric():
    print(
        "Check leaderboard with AUTO stopping metric and rmse sorting metric")
    ds = prepare_data('regression')
    exclude_algos = ["DeepLearning", "GLM"]
    aml = H2OAutoML(
        project_name="py_aml_lb_test_auto_stopping_metric_custom_sorting",
        exclude_algos=exclude_algos,
        max_models=10,
        seed=automl_seed,
        sort_metric="rmse")
    aml.train(y=ds.target, training_frame=ds.train)

    check_leaderboard(
        aml, exclude_algos,
        ["rmse", "mean_residual_deviance", "mse", "mae", "rmsle"], "rmse")
    non_se = get_partitioned_model_names(aml.leaderboard).non_se
    check_model_property(non_se, 'stopping_metric', True, "RMSE")
示例#23
0
def test_max_runtime_secs_can_be_set_in_combination_with_max_models_and_max_runtime_wins(
):
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_all_stopping_constraints",
                    seed=1,
                    max_models=20,
                    max_runtime_secs=12)
    aml.train(y=ds['target'], training_frame=ds['train'])
    max_runtime = aml._build_resp['build_control']['stopping_criteria'][
        'max_runtime_secs']
    max_models = aml._build_resp['build_control']['stopping_criteria'][
        'max_models']
    assert max_runtime == 12
    assert max_models == 20
    assert aml.leaderboard.nrows < 20
    assert int(
        aml.training_info['duration_secs']
    ) < 2 * max_runtime  # being generous to avoid errors on slow Jenkins
示例#24
0
def prostate_automl():

    df = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))

    # Split frames; make the splits repeatable to test multiple runs
    # TODO: note that frames with the following names get created, but some Python binding temp
    # magic gives random names to the frames that are given to AutoML.  See PUBDEV-4634.
    fr = df.split_frame(ratios=[.8, .1],
                        destination_frames=[
                            "prostate_train", "prostate_valid", "prostate_test"
                        ],
                        seed=42)

    #Set up train, validation, and test sets
    train = fr[0]
    valid = fr[1]
    test = fr[2]

    #    aml = H2OAutoML(max_models = 2, stopping_rounds=3, stopping_tolerance=0.001, project_name='prostate')
    aml = H2OAutoML(max_models=2,
                    stopping_rounds=2,
                    stopping_tolerance=0.05,
                    project_name='prostate',
                    exclude_algos=["GLM", "DeepLearning"])
    # aml = H2OAutoML(max_models=8, stopping_rounds=2, seed=42, project_name='prostate')

    train["CAPSULE"] = train["CAPSULE"].asfactor()
    valid["CAPSULE"] = valid["CAPSULE"].asfactor()
    test["CAPSULE"] = test["CAPSULE"].asfactor()

    print(
        "AutoML (Binomial) run with x not provided with train, valid, and test"
    )
    aml.train(y="CAPSULE",
              training_frame=train,
              validation_frame=valid,
              leaderboard_frame=test)
    print(aml.leader)
    print(aml.leaderboard)
    assert set(aml.leaderboard.columns) == set([
        "model_id", "auc", "logloss", "aucpr", "mean_per_class_error", "rmse",
        "mse"
    ])
示例#25
0
def test_early_stopping_args():
    print("Check arguments to H2OAutoML class")
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_aml0",
                    stopping_rounds=3,
                    stopping_tolerance=0.001,
                    stopping_metric="AUC",
                    max_models=max_models,
                    seed=1234,
                    exclude_algos=["DeepLearning"])
    aml.train(y=ds['target'], training_frame=ds['train'])
    assert aml.project_name == "py_aml0", "Project name is not set"
    assert aml.stopping_rounds == 3, "stopping_rounds is not set to 3"
    assert aml.stopping_tolerence == 0.001, "stopping_tolerance is not set to 0.001"
    assert aml.stopping_metric == "AUC", "stopping_metrics is not set to `AUC`"
    assert aml.max_models == 2, "max_models is not set to 2"
    assert aml.seed == 1234, "seed is not set to `1234`"
    print("Check leaderboard")
    print(aml.leaderboard)
    def test_remove_automl_with_xval():
        ds = import_dataset()
        project_name = 'aml_with_xval_remove_test'
        max_models = 5
        nfolds = 5
        aml = H2OAutoML(project_name=project_name,
                        nfolds=nfolds,
                        max_models=max_models,
                        seed=1)
        aml.train(y=ds.target, training_frame=ds.train, validation_frame=ds.valid, leaderboard_frame=ds.test)

        keys = list_keys_in_memory()
        assert aml.key.startswith(project_name)
        assert contains_leaderboard(aml.key, keys)
        assert contains_event_log(aml.key, keys)
        num_SEs = len(keys['metalearners'])
        print({k: len(v) for k, v in keys.items()})
        expectations = dict(
            models_base=max_models + num_SEs,
            cv_models=0,
            predictions=0,
            metrics=(max_models * 3  # for each non-SE model, 1 on training_frame, 1 on validation_frame, 1 on leaderboard_frame
                     + (num_SEs * 2)  # for each SE model, 1 on training frame, 1 on leaderboard frame
                     + (num_SEs * 2)  # for each SE metalearner, 1+1 on levelone training+validation
                     + (1 if any(("DeepLearning" in x for x in keys["metrics"])) else 0)  # DeepLearning has 2 training metrics (IDK why)
                     )
        )
        for k, v in expectations.items():
            assert len(keys[k]) == v, "expected {} {}, but got {}".format(v, k, len(keys[k]))

        h2o.remove(aml)
        clean = list_keys_in_memory()
        print(clean['all'].values)
        assert not contains_leaderboard(aml.key, clean)
        assert not contains_event_log(aml.key, clean)
        assert len(clean['models_base']) == 0
        assert len(clean['cv_models']) == 0
        assert len(clean['models_all']) == 0
        assert len(clean['predictions']) == 0
        assert len(clean['metrics']) == 0
        assert len(clean['automl']) == 0
        for frame in [ds.train, ds.valid, ds.test]:
            assert frame_in_cluster(frame), "frame {} has been removed from cluster".format(frame.frame_id)
示例#27
0
def test_stacked_ensembles_are_trained_after_max_models():
    print(
        "Check that Stacked Ensembles are still trained after max models have been trained"
    )
    max_models = 5
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_aml_SE_after_max_models",
                    seed=1,
                    max_models=max_models)
    aml.train(y=ds['target'], training_frame=ds['train'])

    stacked_ensembles = [
        m for m in
        [aml.leaderboard[i, 0] for i in range(0, (aml.leaderboard.nrows))]
        if m.startswith('StackedEnsemble')
    ]
    assert len(
        stacked_ensembles
    ) == 2, "StackedEnsemble should still be trained after max models have been reached"
示例#28
0
def automl_pojo():
    fr1 = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    fr1["CAPSULE"] = fr1["CAPSULE"].asfactor()
    aml = H2OAutoML(max_models=2, project_name="py_lb_test_aml1", seed=1234)
    aml.train(y="CAPSULE", training_frame=fr1)

    # download pojo
    if aml.leader.algo != "stackedensemble":
        model_zip_path = os.path.join(tempfile.mkdtemp(), 'model.zip')
        time0 = time.time()
        print("\nDownloading POJO @... " + model_zip_path)
        pojo_file = aml.download_pojo(model_zip_path)
        print("    => %s  (%d bytes)" %
              (pojo_file, os.stat(pojo_file).st_size))
        assert os.path.exists(pojo_file)
        print("    Time taken = %.3fs" % (time.time() - time0))
        assert os.path.isfile(model_zip_path)
        os.remove(model_zip_path)
def iris_automl():

    df = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

    # Split frames
    fr = df.split_frame(ratios=[.8,.1])

    # Set up train, validation, and test sets
    train = fr[0]
    valid = fr[1]
    test = fr[2]

    aml = H2OAutoML(max_runtime_secs = 3,stopping_rounds=3,stopping_tolerance=0.001)

    print("AutoML (Multinomial) run with x not provided; uses train, valid, and leaderboard (test) frame")
    aml.train(y="class", training_frame=train,validation_frame=valid, leaderboard_frame=test)
    print(aml.leader)
    print(aml.leaderboard)
    assert set(aml.leaderboard.columns) == set(["model_id","mean_per_class_error", "logloss", "rmse", "mse"])
示例#30
0
def automl_pojo():
    ds = import_dataset()
    aml = H2OAutoML(max_models=2,
                    project_name="py_lb_test_aml1",
                    exclude_algos=['XGBoost', 'StackedEnsemble'],  # no POJO export for XGB or SE
                    seed=1234)
    aml.train(y=ds.target, training_frame=ds.train)

    # download pojo
    model_zip_path = tempfile.mkdtemp()
    model_zip_file_path = os.path.join(model_zip_path, aml._leader_id + ".java")
    time0 = time.time()
    print("\nDownloading POJO @... " + model_zip_file_path)
    pojo_file = aml.download_pojo(model_zip_path)
    print("    => %s  (%d bytes)" % (pojo_file, os.stat(pojo_file).st_size))
    assert os.path.exists(pojo_file)
    print("    Time taken = %.3fs" % (time.time() - time0))
    assert os.path.isfile(model_zip_file_path)
    shutil.rmtree(model_zip_path)