예제 #1
0
def test_get_automl():
    train = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    y = 'CAPSULE'
    train[y] = train[y].asfactor()

    aml = H2OAutoML(project_name="test_get_automl", max_models=2, seed=1234)
    aml.train(y=y, training_frame=train)

    get_aml = get_automl(aml.project_name)

    assert aml.project_name == get_aml["project_name"]
    assert aml.leader.model_id == get_aml["leader"].model_id
    assert aml.leaderboard.get_frame_data(
    ) == get_aml["leaderboard"].get_frame_data()
    assert aml.event_log.get_frame_data(
    ) == get_aml["event_log"].get_frame_data()
    assert aml.training_info == get_aml['training_info']

    # PUBDEV-6599
    assert aml.project_name == get_aml.project_name
    assert aml.leader.model_id == get_aml.leader.model_id
    assert aml.leaderboard.frame_id == get_aml.leaderboard.frame_id
    assert aml.event_log.frame_id == get_aml.event_log.frame_id
    assert aml.training_info == get_aml.training_info

    # Test predictions
    predictions = aml.predict(train)
    predictions_from_output = get_aml.predict(train)
    assert (predictions == predictions_from_output).all()

    # Test get_leaderboard PUBDEV-7454
    assert (get_leaderboard(aml) == get_leaderboard(get_aml)).all()
    assert (get_leaderboard(aml, 'ALL') == get_leaderboard(get_aml,
                                                           'ALL')).all()
예제 #2
0
def test_get_automl():
    ds = import_dataset()
    aml = H2OAutoML(project_name="test_get_automl", max_models=2, seed=1234)
    aml.train(y=ds.target, training_frame=ds.train)

    get_aml = get_automl(aml.project_name)

    assert aml.project_name == get_aml["project_name"]
    assert aml.leader.model_id == get_aml["leader"].model_id
    assert aml.leaderboard.get_frame_data(
    ) == get_aml["leaderboard"].get_frame_data()
    assert aml.event_log.get_frame_data(
    ) == get_aml["event_log"].get_frame_data()
    assert aml.training_info == get_aml['training_info']

    # PUBDEV-6599
    assert aml.project_name == get_aml.project_name
    assert aml.leader.model_id == get_aml.leader.model_id
    assert aml.leaderboard.frame_id == get_aml.leaderboard.frame_id
    assert aml.event_log.frame_id == get_aml.event_log.frame_id
    assert aml.training_info == get_aml.training_info

    # Test predictions
    predictions = aml.predict(ds.test)
    predictions_from_output = get_aml.predict(ds.test)
    assert (predictions == predictions_from_output).all()

    # Test get_leaderboard PUBDEV-7454
    assert (get_leaderboard(aml) == get_leaderboard(get_aml)).all()
    assert (get_leaderboard(aml, 'ALL') == get_leaderboard(get_aml,
                                                           'ALL')).all()
예제 #3
0
def test_custom_leaderboard():
    print("Check custom leaderboard")
    ds = prepare_data('binomial')
    aml = H2OAutoML(project_name="py_aml_custom_lb_test",
                    max_models=5,
                    seed=automl_seed)
    aml.train(y=ds.target, training_frame=ds.train)
    std_columns = [
        "model_id", "auc", "logloss", "aucpr", "mean_per_class_error", "rmse",
        "mse"
    ]
    assert aml.leaderboard.names == std_columns
    assert get_leaderboard(aml).names == std_columns
    assert get_leaderboard(aml, extra_columns=[]).names == std_columns
    assert get_leaderboard(aml, extra_columns='ALL').names == std_columns + [
        "training_time_ms", "predict_time_per_row_ms"
    ]
    assert get_leaderboard(aml, extra_columns="unknown").names == std_columns
    assert get_leaderboard(aml, extra_columns=[
        "training_time_ms"
    ]).names == std_columns + ["training_time_ms"]
    assert get_leaderboard(
        aml, extra_columns=["predict_time_per_row_ms", "training_time_ms"]
    ).names == std_columns + ["predict_time_per_row_ms", "training_time_ms"]
    assert get_leaderboard(aml, extra_columns=[
        "unknown", "training_time_ms"
    ]).names == std_columns + ["training_time_ms"]
    lb_ext = get_leaderboard(aml, extra_columns='ALL')
    print(lb_ext)
    assert all(lb_ext[:, 1:].isnumeric()
               ), "metrics and extension columns should all be numeric"
    assert (lb_ext["training_time_ms"].as_data_frame().values >= 0).all()
    assert (lb_ext["predict_time_per_row_ms"].as_data_frame().values > 0).all()
예제 #4
0
def test_optional_SEs_trained_by_default_when_no_time_limit():
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_SEs_with_no_time_limit",
                    seed=1,
                    max_models=3)
    aml.train(y=ds.target, training_frame=ds.train)
    lb = get_leaderboard(aml, ['provider', 'step']).as_data_frame()
    steps_SE = lb.query("provider == 'StackedEnsemble'").step.to_list()
    assert len(steps_SE) > 1
    assert 'best_of_family_1' in steps_SE, "default SE for first group is missing"
    assert 'best_of_family_2' not in steps_SE, 'all other SEs should be optional ones'
    assert 'all_1' not in steps_SE, 'all other SEs should be optional ones'
    assert 'all_2' not in steps_SE, 'all other SEs should be optional ones'
예제 #5
0
def train(data_path, max_models, model_name):
    train_data, test_data, train_cols = prepare_data(args.data_path)
    test_cols = train_cols[:-1]
    test_cols = "quality"

    with mlflow.start_run() as run:
        print("run_id:", run.info.run_id)
        model = H2OAutoML(max_models=max_models,
                          max_runtime_secs=300,
                          seed=24,
                          nfolds=6)
        model.train(x=train_cols,
                    y=test_cols,
                    training_frame=train_data,
                    validation_frame=test_data)
        mlflow.log_param("max_models", max_models)
        mlflow.log_metric("rmse", model.leader.rmse())

        mlflow.set_tag("mlflow_version", mlflow.__version__)
        mlflow.set_tag("h2o_version", h2o.__version__)
        mlflow.set_tag("model.leader.class", qname(model.leader.__class__))
        mlflow.set_tag("model.leader.estimator_type",
                       model.leader._estimator_type)
        mlflow.set_tag("num_leaderboard_models", model.leaderboard.nrows)

        lb = get_leaderboard(model, extra_columns='ALL')
        print(lb)

        path = "leaderboard.csv"
        h2o.export_file(lb, path=path, force=True)
        mlflow.log_artifact(path)

        from tabulate import tabulate
        df = lb.as_data_frame()
        table = tabulate(df, headers="keys", tablefmt="psql", showindex=False)
        path = "leaderboard.txt"
        with open(path, "w") as f:
            f.write(table)
        mlflow.log_artifact(path)

        df = df[["model_id"]]
        with open("models.csv", "w") as f:
            df.to_csv(f, index=False, header=False)
        mlflow.log_artifact("models.csv")

        mlflow.h2o.log_model(model.leader,
                             "h2o-model",
                             registered_model_name=args.model_name)
예제 #6
0
def test_smoke_automl():
    nmodels = 20  # enough models to run every step (all base models, all grids, all SEs...)

    ds = import_dataset()
    aml = H2OAutoML(project_name="py_aml_smoke",
                    max_models=nmodels,
                    nfolds=3,
                    stopping_tolerance=0.5,
                    stopping_rounds=2,
                    seed=42,
                    verbosity='debug')
    model = aml.train(y=ds.target, training_frame=ds.train)

    assert isinstance(model, ModelBase)
    lb = get_leaderboard(aml, ['algos', 'provider', 'step', 'group'])
    print(lb)
    assert lb.nrows > nmodels
def test_optional_SEs_not_trained_in_reproducible_mode():
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_SEs_reproducible_mode", 
                    seed=1, 
                    max_runtime_secs=30,
                    max_models=3,
                    include_algos=['StackedEnsemble', 'GLM', 'GBM'])  # 2 base model in group 1, 1 in group 2
    aml.train(y=ds.target, training_frame=ds.train)
    lb = get_leaderboard(aml, ['provider', 'step']).as_data_frame()
    print(lb)
    steps_SE = lb.query("provider == 'StackedEnsemble'").step.to_list()
    assert len(steps_SE) == 2
    assert 'best_of_family_1' not in steps_SE, "no SE should be built for first group (sequential reproducible mode)"
    assert 'best_of_family_2' not in steps_SE, "no SE should be built for second group (sequential reproducible mode)"
    assert 'best_of_family_3' not in steps_SE, "no SE should be built for third group (sequential reproducible mode)"
    assert 'best_of_family_xglm' in steps_SE, "final SE is missing"
    assert 'all_xglm' in steps_SE, "final SE is missing"
    assert 'best_of_family_gbm' not in steps_SE, 'no optional SE should be trained (sequential reproducible mode)'
예제 #8
0
def test_custom_leaderboard_as_method():
    ds = import_dataset('binary')
    aml = H2OAutoML(project_name="py_aml_custom_lb_method_test",
                    max_models=5,
                    seed=42)
    aml.train(y=ds.target, training_frame=ds.train)

    assert_frame_equal(aml.get_leaderboard().as_data_frame(),
                       aml.leaderboard.as_data_frame())
    lb_ext = get_leaderboard(aml, extra_columns='ALL')
    assert_frame_equal(
        aml.get_leaderboard('ALL').as_data_frame(), lb_ext.as_data_frame())

    aml2 = get_automl(aml.project_name)
    assert_frame_equal(aml2.get_leaderboard().as_data_frame(),
                       aml.leaderboard.as_data_frame())
    assert_frame_equal(
        aml2.get_leaderboard('ALL').as_data_frame(), lb_ext.as_data_frame())
def test_optional_SEs_trained_in_non_reproducible_mode():
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_SEs_non_reproducible_mode", 
                    seed=1, 
                    max_runtime_secs=30,
                    include_algos=['StackedEnsemble', 'GLM', 'DRF'])  # 1 base model in each group: 1, 2, 3
    aml.train(y=ds.target, training_frame=ds.train)
    lb = get_leaderboard(aml, ['provider', 'step']).as_data_frame()
    print(lb)
    steps_SE = lb.query("provider == 'StackedEnsemble'").step.to_list()
    assert len(steps_SE) > 2
    assert 'best_of_family_1' not in steps_SE, "no SE should be built for first group (1 base nodel only)"
    assert 'best_of_family_2' in steps_SE, 'SE best_of_family from group 2 is missing'
    assert 'best_of_family_3' in steps_SE, 'SE best_of_family from group 3 is missing'
    assert 'best_of_family_4' not in steps_SE, 'all other SEs should be optional ones'
    assert 'all_1' not in steps_SE, 'all other SEs should be optional ones'
    assert 'all_2' not in steps_SE, 'all other SEs should be optional ones'
    assert 'all_3' not in steps_SE, 'all other SEs should be optional ones'
    assert 'best_of_family_gbm' in steps_SE, 'optional SE best_of_family should have been trained'
예제 #10
0
def test_custom_leaderboard():
    print("Check custom leaderboard")
    ds = import_dataset('binary')
    aml = H2OAutoML(project_name="py_aml_custom_lb_test",
                    max_models=5,
                    seed=42)
    aml.train(y=ds.target, training_frame=ds.train)
    std_columns = [
        "model_id", "auc", "logloss", "aucpr", "mean_per_class_error", "rmse",
        "mse"
    ]
    assert aml.leaderboard.names == std_columns
    assert get_leaderboard(aml).names == std_columns
    assert get_leaderboard(aml, extra_columns=[]).names == std_columns
    assert get_leaderboard(aml, extra_columns='ALL').names == std_columns + [
        "training_time_ms", "predict_time_per_row_ms", "algo"
    ]
    assert get_leaderboard(aml, extra_columns="unknown").names == std_columns
    assert get_leaderboard(aml, extra_columns=[
        "training_time_ms"
    ]).names == std_columns + ["training_time_ms"]
    assert get_leaderboard(
        aml, extra_columns=["predict_time_per_row_ms", "training_time_ms"]
    ).names == std_columns + ["predict_time_per_row_ms", "training_time_ms"]
    assert get_leaderboard(aml, extra_columns=[
        "unknown", "training_time_ms"
    ]).names == std_columns + ["training_time_ms"]
    lb_ext = get_leaderboard(aml, extra_columns='ALL')
    print(lb_ext)
    assert all(
        lb_ext[:,
               [c for c in lb_ext.columns if c not in ("model_id", "algo")]].
        isnumeric()), "metrics and extension columns should all be numeric"
    assert (lb_ext["training_time_ms"].as_data_frame().values >= 0).all()
    assert (lb_ext["predict_time_per_row_ms"].as_data_frame().values > 0).all()
    assert (lb_ext["algo"].as_data_frame().isin(
        ["DRF", "DeepLearning", "GBM", "GLM", "StackedEnsemble",
         "XGBoost"]).all().all())
from h2o.automl import H2OAutoML, get_leaderboard

h2o.init()

x = train.columns
y = "Label"
x.remove(y)

aml = H2OAutoML(max_models=20, seed=1)
aml.train(x=x, y=y, training_frame=train)

# AutoML Leaderboard
lb = aml.leaderboard

# Optionally edd extra model information to the leaderboard
lb = get_leaderboard(aml, extra_columns='ALL')

# Print all rows (instead of default 10 rows)
lb.head(rows=lb.nrows)

# The leader model is stored here
aml.leader

# If you need to generate predictions on a test set, you can make
# predictions directly on the `"H2OAutoML"` object, or on the leader
# model object directly

preds = aml.predict(test)

# or:
preds = aml.leader.predict(test)
예제 #12
0
    'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'IsHoliday',
    'Dept', 'A', 'B', 'C', 'Day', 'Month', 'Year'
]
Y = 'Weekly_Sales'

algos = ['DRF', 'XGBoost', 'GBM', 'DeepLearning', 'StackedEnsemble']
#aml = H2OAutoML(max_models=30, max_runtime_secs=300, seed=1)
aml = H2OAutoML(max_runtime_secs=300, seed=1, include_algos=algos)
h2o_frame = h2o.H2OFrame(train)
aml.train(x=X, y=Y, training_frame=h2o_frame)

# AutoML Leaderboard
lb1 = aml.leaderboard.as_data_frame()

# Optionally edd extra model information to the leaderboard
lb = get_leaderboard(aml, extra_columns='ALL').as_data_frame()

# Print all rows (instead of default 10 rows)
lb.as_data_frame().head()

# The leader model is stored here
aml.leader

h2o_frame_test = h2o.H2OFrame(test)
preds = aml.predict(h2o_frame_test)
perf = aml.leader.model_performance(h2o_frame_test)

#################################################


def plot_corr_vars(df):
예제 #13
0
 def view_leaderboard(self, auto_ml):
     leader_board = get_leaderboard(auto_ml, extra_columns='ALL')
     logger.info('Leaderboard: \n{}'.format(leader_board.head(rows=leader_board.nrows)))
     h2o_util.show_model_performance(auto_ml.leader)