Exemplo n.º 1
0
def glm_grid_search_on_weights():
    train = h2o.import_file(
        pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    train = train.drop("ID")
    train["CAPSULE"] = train["CAPSULE"].asfactor()

    response = "CAPSULE"
    features = list(train.col_names)
    features.remove(response)

    train['wt_2'] = (train["CAPSULE"] == "1").ifelse(2, 1)
    train['wt_100'] = (train['CAPSULE'] == "1").ifelse(100, 1)

    hyper_parameters = OrderedDict()
    hyper_parameters["weights_column"] = ["wt_2", "wt_100"]
    print("GLM grid with the following hyper_parameters:", hyper_parameters)

    gs = H2OGridSearch(H2OGeneralizedLinearEstimator,
                       hyper_params=hyper_parameters)
    gs.train(x=features, y=response, training_frame=train)
    for m in gs.get_grid().models:
        used_features = map(lambda x: x[1], m.varimp())
        assert not ("wt_2" in used_features)
        assert not ("wt_100" in used_features)
    loglosses = gs.sorted_metric_table()["logloss"]
    assert loglosses.nunique(
    ) == 2  # models are not identical (=> weights are considered)
Exemplo n.º 2
0
def model_build():

    bc_data_set1 = "/home/wso2123/My Work/Datasets/Breast cancer wisconsin/data.csv"
    bc_data_train_dataset = "/home/wso2123/My Work/Datasets/Breast cancer wisconsin/uncorrected_train.csv"
    bc_data_validate_dataset = "/home/wso2123/My Work/Datasets/Breast cancer wisconsin/validate.csv"
    bc_data_test_dataset = "/home/wso2123/My Work/Datasets/Breast cancer wisconsin/test.csv"

    train_data = h2o.import_file(bc_data_train_dataset)
    validate_data = h2o.import_file(bc_data_validate_dataset)
    test_data = h2o.import_file(bc_data_test_dataset)

    #
    # Train deep autoencoder learning model on "normal"
    # training data, y ignored
    #
    hyper_parameters = {
        'hidden':
        range(10, 30),
        'activation': [
            "tanh", "tanh_with_dropout", "rectifier", "rectifier_with_dropout",
            "maxout", "maxout_with_dropout"
        ]
    }
    grid_search = H2OGridSearch(H2ODeepLearningEstimator,
                                hyper_params=hyper_parameters)
    grid_search.train(x=train_data.names,
                      training_frame=train_data,
                      validation_frame=validate_data)
    grid_search.show()
    v_frame = grid_search.varimp(True)
    print v_frame
def test_stackedensemble_respects_the_max_runtime_secs():
    max_runtime_secs = 1
    hyper_parameters = dict()
    hyper_parameters["ntrees"] = [1, 2, 3, 4, 5]
    params = dict(
        fold_assignment="modulo",
        nfolds=3
    )

    data = prepare_data()

    gs1 = H2OGridSearch(H2OGradientBoostingEstimator(**params), hyper_params=hyper_parameters)
    gs1.train(data.x, data.y, data.train, validation_frame=data.train)

    big_blending_frame = data.train
    for i in range(15):
        big_blending_frame = big_blending_frame.rbind(big_blending_frame)

    se = H2OStackedEnsembleEstimator(
        base_models=gs1.model_ids,
        max_runtime_secs=max_runtime_secs,
        blending_frame=big_blending_frame)
    try:
        se.train(data.x, data.y, data.train)
        assert False, "This should have failed due to time out."
    except H2OResponseError:
        pass
def test_stackedensemble_propagates_the_max_runtime_secs():
    max_runtime_secs = 5
    hyper_parameters = dict()
    hyper_parameters["ntrees"] = [1, 3, 5]
    params = dict(
        fold_assignment="modulo",
        nfolds=3,
        keep_cross_validation_predictions=True
    )

    data = prepare_data()

    gs1 = H2OGridSearch(H2OGradientBoostingEstimator(**params), hyper_params=hyper_parameters)
    gs1.train(data.x, data.y, data.train, validation_frame=data.train)

    se = H2OStackedEnsembleEstimator(base_models=[gs1], max_runtime_secs=max_runtime_secs)
    se.train(data.x, data.y, data.train)
    metalearner = h2o.get_model(se.metalearner()["name"])

    # metalearner has the set max_runtine_secs
    assert metalearner.actual_params['max_runtime_secs'] <= max_runtime_secs
    assert metalearner.actual_params['max_runtime_secs'] > 0

    # stack ensemble has the set max_runtime_secs
    assert se.max_runtime_secs == max_runtime_secs
    def _prepare_test_env():
        hyper_parameters = dict()
        hyper_parameters["ntrees"] = [1, 3, 5]
        params = dict(fold_assignment="modulo",
                      nfolds=3,
                      keep_cross_validation_predictions=True)

        data = prepare_data()

        drf = H2ORandomForestEstimator(**params)
        drf.train(data.x, data.y, data.train, validation_frame=data.train)

        gs1 = H2OGridSearch(H2OGradientBoostingEstimator(**params),
                            hyper_params=hyper_parameters)
        gs1.train(data.x, data.y, data.train, validation_frame=data.train)

        gs2 = H2OGridSearch(H2ORandomForestEstimator(**params),
                            hyper_params=hyper_parameters)
        gs2.train(data.x, data.y, data.train, validation_frame=data.train)

        return dict(data=data, drf=drf, gs1=gs1, gs2=gs2)
Exemplo n.º 6
0
def test_mean_per_class_error_grid():
    gbm = H2OGradientBoostingEstimator(nfolds=3, fold_assignment="Random", seed=1234)
    cars = h2o.import_file(pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    cars["cylinders"] = cars["cylinders"].asfactor()
    r = cars[0].runif(seed=1234)
    train = cars[r > .2]
    valid = cars[r <= .2]
    response_col = "cylinders"
    predictors = ["displacement","power","weight","acceleration","year"]
    gbm.distribution="multinomial"
    
    ## Early stopping
    gbm.stopping_rounds=2
    gbm.stopping_metric="mean_per_class_error"
    gbm.ntrees=10000
    gbm.max_depth=3
    gbm.min_rows=1
    gbm.learn_rate=0.01
    gbm.score_tree_interval=1
    gbm.nfolds=None
    gbm.fold_assignment=None
    gbm.train(x=predictors,y=response_col, training_frame=train, validation_frame=valid)
    print(gbm)
    print(gbm.scoring_history())


    ## Grid search
    hyper_params_tune = {'max_depth' : list(range(1,10+1,1)),
                  'sample_rate': [x/100. for x in range(20,101)],
                  'col_sample_rate' : [x/100. for x in range(20,101)],
                  'col_sample_rate_per_tree': [x/100. for x in range(20,101)],
                  'col_sample_rate_change_per_level': [x/100. for x in range(90,111)],
                  'min_rows': [2**x for x in range(0,int(math.log(train.nrow,2)-2)+1)],
                  'nbins': [2**x for x in range(4,11)],
                  'nbins_cats': [2**x for x in range(4,13)],
                  'min_split_improvement': [0,1e-8,1e-6,1e-4],
                  'histogram_type': ["UniformAdaptive","QuantilesGlobal","RoundRobin"]}

    search_criteria_tune = {'strategy': "RandomDiscrete",
                     'max_runtime_secs': 600,  ## limit the runtime to 10 minutes
                     'max_models': 10,
                     'seed' : 1234,
                     'stopping_rounds' : 5,
                     'stopping_metric' : "mean_per_class_error",
                     'stopping_tolerance': 1e-3
                     }

    grid = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_params_tune, search_criteria=search_criteria_tune)
    grid.train(x=predictors, y=response_col, training_frame=train, validation_frame=valid,distribution="multinomial", seed=1234,
        stopping_rounds = 10, stopping_metric = "mean_per_class_error", stopping_tolerance=1e-3)

    print(grid) ## sorted by logloss
    print(grid.get_grid("mean_per_class_error"))
Exemplo n.º 7
0
    def search(self, score_cutoff, param_space,
               rand_seed, n_models,
               const_params, cv_folds, training_frame,
               model_directory, predictors, response):
        if ("Windows" in platform()) and (self.estimator == H2OXGBoostEstimator):
            incompatible_message = "Windows currently doesn't support H2OXGBoostEstimator. " \
                                   "No xgboost models will be trained."
            self.logger.info(incompatible_message)
            print(incompatible_message)
            return
        criteria = {
            'strategy': 'RandomDiscrete',
            'max_models': n_models,
            'seed': rand_seed,
            # limit the runtime to 60 minutesS
            'max_runtime_secs': self.max_minutes * 60,
            # early stopping once the leaderboard of the top 5 models is converged to 0.1% relative difference
            'stopping_rounds': 5,
            'stopping_metric': self.eval_metric,
            'stopping_tolerance': 1e-3
        }
        # Required for H2OStackedEnsembleEstimator
        const_params.update({
            'nfolds': cv_folds,
            'keep_cross_validation_predictions': True,
            'fold_assignment': "Modulo",
            'seed': rand_seed
        })
        grid = H2OGridSearch(model=self.estimator(**const_params),
                             grid_id=self.name + '_grid',
                             hyper_params=param_space,
                             search_criteria=criteria)
        self.logger.info("Training {} models ...".format(self.name))
        # grid.train(x=X, y=Y, nfolds=configuration.CV_FOLDS, seed=rand_seed, training_frame=credit_data)
        try:
            grid.train(x=predictors, y=response, training_frame=training_frame)
        except H2OResponseError:
            self.logger.error('Encountered server error. Skipping ' + self.name)
            return
        self.logger.info("Finished training {} models.".format(self.name))
        # Get the grid results, sorted
        results = grid.get_grid(sort_by=self.eval_metric, decreasing=True)

        for x in results:
            print(get_model_cv_metric(x, self.eval_metric))

        high_scoring = [model for model in results if get_model_cv_metric(model, self.eval_metric) > score_cutoff]
        if not high_scoring:
            self.logger.info('Failed to find models that meet the cut off.')
            return
        self.log_training_results(results=results, search_grid=param_space)
        self.save_model_list(model_lst=high_scoring, seed=rand_seed, directory=model_directory)
Exemplo n.º 8
0
def test_train_returns_the_trained_models():
    fr = h2o.import_file(path=pu.locate("smalldata/prostate/prostate.csv"))
    target = "CAPSULE"
    fr[target] = fr[target].asfactor()
    
    grid = H2OGridSearch(
        H2OGradientBoostingEstimator,
        dict(
            ntrees=[5, 10],
            learn_rate=[0.1, 0.5]
        )
    )
    result = grid.train(y=target, training_frame=fr)
    assert isinstance(result, H2OGridSearch)
    assert result is grid
    result.predict(fr)
Exemplo n.º 9
0
def test_make_leaderboard_with_leaderboard_frame():
    train = h2o.upload_file(
        pyunit_utils.locate("smalldata/titanic/titanic_expanded.csv"))
    train["name"] = train["name"].asfactor()
    y = "fare"

    aml = H2OAutoML(seed=1234, max_models=5)
    aml.train(y=y, training_frame=train)

    aml2 = H2OAutoML(seed=134, max_models=5)
    aml2.train(y=y, training_frame=train)

    grid = H2OGridSearch(H2OGradientBoostingEstimator(),
                         hyper_params={"ntrees": [1, 2, 3]})
    grid.train(y=y, training_frame=train)
    # with leaderboard frame
    expected_cols = ("model_id", "rmse", "mse", "mae", "rmsle",
                     "mean_residual_deviance", "training_time_ms",
                     "predict_time_per_row_ms", "algo")
    ldb = h2o.make_leaderboard(aml, train, extra_columns="ALL")

    for c in expected_cols:
        assert c in ldb.columns

    for score_data in ("AUTO", "xval", "valid", "train"):
        assert h2o.make_leaderboard(aml, train,
                                    scoring_data=score_data).nrow > 0
        assert h2o.make_leaderboard(
            [aml, aml2], train, scoring_data=score_data).nrow > 0
        assert h2o.make_leaderboard(grid, scoring_data=score_data).nrow > 0
        assert h2o.make_leaderboard(
            [aml, grid, aml2.leader], train, scoring_data=score_data).nrow > 0

    # extra columns
    for ec in ("training_time_ms", "predict_time_per_row_ms", "algo"):
        assert ec in h2o.make_leaderboard(grid, train,
                                          extra_columns=ec).columns

    # extra columns without leaderboard frame
    for ec in ("training_time_ms", "algo"):
        assert ec in h2o.make_leaderboard(grid, extra_columns=ec).columns

    # sort metrics
    for sm in ("rmse", "mse", "mae", "rmsle", "mean_residual_deviance"):
        assert h2o.make_leaderboard(grid, train,
                                    sort_metric=sm).columns[1] == sm
Exemplo n.º 10
0
 def grid_search(training_df, attribute_property_length):
     h2o.init()
     h2o.connect()
     training_array = training_df.values
     x = training_array[:, 0:attribute_property_length]
     y = training_array[:, attribute_property_length - 1]
     tr_df = h2o.H2OFrame(x)
     training_columns = [
         'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10'
     ]
     response_column = 'C11'
     hyper_parameters = {'ntrees': [15, 20, 25], 'max_depth': [15, 20]}
     random_plus_manual = H2OGridSearch(
         H2ORandomForestEstimator(nfolds=n_splits), hyper_parameters)
     random_plus_manual.train(x=training_columns,
                              y=response_column,
                              training_frame=tr_df)
     random_plus_manual.show()
 def train_base_models(data):
     grid = H2OGridSearch(
         H2OGradientBoostingEstimator,
         search_criteria=dict(
             strategy='RandomDiscrete',
             max_models=5,
             seed=seed,
         ),
         hyper_params=dict(
             learn_rate=[0.5, 0.8, 1.0],
             max_depth=[2, 3, 4, 5],
             ntrees=[5, 10, 15],
         ),
     )
     grid.train(data.x,
                data.y,
                data.train,
                nfolds=5,
                fold_assignment='Modulo',
                keep_cross_validation_predictions=True)
     return grid.models
Exemplo n.º 12
0
def test_make_leaderboard_without_leaderboard_frame():
    train = h2o.upload_file(
        pyunit_utils.locate("smalldata/titanic/titanic_expanded.csv"))
    train["name"] = train["name"].asfactor()
    y = "fare"

    aml = H2OAutoML(seed=1234, max_models=5)
    aml.train(y=y, training_frame=train)

    aml2 = H2OAutoML(seed=134, max_models=5)
    aml2.train(y=y, training_frame=train)

    grid = H2OGridSearch(H2OGradientBoostingEstimator(),
                         hyper_params={"ntrees": [1, 2, 3]})
    grid.train(y=y, training_frame=train)

    assert h2o.make_leaderboard(aml).nrow > 0
    assert h2o.make_leaderboard(aml).nrow == h2o.make_leaderboard(
        aml
    ).nrow  # creating the same leaderboard doesn't end up with duplicate models
    assert h2o.make_leaderboard(grid).nrow > 0
    assert h2o.make_leaderboard([aml, aml2, grid, aml.leader]).nrow > 0

    # without leaderboard frame
    for score_data in ("AUTO", "xval", "valid", "train"):
        assert h2o.make_leaderboard(aml, scoring_data=score_data).nrow > 0
        assert h2o.make_leaderboard([aml, aml2],
                                    scoring_data=score_data).nrow > 0
        assert h2o.make_leaderboard(grid, scoring_data=score_data).nrow > 0
        assert h2o.make_leaderboard([aml, grid, aml2.leader],
                                    scoring_data=score_data).nrow > 0

    try:
        print(
            h2o.make_leaderboard(aml, extra_columns="predict_time_per_row_ms"))
        assert False, "Should fail - Cannot calculate the predict time without leaderboard frame"
    except h2o.exceptions.H2OResponseError:
        pass
Exemplo n.º 13
0
def grid_metric_accessors():

    cars = h2o.import_file(
        path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    r = cars[0].runif()
    train = cars[r > .2]
    valid = cars[r <= .2]

    # regression
    response_col = "economy"
    distribution = "gaussian"
    predictors = ["displacement", "power", "weight", "acceleration", "year"]

    gbm = H2OGradientBoostingEstimator(nfolds=3,
                                       distribution=distribution,
                                       fold_assignment="Random")
    gbm_grid = H2OGridSearch(gbm, hyper_params=dict(ntrees=[1, 2, 3]))
    gbm_grid.train(x=predictors,
                   y=response_col,
                   training_frame=train,
                   validation_frame=valid)

    # using list from http://docs.h2o.ai/h2o/latest-stable/h2o-docs/performance-and-prediction.html#regression
    for metric in ['r2', 'mse', 'rmse', 'rmsle', 'mae']:
        val = getattr(gbm_grid, metric)()
        assert isinstance(val, dict)
        for v in val.values():
            assert isinstance(
                v, float), "expected a float for metric {} but got {}".format(
                    metric, v)

    # binomial
    cars = h2o.import_file(
        path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
    r = cars[0].runif()
    train = cars[r > .2]
    valid = cars[r <= .2]
    response_col = "economy_20mpg"
    distribution = "bernoulli"
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    gbm = H2OGradientBoostingEstimator(nfolds=3,
                                       distribution=distribution,
                                       fold_assignment="Random")
    gbm_grid = H2OGridSearch(gbm, hyper_params=dict(ntrees=[1, 2, 3]))
    gbm_grid.train(x=predictors,
                   y=response_col,
                   training_frame=train,
                   validation_frame=valid)

    # using list from http://docs.h2o.ai/h2o/latest-stable/h2o-docs/performance-and-prediction.html#classification
    # + common ones
    for metric in ['gini', 'logloss', 'auc', 'aucpr', 'mse', 'rmse']:
        val = getattr(gbm_grid, metric)()
        assert isinstance(val, dict)
        for v in val.values():
            assert isinstance(
                v, float), "expected a float for metric {} but got {}".format(
                    metric, v)

    for metric in [
            'mcc', 'F1', 'F0point5', 'F2', 'accuracy', 'mean_per_class_error'
    ]:
        val = getattr(gbm_grid, metric)()
        assert isinstance(val, dict)
        for v in val.values():
            assert isinstance(
                v[0][1],
                float), "expected a float for metric {} but got {}".format(
                    metric, v)

    # multinomial
    cars = h2o.import_file(
        path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    cars["cylinders"] = cars["cylinders"].asfactor()
    r = cars[0].runif()
    train = cars[r > .2]
    valid = cars[r <= .2]
    response_col = "cylinders"
    distribution = "multinomial"
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    gbm = H2OGradientBoostingEstimator(nfolds=3,
                                       distribution=distribution,
                                       fold_assignment="Random")
    gbm_grid = H2OGridSearch(gbm, hyper_params=dict(ntrees=[1, 2, 3]))
    gbm_grid.train(x=predictors,
                   y=response_col,
                   training_frame=train,
                   validation_frame=valid)

    # using list from http://docs.h2o.ai/h2o/latest-stable/h2o-docs/performance-and-prediction.html#classification
    # + common ones
    for metric in ['logloss', 'mse', 'rmse', 'mean_per_class_error']:
        val = getattr(gbm_grid, metric)()
        assert isinstance(val, dict)
        for v in val.values():
            assert isinstance(
                v, float), "expected a float for metric {} but got {}".format(
                    metric, v)
Exemplo n.º 14
0
h2o.export_file(hTrain, "hTrainMy.csv", force=True)
h2o.export_file(hValidate, "hValidateMy.csv", force=True)
h2o.export_file(hTesting, "hTestingMy.csv", force=True)

training_columns = list(pData.columns)
training_columns.remove('UnitNumber')
training_columns.remove('Time')
training_columns.remove('RUL')

response_column = 'RUL'

hyper_parameters = {
    'activation': [
        'Tanh', 'TanhWithDropout', 'Rectifier', 'RectifierWithDropout',
        'Maxout', 'MaxoutWithDropout'
    ],
    'epochs': [10, 50, 100],
    'hidden': [32, 64, 128, 256, 512, 1024]
}

grid_search = H2OGridSearch(H2ODeepLearningEstimator,
                            hyper_params=hyper_parameters)
grid_search.train(x=training_columns,
                  y='RUL',
                  training_frame=hTrain,
                  validation_frame=hValidate)
grid_search.show()
models = grid_search.sort_by("mse")
print models
h2o.export_file(hTrain, "hTrainMy.csv", force=True)
h2o.export_file(hValidate, "hValidateMy.csv", force=True)
h2o.export_file(hTesting, "hTestingMy.csv", force=True)

training_columns = list(pData.columns)
training_columns.remove('UnitNumber')
training_columns.remove('Time')
training_columns.remove('RUL')

response_column = 'RUL'

hyper_parameters = {
    'distribution': [
        'auto', 'bernoulli', 'multinomial', 'gaussian', 'poisson', 'gamma',
        'tweedie', 'laplace', 'quantile', 'huber'
    ],
    'fold_assignment': ['auto', 'random', 'modulo', 'stratified'],
    'histogram_type':
    ['auto', 'uniform_adaptive', 'random', 'quantiles_global', 'round_robin']
}

grid_search = H2OGridSearch(H2OGradientBoostingEstimator,
                            hyper_params=hyper_parameters)
grid_search.train(x=training_columns,
                  y='RUL',
                  training_frame=hTrain,
                  validation_frame=hValidate)
grid_search.show()
models = grid_search.sort_by("mse")
print(models)
Exemplo n.º 16
0
# Split data inti training and validation
hTrain, hValidate = hData.split_frame(ratios=[0.8])

h2o.export_file(hTrain, "hTrainMy.csv", force=True)
h2o.export_file(hValidate, "hValidateMy.csv", force=True)
h2o.export_file(hTesting, "hTestingMy.csv", force=True)

training_columns = list(pData.columns)
training_columns.remove('UnitNumber')
training_columns.remove('Time')
training_columns.remove('RUL')

response_column = 'RUL'

hyper_parameters = {
    'ntrees': [50, 75, 100],
    'max_depth': [20, 50],
    'nbins': [100, 250]
}

grid_search = H2OGridSearch(H2ORandomForestEstimator,
                            hyper_params=hyper_parameters)
grid_search.train(x=training_columns,
                  y='RUL',
                  training_frame=hTrain,
                  validation_frame=hValidate)
grid_search.show()
models = grid_search.sort_by("mse")
print models
Exemplo n.º 17
0
    def test_pubdev_6416(self):
        # Attempt to add a model to the grid by specifying invalid hyperparameters search range.
        # Should fail and generate error
        data = h2o.import_file(
            pyunit_utils.locate('smalldata/iris/iris_train.csv'))
        hyper_params = {
            'max_depth': [8],
            'sample_rate': [.9],
            'col_sample_rate': [.9],
            'col_sample_rate_per_tree': [.9],
            'col_sample_rate_change_per_level': [.9],
            'min_rows': [5000000],  # Invalid hyperparameter
            'min_split_improvement': [1e-4],
            'histogram_type': ["UniformAdaptive"]
        }

        search_criteria = {
            'strategy': "RandomDiscrete",
            'max_runtime_secs': 3600,
            'max_models': 1,
            'seed': 12345,
            'stopping_rounds': 5,
            'stopping_metric': "MSE",
            'stopping_tolerance': 1e-3
        }

        gbm = H2OGradientBoostingEstimator(distribution='multinomial',
                                           ntrees=5,
                                           learn_rate=0.05,
                                           score_tree_interval=5,
                                           seed=1,
                                           stopping_rounds=5,
                                           stopping_metric="MSE",
                                           stopping_tolerance=1e-4)

        grid = H2OGridSearch(gbm,
                             hyper_params=hyper_params,
                             grid_id="grid_pubdev6416",
                             search_criteria=search_criteria)

        with self.assertRaises(ValueError) as err:
            grid.train(x=["sepal_len", "sepal_wid"],
                       y="species",
                       max_runtime_secs=3600,
                       training_frame=data)
        # During the first search, the error should be present
        assert "Details: ERRR on field: _min_rows: The dataset size is too small to split for min_rows=5000000.0: must have at least 1.0E7 (weighted) rows" \
               in str(err.exception)
        assert len(grid.models) == 0

        hyper_params = {
            'max_depth': [8],
            'sample_rate': [.9],
            'col_sample_rate': [.9],
            'col_sample_rate_per_tree': [.9],
            'col_sample_rate_change_per_level': [.9],
            'min_rows': [10],
            'min_split_improvement': [1e-4],
            'histogram_type': ["UniformAdaptive"]
        }
        gbm = H2OGradientBoostingEstimator(distribution='multinomial',
                                           ntrees=5,
                                           learn_rate=0.05,
                                           learn_rate_annealing=0.99,
                                           score_tree_interval=5,
                                           seed=1,
                                           stopping_rounds=5,
                                           stopping_metric="MSE",
                                           stopping_tolerance=1e-4)

        grid = H2OGridSearch(gbm,
                             hyper_params=hyper_params,
                             grid_id="grid_pubdev6416",
                             search_criteria=search_criteria)

        grid.train(x=["sepal_len", "sepal_wid"],
                   y="species",
                   max_runtime_secs=3600,
                   training_frame=data)

        # Assert the model is actually trained and added to the grid, not affected by previous exceptions
        assert len(grid.models) == 1
Exemplo n.º 18
0
# Grid search param options:
# http://docs.h2o.ai/h2o/latest-stable/h2o-docs/grid-search.html#xgboost-hyperparameters

gbm_params2 = {
    'learn_rate': [i * 0.01 for i in range(1, 14)],
    'max_depth': list(range(6, 11)),
    'sample_rate': [i * 0.1 for i in range(4, 11)],
    'col_sample_rate': [i * 0.1 for i in range(4, 11)]
}

# Train and validate a random grid of GBMs
gbm_grid2 = H2OGridSearch(model=H2OGradientBoostingEstimator(
    ntrees=max_trees,
    stopping_rounds=3,
    stopping_tolerance=stop_tol,
    keep_cross_validation_predictions=True),
                          hyper_params=gbm_params2,
                          search_criteria=search_criteria)
print("grid searching gbm")
gbm_grid2.train(x=train_cols,
                y='Dry_Yield',
                training_frame=df,
                fold_column='YearID_KFold')
print(gbm_grid2)
gbm_best = gbm_grid2.get_grid(sort_by='mae').models[0]
print(gbm_best)
gbm_ids = [m.model_id for m in gbm_grid2.models[:stack_top_n_grid_results]]

# random forest
drf_params = {
Exemplo n.º 19
0
def pyunit_mean_per_class_error():
    gbm = H2OGradientBoostingEstimator(nfolds=3,
                                       fold_assignment="Random",
                                       seed=1234)

    ## Binomial
    cars = h2o.import_file(
        pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
    r = cars[0].runif(seed=1234)
    train = cars[r > .2]
    valid = cars[r <= .2]
    response_col = "economy_20mpg"
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    gbm.distribution = "bernoulli"
    gbm.train(y=response_col,
              x=predictors,
              validation_frame=valid,
              training_frame=train)
    print(gbm)
    mpce = gbm.mean_per_class_error([0.5, 0.8])  ## different thresholds
    assert (abs(mpce[0][1] - 0.004132231404958664) < 1e-5)
    assert (abs(mpce[1][1] - 0.021390374331550777) < 1e-5)

    ## score on train first
    print(
        gbm.model_performance(train).mean_per_class_error(
            thresholds=[0.3, 0.5]))

    ## Multinomial
    cars = h2o.import_file(
        pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    cars["cylinders"] = cars["cylinders"].asfactor()
    r = cars[0].runif(seed=1234)
    train = cars[r > .2]
    valid = cars[r <= .2]
    response_col = "cylinders"
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    gbm.distribution = "multinomial"
    gbm.train(x=predictors,
              y=response_col,
              training_frame=train,
              validation_frame=valid)
    print(gbm)
    mpce = gbm.mean_per_class_error(train=True)
    assert (mpce == 0)
    mpce = gbm.mean_per_class_error(valid=True)
    # assert(abs(mpce - 0.207142857143 ) < 1e-5)
    assert (abs(mpce - 0.407142857143) < 1e-5)
    mpce = gbm.mean_per_class_error(xval=True)
    # assert(abs(mpce - 0.350071715433 ) < 1e-5)
    assert (abs(mpce - 0.35127653471) < 1e-5)

    ## Early stopping
    gbm.stopping_rounds = 2
    gbm.stopping_metric = "mean_per_class_error"
    gbm.ntrees = 10000
    gbm.max_depth = 3
    gbm.min_rows = 1
    gbm.learn_rate = 0.01
    gbm.score_tree_interval = 1
    gbm.nfolds = None
    gbm.fold_assignment = None
    gbm.train(x=predictors,
              y=response_col,
              training_frame=train,
              validation_frame=valid)
    print(gbm)
    print(gbm.scoring_history())

    ## Grid search
    hyper_params_tune = {
        'max_depth': list(range(1, 10 + 1, 1)),
        'sample_rate': [x / 100. for x in range(20, 101)],
        'col_sample_rate': [x / 100. for x in range(20, 101)],
        'col_sample_rate_per_tree': [x / 100. for x in range(20, 101)],
        'col_sample_rate_change_per_level': [x / 100. for x in range(90, 111)],
        'min_rows':
        [2**x for x in range(0,
                             int(math.log(train.nrow, 2) - 2) + 1)],
        'nbins': [2**x for x in range(4, 11)],
        'nbins_cats': [2**x for x in range(4, 13)],
        'min_split_improvement': [0, 1e-8, 1e-6, 1e-4],
        'histogram_type': ["UniformAdaptive", "QuantilesGlobal", "RoundRobin"]
    }

    search_criteria_tune = {
        'strategy': "RandomDiscrete",
        'max_runtime_secs': 600,  ## limit the runtime to 10 minutes
        'max_models': 10,
        'seed': 1234,
        'stopping_rounds': 5,
        'stopping_metric': "mean_per_class_error",
        'stopping_tolerance': 1e-3
    }

    grid = H2OGridSearch(H2OGradientBoostingEstimator,
                         hyper_params=hyper_params_tune,
                         search_criteria=search_criteria_tune)
    grid.train(x=predictors,
               y=response_col,
               training_frame=train,
               validation_frame=valid,
               distribution="multinomial",
               seed=1234,
               stopping_rounds=10,
               stopping_metric="mean_per_class_error",
               stopping_tolerance=1e-3)

    print(grid)  ## sorted by logloss
    print(grid.get_grid("mean_per_class_error"))
Exemplo n.º 20
0
# Split data inti training and validation
hTrain, hValidate = hData.split_frame(ratios=[0.8])

h2o.export_file(hTrain, "hTrainMy.csv", force=True)
h2o.export_file(hValidate, "hValidateMy.csv", force=True)

response_column = 'RUL'

hyper_parameters = {
    'activation': [
        'Tanh', 'TanhWithDropout', 'Rectifier', 'RectifierWithDropout',
        'Maxout', 'MaxoutWithDropout'
    ],
    'hidden': [4, 6, 8, 10, 12, 14, 16, 18, 20],
    'epochs': [50, 100, 150],
    'loss': ['Quadratic', 'Absolute', 'Huber'],
    'distribution': [
        'AUTO', 'bernoulli', 'multinomial', 'poisson', 'gamma', 'tweedie',
        'laplace', 'huber', 'quantile', 'gaussian'
    ]
}

grid_search = H2OGridSearch(H2OAutoEncoderEstimator,
                            hyper_params=hyper_parameters)
grid_search.train(x=selected_columns,
                  training_frame=hTrain,
                  validation_frame=hValidate)
grid_search.show()
models = grid_search.sort_by("mse")
print models