示例#1
0
def scale_svd_rf_pipe():
    from h2o.transforms.decomposition import H2OSVD

    print("Importing USArrests.csv data...")
    arrests = h2o.upload_file(
        pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))

    # build  transformation pipeline using sklearn's Pipeline and H2OSVD
    pipe = Pipeline([("standardize", H2OScaler()), ("svd", H2OSVD()),
                     ("rf", H2ORandomForestEstimator())])

    params = {
        "standardize__center": [True, False],
        "standardize__scale": [True, False],
        "svd__nv": [2, 3],
        "rf__ntrees": randint(50, 60),
        "rf__max_depth": randint(4, 8),
        "rf__min_rows": randint(5, 10),
        "svd__transform": ["none", "standardize"],
    }

    custom_cv = H2OKFold(arrests, n_folds=5, seed=42)
    random_search = RandomizedSearchCV(pipe,
                                       params,
                                       n_iter=5,
                                       scoring=make_scorer(h2o_r2_score),
                                       cv=custom_cv,
                                       random_state=42,
                                       n_jobs=1)

    random_search.fit(arrests[1:], arrests[0])
    print(random_search.best_estimator_)
def scale_pca_rf_pipe_new_import():
  from h2o.estimators.pca import H2OPrincipalComponentAnalysisEstimator
  iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

  # build transformation pipeline using sklearn's Pipeline and H2O transforms
  pipe = Pipeline([
                    ("standardize", H2OScaler()),
                    ("pca", H2OPrincipalComponentAnalysisEstimator().init_for_pipeline()),
                    ("rf", H2ORandomForestEstimator())
                  ])

  params = {"standardize__center":    [True, False],             # Parameters to test
            "standardize__scale":     [True, False],
            "pca__k":                 randint(2, iris[1:].shape[1]),
            "rf__ntrees":             randint(50,60),
            "rf__max_depth":          randint(4,8),
            "rf__min_rows":           randint(5,10),
            "pca__transform":         ["none", "standardize"],
            }

  custom_cv = H2OKFold(iris, n_folds=5, seed=42)
  random_search = RandomizedSearchCV(pipe, 
                                     params,
                                     n_iter=5,
                                     scoring=make_scorer(h2o_r2_score),
                                     cv=custom_cv,
                                     random_state=42,
                                     n_jobs=1)

  random_search.fit(iris[1:],iris[0])

  print(random_search.best_estimator_)
def test_h2o_only_pipeline_with_h2o_frames():
    pipeline = Pipeline([('svd', H2OSVD(seed=seed)),
                         ('estimator',
                          H2OGradientBoostingClassifier(seed=seed))])

    params = dict(
        svd__nv=[2, 3],
        svd__transform=['DESCALE', 'DEMEAN', 'NONE'],
        estimator__ntrees=[5, 10],
        estimator__max_depth=[1, 2, 3],
        estimator__learn_rate=[0.1, 0.2],
    )
    search = RandomizedSearchCV(
        pipeline,
        params,
        n_iter=5,
        random_state=seed,
        n_jobs=1,  # fails with parallel jobs
    )
    data = _get_data(format='h2o', n_classes=3)
    assert isinstance(data.X_train, h2o.H2OFrame)

    search.set_params(
        scoring=make_scorer(_h2o_accuracy),
        cv=H2OKFold(data.X_train, n_folds=3, seed=seed),
    )

    search.fit(data.X_train, data.y_train)
    preds = search.predict(data.X_test)
    assert isinstance(preds, h2o.H2OFrame)
    assert preds.dim == [len(data.X_test), 1]
    probs = search.predict_proba(data.X_test)
    assert probs.dim == [len(data.X_test), 3]
    assert np.allclose(np.sum(probs.as_data_frame().values, axis=1),
                       1.), "`predict_proba` didn't return probabilities"

    score = search.score(data.X_test, data.y_test)
    assert isinstance(score, float)
    skl_score = accuracy_score(data.y_test.as_data_frame().values,
                               preds.as_data_frame().values)
    assert abs(score - skl_score) < 1e-6, "score={}, skl_score={}".format(
        score, skl_score)
    scores['h2o_only_pipeline_with_h2o_frame'] = score
示例#4
0
def scale_pca_rf_pipe():

    from h2o.transforms.preprocessing import H2OScaler
    from h2o.estimators.pca import H2OPrincipalComponentAnalysisEstimator
    from h2o.estimators.random_forest import H2ORandomForestEstimator
    from sklearn.pipeline import Pipeline
    from sklearn.grid_search import RandomizedSearchCV
    from h2o.cross_validation import H2OKFold
    from h2o.model.regression import h2o_r2_score
    from sklearn.metrics.scorer import make_scorer
    from scipy.stats import randint

    iris = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

    # build  transformation pipeline using sklearn's Pipeline and H2O transforms
    pipe = Pipeline([("standardize", H2OScaler()),
                     ("pca", H2OPrincipalComponentAnalysisEstimator()),
                     ("rf", H2ORandomForestEstimator())])

    params = {
        "standardize__center": [True, False],  # Parameters to test
        "standardize__scale": [True, False],
        "pca__k": randint(2, iris[1:].shape[1]),
        "rf__ntrees": randint(50, 60),
        "rf__max_depth": randint(4, 8),
        "rf__min_rows": randint(5, 10),
    }

    custom_cv = H2OKFold(iris, n_folds=5, seed=42)
    random_search = RandomizedSearchCV(pipe,
                                       params,
                                       n_iter=5,
                                       scoring=make_scorer(h2o_r2_score),
                                       cv=custom_cv,
                                       random_state=42,
                                       n_jobs=1)

    random_search.fit(iris[1:], iris[0])

    print(random_search.best_estimator_)
示例#5
0
                 ("pca", H2OPCA(k=2)),
                 ("gbm", H2OGradientBoostingEstimator(distribution="multinomial"))])
pipeline.fit(iris_df[:4],iris_df[4])

# Random CV using H2O and Scikit-learn
from sklearn.grid_search import RandomizedSearchCV
from h2o.cross_validation import H2OKFold
from h2o.model.regression import h2o_r2_score
from sklearn.metrics.scorer import make_scorer
params = {"standardize__center":    [True, False],             # Parameters to test
          "standardize__scale":     [True, False],
          "pca__k":                 [2,3],
          "gbm__ntrees":            [10,20],
          "gbm__max_depth":         [1,2,3],
          "gbm__learn_rate":        [0.1,0.2]}
custom_cv = H2OKFold(iris_df, n_folds=5, seed=42)
pipeline = Pipeline([("standardize", H2OScaler()),
                     ("pca", H2OPCA(k=2)),
                     ("gbm", H2OGradientBoostingEstimator(distribution="gaussian"))])
random_search = RandomizedSearchCV(pipeline, params,
                                   n_iter=5,
                                   scoring=make_scorer(h2o_r2_score),
                                   cv=custom_cv,
                                   random_state=42,
                                   n_jobs=1)
random_search.fit(iris_df[1:], iris_df[0])
print random_search.best_estimator_



示例#6
0
                    ('groupby_count', AddGroupByCount())])),
         ('Add_ip_address_num',
          Pipeline([('extract', ColumnExtractor(['user_id', 'ip_address'])),
                    ('groupby_count', AddGroupByCount())])),
         ('numerics',
          Pipeline([('extract', ColumnExtractor(NUM_FEATS)),
                    ('zero_fill', ZeroFillTransformer()),
                    ('log', Log1pTransformer())]))
     ]))
])
##############################
# Modeling + Tuning
##############################
from h2o.cross_validation import H2OKFold
dataset = pd.concat([X_train, y_train], axis=1)
cv = H2OKFold(dataset, n_folds=5, seed=42)
# H2O approach
("H2OCreator", H2OFrameCreator()),
# ('standardize', H2OScaler()),
# ('pca', H2OPCA()),
('rf', H2ORandomForestEstimator(ntrees=20))

# something new to try
# from scipy.stats import randint
# params = {
#           # "standardize__center":    [True, False],
#           # "standardize__scale":     [True, False],
#           "pca__k":  2,
#               # randint(2, X_train[1:].shape[1]),
#           "rf__ntrees": 20
# # randint(50,60),