def test_params_conflicting_with_sklearn_api_are_still_available(): pca = H2OPCA() assert pca.transform != 'none' assert callable( pca.transform ), "`transform` method from sklearn API has been replaced by a property" # conflicting param can be accessed normally using get_params() print(pca.get_params()) assert pca.get_params()['transform'] == 'none' # property is accessible directly using a trailing underscore assert pca.transform_ == 'none' pca = H2OPCA(transform='demean') assert callable( pca.transform ), "`transform` method from sklearn API has been replaced by a property" assert pca.get_params()['transform'] == 'demean' assert pca.transform_ == 'demean' # conflicting param can be modified normally using set_params() pca.set_params(transform='descale') assert pca.get_params()['transform'] == 'descale' assert pca.transform_ == 'descale' # conflicting property can be set directly using a trailing underscore pca.transform_ = 'normalize' assert pca.get_params()['transform'] == 'normalize' assert pca.transform_ == 'normalize'
def test_params_conflicting_with_sklearn_api_are_still_available(): pca = H2OPCA() assert pca.transform != 'NONE' assert callable( pca.transform ), "`transform` method from sklearn API has been replaced by a property" # conflicting param can be accessed normally using get_params() assert pca.get_params()['transform'] == 'NONE' # property is accessible directly using a trailing underscore assert pca.transform_ == 'NONE' pca = H2OPCA(transform='DEMEAN') assert callable( pca.transform ), "`transform` method from sklearn API has been replaced by a property" assert pca.get_params()['transform'] == 'DEMEAN' assert pca.transform_ == 'DEMEAN' # conflicting param can be modified normally using set_params() pca.set_params(transform='DESCALE') assert pca.get_params()['transform'] == 'DESCALE' assert pca.transform_ == 'DESCALE' # conflicting property can be set directly using a trailing underscore pca.transform_ = 'NORMALIZE' assert pca.get_params()['transform'] == 'NORMALIZE' assert pca.transform_ == 'NORMALIZE'
def test_params_are_correctly_passed_to_underlying_transformer(): pca = H2OPCA(seed=seed) pca.set_params(transform='demean', k=3) pca.model_id = "dummy" assert pca.estimator is None pca._make_estimator() # normally done when calling `fit` assert pca.estimator parms = pca.estimator._parms assert parms['seed'] == seed assert parms['transform'] == 'demean' assert parms['k'] == 3 assert parms['model_id'] == "dummy" assert parms['max_iterations'] is None
def test_all_params_can_be_set_as_properties(): pipeline = Pipeline([('standardize', H2OScaler()), ('pca', H2OPCA()), ('estimator', H2OGradientBoostingEstimator())]) pipeline.named_steps.standardize.center = True pipeline.named_steps.standardize.scale = False pipeline.named_steps.pca.k = 2 pipeline.named_steps.pca.seed = seed pipeline.named_steps.estimator.ntrees = 20 pipeline.named_steps.estimator.max_depth = 5 pipeline.named_steps.estimator.seed = seed params = pipeline.get_params() assert isinstance(params['standardize'], H2OScaler) assert params['standardize__center'] is True assert params['standardize__scale'] is False assert isinstance(params['pca'], H2OPCA) assert params['pca__k'] == 2 assert params['pca__seed'] == seed assert isinstance(params['estimator'], H2OGradientBoostingEstimator) assert params['estimator__ntrees'] == 20 assert params['estimator__max_depth'] == 5 assert params['estimator__seed'] == seed
def test_all_params_can_be_set_using_set_params(): pipeline = Pipeline([('standardize', H2OScaler()), ('pca', H2OPCA()), ('estimator', H2OGradientBoostingEstimator())]) pipeline.set_params(standardize__center=True, standardize__scale=False, pca__k=2, pca__seed=seed, estimator__ntrees=20, estimator__max_depth=5, estimator__seed=seed) assert isinstance(pipeline.named_steps.standardize, H2OScaler) assert pipeline.named_steps.standardize.center is True assert pipeline.named_steps.standardize.scale is False assert isinstance(pipeline.named_steps.pca, H2OPCA) assert pipeline.named_steps.pca.k == 2 assert pipeline.named_steps.pca.seed == seed assert isinstance(pipeline.named_steps.estimator, H2OGradientBoostingEstimator) assert pipeline.named_steps.estimator.ntrees == 20 assert pipeline.named_steps.estimator.max_depth == 5 assert pipeline.named_steps.estimator.seed == seed
def test_all_params_are_accessible_as_properties(): pipeline = Pipeline([('standardize', H2OScaler(center=True, scale=False)), ('pca', H2OPCA(k=2, seed=seed)), ('estimator', H2OGradientBoostingEstimator(ntrees=20, max_depth=5, seed=seed))]) assert isinstance(pipeline.named_steps.standardize, H2OScaler) assert pipeline.named_steps.standardize.center is True assert pipeline.named_steps.standardize.scale is False assert isinstance(pipeline.named_steps.pca, H2OPCA) assert pipeline.named_steps.pca.k == 2 assert pipeline.named_steps.pca.seed == seed assert isinstance(pipeline.named_steps.estimator, H2OGradientBoostingEstimator) assert pipeline.named_steps.estimator.ntrees == 20 assert pipeline.named_steps.estimator.max_depth == 5 assert pipeline.named_steps.estimator.seed == seed # also the ones that were not set explicitly assert pipeline.named_steps.pca.max_iterations is None assert pipeline.named_steps.estimator.learn_rate is None
def test_all_params_are_visible_in_get_params(): pipeline = Pipeline([('standardize', H2OScaler(center=True, scale=False)), ('pca', H2OPCA(k=2, seed=seed)), ('estimator', H2OGradientBoostingEstimator(ntrees=20, max_depth=5, seed=seed))]) params = pipeline.get_params() assert isinstance(params['standardize'], H2OScaler) assert params['standardize__center'] is True assert params['standardize__scale'] is False assert isinstance(params['pca'], H2OPCA) assert params['pca__k'] == 2 assert params['pca__seed'] == seed assert isinstance(params['estimator'], H2OGradientBoostingEstimator) assert params['estimator__ntrees'] == 20 assert params['estimator__max_depth'] == 5 assert params['estimator__seed'] == seed # also the ones that were not set explicitly assert params['pca__max_iterations'] is None assert params['estimator__learn_rate'] is None
def test_h2o_only_pipeline_with_h2o_frames(): pipeline = Pipeline([('standardize', H2OScaler()), ('pca', H2OPCA(k=2, seed=seed)), ('estimator', H2OGradientBoostingRegressor(seed=seed)) ]) data = _get_data(format='h2o') assert isinstance(data.X_train, h2o.H2OFrame) pipeline.fit(data.X_train, data.y_train) preds = pipeline.predict(data.X_test) assert isinstance(preds, h2o.H2OFrame) assert preds.dim == [len(data.X_test), 1] # to get it working, we need to score a fresh H2OFrame data = _get_data(format='h2o') score = pipeline.score(data.X_test, data.y_test) assert isinstance(score, float) skl_score = r2_score(data.y_test.as_data_frame().values, preds.as_data_frame().values) assert abs(score - skl_score) < 1e-6, "score={}, skl_score={}".format( score, skl_score) scores['h2o_only_pipeline_with_h2o_frame'] = score
def test_h2o_only_pipeline_with_numpy_arrays(): # Note that in normal situations (release build), init_connection_args can be omitted # otherwise, it should be set to the first H2O element in the pipeline. # Also note that in this specific case mixing numpy inputs with a fully H2O pipeline, # the last estimator requires the `data_conversion=True` param in order to return numpy arrays in predictions. pipeline = Pipeline([ ('standardize', H2OScaler(init_connection_args=init_connection_args)), ('pca', H2OPCA(k=2, seed=seed)), ('estimator', H2OGradientBoostingRegressor(seed=seed, data_conversion=True)) ]) data = _get_data(format='numpy') assert isinstance(data.X_train, np.ndarray) pipeline.fit(data.X_train, data.y_train) preds = pipeline.predict(data.X_test) assert isinstance(preds, np.ndarray) assert preds.shape == (len(data.X_test), ) score = pipeline.score(data.X_test, data.y_test) assert isinstance(score, float) skl_score = r2_score(data.y_test, preds) assert abs(score - skl_score) < 1e-6 scores['h2o_only_pipeline_with_numpy_arrays'] = score