def init_for_pipeline(self): """ Returns H2OSVD object which implements fit and transform method to be used in sklearn.Pipeline properly. All parameters defined in self.__params, should be input parameters in H2OSVD.__init__ method. :returns: H2OSVD object :examples: >>> from h2o.transforms.preprocessing import H2OScaler >>> from h2o.estimators import H2ORandomForestEstimator >>> from h2o.estimators import H2OSingularValueDecompositionEstimator >>> from sklearn.pipeline import Pipeline >>> arrests = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/USArrests.csv") >>> pipe = Pipeline([("standardize", H2OScaler()), ... ("svd", H2OSingularValueDecompositionEstimator(nv=3).init_for_pipeline()), ... ("rf", H2ORandomForestEstimator(seed=42,ntrees=50))]) >>> pipe.fit(arrests[1:], arrests[0]) """ import inspect from h2o.transforms.decomposition import H2OSVD # check which parameters can be passed to H2OSVD init var_names = list( dict(inspect.getmembers(H2OSVD.__init__.__code__))['co_varnames']) parameters = {k: v for k, v in self._parms.items() if k in var_names} return H2OSVD(**parameters)
def scale_svd_rf_pipe(): from h2o.transforms.decomposition import H2OSVD print("Importing USArrests.csv data...") arrests = h2o.upload_file( pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) # build transformation pipeline using sklearn's Pipeline and H2OSVD pipe = Pipeline([("standardize", H2OScaler()), ("svd", H2OSVD()), ("rf", H2ORandomForestEstimator())]) params = { "standardize__center": [True, False], "standardize__scale": [True, False], "svd__nv": [2, 3], "rf__ntrees": randint(50, 60), "rf__max_depth": randint(4, 8), "rf__min_rows": randint(5, 10), "svd__transform": ["none", "standardize"], } custom_cv = H2OKFold(arrests, n_folds=5, seed=42) random_search = RandomizedSearchCV(pipe, params, n_iter=5, scoring=make_scorer(h2o_r2_score), cv=custom_cv, random_state=42, n_jobs=1) random_search.fit(arrests[1:], arrests[0]) print(random_search.best_estimator_)
def svd_1_golden(): print("Importing USArrests.csv data...") arrestsH2O = h2o.upload_file( pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) print("Compare with SVD") from h2o.transforms.decomposition import H2OSVD fitH2O = H2OSVD(nv=4, transform="NONE", max_iterations=2000) fitH2O.train(x=list(range(4)), training_frame=arrestsH2O) print("Compare singular values (D)") h2o_d = fitH2O._model_json['output']['d'] r_d = [ 1419.06139509772, 194.825846110138, 45.6613376308754, 18.0695566224677 ] print("R Singular Values: {0}".format(r_d)) print("H2O Singular Values: {0}".format(h2o_d)) for r, h in zip(r_d, h2o_d): assert abs(r - h) < 1e-6, "H2O got {0}, but R got {1}".format(h, r) print("Compare right singular vectors (V)") h2o_v = h2o.as_list(h2o.get_frame( fitH2O._model_json['output']['v_key']['name']), use_pandas=False) h2o_v.pop(0) r_v = [[-0.04239181, 0.01616262, -0.06588426, 0.99679535], [-0.94395706, 0.32068580, 0.06655170, -0.04094568], [-0.30842767, -0.93845891, 0.15496743, 0.01234261], [-0.10963744, -0.12725666, -0.98347101, -0.06760284]] print("R Right Singular Vectors: {0}".format(r_v)) print("H2O Right Singular Vectors: {0}".format(h2o_v)) for rl, hl in zip(r_v, h2o_v): for r, h in zip(rl, hl): assert abs(abs(r) - abs(float(h)) ) < 1e-5, "H2O got {0}, but R got {1}".format(h, r) print("Compare left singular vectors (U)") h2o_u = h2o.as_list(h2o.get_frame( fitH2O._model_json['output']['u_key']['name']), use_pandas=False) h2o_u.pop(0) r_u = [[-0.1716251, 0.096325710, 0.06515480, 0.15369551], [-0.1891166, 0.173452566, -0.42665785, -0.17801438], [-0.2155930, 0.078998111, 0.02063740, -0.28070784], [-0.1390244, 0.059889811, 0.01392269, 0.01610418], [-0.2067788, -0.009812026, -0.17633244, -0.21867425], [-0.1558794, -0.064555293, -0.28288280, -0.11797419]] print("R Left Singular Vectors: {0}".format(r_u)) print("H2O Left Singular Vectors: {0}".format(h2o_u)) for rl, hl in zip(r_u, h2o_u): for r, h in zip(rl, hl): assert abs(abs(r) - abs(float(h)) ) < 1e-5, "H2O got {0}, but R got {1}".format(h, r)
def scale_svd_rf_pipe(): from h2o.transforms.decomposition import H2OSVD print("Importing USArrests.csv data...") arrests = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) # build transformation pipeline using sklearn's Pipeline and H2O transforms pipe = Pipeline([ ("standardize", H2OScaler()), ("svd", H2OSVD(nv=3)), ("rf", H2ORandomForestEstimator(seed=42,ntrees=50)) ]) pipe.fit(arrests[1:], arrests[0]) print(pipe)
def init_for_pipeline(self): """ Returns H2OSVD object which implements fit and transform method to be used in sklearn.Pipeline properly. All parameters defined in self.__params, should be input parameters in H2OSVD.__init__ method. :returns: H2OSVD object """ import inspect from h2o.transforms.decomposition import H2OSVD # check which parameters can be passed to H2OSVD init var_names = list( dict(inspect.getmembers(H2OSVD.__init__.__code__))['co_varnames']) parameters = {k: v for k, v in self._parms.items() if k in var_names} return H2OSVD(**parameters)