def stackedensemble_metalearner_seed_test(): # Import training set train = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/higgs_train_5k.csv"), destination_frame="higgs_train_5k") test = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/higgs_test_5k.csv"), destination_frame="higgs_test_5k") # Identify predictors and response x = train.columns y = "response" x.remove(y) # Convert response to a factor train[y] = train[y].asfactor() test[y] = test[y].asfactor() # Set number of folds for base learners nfolds = 3 #Metalearner params for gbm, drf, glm, and deep deeplearning gbm_params = {"sample_rate" : 0.3, "col_sample_rate" : 0.3} # Train and cross-validate a GBM my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=10, nfolds=nfolds, keep_cross_validation_predictions=True, seed=1) my_gbm.train(x=x, y=y, training_frame=train) # Train and cross-validate a RF my_rf = H2ORandomForestEstimator(ntrees=10, nfolds=nfolds, keep_cross_validation_predictions=True, seed=1) my_rf.train(x=x, y=y, training_frame=train) #Train two SE models with same metalearner seeds stack_gbm1 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm="gbm", metalearner_params = gbm_params, seed = 55555) stack_gbm2 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm="gbm", metalearner_params = gbm_params, seed = 55555) stack_gbm1.train(x=x, y=y, training_frame=train) stack_gbm2.train(x=x, y=y, training_frame=train) meta_gbm1 = h2o.get_model(stack_gbm1.metalearner()['name']) meta_gbm2 = h2o.get_model(stack_gbm2.metalearner()['name']) assert meta_gbm1.rmse(train=True) == meta_gbm2.rmse(train=True), "RMSE should match if same seed" #Train two SE models with diff metalearner seeds stack_gbm3 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm="gbm", metalearner_params = gbm_params, seed = 55555) stack_gbm4 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm="gbm", metalearner_params = gbm_params, seed = 98765) stack_gbm3.train(x=x, y=y, training_frame=train) stack_gbm4.train(x=x, y=y, training_frame=train) meta_gbm3 = h2o.get_model(stack_gbm3.metalearner()['name']) meta_gbm4 = h2o.get_model(stack_gbm4.metalearner()['name']) assert meta_gbm3.rmse(train=True) != meta_gbm4.rmse(train=True), "RMSE should NOT match if diff seed"
def covtype_get_model(): covtype = h2o.import_file(path=pyunit_utils.locate("smalldata/covtype/covtype.20k.data")) Y = 54 X = range(0,20) + range(29,54) # Set response to be indicator of a particular class res_class = random.randint(1,4) # Log.info(paste("Setting response column", myY, "to be indicator of class", res_class, "\n")) covtype[54] = (covtype[54] == res_class) # L2: alpha = 0, lambda = 0 covtype_mod1 = H2OGeneralizedLinearEstimator(family="binomial", alpha=0, Lambda=0) covtype_mod1.train(x=X,y=Y, training_frame=covtype) covtype_mod1.show() covtype_mod1 = h2o.get_model(covtype_mod1.model_id) covtype_mod1.show() # Elastic: alpha = 0.5, lambda = 1e-4 covtype_mod2 = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=1e-4) covtype_mod2.train(x=X, y=Y, training_frame=covtype) covtype_mod2.show() covtype_mod2 = h2o.get_model(covtype_mod2.model_id) covtype_mod2.show() # L1: alpha = 1, lambda = 1e-4 covtype_mod3 = H2OGeneralizedLinearEstimator(family="binomial", alpha=1, Lambda=1e-4) covtype_mod3.train(x=X,y=Y, training_frame=covtype) covtype_mod3.show() covtype_mod3 = h2o.get_model(covtype_mod3.model_id) covtype_mod3.show()
def covtype_get_model(ip,port): #Log.info("Importing covtype.20k.data...\n") covtype = h2o.import_file(path=h2o.locate("smalldata/covtype/covtype.20k.data")) Y = 54 X = range(0,20) + range(29,54) # Set response to be indicator of a particular class res_class = random.randint(1,4) # Log.info(paste("Setting response column", myY, "to be indicator of class", res_class, "\n")) covtype[54] = (covtype[54] == res_class) #covtype_data.summary() # L2: alpha = 0, lambda = 0 covtype_mod1 = h2o.glm(y=covtype[Y], x=covtype[X], family="binomial", alpha=[0], Lambda=[0]) covtype_mod1.show() covtype_mod1 = h2o.get_model(covtype_mod1._id) covtype_mod1.show() # Elastic: alpha = 0.5, lambda = 1e-4 covtype_mod2 = h2o.glm(y=covtype[Y], x=covtype[X], family="binomial", alpha=[0.5], Lambda=[1e-4]) covtype_mod2.show() covtype_mod2 = h2o.get_model(covtype_mod2._id) covtype_mod2.show() # L1: alpha = 1, lambda = 1e-4 covtype_mod3 = h2o.glm(y=covtype[Y], x=covtype[X], family="binomial", alpha=[1], Lambda=[1e-4]) covtype_mod3.show() covtype_mod3 = h2o.get_model(covtype_mod3._id) covtype_mod3.show()
def get_model_test(): prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) r = prostate[0].runif() train = prostate[r < 0.70] test = prostate[r >= 0.70] # Regression regression_gbm1 = H2OGradientBoostingEstimator(distribution="gaussian") regression_gbm1.train(x=[2,3,4,5,6,7,8], y=1, training_frame=train) predictions1 = regression_gbm1.predict(test) regression_gbm2 = h2o.get_model(regression_gbm1._id) assert regression_gbm2._model_json['output']['model_category'] == "Regression" predictions2 = regression_gbm2.predict(test) for r in range(predictions1.nrow): p1 = predictions1[r,0] p2 = predictions2[r,0] assert p1 == p2, "expected regression predictions to be the same for row {}, but got {} and {}".format(r, p1, p2) # Binomial train[1] = train[1].asfactor() bernoulli_gbm1 = H2OGradientBoostingEstimator(distribution="bernoulli") bernoulli_gbm1.train(x=[2,3,4,5,6,7,8],y=1,training_frame=train) predictions1 = bernoulli_gbm1.predict(test) bernoulli_gbm2 = h2o.get_model(bernoulli_gbm1._id) assert bernoulli_gbm2._model_json['output']['model_category'] == "Binomial" predictions2 = bernoulli_gbm2.predict(test) for r in range(predictions1.nrow): p1 = predictions1[r,0] p2 = predictions2[r,0] assert p1 == p2, "expected binomial predictions to be the same for row {}, but got {} and {}".format(r, p1, p2) # Clustering benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv")) km_h2o = H2OKMeansEstimator(k=3) km_h2o.train(x=list(range(benign_h2o.ncol)), training_frame=benign_h2o) benign_km = h2o.get_model(km_h2o._id) assert benign_km._model_json['output']['model_category'] == "Clustering" # Multinomial train[4] = train[4].asfactor() multinomial_dl1 = H2ODeepLearningEstimator(loss="CrossEntropy") multinomial_dl1.train(x=[0,1], y=4, training_frame=train) predictions1 = multinomial_dl1.predict(test) multinomial_dl2 = h2o.get_model(multinomial_dl1._id) assert multinomial_dl2._model_json['output']['model_category'] == "Multinomial" predictions2 = multinomial_dl2.predict(test) for r in range(predictions1.nrow): p1 = predictions1[r,0] p2 = predictions2[r,0] assert p1 == p2, "expected multinomial predictions to be the same for row {0}, but got {1} and {2}" \ "".format(r, p1, p2)
def get_model_test(ip,port): # Connect to h2o h2o.init(ip,port) prostate = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate.csv")) r = prostate[0].runif() train = prostate[r < 0.70] test = prostate[r >= 0.30] # Regression regression_gbm1 = h2o.gbm(y=train[1], x=train[2:9], distribution="gaussian") predictions1 = regression_gbm1.predict(test) regression_gbm2 = h2o.get_model(regression_gbm1._key) assert regression_gbm2._model_json['output']['model_category'] == "Regression" predictions2 = regression_gbm2.predict(test) for r in range(predictions1.nrow()): p1 = predictions1[r,0] p2 = predictions2[r,0] assert p1 == p2, "expected regression predictions to be the same for row {0}, but got {1} and {2}" \ "".format(r, p1, p2) # Binomial train[1] = train[1].asfactor() bernoulli_gbm1 = h2o.gbm(y=train[1], x=train[2:9], distribution="bernoulli") predictions1 = bernoulli_gbm1.predict(test) bernoulli_gbm2 = h2o.get_model(bernoulli_gbm1._key) assert bernoulli_gbm2._model_json['output']['model_category'] == "Binomial" predictions2 = bernoulli_gbm2.predict(test) for r in range(predictions1.nrow()): p1 = predictions1[r,0] p2 = predictions2[r,0] assert p1 == p2, "expected binomial predictions to be the same for row {0}, but got {1} and {2}" \ "".format(r, p1, p2) # Clustering benign_h2o = h2o.import_frame(path=h2o.locate("smalldata/logreg/benign.csv")) km_h2o = h2o.kmeans(x=benign_h2o, k=3) benign_km = h2o.get_model(km_h2o._key) assert benign_km._model_json['output']['model_category'] == "Clustering" # Multinomial train[4] = train[4].asfactor() multinomial_dl1 = h2o.deeplearning(x=train[0:2], y=train[4], loss='CrossEntropy') predictions1 = multinomial_dl1.predict(test) multinomial_dl2 = h2o.get_model(multinomial_dl1._key) assert multinomial_dl2._model_json['output']['model_category'] == "Multinomial" predictions2 = multinomial_dl2.predict(test) for r in range(predictions1.nrow()): p1 = predictions1[r,0] p2 = predictions2[r,0] assert p1 == p2, "expected multinomial predictions to be the same for row {0}, but got {1} and {2}" \ "".format(r, p1, p2)
def get_xval_models(self, key=None): """ Return a Model object. :param key: If None, return all cross-validated models; otherwise return the model that key points to. :return: A model or list of models. """ return h2o.get_model(key) if key is not None else [h2o.get_model(k) for k in self._xval_keys]
def test_param_disabled(): print("\n=== disabling "+kcvp+" ===") aml = setup_and_train(False) _, non_se, se = get_partitioned_model_names(aml.leaderboard) keys = list_keys_in_memory() preds = len(keys['cv_predictions']) assert preds == 0, "{preds} CV predictions were not cleaned from memory".format(preds=preds) for m in non_se: assert_cv_predictions_on_model(m, False) for m in se: assert not h2o.get_model(h2o.get_model(m).metalearner()['name']).cross_validation_predictions()
def get_grid(self, sort_by=None, decreasing=None): """ Retrieve an H2OGridSearch instance. Optionally specify a metric by which to sort models and a sort order. Note that if neither cross-validation nor a validation frame is used in the grid search, then the training metrics will display in the "get grid" output. If a validation frame is passed to the grid, and ``nfolds = 0``, then the validation metrics will display. However, if ``nfolds`` > 1, then cross-validation metrics will display even if a validation frame is provided. Parameters ---------- sort_by : str, optional A metric by which to sort the models in the grid space. Choices are "logloss", "residual_deviance", "mse", "auc", "r2", "accuracy", "precision", "recall", "f1", etc. decreasing : bool, optional Sort the models in decreasing order of metric if true, otherwise sort in increasing order (default). Returns ------- A new H2OGridSearch instance optionally sorted on the specified metric. """ if sort_by is None and decreasing is None: return self grid_json = h2o.api("GET /99/Grids/%s" % self._id, data={"sort_by": sort_by, "decreasing": decreasing}) grid = H2OGridSearch(self.model, self.hyper_params, self._id) grid.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']] # reordered first_model_json = h2o.api("GET /99/Models/%s" % grid_json['model_ids'][0]['name'])['models'][0] model_class = H2OGridSearch._metrics_class(first_model_json) m = model_class() m._id = self._id m._grid_json = grid_json # m._metrics_class = metrics_class m._parms = grid._parms H2OEstimator.mixin(grid, model_class) grid.__dict__.update(m.__dict__.copy()) return grid
def predict(self, test_data): """ Predict on a dataset. :param H2OFrame test_data: Data on which to make predictions. :returns: A new H2OFrame of predictions. :examples: >>> #Set up an H2OAutoML object >>> build_control = { >>> 'stopping_criteria': { >>> 'stopping_rounds': 3, >>> 'stopping_tolerance': 0.001 >>> } >>> } >>> aml = H2OAutoML(max_runtime_secs=30, build_control=build_control) >>> # Launch H2OAutoML >>> aml.train(y=y, training_frame=training_frame) >>> #Predict with #1 model from H2OAutoML leaderboard >>> aml.predict(test_data) """ if self._fetch(): self._model = h2o.get_model(self._leader_id) return self._model.predict(test_data) print("No model built yet...")
def test_workaround_for_distribution(): try: h2o.rapids("(setproperty \"{}\" \"{}\")".format( "sys.ai.h2o.automl.algo_parameters.all.enabled", "true")) ds = import_dataset('regression') aml = H2OAutoML(project_name="py_test", algo_parameters=dict( distribution='poisson', family='poisson', ), exclude_algos=['StackedEnsemble'], max_runtime_secs=60, seed=1) aml.train(y=ds.target, training_frame=ds.train) model_names = [ aml.leaderboard[i, 0] for i in range(0, (aml.leaderboard.nrows)) ] for mn in model_names: m = h2o.get_model(mn) dist = m.params[ 'distribution'] if 'distribution' in m.params else m.params[ 'family'] if 'family' in m.params else None print("{}: distribution = {}".format(mn, dist)) except: h2o.rapids("(setproperty \"{}\" \"{}\")".format( "sys.ai.h2o.automl.algo_parameters.all.enabled", "false"))
def grid_resume(): train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) # Run GBM Grid Search ntrees_opts = [1, 3] learn_rate_opts = [0.1, .05] hyper_parameters = OrderedDict() hyper_parameters["learn_rate"] = learn_rate_opts hyper_parameters["ntrees"] = ntrees_opts print("GBM grid with the following hyper_parameters:", hyper_parameters) export_dir = pyunit_utils.locate("results") gs = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters) gs.train(x=list(range(4)), y=4, training_frame=train) grid_id = gs.grid_id old_grid_model_count = len(gs.model_ids) print("Baseline grid has %d models" % old_grid_model_count) saved_path = h2o.save_grid(export_dir, grid_id) h2o.remove_all() train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) grid = h2o.load_grid(saved_path) assert grid is not None assert len(grid.model_ids) == old_grid_model_count # Modify the hyperspace - should add new models to the grid hyper_parameters["ntrees"] = [2, 5] grid = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters, grid_id = grid.grid_id) grid.train(x=list(range(4)), y=4, training_frame=train) print("Newly grained grid has %d models" % len(grid.model_ids)) assert len(grid.model_ids) == 2 * old_grid_model_count for model_id in grid.model_ids: model = h2o.get_model(model_id) assert model is not None
def test_algo_parameter_can_be_applied_only_to_a_specific_algo(): ds = import_dataset() aml = H2OAutoML( project_name="py_specific_algo_param", algo_parameters=dict(GBM__monotone_constraints=dict(AGE=1)), max_models=6, seed=1) aml.train(y=ds.target, training_frame=ds.train) model_names, _, _ = get_partitioned_model_names(aml.leaderboard) models_supporting_monotone_constraints = [ n for n in model_names if re.match(r"GBM|XGBoost", n) ] assert next((m for m in models_supporting_monotone_constraints if m.startswith('GBM')), None), "There should be at least one GBM model" for m in models_supporting_monotone_constraints: model = h2o.get_model(m) mc_value = next(v['actual'] for n, v in model.params.items() if n == 'monotone_constraints') if m.startswith('GBM'): assert isinstance(mc_value, list) age = next((v for v in mc_value if v['key'] == 'AGE'), None) assert age is not None assert age['value'] == 1.0 else: assert mc_value is None
def get_modelKmeans(ip, port): # Connect to a pre-existing cluster # connect to localhost:54321 #Log.info("Importing benign.csv data...\n") benign_h2o = h2o.import_file( path=h2o.locate("smalldata/logreg/benign.csv")) #benign_h2o.summary() benign_sci = np.genfromtxt(h2o.locate("smalldata/logreg/benign.csv"), delimiter=",") # Impute missing values with column mean imp = Imputer(missing_values='NaN', strategy='mean', axis=0) benign_sci = imp.fit_transform(benign_sci) for i in range(2, 7): # Log.info("H2O K-Means") km_h2o = h2o.kmeans(x=benign_h2o, k=i) km_h2o.show() model = h2o.get_model(km_h2o._id) model.show() km_sci = KMeans(n_clusters=i, init='k-means++', n_init=1) km_sci.fit(benign_sci) print "sckit centers" print km_sci.cluster_centers_
def test_automl_creates_interpretable_SE_with_only_monotonic_models(): ds = import_dataset() aml_mono = H2OAutoML( project_name="test_automl_creates_interpretable_se", max_models=5, include_algos=["GBM", "GLM", "XGBoost", "StackedEnsemble"], monotone_constraints=dict(AGE=1, DPROS=1, DCAPS=1, PSA=1, VOL=1, GLEASON=1), seed=1234) aml_mono.train(y=ds.target, training_frame=ds.train) leaderboard = (aml_mono.leaderboard.as_data_frame()["model_id"]) assert leaderboard.apply( lambda model_name: "Monotonic" in model_name).any() se_name = leaderboard[leaderboard.apply( lambda model_name: "Monotonic" in model_name)] se_mono = h2o.get_model(se_name.iloc[0]) assert leaderboard.apply(lambda model_name: 'GLM' in model_name).any() assert all(['GBM' in bm or 'XGBoost' in bm for bm in se_mono.base_models])
def test_param_disabled(): print("\n=== disabling "+kcvm+" ===") aml = setup_and_train(False) models, non_se, se = get_partitioned_model_names(aml.leaderboard) check_model_property(se, kcvm, False) check_model_property(non_se, kcvm, True, False, True) keys = list_keys_in_memory() tot, cv = len(keys['models']), len(keys['cv_models']) print("total models in memory = {tot}, among which {cv} CV models".format(tot=tot, cv=cv)) assert tot > 0, "no models left in memory" assert cv == 0, "{cv} CV models were not cleaned from memory".format(cv=cv) for m in non_se: assert not h2o.get_model(m).cross_validation_models(), "unexpected cv models for model "+m for m in se: metal = h2o.get_model(h2o.get_model(m).metalearner()['name']) assert not metal.cross_validation_models(), "unexpected cv models for metalearner of model "+m
def test_monotone_constraints_can_be_passed_as_algo_parameter(): ds = import_dataset() aml = H2OAutoML( project_name="py_monotone_constraints", algo_parameters=dict( monotone_constraints=dict( AGE=1, VOL=-1), # constraints just for the sake of testing # ntrees=10, ), max_models=6, seed=1) aml.train(y=ds.target, training_frame=ds.train) model_names, _, _ = get_partitioned_model_names(aml.leaderboard) models_supporting_monotone_constraints = [ n for n in model_names if re.match(r"GBM|XGBoost", n) ] assert len(models_supporting_monotone_constraints) < len(model_names), \ "models not supporting the constraint should not have been skipped" for m in models_supporting_monotone_constraints: model = h2o.get_model(m) value = next(v['actual'] for n, v in model.params.items() if n == 'monotone_constraints') # print(param) assert isinstance(value, list) assert len(value) == 2 age = next((v for v in value if v['key'] == 'AGE'), None) assert age is not None assert age['value'] == 1.0 vol = next((v for v in value if v['key'] == 'VOL'), None) assert vol is not None assert vol['value'] == -1.0
def _fetch_state(aml_id, properties=None): state_json = h2o.api("GET /99/AutoML/%s" % aml_id) project_name = state_json["project_name"] if project_name is None: raise H2OValueError("No AutoML instance with id {}.".format(aml_id)) leaderboard_list = [key["name"] for key in state_json['leaderboard']['models']] leader_id = leaderboard_list[0] if (leaderboard_list is not None and len(leaderboard_list) > 0) else None should_fetch = lambda prop: properties is None or prop in properties leader = None if should_fetch('leader'): leader = h2o.get_model(leader_id) if leader_id is not None else None leaderboard = None if should_fetch('leaderboard'): leaderboard = H2OAutoML._fetch_table(state_json['leaderboard_table'], key=project_name+"_leaderboard", progress_bar=False) leaderboard = h2o.assign(leaderboard[1:], project_name+"_leaderboard") # removing index and reassign id to ensure persistence on backend event_log = None if should_fetch('event_log'): event_log = H2OAutoML._fetch_table(state_json['event_log_table'], key=project_name+"_eventlog", progress_bar=False) event_log = h2o.assign(event_log[1:], project_name+"_eventlog") # removing index and reassign id to ensure persistence on backend return dict( project_name=project_name, json=state_json, leader_id=leader_id, leader=leader, leaderboard=leaderboard, event_log=event_log, )
def test_param_disabled(): print("\n=== disabling "+kcvm+" ===") aml = setup_and_train(False) models, non_se, se = get_partitioned_model_names(aml.leaderboard) check_model_property(se, kcvm, False) check_model_property(non_se, kcvm, True, False, True) keys = list_keys_in_memory() tot, cv = len(keys['models_all']), len(keys['cv_models']) print("total models in memory = {tot}, among which {cv} CV models".format(tot=tot, cv=cv)) assert tot > 0, "no models left in memory" assert cv == 0, "{cv} CV models were not cleaned from memory".format(cv=cv) for m in non_se: assert not h2o.get_model(m).cross_validation_models(), "unexpected cv models for model "+m for m in se: metal = h2o.get_model(h2o.get_model(m).metalearner()['name']) assert not metal.cross_validation_models(), "unexpected cv models for metalearner of model "+m
def get_modelKmeans(): # Connect to a pre-existing cluster # connect to localhost:54321 # Log.info("Importing benign.csv data...\n") benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv")) # benign_h2o.summary() benign_sci = np.genfromtxt(pyunit_utils.locate("smalldata/logreg/benign.csv"), delimiter=",") # Impute missing values with column mean imp = Imputer(missing_values="NaN", strategy="mean", axis=0) benign_sci = imp.fit_transform(benign_sci) for i in range(2, 7): # Log.info("H2O K-Means") km_h2o = H2OKMeansEstimator(k=i) km_h2o.train(x=range(benign_h2o.ncol), training_frame=benign_h2o) km_h2o.show() model = h2o.get_model(km_h2o._id) model.show() km_sci = KMeans(n_clusters=i, init="k-means++", n_init=1) km_sci.fit(benign_sci) print "sckit centers" print km_sci.cluster_centers_
def get_grid(self, sort_by=None, decreasing=None): """ Retrieve an H2OGridSearch instance. Optionally specify a metric by which to sort models and a sort order. Parameters ---------- sort_by : str, optional A metric by which to sort the models in the grid space. Choices are "logloss", "residual_deviance", "mse", "auc", "r2", "accuracy", "precision", "recall", "f1", etc. decreasing : bool, optional Sort the models in decreasing order of metric if true, otherwise sort in increasing order (default). Returns ------- A new H2OGridSearch instance optionally sorted on the specified metric. """ if sort_by is None and decreasing is None: return self grid_json = H2OConnection.get_json("Grids/"+self._id, sort_by=sort_by, decreasing=decreasing, _rest_version=99) grid = H2OGridSearch(self.model, self.hyper_params, self._id) grid.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']] #reordered first_model_json = H2OConnection.get_json("Models/"+grid_json['model_ids'][0]['name'], _rest_version=99)['models'][0] model_class = H2OGridSearch._metrics_class(first_model_json) m = model_class() m._id = self._id m._grid_json = grid_json # m._metrics_class = metrics_class m._parms = grid._parms H2OEstimator.mixin(grid,model_class) grid.__dict__.update(m.__dict__.copy()) return grid
def test_base_models_are_populated(): train = h2o.import_file(pu.locate("smalldata/iris/iris_train.csv")) test = h2o.import_file(pu.locate("smalldata/iris/iris_test.csv")) x = train.columns y = "species" x.remove(y) nfolds = 2 gbm = H2OGradientBoostingEstimator(nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True) gbm.train(x=x, y=y, training_frame=train) rf = H2ORandomForestEstimator(nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True) rf.train(x=x, y=y, training_frame=train) se = H2OStackedEnsembleEstimator(training_frame=train, validation_frame=test, base_models=[gbm.model_id, rf.model_id]) se.train(x=x, y=y, training_frame=train) retrieved_se = get_model(se.model_id) assert len(se.base_models) == 2 assert len(retrieved_se.base_models) == 2 assert se.base_models == retrieved_se.base_models # ensure that we are getting the model_ids assert pu.is_type(se.base_models, [str]) assert pu.is_type(retrieved_se.base_models, [str])
def save_model(model_id, dest_dir='.', mformat='mojo'): model = h2o.get_model(model_id) if mformat == 'mojo': model.save_mojo(path=dest_dir) # model.download_mojo(path=dest_dir, get_genmodel_jar=True) else: model.save_model_details(path=dest_dir)
def get_hyperparams_dict(self, id, display=True): """ Derived and returned the model parameters used to train the particular grid search model. Parameters ---------- id: str The model id of the model with hyperparameters of interest. display: boolean Flag to indicate whether to display the hyperparameter names. Returns ------- A dict of model pararmeters derived from the hyper-parameters used to train this particular model. """ idx = id if is_int(id) else self.model_ids.index(id) model = self[idx] model_params = dict() # if cross-validation is turned on, parameters in one of the fold model actual contains the max_runtime_secs # parameter and not the main model that is returned. if model._is_xvalidated: model = h2o.get_model(model._xval_keys[0]) for param_name in self.hyper_names: model_params[param_name] = model.params[param_name]['actual'][0] if \ isinstance(model.params[param_name]['actual'], list) else model.params[param_name]['actual'] if display: print('Hyperparameters: [' + ', '.join(list(self.hyper_params.keys())) + ']') return model_params
def _model_build(self, x, y, tframe, vframe, kwargs): kwargs['training_frame'] = tframe if vframe is not None: kwargs["validation_frame"] = vframe if isinstance(y, int): y = tframe.names[y] if y is not None: kwargs['response_column'] = y if not isinstance(x, (list,tuple)): x=[x] if isinstance(x[0], int): x = [tframe.names[i] for i in x] offset = kwargs["offset_column"] folds = kwargs["fold_column"] weights= kwargs["weights_column"] ignored_columns = list(set(tframe.names) - set(x + [y,offset,folds,weights])) kwargs["ignored_columns"] = None if ignored_columns==[] else [h2o.h2o._quoted(col) for col in ignored_columns] kwargs = dict([(k, kwargs[k].frame_id if isinstance(kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if kwargs[k] is not None]) # gruesome one-liner algo = self.model._compute_algo() #unique to grid search kwargs["_rest_version"] = 99 #unique to grid search grid = H2OJob(H2OConnection.post_json("Grid/"+algo, **kwargs), job_type=(algo+" Grid Build")) if self._future: self._job = grid return grid.poll() if '_rest_version' in kwargs.keys(): grid_json = H2OConnection.get_json("Grids/"+grid.dest_key, _rest_version=kwargs['_rest_version']) else: grid_json = H2OConnection.get_json("Grids/"+grid.dest_key) self.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']] #get first model returned in list of models from grid search to get model class (binomial, multinomial, etc) first_model_json = H2OConnection.get_json("Models/"+grid_json['model_ids'][0]['name'], _rest_version=kwargs['_rest_version'])['models'][0] self._resolve_grid(grid.dest_key, grid_json, first_model_json)
def get_grid(self, sort_by=None, decreasing=None): """ Retrieve an H2OGridSearch instance. Optionally specify a metric by which to sort models and a sort order. Parameters ---------- sort_by : str, optional A metric by which to sort the models in the grid space. Choices are "logloss", "residual_deviance", "mse", "auc", "r2", "accuracy", "precision", "recall", "f1", etc. decreasing : bool, optional Sort the models in decreasing order of metric if true, otherwise sort in increasing order (default). Returns ------- A new H2OGridSearch instance optionally sorted on the specified metric. """ if sort_by is None and decreasing is None: return self grid_json = h2o.api("GET /99/Grids/%s" % self._id, data={"sort_by": sort_by, "decreasing": decreasing}) grid = H2OGridSearch(self.model, self.hyper_params, self._id) grid.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']] # reordered first_model_json = h2o.api("GET /99/Models/%s" % grid_json['model_ids'][0]['name'])['models'][0] model_class = H2OGridSearch._metrics_class(first_model_json) m = model_class() m._id = self._id m._grid_json = grid_json # m._metrics_class = metrics_class m._parms = grid._parms H2OEstimator.mixin(grid, model_class) grid.__dict__.update(m.__dict__.copy()) return grid
def get_hyperparams_dict(self, id, display=True): """ Derived and returned the model parameters used to train the particular grid search model. Parameters ---------- id: str The model id of the model with hyperparameters of interest. display: boolean Flag to indicate whether to display the hyperparameter names. Returns ------- A dict of model pararmeters derived from the hyper-parameters used to train this particular model. """ idx = id if isinstance(id, int) else self.model_ids.index(id) model = self[idx] model_params = dict() # if cross-validation is turned on, parameters in one of the fold model actual contains the max_runtime_secs # parameter and not the main model that is returned. if model._is_xvalidated: model = h2o.get_model(model._xval_keys[0]) for param_name in self.hyper_names: model_params[param_name] = model.params[param_name]['actual'][0] if \ isinstance(model.params[param_name]['actual'], list) else model.params[param_name]['actual'] if display: print('Hyperparameters: [' + ', '.join(list(self.hyper_params.keys())) + ']') return model_params
def get_hyperparams(self, id, display=True): """ Get the hyperparameters of a model explored by grid search. Parameters ---------- id: str The model id of the model with hyperparameters of interest. display: boolean Flag to indicate whether to display the hyperparameter names. Returns ------- A list of the hyperparameters for the specified model. """ idx = id if isinstance(id, int) else self.model_ids.index(id) model = self[idx] # if cross-validation is turned on, parameters in one of the fold model actuall contains the max_runtime_secs # parameter and not the main model that is returned. if model._is_xvalidated: model = h2o.get_model(model._xval_keys[0]) res = [ model.params[h]['actual'][0] if isinstance( model.params[h]['actual'], list) else model.params[h]['actual'] for h in self.hyper_params ] if display: print('Hyperparameters: [' + ', '.join(list(self.hyper_params.keys())) + ']') return res
def test_stacked_ensembles_are_trained_with_blending_frame_even_if_nfolds_eq_0( ): print( "Check that we can disable cross-validation when passing a blending frame and that Stacked Ensembles are trained using this frame." ) max_models = 5 ds = import_dataset() aml = H2OAutoML(project_name="py_aml_blending_frame", seed=1, max_models=max_models, nfolds=0) aml.train(y=ds.target, training_frame=ds.train, blending_frame=ds.valid, leaderboard_frame=ds.test) se = get_partitioned_model_names(aml.leaderboard).se assert len( se ) > 3, "In blending mode, StackedEnsemble should still be trained in spite of nfolds=0." for m in se: model = h2o.get_model(m) assert model.params['blending_frame']['actual'][ 'name'] == ds.valid.frame_id assert model._model_json['output']['stacking_strategy'] == 'blending'
def test_stackedensemble_propagates_the_max_runtime_secs(): max_runtime_secs = 5 hyper_parameters = dict() hyper_parameters["ntrees"] = [1, 3, 5] params = dict( fold_assignment="modulo", nfolds=3, keep_cross_validation_predictions=True ) data = prepare_data() gs1 = H2OGridSearch(H2OGradientBoostingEstimator(**params), hyper_params=hyper_parameters) gs1.train(data.x, data.y, data.train, validation_frame=data.train) se = H2OStackedEnsembleEstimator(base_models=[gs1], max_runtime_secs=max_runtime_secs) se.train(data.x, data.y, data.train) metalearner = h2o.get_model(se.metalearner()["name"]) # metalearner has the set max_runtine_secs assert metalearner.actual_params['max_runtime_secs'] <= max_runtime_secs assert metalearner.actual_params['max_runtime_secs'] > 0 # stack ensemble has the set max_runtime_secs assert se.max_runtime_secs == max_runtime_secs
def get_automl(project_name): """ Retrieve information about an AutoML instance. :param str project_name: A string indicating the project_name of the automl instance to retrieve. :returns: A dictionary containing the project_name, leader model, and leaderboard. """ automl_json = h2o.api("GET /99/AutoML/%s" % project_name) project_name = automl_json["project_name"] leaderboard_list = [key["name"] for key in automl_json['leaderboard']['models']] if leaderboard_list is not None and len(leaderboard_list) > 0: leader_id = leaderboard_list[0] else: leader_id = None leader = h2o.get_model(leader_id) # Intentionally mask the progress bar here since showing multiple progress bars is confusing to users. # If any failure happens, revert back to user's original setting for progress and display the error message. is_progress = H2OJob.__PROGRESS_BAR__ h2o.no_progress() try: # Parse leaderboard H2OTwoDimTable & return as an H2OFrame leaderboard = h2o.H2OFrame( automl_json["leaderboard_table"].cell_values, column_names=automl_json["leaderboard_table"].col_header) except Exception as ex: raise ex finally: if is_progress is True: h2o.show_progress() leaderboard = leaderboard[1:] automl_dict = {'project_name': project_name, "leader": leader, "leaderboard": leaderboard} return automl_dict
def test_api_timestamp(): prostate_train = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate_train.csv")) prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor() ntrees = 1 learning_rate = 0.1 depth = 5 min_rows = 10 # Build H2O GBM classification model: gbm_h2o = H2OGradientBoostingEstimator(ntrees=ntrees, learn_rate=learning_rate, max_depth=depth, min_rows=min_rows, distribution="bernoulli", model_id="test_timestamp") gbm_h2o.train(x=list(range(1,prostate_train.ncol)),y="CAPSULE", training_frame=prostate_train) model = h2o.get_model(model_id="test_timestamp") models = h2o.api("GET /3/Models") assert model._model_json['timestamp'] == models["models"][0]["timestamp"], "Timestamp should be the same." assert gbm_h2o.start_time is not None and gbm_h2o.start_time > 0 assert gbm_h2o.end_time is not None and gbm_h2o.end_time > 0 assert gbm_h2o.run_time is not None and gbm_h2o.run_time > 0 assert gbm_h2o.end_time - gbm_h2o.start_time == gbm_h2o.run_time
def test_maxrglm_gaussian_coefs(): d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) my_y = "GLEASON" my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] maxrglm_model = maxrglm(seed=12345, max_predictor_number=7) maxrglm_model.train(training_frame=d, x=my_x, y=my_y) coefs = maxrglm_model.coef() coefs_norm = maxrglm_model.coef_norm() for ind in list(range(len(coefs))): one_coef = coefs[ind] one_coef_norm = coefs_norm[ind] # coefficients obtained from accessing model_id, generate model and access the model coeffs one_model = h2o.get_model( maxrglm_model._model_json["output"]["best_model_ids"][ind]['name']) model_coef = one_model.coef() model_coef_norm = one_model.coef_norm() # get coefficients of individual predictor subset size subset_size = ind + 1 one_model_coef = maxrglm_model.coef(subset_size) one_model_coef_norm = maxrglm_model.coef_norm(subset_size) # check coefficient dicts are equal pyunit_utils.assertCoefDictEqual(one_coef, model_coef, 1e-6) pyunit_utils.assertCoefDictEqual(one_coef_norm, model_coef_norm, 1e-6) pyunit_utils.assertCoefDictEqual(one_model_coef, model_coef, 1e-6) pyunit_utils.assertCoefDictEqual(one_model_coef_norm, model_coef_norm, 1e-6)
def get_grid(model, hyper_params, grid_id): """ Retrieve an H2OGridSearch instance already trained given its original model, hyper_params, and grid_id. Parameters ---------- model : H2O Estimator model The type of model explored that is initalized with optional parameters which are unchanged across explored models. hyper_params: dict A dictionary of string parameters (keys) and a list of values explored by grid search (values). grid_id : str, optional The unique id assigned to the grid object. Returns ------- A new H2OGridSearch instance that is a replica of the H2OGridSearch instance with the specified grid_id. """ kwargs = {'_rest_version':99} grid_json = H2OConnection.get_json("Grids/"+grid_id, **kwargs) grid = H2OGridSearch(model, hyper_params, grid_id) grid.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']] first_model_json = H2OConnection.get_json("Models/"+grid_json['model_ids'][0]['name'], _rest_version=kwargs['_rest_version'])['models'][0] model_class = H2OGridSearch._metrics_class(first_model_json) m = model_class() m._id = grid_id m._grid_json = grid_json # m._metrics_class = metrics_class m._parms = grid._parms H2OEstimator.mixin(grid,model_class) grid.__dict__.update(m.__dict__.copy()) return grid
def get_grid(self, sort_by=None, decreasing=None): """ Retrieve an H2OGridSearch instance. Optionally specify a metric by which to sort models and a sort order. Note that if neither cross-validation nor a validation frame is used in the grid search, then the training metrics will display in the "get grid" output. If a validation frame is passed to the grid, and ``nfolds = 0``, then the validation metrics will display. However, if ``nfolds`` > 1, then cross-validation metrics will display even if a validation frame is provided. :param str sort_by: A metric by which to sort the models in the grid space. Choices are: ``"logloss"``, ``"residual_deviance"``, ``"mse"``, ``"auc"``, ``"r2"``, ``"accuracy"``, ``"precision"``, ``"recall"``, ``"f1"``, etc. :param bool decreasing: Sort the models in decreasing order of metric if true, otherwise sort in increasing order (default). :returns: A new H2OGridSearch instance optionally sorted on the specified metric. """ if sort_by is None and decreasing is None: return self grid_json = h2o.api("GET /99/Grids/%s" % self._id, data={"sort_by": sort_by, "decreasing": decreasing}) grid = H2OGridSearch(self.model, self.hyper_params, self._id) grid.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']] # reordered first_model_json = h2o.api("GET /99/Models/%s" % grid_json['model_ids'][0]['name'])['models'][0] model_class = H2OGridSearch._metrics_class(first_model_json) m = model_class() m._id = self._id m._grid_json = grid_json # m._metrics_class = metrics_class m._parms = grid._parms H2OEstimator.mixin(grid, model_class) grid.__dict__.update(m.__dict__.copy()) return grid
def get_hyperparams(self, id, display=True): """ Get the hyperparameters of a model explored by grid search. Parameters ---------- id: str The model id of the model with hyperparameters of interest. display: boolean Flag to indicate whether to display the hyperparameter names. Returns ------- A list of the hyperparameters for the specified model. """ idx = id if is_int(id) else self.model_ids.index(id) model = self[idx] # if cross-validation is turned on, parameters in one of the fold model actuall contains the max_runtime_secs # parameter and not the main model that is returned. if model._is_xvalidated: model = h2o.get_model(model._xval_keys[0]) res = [model.params[h]['actual'][0] if isinstance(model.params[h]['actual'], list) else model.params[h]['actual'] for h in self.hyper_params] if display: print('Hyperparameters: [' + ', '.join(list(self.hyper_params.keys())) + ']') return res
def test_maxrglm_gaussian(): d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) my_y = "GLEASON" my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] maxrglm_model = maxrglm(seed=12345, max_predictor_number=7) maxrglm_model.train(training_frame=d, x=my_x, y=my_y) resultFrame = maxrglm_model.result() numRows = resultFrame.nrows best_r2_value = maxrglm_model.get_best_R2_values() for ind in list(range(numRows)): # r2 from attributes best_r2 = best_r2_value[ind] one_model = h2o.get_model(resultFrame["model_id"][ind, 0]) pred = one_model.predict(d) print("last element of predictor frame: {0}".format( pred[pred.nrows - 1, pred.ncols - 1])) assert pred.nrows == d.nrows, "expected dataset row: {0}, actual dataset row: {1}".format( pred.nrows, d.nrows) # r2 from result frame frame_r2 = resultFrame["best_r2_value"][ind, 0] # r2 from model model_r2 = one_model.r2() # make sure all r2 are equal assert abs( best_r2 - frame_r2 ) < 1e-6, "expected best r2: {0}, actual best r2: {1}".format( best_r2, frame_r2) assert abs( frame_r2 - model_r2 ) < 1e-6, "expected best r2: {0}, actual best r2: {1}".format( model_r2, frame_r2)
def benign_grid(): training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv")) Y = 3 X = list(range(3)) + list(range(4,11)) # NOTE: this tests bad parameter value handling; 'a' is not a float: hyper_parameters = {'alpha': [0.01,0.5,'a'], 'lambda': [1e-5,1e-6]} gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters) gs.train(x=X,y=Y, training_frame=training_data) for model in gs: assert isinstance(model, H2OGeneralizedLinearEstimator) gs.show() print(gs.sort_by('F1', False)) best_model_id = gs.sort_by('F1', False)['Model Id'][0] best_model = h2o.get_model(best_model_id) best_model.predict(training_data) gs.predict(training_data) print(gs.get_hyperparams(best_model_id)) print(gs.grid_id) assert best_model.params['family']['actual'] == 'binomial' # test search_criteria plumbing search_criteria = { 'strategy': "RandomDiscrete", 'max_models': 3 } max_models_g = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters, search_criteria=search_criteria) max_models_g.train(x=X,y=Y, training_frame=training_data) max_models_g.show() print(max_models_g.grid_id) print(max_models_g.sort_by('F1', False)) assert len(max_models_g.models) == 3, "expected 3 models, got: {}".format(len(max_models_g.models)) print(max_models_g.sorted_metric_table()) print(max_models_g.get_grid("r2"))
def benign_grid(): training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv")) Y = 3 X = range(3) + range(4,11) hyper_parameters = {'alpha': [0.01,0.5,'a'], 'lambda': [1e-5,1e-6]} gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters) gs.train(x=X,y=Y, training_frame=training_data) gs.show() print gs.sort_by('F1', False) best_model_id = gs.sort_by('F1', False)['Model Id'][0] best_model = h2o.get_model(best_model_id) best_model.predict(training_data) gs.predict(training_data) print gs.get_hyperparams(best_model_id) print gs.grid_id new_g = H2OGridSearch.get_grid(H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters, gs.grid_id) new_g.show() print new_g.grid_id print new_g.sort_by('F1', False) assert best_model.params['family']['actual'] == 'binomial'
def covtype_get_model(ip, port): # Connect to h2o h2o.init(ip, port) #Log.info("Importing covtype.20k.data...\n") covtype = h2o.import_frame( path=h2o.locate("smalldata/covtype/covtype.20k.data")) Y = 54 X = range(0, 20) + range(29, 54) # Set response to be indicator of a particular class res_class = random.randint(1, 4) # Log.info(paste("Setting response column", myY, "to be indicator of class", res_class, "\n")) covtype[54] = (covtype[54] == res_class) #covtype_data.summary() # L2: alpha = 0, lambda = 0 covtype_mod1 = h2o.glm(y=covtype[Y], x=covtype[X], family="binomial", alpha=[0], Lambda=[0]) covtype_mod1.show() covtype_mod1 = h2o.get_model(covtype_mod1._id) covtype_mod1.show() # Elastic: alpha = 0.5, lambda = 1e-4 covtype_mod2 = h2o.glm(y=covtype[Y], x=covtype[X], family="binomial", alpha=[0.5], Lambda=[1e-4]) covtype_mod2.show() covtype_mod2 = h2o.get_model(covtype_mod2._id) covtype_mod2.show() # L1: alpha = 1, lambda = 1e-4 covtype_mod3 = h2o.glm(y=covtype[Y], x=covtype[X], family="binomial", alpha=[1], Lambda=[1e-4]) covtype_mod3.show() covtype_mod3 = h2o.get_model(covtype_mod3._id) covtype_mod3.show()
def test_gaussian_result_frame_model_id(): d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) my_y = "GLEASON" my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] maxr_model = modelSelection(seed=12345, max_predictor_number=7, mode="maxr") maxr_model.train(training_frame=d, x=my_x, y=my_y) allsubsets_model = modelSelection(seed=12345, max_predictor_number=7, mode="allsubsets") allsubsets_model.train(training_frame=d, x=my_x, y=my_y) result_frame_allsubsets = allsubsets_model.result() numRows = result_frame_allsubsets.nrows best_r2_allsubsets = allsubsets_model.get_best_R2_values() result_frame_maxr = maxr_model.result() best_r2_maxr = maxr_model.get_best_R2_values() for ind in list(range(numRows)): # r2 from attributes best_r2_value_allsubsets = best_r2_allsubsets[ind] one_model_allsubsets = h2o.get_model( result_frame_allsubsets["model_id"][ind, 0]) pred_allsubsets = one_model_allsubsets.predict(d) print("last element of predictor frame: {0}".format( pred_allsubsets[pred_allsubsets.nrows - 1, pred_allsubsets.ncols - 1])) assert pred_allsubsets.nrows == d.nrows, "expected dataset row: {0}, actual dataset row: " \ "{1}".format(pred_allsubsets.nrows, d.nrows) best_r2_value_maxr = best_r2_maxr[ind] one_model_maxr = h2o.get_model(result_frame_maxr["model_id"][ind, 0]) pred_maxr = one_model_maxr.predict(d) pyunit_utils.compare_frames_local( pred_maxr, pred_allsubsets, prob=1, tol=1e-6) # compare allsubsets and maxr results # r2 from result frame frame_r2_allsubsets = result_frame_allsubsets["best_r2_value"][ind, 0] # r2 from model model_r2_allsubsets = one_model_allsubsets.r2() # make sure all r2 are equal assert abs(best_r2_value_allsubsets-frame_r2_allsubsets) < 1e-6, "expected best r2: {0}, actual best r2: " \ "{1}".format(best_r2_value_allsubsets, frame_r2_allsubsets) assert abs(frame_r2_allsubsets-model_r2_allsubsets) < 1e-6, "expected best r2: {0}, actual best r2: " \ "{1}".format(model_r2_allsubsets, frame_r2_allsubsets) assert abs(best_r2_value_maxr-model_r2_allsubsets) < 1e-6, "expected best r2: {0}, maxr best r2: {1}" \ "".format(best_r2_value_maxr, model_r2_allsubsets)
def _model_build(self, x, y, tframe, vframe, kwargs): kwargs['training_frame'] = tframe if vframe is not None: kwargs["validation_frame"] = vframe if is_type(y, int): y = tframe.names[y] if y is not None: kwargs['response_column'] = y if not is_type(x, list, tuple): x = [x] if is_type(x[0], int): x = [tframe.names[i] for i in x] offset = kwargs["offset_column"] folds = kwargs["fold_column"] weights = kwargs["weights_column"] ignored_columns = list(set(tframe.names) - set(x + [y, offset, folds, weights])) kwargs["ignored_columns"] = None if not ignored_columns else [quoted(col) for col in ignored_columns] kwargs = dict([(k, kwargs[k].frame_id if isinstance(kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if kwargs[k] is not None]) # gruesome one-liner algo = self.model._compute_algo() # unique to grid search if self.grid_id is not None: kwargs["grid_id"] = self.grid_id rest_ver = kwargs.pop("_rest_version") if "_rest_version" in kwargs else None grid = H2OJob(h2o.api("POST /99/Grid/%s" % algo, data=kwargs), job_type=(algo + " Grid Build")) if self._future: self._job = grid return grid.poll() grid_json = h2o.api("GET /99/Grids/%s" % (grid.dest_key)) failure_messages_stacks = "" error_index = 0 if len(grid_json["failure_details"]) > 0: print("Errors/Warnings building gridsearch model\n") # will raise error if no grid model is returned, store error messages here for error_message in grid_json["failure_details"]: if isinstance(grid_json["failed_params"][error_index], dict): for h_name in grid_json['hyper_names']: print("Hyper-parameter: {0}, {1}".format(h_name, grid_json['failed_params'][error_index][h_name])) if len(grid_json["failure_stack_traces"]) > error_index: print("failure_details: {0}\nfailure_stack_traces: " "{1}\n".format(error_message, grid_json['failure_stack_traces'][error_index])) failure_messages_stacks += error_message+'\n' error_index += 1 self.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']] # get first model returned in list of models from grid search to get model class (binomial, multinomial, etc) # sometimes no model is returned due to bad parameter values provided by the user. if len(grid_json['model_ids']) > 0: first_model_json = h2o.api("GET /%d/Models/%s" % (rest_ver or 3, grid_json['model_ids'][0]['name']))['models'][0] self._resolve_grid(grid.dest_key, grid_json, first_model_json) else: if len(failure_messages_stacks)>0: raise ValueError(failure_messages_stacks) else: raise ValueError("Gridsearch returns no model due to bad parameter values or other reasons....")
def rename_things(): fr = h2o.import_file(tests.locate("smalldata/logreg/prostate.csv")) fr.frame_id = "mooochooo" print h2o.ls() zz = fr[1:2] zz.show() zz.frame_id = "black_sheep_LLC" print h2o.ls() from h2o.estimators.gbm import H2OGradientBoostingEstimator m = H2OGradientBoostingEstimator(ntrees=5, max_depth=2) m.train(x=fr.names[2:], y=fr.names[1], training_frame=fr) print m.model_id m.model_id = "my_gbm_model_wwwww" print h2o.ls() print h2o.get_model("my_gbm_model_wwwww") print h2o.ls()
def get_model_by_algo(algo,models_dict): mod=None mod_id=None for m in list(models_dict.keys()): if m[0:3]==algo: mod_id=m mod=h2o.get_model(m) return mod,mod_id
def stackedensemble(mod): coef_norm=None try: metalearner = h2o.get_model(mod.metalearner()['name']) coef_norm=metalearner.coef_norm() except: pass return coef_norm
def test_modelselection_gaussian_model_id(): d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) my_y = "GLEASON" my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] allsubsets_model = modelSelection(seed=12345, max_predictor_number=7, mode="allsubsets") allsubsets_model.train(training_frame=d, x=my_x, y=my_y) result_frame_allsubsets = allsubsets_model.result() numRows = result_frame_allsubsets.nrows modelIDs_allsubsets = allsubsets_model._model_json["output"][ "best_model_ids"] maxr_model = modelSelection(seed=12345, max_predictor_number=7, mode="maxr") maxr_model.train(training_frame=d, x=my_x, y=my_y) result_frame_maxr = maxr_model.result() maxrsweep_model = modelSelection(seed=12345, max_predictor_number=7, mode="maxrsweep") maxrsweep_model.train(training_frame=d, x=my_x, y=my_y) # make sure results returned by maxr and maxrsweep are the same pyunit_utils.compare_frames_local(maxr_model.result()[2:4], maxrsweep_model.result()[2:4], prob=1.0, tol=1e-6) for ind in list(range(numRows)): model_from_frame_allsubsets = h2o.get_model( result_frame_allsubsets["model_id"][ind, 0]) pred_frame_allsubsets = model_from_frame_allsubsets.predict(d) model_from_frame_allsubsets = h2o.get_model( modelIDs_allsubsets[ind]['name']) pred_id_allsubsets = model_from_frame_allsubsets.predict(d) pyunit_utils.compare_frames_local(pred_frame_allsubsets, pred_id_allsubsets, prob=1) model_from_frame_maxr = h2o.get_model( result_frame_maxr["model_id"][ind, 0]) pred_frame_maxr = model_from_frame_maxr.predict(d) pyunit_utils.compare_frames_local(pred_frame_allsubsets, pred_frame_maxr, prob=1, tol=1e-6)
def h2o_print_leaderboard(lb_frame, top_n=999999): df = lb_frame.as_data_frame() for i in range(0, min(top_n, df.shape[0])): model_id = df['model_id'][i] print(df[i:i + 1].to_string(index=False)) best_model = h2o.get_model(model_id) pprint(h2o_not_default_params_str(best_model)) print()
def test_nfolds_eq_0(): print("Check nfolds = 0 works properly") ds = import_dataset() aml = H2OAutoML(project_name="py_aml_nfolds0", nfolds=0, max_models=3, seed=1) aml.train(y=ds['target'], training_frame=ds['train']) _, non_se, _ = get_partitioned_model_names(aml.leaderboard) amodel = h2o.get_model(non_se[0]) assert amodel.params['nfolds']['actual'] == 0
def check_ignore_cols_automl(models,names,x,y): models = sum(models.as_data_frame().values.tolist(),[]) for model in models: if "StackedEnsemble" in model: continue else: assert set(h2o.get_model(model).params["ignored_columns"]["actual"]) == set(names) - {y} - set(x), \ "ignored columns are not honored for model " + model
def test_param_enabled(): print("\n=== enabling "+kcvm+" ===") aml = setup_and_train(True) models, non_se, se = get_partitioned_model_names(aml.leaderboard) check_model_property(se, kcvm, False) check_model_property(non_se, kcvm, True, True, False) keys = list_keys_in_memory() tot, cv = len(keys['models']), len(keys['cv_models']) print("total models in memory = {tot}, among which {cv} CV models".format(tot=tot, cv=cv)) assert tot > 0, "no models left in memory" expected = len(models) * nfolds assert cv == expected, "missing CV models in memory, got {actual}, expected {expected}".format(actual=cv, expected=expected) for m in non_se: assert h2o.get_model(m).cross_validation_models(), "missing cv models for model "+m for m in se: metal = h2o.get_model(h2o.get_model(m).metalearner()['name']) assert metal.cross_validation_models(), "missing cv models for metalearner of model "+m
def test_param_enabled(): print("\n=== enabling "+kcvm+" ===") aml = setup_and_train(True) models, non_se, se = get_partitioned_model_names(aml.leaderboard) check_model_property(se, kcvm, False) check_model_property(non_se, kcvm, True, True, True) keys = list_keys_in_memory() tot, cv = len(keys['models']), len(keys['cv_models']) print("total models in memory = {tot}, among which {cv} CV models".format(tot=tot, cv=cv)) assert tot > 0, "no models left in memory" expected = len(models) * nfolds assert cv == expected, "missing CV models in memory, got {actual}, expected {expected}".format(actual=cv, expected=expected) for m in non_se: assert h2o.get_model(m).cross_validation_models(), "missing cv models for model "+m for m in se: metal = h2o.get_model(h2o.get_model(m).metalearner()['name']) assert metal.cross_validation_models(), "missing cv models for metalearner of model "+m
def test_explanation_list_of_models_binomial_classification(): train = h2o.upload_file( pyunit_utils.locate("smalldata/logreg/prostate.csv")) y = "CAPSULE" train[y] = train[y].asfactor() # get at most one column from each type cols_to_test = [] for col, typ in train.types.items(): for ctt in cols_to_test: if typ == train.types[ctt] or col == y: break else: cols_to_test.append(col) aml = H2OAutoML(seed=1234, max_models=5) aml.train(y=y, training_frame=train) models = [ h2o.get_model(m[0]) for m in aml.leaderboard["model_id"].as_data_frame(use_pandas=False, header=False) ] # Test named models as well gbm = H2OGradientBoostingEstimator(model_id="my_awesome_model") gbm.train(y=y, training_frame=train) models += [gbm] # test variable importance heatmap plot assert isinstance( h2o.varimp_heatmap(models).figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() # test model correlation heatmap plot assert isinstance( h2o.model_correlation_heatmap(models, train).figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() # test partial dependences for col in cols_to_test: assert isinstance( h2o.pd_multi_plot(models, train, col).figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close() # test learning curve for model in models: assert isinstance(model.learning_curve_plot().figure(), matplotlib.pyplot.Figure) matplotlib.pyplot.close("all") # test explain assert isinstance(h2o.explain(models, train, render=False), H2OExplanation) # test explain row assert isinstance(h2o.explain_row(models, train, 1, render=False), H2OExplanation)
def iris_get_model(): iris = h2o.import_file(path=tests.locate("smalldata/iris/iris.csv")) model = h2o.random_forest(y=iris[4], x=iris[0:4], ntrees=50) model.show() model = h2o.get_model(model._id) model.show()
def iris_get_model(): iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv")) model = H2ORandomForestEstimator(ntrees=50) model.train(y=4, x=list(range(4)), training_frame=iris) model.show() model = h2o.get_model(model._id) model.show()
def cross_validation_models(self): """ Obtain a list of cross-validation models. :return: list of H2OModel objects """ cvmodels = self._model_json["output"]["cross_validation_models"] if cvmodels is None: return None m = [] for p in cvmodels: m.append(h2o.get_model(p["name"])) return m
def iris_get_model(ip,port): # Connect to h2o h2o.init(ip,port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv")) model = h2o.random_forest(y=iris[4], x=iris[0:4], ntrees=50) model.show() model = h2o.get_model(model._id) model.show()
def _model_build(self, x, y, tframe, vframe, kwargs): kwargs['training_frame'] = tframe if vframe is not None: kwargs["validation_frame"] = vframe if isinstance(y, int): y = tframe.names[y] if y is not None: kwargs['response_column'] = y if not isinstance(x, (list,tuple)): x=[x] if isinstance(x[0], int): x = [tframe.names[i] for i in x] offset = kwargs["offset_column"] folds = kwargs["fold_column"] weights= kwargs["weights_column"] ignored_columns = list(set(tframe.names) - set(x + [y,offset,folds,weights])) kwargs["ignored_columns"] = None if ignored_columns==[] else [h2o.h2o._quoted(col) for col in ignored_columns] kwargs = dict([(k, kwargs[k].frame_id if isinstance(kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if kwargs[k] is not None]) # gruesome one-liner algo = self.model._compute_algo() #unique to grid search kwargs["_rest_version"] = 99 #unique to grid search if self.grid_id is not None: kwargs["grid_id"] = self.grid_id grid = H2OJob(H2OConnection.post_json("Grid/"+algo, **kwargs), job_type=(algo+" Grid Build")) if self._future: self._job = grid return grid.poll() if '_rest_version' in list(kwargs.keys()): grid_json = H2OConnection.get_json("Grids/"+grid.dest_key, _rest_version=kwargs['_rest_version']) error_index = 0 if len(grid_json["failure_details"]) > 0: print("Errors/Warnings building gridsearch model\n") for error_message in grid_json["failure_details"]: if isinstance(grid_json["failed_params"][error_index], dict): for h_name in grid_json['hyper_names']: print("Hyper-parameter: {0}, {1}".format(h_name, grid_json['failed_params'][error_index][h_name])) if len(grid_json["failure_stack_traces"]) > error_index: print("failure_details: {0}\nfailure_stack_traces: " "{1}\n".format(error_message, grid_json['failure_stack_traces'][error_index])) error_index += 1 else: grid_json = H2OConnection.get_json("Grids/"+grid.dest_key) self.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']] #get first model returned in list of models from grid search to get model class (binomial, multinomial, etc) # sometimes no model is returned due to bad parameter values provided by the user. if len(grid_json['model_ids']) > 0: first_model_json = H2OConnection.get_json("Models/"+grid_json['model_ids'][0]['name'], _rest_version=kwargs['_rest_version'])['models'][0] self._resolve_grid(grid.dest_key, grid_json, first_model_json) else: raise ValueError("Gridsearch returns no model due to bad parameter values or other reasons....")
def get_modelGBM(ip, port): prostate = h2o.import_file(path=h2o.locate("smalldata/logreg/prostate.csv")) prostate.describe() prostate[1] = prostate[1].asfactor() prostate_gbm = h2o.gbm(y=prostate[1], x=prostate[2:9], distribution="bernoulli") prostate_gbm.show() prostate_gbm.predict(prostate) model = h2o.get_model(prostate_gbm._id) model.show()
def get_model_gbm(): prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) prostate.describe() prostate[1] = prostate[1].asfactor() prostate_gbm = H2OGradientBoostingEstimator(distribution="bernoulli") prostate_gbm.train(x=range(2,9),y=1, training_frame=prostate) prostate_gbm.show() prostate_gbm.predict(prostate) model = h2o.get_model(prostate_gbm.model_id) model.show()
def check_model_property(model_names, prop_name, present=True, actual_value=None, default_value=None): for mn in model_names: model = h2o.get_model(mn) if present: assert prop_name in model.params.keys(), \ "missing {prop} in model {model}".format(prop=prop_name, model=mn) assert actual_value is None or model.params[prop_name]['actual'] == actual_value, \ "actual value for {prop} in model {model} is {val}, expected {exp}".format(prop=prop_name, model=mn, val=model.params[prop_name]['actual'], exp=actual_value) assert default_value is None or model.params[prop_name]['default'] == default_value, \ "default value for {prop} in model {model} is {val}, expected {exp}".format(prop=prop_name, model=mn, val=model.params[prop_name]['default'], exp=default_value) else: assert prop_name not in model.params.keys(), "unexpected {prop} in model {model}".format(prop=prop_name, model=mn)