def grid_resume(): train = h2o.import_file( path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) # Run GBM Grid Search ntrees_opts = [1, 3] learn_rate_opts = [0.1, .05] hyper_parameters = {"learn_rate": learn_rate_opts, "ntrees": ntrees_opts} print("GBM grid with the following hyper_parameters:", hyper_parameters) export_dir = pyunit_utils.locate( "results") + "/grid_resume_new_hyperspace_1" gs = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters) gs.train(x=list(range(4)), y=4, training_frame=train) grid_id = gs.grid_id old_grid_model_count = len(gs.model_ids) print("Baseline grid has %d models" % old_grid_model_count) saved_path = h2o.save_grid(export_dir, grid_id) h2o.remove_all() # reload everything and restart grid train = h2o.import_file( path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) grid = h2o.load_grid(saved_path) assert grid is not None assert len(grid.model_ids) == old_grid_model_count # Modify the hyperspace - should add new models to the grid hyper_parameters["ntrees"] = [2, 5] grid = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters, grid_id=grid.grid_id) grid.train(x=list(range(4)), y=4, training_frame=train) print("Newly grained grid has %d models" % len(grid.model_ids)) assert len(grid.model_ids) == 2 * old_grid_model_count for model_id in grid.model_ids: model = h2o.get_model(model_id) assert model is not None export_dir2 = pyunit_utils.locate( "results") + "/grid_resume_new_hyperspace_2" saved_path2 = h2o.save_grid(export_dir2, grid_id, save_params_references=True) h2o.remove_all() grid = h2o.load_grid(saved_path2, load_params_references=True) hyper_parameters["ntrees"] = [6] grid.hyper_params = hyper_parameters grid.train(x=list(range(4)), y=4, training_frame=train) print("Newly grained grid has %d models" % len(grid.model_ids)) assert len(grid.model_ids) == (2 * old_grid_model_count) + 2
def grid_resume(): train = h2o.import_file( path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) # Run GBM Grid Search ntrees_opts = [1, 3] learn_rate_opts = [0.1, 0.01, .05] hyper_parameters = OrderedDict() hyper_parameters["learn_rate"] = learn_rate_opts hyper_parameters["ntrees"] = ntrees_opts print("GBM grid with the following hyper_parameters:", hyper_parameters) export_dir = pyunit_utils.locate("results") gs = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters) gs.train(x=list(range(4)), y=4, training_frame=train) grid_id = gs.grid_id old_grid_model_count = len(gs.model_ids) print("Baseline grid has %d models" % old_grid_model_count) saved_path = h2o.save_grid(export_dir, grid_id) h2o.remove_all() train = h2o.import_file( path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) grid = h2o.load_grid(saved_path) assert grid is not None assert len(grid.model_ids) == old_grid_model_count grid.train(x=list(range(4)), y=4, training_frame=train) print("Newly grained grid has %d models" % len(grid.model_ids)) assert len(grid.model_ids) == old_grid_model_count for model_id in grid.model_ids: model = h2o.get_model(model_id) assert model is not None
def grid_export_with_cv(): train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) # Run GBM Grid Search hyper_parameters = OrderedDict() hyper_parameters["ntrees"] = [1, 2] # train with CV gs = H2OGridSearch(H2OGradientBoostingEstimator(nfolds=2, keep_cross_validation_predictions=True, seed=42), hyper_params=hyper_parameters) gs.train(x=list(range(4)), y=4, training_frame=train) holdout_frame_ids = map(lambda m: m.cross_validation_holdout_predictions().frame_id, gs.models) export_dir = pyunit_utils.locate("results") saved_path = h2o.save_grid(export_dir, gs.grid_id, export_cross_validation_predictions=True) h2o.remove_all() grid = h2o.load_grid(saved_path) assert grid is not None for holdout_frame_id in holdout_frame_ids: assert h2o.get_frame(holdout_frame_id) is not None train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) stack = H2OStackedEnsembleEstimator(base_models=grid.model_ids) stack.train(x=list(range(4)), y=4, training_frame=train) predicted = stack.predict(train) assert predicted.nrow == train.nrow
def test_frame_reload(self): name_node = pyunit_utils.hadoop_namenode() work_dir = "hdfs://%s%s" % (name_node, utils.get_workdir()) dataset = "/datasets/iris_wheader.csv" ntrees_opts = [100, 120, 130, 140] learn_rate_opts = [0.01, 0.02, 0.03, 0.04] grid_size = len(ntrees_opts) * len(learn_rate_opts) print("max models %s" % grid_size) grid_id = "grid_ft_resume" hyper_parameters = { "learn_rate": learn_rate_opts, "ntrees": ntrees_opts } cluster_1_name = "grid1-py" try: cluster_1 = utils.start_cluster(cluster_1_name) h2o.connect(url=cluster_1) train = h2o.import_file(path="hdfs://%s%s" % (name_node, dataset)) grid = H2OGridSearch(H2OGradientBoostingEstimator, grid_id=grid_id, hyper_params=hyper_parameters, recovery_dir=work_dir) print("starting initial grid and sleeping...") grid.start(x=list(range(4)), y=4, training_frame=train) grid_in_progress = None times_waited = 0 while (times_waited < 20) and (grid_in_progress is None or len( grid_in_progress.model_ids) == 0): time.sleep(5) # give it tome to train some models times_waited += 1 try: grid_in_progress = h2o.get_grid(grid_id) except IndexError: print("no models trained yet") print("done sleeping") h2o.connection().close() finally: utils.stop_cluster(cluster_1_name) cluster_2_name = "grid2-py" try: cluster_2 = utils.start_cluster(cluster_2_name) h2o.connect(url=cluster_2) loaded = h2o.load_grid("%s/%s" % (work_dir, grid_id), load_params_references=True) print("models after first run:") for x in sorted(loaded.model_ids): print(x) loaded.resume() print("models after second run:") for x in sorted(loaded.model_ids): print(x) print("Newly grained grid has %d models" % len(loaded.model_ids)) self.assertEqual(len(loaded.model_ids), grid_size, "The full grid was not trained.") h2o.connection().close() finally: utils.stop_cluster(cluster_2_name)
def test_resume_with_recovery(): export_dir = tempfile.mkdtemp() grid_id = "resume_with_recovery_gbm" print("Using directory %s" % export_dir) hyper_parameters = { "learn_rate": [0.01, 0.05], "ntrees": [100, 110, 120, 130] } grid_size = 1 for p in hyper_parameters: grid_size *= len(hyper_parameters[p]) print("Grid size %d" % grid_size) print("Starting baseline grid") df = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv")) df["Angaus"] = df["Angaus"].asfactor() df["Weights"] = h2o.H2OFrame.from_python(abs(np.random.randn(df.nrow, 1)).tolist())[0] train, calib = df.split_frame(ratios=[.8], destination_frames=["eco_train", "eco_calib"], seed=42) params = { "distribution": "bernoulli", "min_rows": 10, "max_depth": 5, "weights_column": "Weights", "calibrate_model": True, "calibration_frame": calib } recovery_dir_1 = export_dir + "/recovery_1" grid = H2OGridSearch( H2OGradientBoostingEstimator, grid_id=grid_id, hyper_params=hyper_parameters, recovery_dir=recovery_dir_1 ) grid.start(x=list(range(2, train.ncol)), y="Angaus", training_frame=train, **params) grid_1_model_count = _wait_for_grid_models(grid, grid_id, 1, grid_size) loaded = h2o.load_grid("%s/%s" % (recovery_dir_1, grid_id), load_params_references=True) _check_grid_loaded_properly(loaded, train, grid_1_model_count) print("Resuming grid") recovery_dir_2 = export_dir + "/recovery_2" loaded.resume(detach=True, recovery_dir=recovery_dir_2) grid_2_model_count = _wait_for_grid_models(loaded, grid_id, len(loaded.model_ids) + 1, grid_size) loaded_2 = h2o.load_grid("%s/%s" % (recovery_dir_2, grid_id), load_params_references=True) _check_grid_loaded_properly(loaded_2, train, grid_2_model_count) print("Resuming grid to finish") loaded_2.resume() print("Finished grid has %d models" % len(loaded_2.model_ids)) assert grid_size == len(loaded_2.model_ids)
def glrm_grid_user_y(): export_dir = tempfile.mkdtemp() train_data = np.dot(np.random.rand(1000, 10), np.random.rand(10, 100)) train = h2o.H2OFrame(train_data.tolist(), destination_frame="glrm_train") initial_y_data = np.random.rand(10, 100) initial_y_h2o = h2o.H2OFrame(initial_y_data.tolist(), destination_frame="glrm_initial_y") params = { "k": 10, "init": "User", "user_y": initial_y_h2o, "loss": "Quadratic", "regularization_x": "OneSparse", "regularization_y": "NonNegative" } hyper_params = { "transform": ["NONE", "STANDARDIZE"], "gamma_x": [0.1], } # train grid grid = H2OGridSearch( H2OGeneralizedLowRankEstimator, hyper_params=hyper_params ) grid.train(x=train.names, training_frame=train, **params) print("first grid") print(grid) assert len(grid.model_ids) == 2 archetypes1 = grid.models[0].archetypes() archetypes2 = grid.models[1].archetypes() grid_path = h2o.save_grid(export_dir, grid.grid_id) h2o.remove_all() # reimport and train some more train = h2o.H2OFrame(train_data.tolist(), destination_frame="glrm_train") initial_y = h2o.H2OFrame(initial_y_data.tolist(), destination_frame="glrm_initial_y") grid = h2o.load_grid(grid_path) grid.hyper_params["gamma_x"] = [0.1, 1] grid.train(x=train.names, training_frame=train, **params) print("second grid") print(grid) assert len(grid.model_ids) == 4 # check actual training occurred and results are different assert grid.models[0].archetypes() == archetypes1 assert grid.models[1].archetypes() == archetypes2 # instead of using python compare, I am just check and make first the first arrays are not equal. assert not(pyunit_utils.equal_two_arrays(grid.models[1].archetypes()[0], grid.models[2].archetypes()[0], throw_error=False)) assert not(pyunit_utils.equal_two_arrays(grid.models[2].archetypes()[0], grid.models[3].archetypes()[0], throw_error=False))
def grid_ft_resume(train, grid_id, params, hyper_parameters, start_grid): print("TESTING %s\n-------------------" % grid_id) export_dir = tempfile.mkdtemp() print("Using directory %s" % export_dir) grid_size = 1 for p in hyper_parameters: grid_size *= len(hyper_parameters[p]) print("Grid size %d" % grid_size) print("Starting baseline grid") grid = start_grid(grid_id, export_dir, train, params, hyper_parameters) grid_in_progress = None times_waited = 0 while (times_waited < 3000) and (grid_in_progress is None or len(grid_in_progress.model_ids) == 0): time.sleep(0.1) # give it tome to train some models times_waited += 1 try: grid_in_progress = h2o.get_grid(grid_id) except IndexError: if times_waited % 100 == 0: print("no models trained yet after %ss" % (times_waited / 10)) grid.cancel() grid = h2o.get_grid(grid_id) old_grid_model_count = len(grid.model_ids) print("Baseline grid has %d models:" % old_grid_model_count) assert old_grid_model_count < grid_size, "The full grid should not have finished yet." for x in sorted(grid.model_ids): print(x) h2o.remove_all() loaded = h2o.load_grid("%s/%s" % (export_dir, grid_id), load_params_references=True) assert loaded is not None assert len(grid.model_ids) == old_grid_model_count loaded_train = h2o.H2OFrame.get_frame(train.frame_id) assert loaded_train is not None, "Train frame was not loaded" print("Starting final grid") loaded.resume() print("Newly grained grid has %d models:" % len(loaded.model_ids)) for x in sorted(loaded.model_ids): print(x) assert len(loaded.model_ids) == grid_size, "The full grid was not trained." h2o.remove_all()
def grid_resume(): train = h2o.import_file( path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) # run GBM Grid Search hyper_parameters_1 = {"ntrees": [10, 50], "learn_rate": [0.01, 0.1]} grid_size_1 = len(hyper_parameters_1["ntrees"]) * len( hyper_parameters_1["learn_rate"]) print("Training GBM grid with the following hyper_parameters:", hyper_parameters_1) export_dir = tempfile.mkdtemp() grid = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters_1, export_checkpoints_dir=export_dir) grid.train(x=list(range(4)), y=4, training_frame=train) grid_id = grid.grid_id model_count_1 = len(grid.model_ids) print(grid) assert len(grid.model_ids ) == grid_size_1, "There should be %d models" % grid_size_1 print("Baseline grid has %d models" % model_count_1) h2o.remove_all() # start over train = h2o.import_file( path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) grid = h2o.load_grid(export_dir + "/" + grid_id) assert len(grid.model_ids) == model_count_1 hyper_parameters_2 = {"ntrees": [10, 20, 50], "learn_rate": [0.01, 0.1]} grid.hyper_params = hyper_parameters_2 print("Training GBM grid with the following hyper_parameters:", hyper_parameters_2) grid.train(x=list(range(4)), y=4, training_frame=train) grid_size_2 = len(hyper_parameters_2["ntrees"]) * len( hyper_parameters_2["learn_rate"]) print(grid) assert len(grid.model_ids ) == grid_size_2, "There should be %s models" % grid_size_2 print("Newly grained grid has %d models" % len(grid.model_ids))
def grid_resume(): train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) hyper_parameters = { "learn_rate": [0.1, 0.01, .05], "ntrees": [10, 20] } export_dir = 'hdfs:///user/jenkins/grid_export_py' gs = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters) gs.train(x=list(range(4)), y=4, training_frame=train) grid_id = gs.grid_id old_grid_model_count = len(gs.model_ids) print("Baseline grid has %d models" % old_grid_model_count) saved_path = h2o.save_grid(export_dir, grid_id) h2o.remove_all() train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) grid = h2o.load_grid(saved_path) assert grid is not None assert len(grid.model_ids) == old_grid_model_count grid.train(x=list(range(4)), y=4, training_frame=train) print("Newly grained grid has %d models" % len(grid.model_ids)) assert len(grid.model_ids) == old_grid_model_count
def glrm_grid_user_y(): export_dir = tempfile.mkdtemp() train_data = np.dot(np.random.rand(1000, 10), np.random.rand(10, 100)) train = h2o.H2OFrame(train_data.tolist(), destination_frame="glrm_train") initial_y_data = np.random.rand(10, 100) initial_y_h2o = h2o.H2OFrame(initial_y_data.tolist(), destination_frame="glrm_initial_y") numArchetypes = 10 params = { "k": numArchetypes, "init": "User", "user_y": initial_y_h2o, "loss": "Quadratic", "regularization_x": "OneSparse", "regularization_y": "NonNegative", "seed": 12345 } hyper_params = { "transform": ["NONE", "STANDARDIZE"], "gamma_x": [0.1], } # train grid grid = H2OGridSearch(H2OGeneralizedLowRankEstimator, hyper_params=hyper_params) grid.train(x=train.names, training_frame=train, **params) print("first grid") print(grid) assert len(grid.model_ids) == 2 if (grid.models[0].actual_params['transform'] == 'STANDARDIZE'): archetypes0p1Standardize = grid.models[0].archetypes() archetypes0p1None = grid.models[1].archetypes() else: archetypes0p1Standardize = grid.models[1].archetypes() archetypes0p1None = grid.models[0].archetypes() grid_path = h2o.save_grid(export_dir, grid.grid_id) h2o.remove_all() # reimport and train some more train = h2o.H2OFrame(train_data.tolist(), destination_frame="glrm_train") initial_y = h2o.H2OFrame(initial_y_data.tolist(), destination_frame="glrm_initial_y") grid = h2o.load_grid(grid_path) grid.hyper_params["gamma_x"] = [0.1, 1] grid.train(x=train.names, training_frame=train, **params) print("second grid") print(grid) assert len(grid.model_ids) == 4 # check actual training occurred and results are different for oneGridModel in grid.models: if (oneGridModel.actual_params['gamma_x'] == 0.1) and (oneGridModel.actual_params['transform'] == 'STANDARDIZE'): assert oneGridModel.archetypes() == archetypes0p1Standardize if (oneGridModel.actual_params['gamma_x'] == 0.1) and (oneGridModel.actual_params['transform'] == 'NONE'): assert oneGridModel.archetypes() == archetypes0p1None if (oneGridModel.actual_params['gamma_x'] == 1) and (oneGridModel.actual_params['transform'] == 'STANDARDIZE'): archetypes1None = oneGridModel.archetypes() if (oneGridModel.actual_params['gamma_x'] == 1) and (oneGridModel.actual_params['transform'] == 'NONE'): archetypes1Standardize = oneGridModel.archetypes() archetypesNotEqual12 = not all([ pyunit_utils.equal_two_arrays( archetypes1None[i], archetypes0p1None[i], throw_error=False) for i in range(numArchetypes) ]) assert archetypesNotEqual12 archetypesNotEqual23 = not all([ pyunit_utils.equal_two_arrays(archetypes1Standardize[i], archetypes0p1Standardize[i], throw_error=False) for i in range(numArchetypes) ]) assert archetypesNotEqual23