示例#1
0
def grid_resume():
    train = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

    # Run GBM Grid Search
    ntrees_opts = [1, 3]
    learn_rate_opts = [0.1, .05]
    hyper_parameters = {"learn_rate": learn_rate_opts, "ntrees": ntrees_opts}
    print("GBM grid with the following hyper_parameters:", hyper_parameters)

    export_dir = pyunit_utils.locate(
        "results") + "/grid_resume_new_hyperspace_1"
    gs = H2OGridSearch(H2OGradientBoostingEstimator,
                       hyper_params=hyper_parameters)
    gs.train(x=list(range(4)), y=4, training_frame=train)
    grid_id = gs.grid_id
    old_grid_model_count = len(gs.model_ids)
    print("Baseline grid has %d models" % old_grid_model_count)
    saved_path = h2o.save_grid(export_dir, grid_id)
    h2o.remove_all()

    # reload everything and restart grid
    train = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    grid = h2o.load_grid(saved_path)
    assert grid is not None
    assert len(grid.model_ids) == old_grid_model_count
    # Modify the hyperspace - should add new models to the grid
    hyper_parameters["ntrees"] = [2, 5]
    grid = H2OGridSearch(H2OGradientBoostingEstimator,
                         hyper_params=hyper_parameters,
                         grid_id=grid.grid_id)
    grid.train(x=list(range(4)), y=4, training_frame=train)
    print("Newly grained grid has %d models" % len(grid.model_ids))
    assert len(grid.model_ids) == 2 * old_grid_model_count

    for model_id in grid.model_ids:
        model = h2o.get_model(model_id)
        assert model is not None

    export_dir2 = pyunit_utils.locate(
        "results") + "/grid_resume_new_hyperspace_2"
    saved_path2 = h2o.save_grid(export_dir2,
                                grid_id,
                                save_params_references=True)
    h2o.remove_all()
    grid = h2o.load_grid(saved_path2, load_params_references=True)
    hyper_parameters["ntrees"] = [6]
    grid.hyper_params = hyper_parameters
    grid.train(x=list(range(4)), y=4, training_frame=train)
    print("Newly grained grid has %d models" % len(grid.model_ids))
    assert len(grid.model_ids) == (2 * old_grid_model_count) + 2
示例#2
0
def grid_resume():
    train = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

    # Run GBM Grid Search
    ntrees_opts = [1, 3]
    learn_rate_opts = [0.1, 0.01, .05]
    hyper_parameters = OrderedDict()
    hyper_parameters["learn_rate"] = learn_rate_opts
    hyper_parameters["ntrees"] = ntrees_opts
    print("GBM grid with the following hyper_parameters:", hyper_parameters)

    export_dir = pyunit_utils.locate("results")
    gs = H2OGridSearch(H2OGradientBoostingEstimator,
                       hyper_params=hyper_parameters)
    gs.train(x=list(range(4)), y=4, training_frame=train)
    grid_id = gs.grid_id
    old_grid_model_count = len(gs.model_ids)
    print("Baseline grid has %d models" % old_grid_model_count)
    saved_path = h2o.save_grid(export_dir, grid_id)
    h2o.remove_all()

    train = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    grid = h2o.load_grid(saved_path)
    assert grid is not None
    assert len(grid.model_ids) == old_grid_model_count
    grid.train(x=list(range(4)), y=4, training_frame=train)
    print("Newly grained grid has %d models" % len(grid.model_ids))
    assert len(grid.model_ids) == old_grid_model_count

    for model_id in grid.model_ids:
        model = h2o.get_model(model_id)
        assert model is not None
示例#3
0
def grid_export_with_cv():
    train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

    # Run GBM Grid Search
    hyper_parameters = OrderedDict()
    hyper_parameters["ntrees"] = [1, 2]

    # train with CV
    gs = H2OGridSearch(H2OGradientBoostingEstimator(nfolds=2, keep_cross_validation_predictions=True, seed=42),
                       hyper_params=hyper_parameters)
    gs.train(x=list(range(4)), y=4, training_frame=train)

    holdout_frame_ids = map(lambda m: m.cross_validation_holdout_predictions().frame_id, gs.models)

    export_dir = pyunit_utils.locate("results")
    saved_path = h2o.save_grid(export_dir, gs.grid_id, export_cross_validation_predictions=True)

    h2o.remove_all()

    grid = h2o.load_grid(saved_path)

    assert grid is not None
    for holdout_frame_id in holdout_frame_ids:
        assert h2o.get_frame(holdout_frame_id) is not None

    train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    stack = H2OStackedEnsembleEstimator(base_models=grid.model_ids)
    stack.train(x=list(range(4)), y=4, training_frame=train)

    predicted = stack.predict(train)
    assert predicted.nrow == train.nrow
示例#4
0
    def test_frame_reload(self):
        name_node = pyunit_utils.hadoop_namenode()
        work_dir = "hdfs://%s%s" % (name_node, utils.get_workdir())
        dataset = "/datasets/iris_wheader.csv"

        ntrees_opts = [100, 120, 130, 140]
        learn_rate_opts = [0.01, 0.02, 0.03, 0.04]
        grid_size = len(ntrees_opts) * len(learn_rate_opts)
        print("max models %s" % grid_size)
        grid_id = "grid_ft_resume"
        hyper_parameters = {
            "learn_rate": learn_rate_opts,
            "ntrees": ntrees_opts
        }

        cluster_1_name = "grid1-py"
        try:
            cluster_1 = utils.start_cluster(cluster_1_name)
            h2o.connect(url=cluster_1)
            train = h2o.import_file(path="hdfs://%s%s" % (name_node, dataset))
            grid = H2OGridSearch(H2OGradientBoostingEstimator,
                                 grid_id=grid_id,
                                 hyper_params=hyper_parameters,
                                 recovery_dir=work_dir)
            print("starting initial grid and sleeping...")
            grid.start(x=list(range(4)), y=4, training_frame=train)
            grid_in_progress = None
            times_waited = 0
            while (times_waited < 20) and (grid_in_progress is None or len(
                    grid_in_progress.model_ids) == 0):
                time.sleep(5)  # give it tome to train some models
                times_waited += 1
                try:
                    grid_in_progress = h2o.get_grid(grid_id)
                except IndexError:
                    print("no models trained yet")
            print("done sleeping")
            h2o.connection().close()
        finally:
            utils.stop_cluster(cluster_1_name)

        cluster_2_name = "grid2-py"
        try:
            cluster_2 = utils.start_cluster(cluster_2_name)
            h2o.connect(url=cluster_2)
            loaded = h2o.load_grid("%s/%s" % (work_dir, grid_id),
                                   load_params_references=True)
            print("models after first run:")
            for x in sorted(loaded.model_ids):
                print(x)
            loaded.resume()
            print("models after second run:")
            for x in sorted(loaded.model_ids):
                print(x)
            print("Newly grained grid has %d models" % len(loaded.model_ids))
            self.assertEqual(len(loaded.model_ids), grid_size,
                             "The full grid was not trained.")
            h2o.connection().close()
        finally:
            utils.stop_cluster(cluster_2_name)
def test_resume_with_recovery():
    export_dir = tempfile.mkdtemp()
    grid_id = "resume_with_recovery_gbm"
    print("Using directory %s" % export_dir)
    hyper_parameters = {
        "learn_rate": [0.01, 0.05],
        "ntrees": [100, 110, 120, 130]
    }
    grid_size = 1
    for p in hyper_parameters:
        grid_size *= len(hyper_parameters[p])
    print("Grid size %d" % grid_size)
    print("Starting baseline grid")

    df = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv"))
    df["Angaus"] = df["Angaus"].asfactor()
    df["Weights"] = h2o.H2OFrame.from_python(abs(np.random.randn(df.nrow, 1)).tolist())[0]
    train, calib = df.split_frame(ratios=[.8], destination_frames=["eco_train", "eco_calib"], seed=42)
    params = {
        "distribution": "bernoulli", "min_rows": 10, "max_depth": 5,
        "weights_column": "Weights",
        "calibrate_model": True, 
        "calibration_frame": calib
    }
    recovery_dir_1 = export_dir + "/recovery_1"
    grid = H2OGridSearch(
        H2OGradientBoostingEstimator,
        grid_id=grid_id,
        hyper_params=hyper_parameters,
        recovery_dir=recovery_dir_1
    )
    grid.start(x=list(range(2, train.ncol)), y="Angaus", training_frame=train, **params)
    grid_1_model_count = _wait_for_grid_models(grid, grid_id, 1, grid_size)

    loaded = h2o.load_grid("%s/%s" % (recovery_dir_1, grid_id), load_params_references=True)
    _check_grid_loaded_properly(loaded, train, grid_1_model_count)
    print("Resuming grid")
    recovery_dir_2 = export_dir + "/recovery_2"
    loaded.resume(detach=True, recovery_dir=recovery_dir_2)
    grid_2_model_count = _wait_for_grid_models(loaded, grid_id, len(loaded.model_ids) + 1, grid_size)

    loaded_2 = h2o.load_grid("%s/%s" % (recovery_dir_2, grid_id), load_params_references=True)
    _check_grid_loaded_properly(loaded_2, train, grid_2_model_count)
    print("Resuming grid to finish")
    loaded_2.resume()
    print("Finished grid has %d models" % len(loaded_2.model_ids))
    assert grid_size == len(loaded_2.model_ids)
def glrm_grid_user_y():
    export_dir = tempfile.mkdtemp()
    train_data = np.dot(np.random.rand(1000, 10), np.random.rand(10, 100))
    train = h2o.H2OFrame(train_data.tolist(), destination_frame="glrm_train")
    initial_y_data = np.random.rand(10, 100)
    initial_y_h2o = h2o.H2OFrame(initial_y_data.tolist(), destination_frame="glrm_initial_y")
    params = {
        "k": 10,
        "init": "User",
        "user_y": initial_y_h2o,
        "loss": "Quadratic",
        "regularization_x": "OneSparse",
        "regularization_y": "NonNegative"
    }
    hyper_params = {
        "transform": ["NONE", "STANDARDIZE"],
        "gamma_x": [0.1],
    }
    
    # train grid
    grid = H2OGridSearch(
        H2OGeneralizedLowRankEstimator,
        hyper_params=hyper_params
    )
    grid.train(x=train.names, training_frame=train, **params)
    print("first grid")
    print(grid)
    assert len(grid.model_ids) == 2
    archetypes1 = grid.models[0].archetypes()
    archetypes2 = grid.models[1].archetypes()
    grid_path = h2o.save_grid(export_dir, grid.grid_id)
    h2o.remove_all()
    
    # reimport and train some more
    train = h2o.H2OFrame(train_data.tolist(), destination_frame="glrm_train")
    initial_y = h2o.H2OFrame(initial_y_data.tolist(), destination_frame="glrm_initial_y")
    grid = h2o.load_grid(grid_path)
    grid.hyper_params["gamma_x"] = [0.1, 1]
    grid.train(x=train.names, training_frame=train, **params)
    print("second grid")
    print(grid)
    assert len(grid.model_ids) == 4
    # check actual training occurred and results are different
    assert grid.models[0].archetypes() == archetypes1
    assert grid.models[1].archetypes() == archetypes2
    # instead of using python compare, I am just check and make first the first arrays are not equal.
    assert not(pyunit_utils.equal_two_arrays(grid.models[1].archetypes()[0], grid.models[2].archetypes()[0], throw_error=False))
    assert not(pyunit_utils.equal_two_arrays(grid.models[2].archetypes()[0], grid.models[3].archetypes()[0], throw_error=False))
示例#7
0
def grid_ft_resume(train, grid_id, params, hyper_parameters, start_grid):
    print("TESTING %s\n-------------------" % grid_id)
    export_dir = tempfile.mkdtemp()
    print("Using directory %s" % export_dir)
    grid_size = 1
    for p in hyper_parameters:
        grid_size *= len(hyper_parameters[p])
    print("Grid size %d" % grid_size)
    print("Starting baseline grid")
    grid = start_grid(grid_id, export_dir, train, params, hyper_parameters)
    grid_in_progress = None
    times_waited = 0
    while (times_waited < 3000) and (grid_in_progress is None
                                     or len(grid_in_progress.model_ids) == 0):
        time.sleep(0.1)  # give it tome to train some models
        times_waited += 1
        try:
            grid_in_progress = h2o.get_grid(grid_id)
        except IndexError:
            if times_waited % 100 == 0:
                print("no models trained yet after %ss" % (times_waited / 10))
    grid.cancel()

    grid = h2o.get_grid(grid_id)
    old_grid_model_count = len(grid.model_ids)
    print("Baseline grid has %d models:" % old_grid_model_count)
    assert old_grid_model_count < grid_size, "The full grid should not have finished yet."
    for x in sorted(grid.model_ids):
        print(x)
    h2o.remove_all()

    loaded = h2o.load_grid("%s/%s" % (export_dir, grid_id),
                           load_params_references=True)
    assert loaded is not None
    assert len(grid.model_ids) == old_grid_model_count
    loaded_train = h2o.H2OFrame.get_frame(train.frame_id)
    assert loaded_train is not None, "Train frame was not loaded"
    print("Starting final grid")
    loaded.resume()
    print("Newly grained grid has %d models:" % len(loaded.model_ids))
    for x in sorted(loaded.model_ids):
        print(x)
    assert len(loaded.model_ids) == grid_size, "The full grid was not trained."
    h2o.remove_all()
示例#8
0
def grid_resume():
    train = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    # run GBM Grid Search
    hyper_parameters_1 = {"ntrees": [10, 50], "learn_rate": [0.01, 0.1]}
    grid_size_1 = len(hyper_parameters_1["ntrees"]) * len(
        hyper_parameters_1["learn_rate"])
    print("Training GBM grid with the following hyper_parameters:",
          hyper_parameters_1)

    export_dir = tempfile.mkdtemp()
    grid = H2OGridSearch(H2OGradientBoostingEstimator,
                         hyper_params=hyper_parameters_1,
                         export_checkpoints_dir=export_dir)
    grid.train(x=list(range(4)), y=4, training_frame=train)
    grid_id = grid.grid_id
    model_count_1 = len(grid.model_ids)
    print(grid)
    assert len(grid.model_ids
               ) == grid_size_1, "There should be %d models" % grid_size_1
    print("Baseline grid has %d models" % model_count_1)
    h2o.remove_all()

    # start over
    train = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    grid = h2o.load_grid(export_dir + "/" + grid_id)
    assert len(grid.model_ids) == model_count_1
    hyper_parameters_2 = {"ntrees": [10, 20, 50], "learn_rate": [0.01, 0.1]}
    grid.hyper_params = hyper_parameters_2
    print("Training GBM grid with the following hyper_parameters:",
          hyper_parameters_2)
    grid.train(x=list(range(4)), y=4, training_frame=train)
    grid_size_2 = len(hyper_parameters_2["ntrees"]) * len(
        hyper_parameters_2["learn_rate"])
    print(grid)
    assert len(grid.model_ids
               ) == grid_size_2, "There should be %s models" % grid_size_2
    print("Newly grained grid has %d models" % len(grid.model_ids))
示例#9
0
def grid_resume():
    train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

    hyper_parameters = {
        "learn_rate": [0.1, 0.01, .05],
        "ntrees": [10, 20]
    }
    export_dir = 'hdfs:///user/jenkins/grid_export_py'
    gs = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters)
    gs.train(x=list(range(4)), y=4, training_frame=train)
    grid_id = gs.grid_id
    old_grid_model_count = len(gs.model_ids)
    print("Baseline grid has %d models" % old_grid_model_count)
    saved_path = h2o.save_grid(export_dir, grid_id)
    h2o.remove_all()

    train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    grid = h2o.load_grid(saved_path)
    assert grid is not None
    assert len(grid.model_ids) == old_grid_model_count
    grid.train(x=list(range(4)), y=4, training_frame=train)
    print("Newly grained grid has %d models" % len(grid.model_ids))
    assert len(grid.model_ids) == old_grid_model_count
def glrm_grid_user_y():
    export_dir = tempfile.mkdtemp()
    train_data = np.dot(np.random.rand(1000, 10), np.random.rand(10, 100))
    train = h2o.H2OFrame(train_data.tolist(), destination_frame="glrm_train")
    initial_y_data = np.random.rand(10, 100)
    initial_y_h2o = h2o.H2OFrame(initial_y_data.tolist(),
                                 destination_frame="glrm_initial_y")
    numArchetypes = 10
    params = {
        "k": numArchetypes,
        "init": "User",
        "user_y": initial_y_h2o,
        "loss": "Quadratic",
        "regularization_x": "OneSparse",
        "regularization_y": "NonNegative",
        "seed": 12345
    }
    hyper_params = {
        "transform": ["NONE", "STANDARDIZE"],
        "gamma_x": [0.1],
    }

    # train grid
    grid = H2OGridSearch(H2OGeneralizedLowRankEstimator,
                         hyper_params=hyper_params)
    grid.train(x=train.names, training_frame=train, **params)
    print("first grid")
    print(grid)
    assert len(grid.model_ids) == 2
    if (grid.models[0].actual_params['transform'] == 'STANDARDIZE'):
        archetypes0p1Standardize = grid.models[0].archetypes()
        archetypes0p1None = grid.models[1].archetypes()
    else:
        archetypes0p1Standardize = grid.models[1].archetypes()
        archetypes0p1None = grid.models[0].archetypes()
    grid_path = h2o.save_grid(export_dir, grid.grid_id)
    h2o.remove_all()

    # reimport and train some more
    train = h2o.H2OFrame(train_data.tolist(), destination_frame="glrm_train")
    initial_y = h2o.H2OFrame(initial_y_data.tolist(),
                             destination_frame="glrm_initial_y")
    grid = h2o.load_grid(grid_path)
    grid.hyper_params["gamma_x"] = [0.1, 1]
    grid.train(x=train.names, training_frame=train, **params)
    print("second grid")
    print(grid)
    assert len(grid.model_ids) == 4
    # check actual training occurred and results are different
    for oneGridModel in grid.models:
        if (oneGridModel.actual_params['gamma_x']
                == 0.1) and (oneGridModel.actual_params['transform']
                             == 'STANDARDIZE'):
            assert oneGridModel.archetypes() == archetypes0p1Standardize
        if (oneGridModel.actual_params['gamma_x']
                == 0.1) and (oneGridModel.actual_params['transform']
                             == 'NONE'):
            assert oneGridModel.archetypes() == archetypes0p1None
        if (oneGridModel.actual_params['gamma_x']
                == 1) and (oneGridModel.actual_params['transform']
                           == 'STANDARDIZE'):
            archetypes1None = oneGridModel.archetypes()
        if (oneGridModel.actual_params['gamma_x']
                == 1) and (oneGridModel.actual_params['transform'] == 'NONE'):
            archetypes1Standardize = oneGridModel.archetypes()

    archetypesNotEqual12 = not all([
        pyunit_utils.equal_two_arrays(
            archetypes1None[i], archetypes0p1None[i], throw_error=False)
        for i in range(numArchetypes)
    ])
    assert archetypesNotEqual12
    archetypesNotEqual23 = not all([
        pyunit_utils.equal_two_arrays(archetypes1Standardize[i],
                                      archetypes0p1Standardize[i],
                                      throw_error=False)
        for i in range(numArchetypes)
    ])
    assert archetypesNotEqual23