Пример #1
0
    def test_frame_reload(self):
        name_node = utils.hadoop_namenode()
        work_dir = utils.get_workdir()
        dataset = "/datasets/mnist/train.csv.gz"

        saver_cluster_name = "saver-py"
        try:
            cluster_1 = utils.start_cluster(saver_cluster_name)
            h2o.connect(url=cluster_1)
            df_orig = h2o.import_file(path="hdfs://%s%s" %
                                      (name_node, dataset))
            df_key = df_orig.key
            df_pd_orig = df_orig.as_data_frame()
            df_orig.save(work_dir)
            h2o.connection().close()
        finally:
            utils.stop_cluster(saver_cluster_name)

        loader_cluster_name = "loader-py"
        try:
            cluster_2 = utils.start_cluster(loader_cluster_name)
            h2o.connect(url=cluster_2)
            df_loaded = h2o.load_frame(df_key, work_dir)
            df_pd_loaded = df_loaded.as_data_frame()
            h2o.connection().close()
        finally:
            utils.stop_cluster(loader_cluster_name)

        self.assertTrue(df_pd_orig.equals(df_pd_loaded))
Пример #2
0
    def test_frame_reload(self):
        name_node = pyunit_utils.hadoop_namenode()
        work_dir = "hdfs://%s%s" % (name_node, utils.get_workdir())
        dataset = "/datasets/iris_wheader.csv"

        ntrees_opts = [100, 120, 130, 140]
        learn_rate_opts = [0.01, 0.02, 0.03, 0.04]
        grid_size = len(ntrees_opts) * len(learn_rate_opts)
        print("max models %s" % grid_size)
        grid_id = "grid_ft_resume"
        hyper_parameters = {
            "learn_rate": learn_rate_opts,
            "ntrees": ntrees_opts
        }

        cluster_1_name = "grid1-py"
        try:
            cluster_1 = utils.start_cluster(cluster_1_name)
            h2o.connect(url=cluster_1)
            train = h2o.import_file(path="hdfs://%s%s" % (name_node, dataset))
            grid = H2OGridSearch(H2OGradientBoostingEstimator,
                                 grid_id=grid_id,
                                 hyper_params=hyper_parameters,
                                 recovery_dir=work_dir)
            print("starting initial grid and sleeping...")
            grid.start(x=list(range(4)), y=4, training_frame=train)
            grid_in_progress = None
            times_waited = 0
            while (times_waited < 20) and (grid_in_progress is None or len(
                    grid_in_progress.model_ids) == 0):
                time.sleep(5)  # give it tome to train some models
                times_waited += 1
                try:
                    grid_in_progress = h2o.get_grid(grid_id)
                except IndexError:
                    print("no models trained yet")
            print("done sleeping")
            h2o.connection().close()
        finally:
            utils.stop_cluster(cluster_1_name)

        cluster_2_name = "grid2-py"
        try:
            cluster_2 = utils.start_cluster(cluster_2_name)
            h2o.connect(url=cluster_2)
            loaded = h2o.load_grid("%s/%s" % (work_dir, grid_id),
                                   load_params_references=True)
            print("models after first run:")
            for x in sorted(loaded.model_ids):
                print(x)
            loaded.resume()
            print("models after second run:")
            for x in sorted(loaded.model_ids):
                print(x)
            print("Newly grained grid has %d models" % len(loaded.model_ids))
            self.assertEqual(len(loaded.model_ids), grid_size,
                             "The full grid was not trained.")
            h2o.connection().close()
        finally:
            utils.stop_cluster(cluster_2_name)