Пример #1
0
    def test_frame_reload(self):
        name_node = utils.hadoop_namenode()
        work_dir = utils.get_workdir()
        dataset = "/datasets/mnist/train.csv.gz"

        saver_cluster_name = "saver-py"
        try:
            cluster_1 = utils.start_cluster(saver_cluster_name)
            h2o.connect(url=cluster_1)
            df_orig = h2o.import_file(path="hdfs://%s%s" %
                                      (name_node, dataset))
            df_key = df_orig.key
            df_pd_orig = df_orig.as_data_frame()
            df_orig.save(work_dir)
            h2o.connection().close()
        finally:
            utils.stop_cluster(saver_cluster_name)

        loader_cluster_name = "loader-py"
        try:
            cluster_2 = utils.start_cluster(loader_cluster_name)
            h2o.connect(url=cluster_2)
            df_loaded = h2o.load_frame(df_key, work_dir)
            df_pd_loaded = df_loaded.as_data_frame()
            h2o.connection().close()
        finally:
            utils.stop_cluster(loader_cluster_name)

        self.assertTrue(df_pd_orig.equals(df_pd_loaded))
Пример #2
0
    def test_frame_reload(self):
        name_node = pyunit_utils.hadoop_namenode()
        work_dir = "hdfs://%s%s" % (name_node, utils.get_workdir())
        dataset = "/datasets/iris_wheader.csv"

        ntrees_opts = [100, 120, 130, 140]
        learn_rate_opts = [0.01, 0.02, 0.03, 0.04]
        grid_size = len(ntrees_opts) * len(learn_rate_opts)
        print("max models %s" % grid_size)
        grid_id = "grid_ft_resume"
        hyper_parameters = {
            "learn_rate": learn_rate_opts,
            "ntrees": ntrees_opts
        }

        cluster_1_name = "grid1-py"
        try:
            cluster_1 = utils.start_cluster(cluster_1_name)
            h2o.connect(url=cluster_1)
            train = h2o.import_file(path="hdfs://%s%s" % (name_node, dataset))
            grid = H2OGridSearch(H2OGradientBoostingEstimator,
                                 grid_id=grid_id,
                                 hyper_params=hyper_parameters,
                                 recovery_dir=work_dir)
            print("starting initial grid and sleeping...")
            grid.start(x=list(range(4)), y=4, training_frame=train)
            grid_in_progress = None
            times_waited = 0
            while (times_waited < 20) and (grid_in_progress is None or len(
                    grid_in_progress.model_ids) == 0):
                time.sleep(5)  # give it tome to train some models
                times_waited += 1
                try:
                    grid_in_progress = h2o.get_grid(grid_id)
                except IndexError:
                    print("no models trained yet")
            print("done sleeping")
            h2o.connection().close()
        finally:
            utils.stop_cluster(cluster_1_name)

        cluster_2_name = "grid2-py"
        try:
            cluster_2 = utils.start_cluster(cluster_2_name)
            h2o.connect(url=cluster_2)
            loaded = h2o.load_grid("%s/%s" % (work_dir, grid_id),
                                   load_params_references=True)
            print("models after first run:")
            for x in sorted(loaded.model_ids):
                print(x)
            loaded.resume()
            print("models after second run:")
            for x in sorted(loaded.model_ids):
                print(x)
            print("Newly grained grid has %d models" % len(loaded.model_ids))
            self.assertEqual(len(loaded.model_ids), grid_size,
                             "The full grid was not trained.")
            h2o.connection().close()
        finally:
            utils.stop_cluster(cluster_2_name)
Пример #3
0
    def test_auto_recovery(self):
        name_node = pyunit_utils.hadoop_namenode()
        dataset = "/datasets/iris_wheader.csv"

        ntrees_opts = [100, 120, 130, 140]
        learn_rate_opts = [0.01, 0.02, 0.03, 0.04]
        grid_size = len(ntrees_opts) * len(learn_rate_opts)
        print("max models %s" % grid_size)
        grid_id = "grid_ft_auto_recover"
        hyper_parameters = {
            "learn_rate": learn_rate_opts,
            "ntrees": ntrees_opts
        }

        cluster_1_name = "grid-auto-1-py"
        try:
            cluster_1 = utils.start_cluster(cluster_1_name,
                                            enable_auto_recovery=True,
                                            clean_auto_recovery=True)
            print("initial cluster started at %s" % cluster_1)
            h2o.connect(url=cluster_1)
            train = h2o.import_file(path="hdfs://%s%s" % (name_node, dataset))
            grid = H2OGridSearch(H2OGradientBoostingEstimator,
                                 grid_id=grid_id,
                                 hyper_params=hyper_parameters)
            bg_train_thread = threading.Thread(target=self._training_thread,
                                               kwargs={
                                                   "grid": grid,
                                                   "train": train
                                               })
            bg_train_thread.start()
            phase_1_models = self._wait_for_model_to_build(grid_id)
            self._print_models("Initial models", phase_1_models)
            assert len(phase_1_models) > 0
            self._check_training_error()
        finally:
            utils.stop_cluster(cluster_1_name)

        cluster_2_name = "grid-auto-2-py"
        try:
            cluster_2 = utils.start_cluster(cluster_2_name,
                                            enable_auto_recovery=True)
            print("cluster resumed at %s, should unblock background thread" %
                  cluster_2)
            phase_2_models = self._wait_for_model_to_build(
                grid_id,
                len(phase_1_models) + 1)
            self._print_models("Recovery #1 models", phase_2_models)
            assert len(phase_2_models) > len(phase_1_models)
            self._check_training_error()
        finally:
            utils.stop_cluster(cluster_2_name)

        cluster_3_name = "grid-auto-3-py"
        try:
            cluster_3 = utils.start_cluster(cluster_3_name,
                                            enable_auto_recovery=True)
            print("cluster resumed at %s, waiting for training to finish" %
                  cluster_3)
            bg_train_thread.join()
            print("models after final run:")
            for x in sorted(grid.model_ids):
                print(x)
            print("Finished grained grid has %d models" % len(grid.model_ids))
            self.assertEqual(len(grid.model_ids), grid_size,
                             "The full grid was not trained.")
            self._check_training_error()
            h2o.connection().close()
        finally:
            utils.stop_cluster(cluster_3_name)