예제 #1
0
    def testExperimentTagTruncation(self):
        ray.init(num_cpus=2)

        def train(config, reporter):
            reporter(timesteps_total=1)

        trial_executor = RayTrialExecutor()
        register_trainable("f1", train)

        experiments = {
            "foo": {
                "run": "f1",
                "config": {
                    "a" * 50: tune.sample_from(lambda spec: 5.0 / 7),
                    "b" * 50: tune.sample_from(lambda spec: "long" * 40),
                },
            }
        }

        for name, spec in experiments.items():
            trial_generator = BasicVariantGenerator()
            trial_generator.add_configurations({name: spec})
            while not trial_generator.is_finished():
                trial = trial_generator.next_trial()
                if not trial:
                    break
                trial_executor.start_trial(trial)
                self.assertLessEqual(len(os.path.basename(trial.logdir)), 200)
                trial_executor.stop_trial(trial)
예제 #2
0
    def testExperimentTagTruncation(self):
        ray.init()

        def train(config, reporter):
            reporter(timesteps_total=1)

        trial_executor = RayTrialExecutor()
        register_trainable("f1", train)

        experiments = {
            "foo": {
                "run": "f1",
                "config": {
                    "a" * 50: lambda spec: 5.0 / 7,
                    "b" * 50: lambda spec: "long" * 40
                },
            }
        }

        for name, spec in experiments.items():
            trial_generator = BasicVariantGenerator()
            trial_generator.add_configurations({name: spec})
            for trial in trial_generator.next_trials():
                trial_executor.start_trial(trial)
                self.assertLessEqual(len(trial.logdir), 200)
                trial_executor.stop_trial(trial)
예제 #3
0
    def testExperimentTagTruncation(self):
        ray.init(num_cpus=2)
        trainable_cls = AdaptDLTrainableCreator(_train_simple, num_workers=1)
        trial_executor = RayTrialExecutor()
        experiments = {
            "foo": {
                "run": trainable_cls.__name__,
                "config": {
                    "a" * 50: tune.sample_from(lambda spec: 5.0 / 7),
                    "b" * 50: tune.sample_from(lambda spec: "long" * 40)
                },
            }
        }

        for name, spec in experiments.items():
            trial_generator = BasicVariantGenerator()
            trial_generator.add_configurations({name: spec})
            while not trial_generator.is_finished():
                trial = trial_generator.next_trial()
                if not trial:
                    break
                trial_executor.start_trial(trial)
                assert trial.status == Trial.RUNNING
                assert len(os.path.basename(trial.logdir)) <= 200
                trial_executor.stop_trial(trial)
                assert trial.status == Trial.TERMINATED
예제 #4
0
 def _add_trials(self, name, spec):
     """Add trial by invoking TrialRunner."""
     resource = {}
     resource["trials"] = []
     trial_generator = BasicVariantGenerator()
     trial_generator.add_configurations({name: spec})
     for trial in trial_generator.next_trials():
         runner.add_trial(trial)
         resource["trials"].append(self._trial_info(trial))
     return resource
예제 #5
0
 def _add_trials(self, name, spec):
     """Add trial by invoking TrialRunner."""
     resource = {}
     resource["trials"] = []
     trial_generator = BasicVariantGenerator()
     trial_generator.add_configurations({name: spec})
     for trial in trial_generator.next_trials():
         runner.add_trial(trial)
         resource["trials"].append(self._trial_info(trial))
     return resource
예제 #6
0
 def generate_trials(spec, name):
     suggester = BasicVariantGenerator()
     suggester.add_configurations({name: spec})
     trials = []
     while not suggester.is_finished():
         trial = suggester.next_trial()
         if trial:
             trials.append(trial)
         else:
             break
     return trials
예제 #7
0
파일: web_server.py 프로젝트: zivzone/ray
 def _add_trials(self, name, spec):
     """Add trial by invoking TrialRunner."""
     resource = {}
     resource["trials"] = []
     trial_generator = BasicVariantGenerator()
     trial_generator.add_configurations({name: spec})
     while not trial_generator.is_finished():
         trial = trial_generator.next_trial()
         if not trial:
             break
         runner.add_trial(trial)
         resource["trials"].append(self._trial_info(trial))
     return resource
예제 #8
0
    def testBasicVariantLimiter(self):
        search_alg = BasicVariantGenerator(max_concurrent=2)

        experiment_spec = {
            "run": "__fake",
            "num_samples": 5,
            "stop": {
                "training_iteration": 1
            }
        }
        search_alg.add_configurations({"test": experiment_spec})

        trial1 = search_alg.next_trial()
        self.assertTrue(trial1)

        trial2 = search_alg.next_trial()
        self.assertTrue(trial2)

        # Returns None because of limiting
        trial3 = search_alg.next_trial()
        self.assertFalse(trial3)

        # Finish trial, now trial 3 should be created
        search_alg.on_trial_complete(trial1.trial_id, None, False)
        trial3 = search_alg.next_trial()
        self.assertTrue(trial3)

        trial4 = search_alg.next_trial()
        self.assertFalse(trial4)

        search_alg.on_trial_complete(trial2.trial_id, None, False)
        search_alg.on_trial_complete(trial3.trial_id, None, False)

        trial4 = search_alg.next_trial()
        self.assertTrue(trial4)

        trial5 = search_alg.next_trial()
        self.assertTrue(trial5)

        search_alg.on_trial_complete(trial4.trial_id, None, False)

        # Should also be None because search is finished
        trial6 = search_alg.next_trial()
        self.assertFalse(trial6)
예제 #9
0
    def testSpecifyAlgorithm(self):
        """Tests run_experiments works without specifying experiment."""
        def train(config, reporter):
            for i in range(100):
                reporter(timesteps_total=i)

        register_trainable("f1", train)

        alg = BasicVariantGenerator()
        alg.add_configurations(
            {"foo": {
                "run": "f1",
                "config": {
                    "script_min_iter_time_s": 0
                }
            }})
        trials = run_experiments(search_alg=alg)
        for trial in trials:
            self.assertEqual(trial.status, Trial.TERMINATED)
            self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 99)
예제 #10
0
    def testQueueFilling(self):
        os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1"

        ray.init(num_cpus=4)

        def f1(config):
            for i in range(10):
                yield i

        tune.register_trainable("f1", f1)

        search_alg = BasicVariantGenerator()
        search_alg.add_configurations(
            {
                "foo": {
                    "run": "f1",
                    "num_samples": 100,
                    "config": {
                        "a": tune.sample_from(lambda spec: 5.0 / 7),
                        "b": tune.sample_from(lambda spec: "long" * 40),
                    },
                    "resources_per_trial": {"cpu": 2},
                }
            }
        )

        runner = TrialRunner(search_alg=search_alg)

        runner.step()
        runner.step()
        runner.step()
        self.assertEqual(len(runner._trials), 3)

        runner.step()
        self.assertEqual(len(runner._trials), 3)

        self.assertEqual(runner._trials[0].status, Trial.RUNNING)
        self.assertEqual(runner._trials[1].status, Trial.RUNNING)
        self.assertEqual(runner._trials[2].status, Trial.PENDING)
예제 #11
0
        def execute_command(self, args):
            def get_trial():
                trial = runner.get_trial(args["trial_id"])
                if trial is None:
                    error = "Trial ({}) not found.".format(args["trial_id"])
                    raise TuneManagerError(error)
                else:
                    return trial

            command = args["command"]
            response = {}
            try:
                if command == TuneClient.GET_LIST:
                    response["trials"] = [
                        self.trial_info(t) for t in runner.get_trials()
                    ]
                elif command == TuneClient.GET_TRIAL:
                    trial = get_trial()
                    response["trial_info"] = self.trial_info(trial)
                elif command == TuneClient.STOP:
                    trial = get_trial()
                    runner.request_stop_trial(trial)
                elif command == TuneClient.ADD:
                    name = args["name"]
                    spec = args["spec"]
                    trial_generator = BasicVariantGenerator()
                    trial_generator.add_configurations({name: spec})
                    for trial in trial_generator.next_trials():
                        runner.add_trial(trial)
                else:
                    raise TuneManagerError("Unknown command.")
                status = True
            except TuneError as e:
                status = False
                response["message"] = str(e)

            return status, response
예제 #12
0
        def execute_command(self, args):
            def get_trial():
                trial = runner.get_trial(args["trial_id"])
                if trial is None:
                    error = "Trial ({}) not found.".format(args["trial_id"])
                    raise TuneManagerError(error)
                else:
                    return trial

            command = args["command"]
            response = {}
            try:
                if command == TuneClient.GET_LIST:
                    response["trials"] = [
                        self.trial_info(t) for t in runner.get_trials()
                    ]
                elif command == TuneClient.GET_TRIAL:
                    trial = get_trial()
                    response["trial_info"] = self.trial_info(trial)
                elif command == TuneClient.STOP:
                    trial = get_trial()
                    runner.request_stop_trial(trial)
                elif command == TuneClient.ADD:
                    name = args["name"]
                    spec = args["spec"]
                    trial_generator = BasicVariantGenerator()
                    trial_generator.add_configurations({name: spec})
                    for trial in trial_generator.next_trials():
                        runner.add_trial(trial)
                else:
                    raise TuneManagerError("Unknown command.")
                status = True
            except TuneError as e:
                status = False
                response["message"] = str(e)

            return status, response
예제 #13
0
파일: tune.py 프로젝트: jamescasbon/ray
def run_experiments(experiments,
                    search_alg=None,
                    scheduler=None,
                    with_server=False,
                    server_port=TuneServer.DEFAULT_PORT,
                    verbose=True,
                    resume=False,
                    queue_trials=False,
                    trial_executor=None,
                    raise_on_failed_trial=True):
    """Runs and blocks until all trials finish.

    Args:
        experiments (Experiment | list | dict): Experiments to run. Will be
            passed to `search_alg` via `add_configurations`.
        search_alg (SearchAlgorithm): Search Algorithm. Defaults to
            BasicVariantGenerator.
        scheduler (TrialScheduler): Scheduler for executing
            the experiment. Choose among FIFO (default), MedianStopping,
            AsyncHyperBand, and HyperBand.
        with_server (bool): Starts a background Tune server. Needed for
            using the Client API.
        server_port (int): Port number for launching TuneServer.
        verbose (bool): How much output should be printed for each trial.
        resume (bool|"prompt"): If checkpoint exists, the experiment will
            resume from there. If resume is "prompt", Tune will prompt if
            checkpoint detected.
        queue_trials (bool): Whether to queue trials when the cluster does
            not currently have enough resources to launch one. This should
            be set to True when running on an autoscaling cluster to enable
            automatic scale-up.
        trial_executor (TrialExecutor): Manage the execution of trials.
        raise_on_failed_trial (bool): Raise TuneError if there exists failed
            trial (of ERROR state) when the experiments complete.

    Examples:
        >>> experiment_spec = Experiment("experiment", my_func)
        >>> run_experiments(experiments=experiment_spec)

        >>> experiment_spec = {"experiment": {"run": my_func}}
        >>> run_experiments(experiments=experiment_spec)

        >>> run_experiments(
        >>>     experiments=experiment_spec,
        >>>     scheduler=MedianStoppingRule(...))

        >>> run_experiments(
        >>>     experiments=experiment_spec,
        >>>     search_alg=SearchAlgorithm(),
        >>>     scheduler=MedianStoppingRule(...))

    Returns:
        List of Trial objects, holding data for each executed trial.

    """
    # This is important to do this here
    # because it schematize the experiments
    # and it conducts the implicit registration.
    experiments = convert_to_experiment_list(experiments)
    checkpoint_dir = _find_checkpoint_dir(experiments)

    runner = None
    restore = False

    if os.path.exists(
            os.path.join(checkpoint_dir, TrialRunner.CKPT_FILE_NAME)):
        if resume == "prompt":
            msg = ("Found incomplete experiment at {}. "
                   "Would you like to resume it?".format(checkpoint_dir))
            restore = click.confirm(msg, default=False)
            if restore:
                logger.info("Tip: to always resume, "
                            "pass resume=True to run_experiments()")
            else:
                logger.info("Tip: to always start a new experiment, "
                            "pass resume=False to run_experiments()")
        elif resume:
            restore = True
        else:
            logger.info(
                "Tip: to resume incomplete experiments, "
                "pass resume='prompt' or resume=True to run_experiments()")
    else:
        logger.info(
            "Did not find checkpoint file in {}.".format(checkpoint_dir))

    if restore:
        runner = try_restore_runner(checkpoint_dir, search_alg, scheduler,
                                    trial_executor)
    else:
        logger.info("Starting a new experiment.")

    if not runner:
        if scheduler is None:
            scheduler = FIFOScheduler()

        if search_alg is None:
            search_alg = BasicVariantGenerator()

        search_alg.add_configurations(experiments)

        runner = TrialRunner(
            search_alg,
            scheduler=scheduler,
            metadata_checkpoint_dir=checkpoint_dir,
            launch_web_server=with_server,
            server_port=server_port,
            verbose=verbose,
            queue_trials=queue_trials,
            trial_executor=trial_executor)

    print(runner.debug_string(max_debug=99999))

    last_debug = 0
    while not runner.is_finished():
        runner.step()
        if time.time() - last_debug > DEBUG_PRINT_INTERVAL:
            print(runner.debug_string())
            last_debug = time.time()

    print(runner.debug_string(max_debug=99999))

    wait_for_log_sync()

    errored_trials = []
    for trial in runner.get_trials():
        if trial.status != Trial.TERMINATED:
            errored_trials += [trial]

    if errored_trials:
        if raise_on_failed_trial:
            raise TuneError("Trials did not complete", errored_trials)
        else:
            logger.error("Trials did not complete: %s", errored_trials)

    return runner.get_trials()
예제 #14
0
 def generate_trials(self, spec, name):
     suggester = BasicVariantGenerator()
     suggester.add_configurations({name: spec})
     return suggester.next_trials()
예제 #15
0
 def generate_trials(self, spec, name):
     suggester = BasicVariantGenerator()
     suggester.add_configurations({name: spec})
     return suggester.next_trials()
예제 #16
0
def run_experiments(experiments,
                    search_alg=None,
                    scheduler=None,
                    with_server=False,
                    server_port=TuneServer.DEFAULT_PORT,
                    verbose=2,
                    resume=False,
                    queue_trials=False,
                    trial_executor=None,
                    raise_on_failed_trial=True):
    """Runs and blocks until all trials finish.

    Args:
        experiments (Experiment | list | dict): Experiments to run. Will be
            passed to `search_alg` via `add_configurations`.
        search_alg (SearchAlgorithm): Search Algorithm. Defaults to
            BasicVariantGenerator.
        scheduler (TrialScheduler): Scheduler for executing
            the experiment. Choose among FIFO (default), MedianStopping,
            AsyncHyperBand, and HyperBand.
        with_server (bool): Starts a background Tune server. Needed for
            using the Client API.
        server_port (int): Port number for launching TuneServer.
        verbose (int): 0, 1, or 2. Verbosity mode. 0 = silent,
            1 = only status updates, 2 = status and trial results.
        resume (bool|"prompt"): If checkpoint exists, the experiment will
            resume from there. If resume is "prompt", Tune will prompt if
            checkpoint detected.
        queue_trials (bool): Whether to queue trials when the cluster does
            not currently have enough resources to launch one. This should
            be set to True when running on an autoscaling cluster to enable
            automatic scale-up.
        trial_executor (TrialExecutor): Manage the execution of trials.
        raise_on_failed_trial (bool): Raise TuneError if there exists failed
            trial (of ERROR state) when the experiments complete.

    Examples:
        >>> experiment_spec = Experiment("experiment", my_func)
        >>> run_experiments(experiments=experiment_spec)

        >>> experiment_spec = {"experiment": {"run": my_func}}
        >>> run_experiments(experiments=experiment_spec)

        >>> run_experiments(
        >>>     experiments=experiment_spec,
        >>>     scheduler=MedianStoppingRule(...))

        >>> run_experiments(
        >>>     experiments=experiment_spec,
        >>>     search_alg=SearchAlgorithm(),
        >>>     scheduler=MedianStoppingRule(...))

    Returns:
        List of Trial objects, holding data for each executed trial.

    """
    # This is important to do this here
    # because it schematize the experiments
    # and it conducts the implicit registration.
    experiments = convert_to_experiment_list(experiments)
    checkpoint_dir = _find_checkpoint_dir(experiments)

    runner = None
    restore = False

    if TrialRunner.checkpoint_exists(checkpoint_dir):
        if resume == "prompt":
            msg = ("Found incomplete experiment at {}. "
                   "Would you like to resume it?".format(checkpoint_dir))
            restore = click.confirm(msg, default=False)
            if restore:
                logger.info("Tip: to always resume, "
                            "pass resume=True to run_experiments()")
            else:
                logger.info("Tip: to always start a new experiment, "
                            "pass resume=False to run_experiments()")
        elif resume:
            restore = True
        else:
            logger.info(
                "Tip: to resume incomplete experiments, "
                "pass resume='prompt' or resume=True to run_experiments()")
    else:
        logger.info(
            "Did not find checkpoint file in {}.".format(checkpoint_dir))

    if restore:
        runner = try_restore_runner(checkpoint_dir, search_alg, scheduler,
                                    trial_executor)
    else:
        logger.info("Starting a new experiment.")

    if not runner:
        if scheduler is None:
            scheduler = FIFOScheduler()

        if search_alg is None:
            search_alg = BasicVariantGenerator()

        search_alg.add_configurations(experiments)

        runner = TrialRunner(search_alg,
                             scheduler=scheduler,
                             metadata_checkpoint_dir=checkpoint_dir,
                             launch_web_server=with_server,
                             server_port=server_port,
                             verbose=bool(verbose > 1),
                             queue_trials=queue_trials,
                             trial_executor=trial_executor)

    if verbose:
        print(runner.debug_string(max_debug=99999))

    last_debug = 0
    while not runner.is_finished():
        runner.step()
        if time.time() - last_debug > DEBUG_PRINT_INTERVAL:
            if verbose:
                print(runner.debug_string())
            last_debug = time.time()

    if verbose:
        print(runner.debug_string(max_debug=99999))

    wait_for_log_sync()

    errored_trials = []
    for trial in runner.get_trials():
        if trial.status != Trial.TERMINATED:
            errored_trials += [trial]

    if errored_trials:
        if raise_on_failed_trial:
            raise TuneError("Trials did not complete", errored_trials)
        else:
            logger.error("Trials did not complete: %s", errored_trials)

    return runner.get_trials()
예제 #17
0
def run_experiments(experiments=None,
                    search_alg=None,
                    scheduler=None,
                    with_server=False,
                    server_port=TuneServer.DEFAULT_PORT,
                    verbose=True,
                    queue_trials=False,
                    trial_executor=None,
                    raise_on_failed_trial=True):
    """Runs and blocks until all trials finish.

    Args:
        experiments (Experiment | list | dict): Experiments to run. Will be
            passed to `search_alg` via `add_configurations`.
        search_alg (SearchAlgorithm): Search Algorithm. Defaults to
            BasicVariantGenerator.
        scheduler (TrialScheduler): Scheduler for executing
            the experiment. Choose among FIFO (default), MedianStopping,
            AsyncHyperBand, and HyperBand.
        with_server (bool): Starts a background Tune server. Needed for
            using the Client API.
        server_port (int): Port number for launching TuneServer.
        verbose (bool): How much output should be printed for each trial.
        queue_trials (bool): Whether to queue trials when the cluster does
            not currently have enough resources to launch one. This should
            be set to True when running on an autoscaling cluster to enable
            automatic scale-up.
        trial_executor (TrialExecutor): Manage the execution of trials.
        raise_on_failed_trial (bool): Raise TuneError if there exists failed
            trial (of ERROR state) when the experiments complete.

    Examples:
        >>> experiment_spec = Experiment("experiment", my_func)
        >>> run_experiments(experiments=experiment_spec)

        >>> experiment_spec = {"experiment": {"run": my_func}}
        >>> run_experiments(experiments=experiment_spec)

        >>> run_experiments(
        >>>     experiments=experiment_spec,
        >>>     scheduler=MedianStoppingRule(...))

        >>> run_experiments(
        >>>     experiments=experiment_spec,
        >>>     search_alg=SearchAlgorithm(),
        >>>     scheduler=MedianStoppingRule(...))

    Returns:
        List of Trial objects, holding data for each executed trial.

    """

    if scheduler is None:
        scheduler = FIFOScheduler()

    if search_alg is None:
        search_alg = BasicVariantGenerator()

    search_alg.add_configurations(experiments)

    runner = TrialRunner(search_alg,
                         scheduler=scheduler,
                         launch_web_server=with_server,
                         server_port=server_port,
                         verbose=verbose,
                         queue_trials=queue_trials,
                         trial_executor=trial_executor)

    logger.info(runner.debug_string(max_debug=99999))

    last_debug = 0
    while not runner.is_finished():
        runner.step()
        if time.time() - last_debug > DEBUG_PRINT_INTERVAL:
            logger.info(runner.debug_string())
            last_debug = time.time()

    logger.info(runner.debug_string(max_debug=99999))

    wait_for_log_sync()

    errored_trials = []
    for trial in runner.get_trials():
        if trial.status != Trial.TERMINATED:
            errored_trials += [trial]

    if errored_trials:
        if raise_on_failed_trial:
            raise TuneError("Trials did not complete", errored_trials)
        else:
            logger.error("Trials did not complete: %s", errored_trials)

    return runner.get_trials()