Пример #1
0
    def save(self, trial, storage=Checkpoint.PERSISTENT, result=None):
        """Saves the trial's state to a checkpoint asynchronously.

        Args:
            trial (Trial): The trial to be saved.
            storage (str): Where to store the checkpoint. Defaults to
                PERSISTENT.
            result (dict): The state of this trial as a dictionary to be saved.
                If result is None, the trial's last result will be used.

        Returns:
             Checkpoint object, or None if an Exception occurs.
        """
        result = result or trial.last_result
        with self._change_working_directory(trial):
            if storage == Checkpoint.MEMORY:
                value = trial.runner.save_to_object.remote()
                checkpoint = Checkpoint(storage, value, result)
                trial.on_checkpoint(checkpoint)
            else:
                value = trial.runner.save.remote()
                checkpoint = Checkpoint(storage, value, result)
                trial.saving_to = checkpoint
                self._running[value] = trial
        return checkpoint
Пример #2
0
    def save(
        self, trial, storage=Checkpoint.PERSISTENT, result: Optional[Dict] = None
    ) -> Checkpoint:
        """Saves the trial's state to a checkpoint asynchronously.

        Args:
            trial (Trial): The trial to be saved.
            storage (str): Where to store the checkpoint. Defaults to
                PERSISTENT.
            result (dict): The state of this trial as a dictionary to be saved.
                If result is None, the trial's last result will be used.

        Returns:
             Checkpoint object, or None if an Exception occurs.
        """
        logger.info(f"saving trial {trial}")
        result = result or trial.last_result
        with self._change_working_directory(trial):
            if storage == Checkpoint.MEMORY:
                value = trial.runner.save_to_object.remote()
                checkpoint = Checkpoint(storage, value, result)
                trial.on_checkpoint(checkpoint)
            else:
                value = trial.runner.save.remote()
                checkpoint = Checkpoint(storage, value, result)
                trial.saving_to = checkpoint
                self._futures[value] = (ExecutorEventType.SAVING_RESULT, trial)
        return checkpoint
Пример #3
0
    def save(self, trial, storage=Checkpoint.DISK, result=None):
        """Saves the trial's state to a checkpoint."""
        result = result or trial.last_result

        if storage == Checkpoint.MEMORY:
            value = trial.runner.save_to_object.remote()
            checkpoint = Checkpoint(storage, value, result)
        else:
            with warn_if_slow("save_checkpoint_to_disk"):
                value = ray.get(trial.runner.save.remote())
                checkpoint = Checkpoint(storage, value, result)

        with warn_if_slow("on_checkpoint", DEFAULT_GET_TIMEOUT) as profile:
            try:
                trial.on_checkpoint(checkpoint)
            except Exception:
                logger.exception("Trial %s: Error handling checkpoint %s",
                                 trial, checkpoint.value)
                return None
        if profile.too_slow and trial.sync_on_checkpoint:
            logger.warning(
                "Consider turning off forced head-worker trial checkpoint "
                "syncs by setting sync_on_checkpoint=False. Note that this "
                "might result in faulty trial restoration for some worker "
                "failure modes.")
        return checkpoint.value
Пример #4
0
    def _exploit_trial(self, trial_executor: RayTrialExecutor, trial: Trial,
                       trial_to_clone: Trial):
        """
        Transfers perturbed state from trial_to_clone -> trial.
        If specified, also logs the updated hyperparam state.
        """

        trial_state = self._trials_states_dict[trial]
        new_state = self._trials_states_dict[trial_to_clone]
        if not new_state.last_checkpoint:
            logger.info(
                "[pbt]: no checkpoint for trial. Skip exploit for Trial {}".
                format(trial))
            return
        new_config = explore(trial_to_clone.config, self._hyperparam_mutations,
                             self._hyperparam_mutate_probability,
                             self._explore_func)
        logger.info(
            "[exploit] transferring weights from trial {} (score {}) -> {} (score {})"
            .format(trial_to_clone, new_state.last_score, trial,
                    trial_state.last_score))

        if self._log_config:
            self._log_config_on_step(trial_state, new_state, trial,
                                     trial_to_clone, new_config)

        new_tag = make_experiment_tag(trial_state.orig_tag, new_config,
                                      self._hyperparam_mutations)
        reset_successful = trial_executor.reset_trial(trial, new_config,
                                                      new_tag)

        if reset_successful:
            trial_executor.restore(
                trial, Checkpoint.from_object(new_state.last_checkpoint))
        else:
            trial_executor.stop_trial(trial, stop_logger=False)
            trial.config = new_config
            trial.experiment_tag = new_tag
            trial_executor.start_trial(
                trial, Checkpoint.from_object(new_state.last_checkpoint))

        # TODO: move to Exploiter
        new_state.num_steps = 0
        trial_state.num_steps = 0
        new_state.num_explorations = 0
        trial_state.num_explorations += 1
        self._num_explorations += 1
        # Transfer over the last perturbation time as well
        trial_state.last_perturbation_time = new_state.last_perturbation_time
Пример #5
0
    def _exploit(self, trial_executor, trial, trial_to_clone):
        """Transfers perturbed state from trial_to_clone -> trial."""

        trial_state = self._trial_state[trial]
        new_state = self._trial_state[trial_to_clone]
        if not new_state.last_checkpoint:
            print("[pbt] warn: no checkpoint for trial, skip exploit", trial)
            return
        new_config = explore(trial_to_clone.config, self._hyperparam_mutations,
                             self._resample_probability,
                             self._custom_explore_fn)
        print("[exploit] transferring weights from trial "
              "{} (score {}) -> {} (score {})".format(trial_to_clone,
                                                      new_state.last_score,
                                                      trial,
                                                      trial_state.last_score))
        # TODO(ekl) restarting the trial is expensive. We should implement a
        # lighter way reset() method that can alter the trial config.
        trial_executor.stop_trial(trial, stop_logger=False)
        trial.config = new_config
        trial.experiment_tag = make_experiment_tag(trial_state.orig_tag,
                                                   new_config,
                                                   self._hyperparam_mutations)
        trial_executor.start_trial(
            trial, Checkpoint.from_object(new_state.last_checkpoint))
        self._num_perturbations += 1
        # Transfer over the last perturbation time as well
        trial_state.last_perturbation_time = new_state.last_perturbation_time
Пример #6
0
    def _exploit(self, trial_executor, trial, trial_to_clone):
        """Transfers perturbed state from trial_to_clone -> trial."""

        trial_state = self._trial_state[trial]
        new_state = self._trial_state[trial_to_clone]
        if not new_state.last_checkpoint:
            logger.warning("[pbt]: no checkpoint for trial."
                           " Skip exploit for Trial {}".format(trial))
            return
        new_config = explore(trial_to_clone.config, self._hyperparam_mutations,
                             self._resample_probability,
                             self._custom_explore_fn)
        logger.warning("[exploit] transferring weights from trial "
                       "{} (score {}) -> {} (score {})".format(
                           trial_to_clone, new_state.last_score, trial,
                           trial_state.last_score))
        # TODO(ekl) restarting the trial is expensive. We should implement a
        # lighter way reset() method that can alter the trial config.
        new_tag = make_experiment_tag(trial_state.orig_tag, new_config,
                                      self._hyperparam_mutations)
        reset_successful = trial_executor.reset_trial(trial, new_config,
                                                      new_tag)
        if not reset_successful:
            trial_executor.stop_trial(trial, stop_logger=False)
            trial.config = new_config
            trial.experiment_tag = new_tag
            trial_executor.start_trial(
                trial, Checkpoint.from_object(new_state.last_checkpoint))

        self._num_perturbations += 1
        # Transfer over the last perturbation time as well
        trial_state.last_perturbation_time = new_state.last_perturbation_time
Пример #7
0
    def save(self, trial, storage=Checkpoint.PERSISTENT, result=None):
        """Saves the trial's state to a checkpoint.

        Args:
            trial (Trial): The state of this trial to be saved.
            storage (str): Where to store the checkpoint. Defaults to
                PERSISTENT.
            result (dict): The state of this trial as a dictionary to be saved.
                If result is None, the trial's last result will be used.

        Returns:
             Checkpoint future, or None if an Exception occurs.
        """
        result = result or trial.last_result

        with self._change_working_directory(trial):
            if storage == Checkpoint.MEMORY:
                value = trial.runner.save_to_object.remote()
                checkpoint = Checkpoint(storage, value, result)
            else:
                with warn_if_slow("save_checkpoint_to_storage"):
                    # TODO(ujvl): Make this asynchronous.
                    value = ray.get(trial.runner.save.remote())
                    checkpoint = Checkpoint(storage, value, result)
        with warn_if_slow("on_checkpoint", DEFAULT_GET_TIMEOUT) as profile:
            try:
                trial.on_checkpoint(checkpoint)
            except Exception:
                logger.exception("Trial %s: Error handling checkpoint %s",
                                 trial, checkpoint.value)
                return None
        if profile.too_slow and trial.sync_on_checkpoint:
            logger.warning(
                "Consider turning off forced head-worker trial checkpoint "
                "syncs by setting sync_on_checkpoint=False. Note that this "
                "might result in faulty trial restoration for some worker "
                "failure modes.")
        return checkpoint.value
 def checkpoint(self):
     return Checkpoint(Checkpoint.MEMORY, "None", {})
Пример #9
0
 def save(self, trial, type=Checkpoint.PERSISTENT, result=None):
     return Checkpoint(Checkpoint.PERSISTENT, trial.trainable_name, result)
Пример #10
0
Файл: pb2.py Проект: os-popt/PB2
    def _exploit(self, trial_executor, trial, trial_to_clone):
        """Transfers perturbed state from trial_to_clone -> trial.

        If specified, also logs the updated hyperparam state.

        """

        trial_state = self._trial_state[trial]
        new_state = self._trial_state[trial_to_clone]

        if not new_state.last_checkpoint:
            logger.info("[pbt]: no checkpoint for trial."
                        " Skip exploit for Trial {}".format(trial))
            return

        # if we are at a new timestep, we dont want to penalise for trials still going
        if self.data['T'].max() > self.latest:
            self.current = None

        print("\n\n\n\n Copying: \n{} \n with:{} \n\n".format(
            str(trial), str(trial_to_clone)))
        new_config, lengthscale, mindist, meandist, data = explore(
            self.data, self.bounds, self.current, trial_to_clone, trial,
            trial_to_clone.config, self._hyperparam_mutations,
            self._resample_probability)

        # important to replace the old values, since we are copying across
        self.data = data.copy()

        # if the current guy youre selecting is at a point youve already done,
        # then append the data to the "current" which is the points in the current batch

        new = []
        for key in self._hyperparam_mutations.keys():
            new.append(new_config[key])

        new = np.array(new)
        new = new.reshape(1, new.size)
        if self.data['T'].max() > self.latest:
            self.latest = self.data['T'].max()
            self.current = new.copy()
        else:
            self.current = np.concatenate((self.current, new), axis=0)
            print("\n\n\n\n\n Currently Evaluating \n\n\n\n\n")
            print(self.current)
            print("\n\n\n\n\n")

        # log the lengthscale
        self.meta['timesteps'].append(self.data['T'].values[-1])
        self.meta['lengthscales'].append(lengthscale)
        self.meta['closest'].append(mindist)
        self.meta['meandist'].append(meandist)
        meta = pd.DataFrame({
            'timesteps': self.meta['timesteps'],
            'lengthscales': self.meta['lengthscales'],
            'closest': self.meta['closest'],
            'meandist': self.meta['meandist']
        })
        meta.to_csv('meta_data.csv')

        logger.info("[exploit] transferring weights from trial "
                    "{} (score {}) -> {} (score {})".format(
                        trial_to_clone, new_state.last_score, trial,
                        trial_state.last_score))

        if self._log_config:
            self._log_config_on_step(trial_state, new_state, trial,
                                     trial_to_clone, new_config)

        new_tag = make_experiment_tag(trial_state.orig_tag, new_config,
                                      self._hyperparam_mutations)
        reset_successful = trial_executor.reset_trial(trial, new_config,
                                                      new_tag)
        if reset_successful:
            trial_executor.restore(
                trial, Checkpoint.from_object(new_state.last_checkpoint))
        else:
            trial_executor.stop_trial(trial, stop_logger=False)
            trial.config = new_config
            trial.experiment_tag = new_tag
            trial_executor.start_trial(
                trial, Checkpoint.from_object(new_state.last_checkpoint))

        self._num_perturbations += 1
        # Transfer over the last perturbation time as well
        trial_state.last_perturbation_time = new_state.last_perturbation_time