def _final_msg_callback(self, msg: dict) -> None:
        """Final message callback.

        Logs trial results and registers executor as idle.

        :param msg: The final executor message from the message queue.
        """
        trial = self.get_trial(msg["trial_id"])
        logs = msg.get("logs", None)
        if logs is not None:
            with self.log_lock:
                self.executor_logs = self.executor_logs + logs

        # finalize the trial object
        with trial.lock:
            trial.status = Trial.FINALIZED
            trial.final_metric = msg["data"]
            trial.duration = util.seconds_to_milliseconds(time.time() -
                                                          trial.start)

        # move trial to the finalized ones
        self._final_store.append(trial)
        self._trial_store.pop(trial.trial_id)

        # update result dictionary
        self._update_result(trial)
        # keep for later in case tqdm doesn't work
        self.maggy_log = self._update_maggy_log()
        self.log(self.maggy_log)

        EnvSing.get_instance().dump(
            trial.to_json(),
            self.log_dir + "/" + trial.trial_id + "/trial.json",
        )

        # assign new trial
        trial = self.controller_get_next(trial)
        if trial is None:
            self.server.reservations.assign_trial(msg["partition_id"], None)
            self.experiment_done = True
        elif trial == "IDLE":
            self.add_message({
                "type": "IDLE",
                "partition_id": msg["partition_id"],
                "idle_start": time.time(),
            })
            self.server.reservations.assign_trial(msg["partition_id"], None)
        else:
            with trial.lock:
                trial.start = time.time()
                trial.status = Trial.SCHEDULED
                self.server.reservations.assign_trial(msg["partition_id"],
                                                      trial.trial_id)
                self.add_trial(trial)
示例#2
0
    def finalize(self, job_end: float) -> dict:
        """Saves a summary of the experiment to a dict and logs it in the DFS.

        :param job_end: Time of the job end.

        :returns: The experiment summary dict.
        """
        self.job_end = job_end
        self.duration = util.seconds_to_milliseconds(self.job_end -
                                                     self.job_start)
        duration_str = util.time_diff(self.job_start, self.job_end)
        results = self.prep_results(duration_str)
        print(results)
        self.log(results)
        EnvSing.get_instance().dump(
            json.dumps(self.result, default=util.json_default_numpy),
            self.log_dir + "/result.json",
        )
        EnvSing.get_instance().dump(self.json(), self.log_dir + "/maggy.json")
        return self.result_dict
示例#3
0
def lagom(train_fn: Callable, config: LagomConfig) -> dict:
    """Launches a maggy experiment, which depending on 'config' can either
    be a hyperparameter optimization, an ablation study experiment or distributed
    training. Given a search space, objective and a model training procedure `train_fn`
    (black-box function), an experiment is the whole process of finding the
    best hyperparameter combination in the search space, optimizing the
    black-box function. Currently maggy supports random search and a median
    stopping rule.
    **lagom** is a Swedish word meaning "just the right amount".

    :param train_fn: User defined experiment containing the model training.
    :param config: An experiment configuration. For more information, see config.

    :returns: The experiment results as a dict.
    """
    global APP_ID
    global RUNNING
    global RUN_ID
    job_start = time.time()
    try:
        if RUNNING:
            raise RuntimeError("An experiment is currently running.")
        RUNNING = True
        spark_context = util.find_spark().sparkContext
        APP_ID = str(spark_context.applicationId)
        APP_ID, RUN_ID = util.register_environment(APP_ID, RUN_ID)
        EnvSing.get_instance().set_app_id(APP_ID)
        driver = lagom_driver(config, APP_ID, RUN_ID)
        return driver.run_experiment(train_fn, config)
    except:  # noqa: E722
        _exception_handler(
            util.seconds_to_milliseconds(time.time() - job_start))
        raise
    finally:
        # Clean up spark jobs
        RUN_ID += 1
        RUNNING = False
        util.find_spark().sparkContext.setJobGroup("", "")