def _final_msg_callback(self, msg: dict) -> None: """Final message callback. Logs trial results and registers executor as idle. :param msg: The final executor message from the message queue. """ trial = self.get_trial(msg["trial_id"]) logs = msg.get("logs", None) if logs is not None: with self.log_lock: self.executor_logs = self.executor_logs + logs # finalize the trial object with trial.lock: trial.status = Trial.FINALIZED trial.final_metric = msg["data"] trial.duration = util.seconds_to_milliseconds(time.time() - trial.start) # move trial to the finalized ones self._final_store.append(trial) self._trial_store.pop(trial.trial_id) # update result dictionary self._update_result(trial) # keep for later in case tqdm doesn't work self.maggy_log = self._update_maggy_log() self.log(self.maggy_log) EnvSing.get_instance().dump( trial.to_json(), self.log_dir + "/" + trial.trial_id + "/trial.json", ) # assign new trial trial = self.controller_get_next(trial) if trial is None: self.server.reservations.assign_trial(msg["partition_id"], None) self.experiment_done = True elif trial == "IDLE": self.add_message({ "type": "IDLE", "partition_id": msg["partition_id"], "idle_start": time.time(), }) self.server.reservations.assign_trial(msg["partition_id"], None) else: with trial.lock: trial.start = time.time() trial.status = Trial.SCHEDULED self.server.reservations.assign_trial(msg["partition_id"], trial.trial_id) self.add_trial(trial)
def finalize(self, job_end: float) -> dict: """Saves a summary of the experiment to a dict and logs it in the DFS. :param job_end: Time of the job end. :returns: The experiment summary dict. """ self.job_end = job_end self.duration = util.seconds_to_milliseconds(self.job_end - self.job_start) duration_str = util.time_diff(self.job_start, self.job_end) results = self.prep_results(duration_str) print(results) self.log(results) EnvSing.get_instance().dump( json.dumps(self.result, default=util.json_default_numpy), self.log_dir + "/result.json", ) EnvSing.get_instance().dump(self.json(), self.log_dir + "/maggy.json") return self.result_dict
def lagom(train_fn: Callable, config: LagomConfig) -> dict: """Launches a maggy experiment, which depending on 'config' can either be a hyperparameter optimization, an ablation study experiment or distributed training. Given a search space, objective and a model training procedure `train_fn` (black-box function), an experiment is the whole process of finding the best hyperparameter combination in the search space, optimizing the black-box function. Currently maggy supports random search and a median stopping rule. **lagom** is a Swedish word meaning "just the right amount". :param train_fn: User defined experiment containing the model training. :param config: An experiment configuration. For more information, see config. :returns: The experiment results as a dict. """ global APP_ID global RUNNING global RUN_ID job_start = time.time() try: if RUNNING: raise RuntimeError("An experiment is currently running.") RUNNING = True spark_context = util.find_spark().sparkContext APP_ID = str(spark_context.applicationId) APP_ID, RUN_ID = util.register_environment(APP_ID, RUN_ID) EnvSing.get_instance().set_app_id(APP_ID) driver = lagom_driver(config, APP_ID, RUN_ID) return driver.run_experiment(train_fn, config) except: # noqa: E722 _exception_handler( util.seconds_to_milliseconds(time.time() - job_start)) raise finally: # Clean up spark jobs RUN_ID += 1 RUNNING = False util.find_spark().sparkContext.setJobGroup("", "")