def __init__(self, config: HyperparameterOptConfig, app_id: int, run_id: int): """Performs argument checks and initializes the optimization controller. :param config: Experiment config. :param app_id: Maggy application ID. :param run_id: Maggy run ID. :raises ValueError: In case an invalid optimization direction was specified. """ super().__init__(config, app_id, run_id) self._final_store = [] self._trial_store = {} self.experiment_done = False self.maggy_log = "" self.job_end = None self.duration = None # Interrupt init for AblationDriver. if isinstance(config, AblationConfig): return self.num_trials = config.num_trials self.num_executors = min(util.num_executors(self.spark_context), self.num_trials) self.server = OptimizationServer(self.num_executors, config.__class__) self.searchspace = self._init_searchspace(config.searchspace) self.controller = self._init_controller(config.optimizer, self.searchspace) # if optimizer has pruner, num trials is determined by pruner if self.controller.pruner: self.num_trials = self.controller.pruner.num_trials() if isinstance(self.controller, GridSearch): # number of trials need to be determined depending on searchspace of user. self.num_trials = self.controller.get_num_trials( config.searchspace) self.earlystop_check = self._init_earlystop_check(config.es_policy) self.es_interval = config.es_interval self.es_min = config.es_min if isinstance(config.direction, str) and config.direction.lower() in [ "min", "max", ]: self.direction = config.direction.lower() else: raise ValueError( "The experiment's direction should be a string (either 'min' or 'max') " "but it is {0} (of type '{1}').".format( str(config.direction), type(config.direction).__name__)) self.result = {"best_val": "n.a.", "num_trials": 0, "early_stopped": 0} # Init controller and set references to data self.controller.num_trials = self.num_trials self.controller.searchspace = self.searchspace self.controller.trial_store = self._trial_store self.controller.final_store = self._final_store self.controller.direction = self.direction self.controller._initialize(exp_dir=self.log_dir)
def __init__(self, config: LagomConfig, app_id: int, run_id: int): """Sets up the RPC server, message queue and logs. :param config: Experiment config. :param app_id: Maggy application ID. :param run_id: Maggy run ID. """ global DRIVER_SECRET self.config = config self.app_id = app_id self.run_id = run_id self.name = config.name self.description = config.description self.spark_context = util.find_spark().sparkContext self.num_executors = util.num_executors(self.spark_context) self.hb_interval = config.hb_interval self.server_addr = None self.job_start = None DRIVER_SECRET = (DRIVER_SECRET if DRIVER_SECRET else self._generate_secret(self.SECRET_BYTES)) self._secret = DRIVER_SECRET # Logging related initialization self._message_q = queue.Queue() self.message_callbacks = {} self._register_msg_callbacks() self.worker_done = False self.executor_logs = "" self.log_lock = threading.RLock() self.log_dir = EnvSing.get_instance().get_logdir(app_id, run_id) log_file = self.log_dir + "/maggy.log" # Open File desc for HDFS to log if not EnvSing.get_instance().exists(log_file): EnvSing.get_instance().dump("", log_file) self.log_file_handle = EnvSing.get_instance().open_file(log_file, flags="w") self.exception = None self.result = None self.result_dict = {} self.main_metric_key = None
def lagom( map_fun, name="no-name", experiment_type="optimization", searchspace=None, optimizer=None, direction="max", num_trials=1, ablation_study=None, ablator=None, optimization_key="metric", hb_interval=1, es_policy="median", es_interval=300, es_min=10, description="", ): """Launches a maggy experiment, which depending on `experiment_type` can either be a hyperparameter optimization or an ablation study experiment. Given a search space, objective and a model training procedure `map_fun` (black-box function), an experiment is the whole process of finding the best hyperparameter combination in the search space, optimizing the black-box function. Currently maggy supports random search and a median stopping rule. **lagom** is a Swedish word meaning "just the right amount". :param map_fun: User defined experiment containing the model training. :type map_fun: function :param name: A user defined experiment identifier. :type name: str :param experiment_type: Type of Maggy experiment, either 'optimization' (default) or 'ablation'. :type experiment_type: str :param searchspace: A maggy Searchspace object from which samples are drawn. :type searchspace: Searchspace :param optimizer: The optimizer is the part generating new trials. :type optimizer: str, AbstractOptimizer :param direction: If set to ‘max’ the highest value returned will correspond to the best solution, if set to ‘min’ the opposite is true. :type direction: str :param num_trials: the number of trials to evaluate given the search space, each containing a different hyperparameter combination :type num_trials: int :param ablation_study: Ablation study object. Can be None for optimization experiment type. :type ablation_study: AblationStudy :param ablator: Ablator to use for experiment type 'ablation'. :type ablator: str, AbstractAblator :param optimization_key: Name of the metric to be optimized :type optimization_key: str, optional :param hb_interval: The heartbeat interval in seconds from trial executor to experiment driver, defaults to 1 :type hb_interval: int, optional :param es_policy: The earlystopping policy, defaults to 'median' :type es_policy: str, optional :param es_interval: Frequency interval in seconds to check currently running trials for early stopping, defaults to 300 :type es_interval: int, optional :param es_min: Minimum number of trials finalized before checking for early stopping, defaults to 10 :type es_min: int, optional :param description: A longer description of the experiment. :type description: str, optional :raises RuntimeError: An experiment is currently running. :return: A dictionary indicating the best trial and best hyperparameter combination with it's performance metric :rtype: dict """ global running if running: raise RuntimeError("An experiment is currently running.") job_start = time.time() sc = hopsutil._find_spark().sparkContext exp_driver = None try: global app_id global experiment_json global run_id app_id = str(sc.applicationId) app_id, run_id = util._validate_ml_id(app_id, run_id) # start run running = True experiment_utils._set_ml_id(app_id, run_id) # create experiment dir experiment_utils._create_experiment_dir(app_id, run_id) tensorboard._register(experiment_utils._get_logdir(app_id, run_id)) num_executors = util.num_executors(sc) # start experiment driver if experiment_type == "optimization": assert num_trials > 0, "number of trials should be greater " + "than zero" tensorboard._write_hparams_config( experiment_utils._get_logdir(app_id, run_id), searchspace ) if num_executors > num_trials: num_executors = num_trials exp_driver = experimentdriver.ExperimentDriver( "optimization", searchspace=searchspace, optimizer=optimizer, direction=direction, num_trials=num_trials, name=name, num_executors=num_executors, hb_interval=hb_interval, es_policy=es_policy, es_interval=es_interval, es_min=es_min, description=description, log_dir=experiment_utils._get_logdir(app_id, run_id), ) exp_function = exp_driver.optimizer.name() elif experiment_type == "ablation": exp_driver = experimentdriver.ExperimentDriver( "ablation", ablation_study=ablation_study, ablator=ablator, name=name, num_executors=num_executors, hb_interval=hb_interval, description=description, log_dir=experiment_utils._get_logdir(app_id, run_id), ) # using exp_driver.num_executor since # it has been set using ablator.get_number_of_trials() # in experiment.py if num_executors > exp_driver.num_executors: num_executors = exp_driver.num_executors exp_function = exp_driver.ablator.name() else: running = False raise RuntimeError( "Unknown experiment_type:" "should be either 'optimization' or 'ablation', " "But it is '{0}'".format(str(experiment_type)) ) nodeRDD = sc.parallelize(range(num_executors), num_executors) # Do provenance after initializing exp_driver, because exp_driver does # the type checks for optimizer and searchspace sc.setJobGroup(os.environ["ML_ID"], "{0} | {1}".format(name, exp_function)) experiment_json = experiment_utils._populate_experiment( name, exp_function, "MAGGY", exp_driver.searchspace.json(), description, app_id, direction, optimization_key, ) experiment_json = experiment_utils._attach_experiment_xattr( app_id, run_id, experiment_json, "CREATE" ) util._log( "Started Maggy Experiment: {0}, {1}, run {2}".format(name, app_id, run_id) ) exp_driver.init(job_start) server_addr = exp_driver.server_addr # Force execution on executor, since GPU is located on executor nodeRDD.foreachPartition( trialexecutor._prepare_func( app_id, run_id, experiment_type, map_fun, server_addr, hb_interval, exp_driver._secret, optimization_key, experiment_utils._get_logdir(app_id, run_id), ) ) job_end = time.time() result = exp_driver.finalize(job_end) best_logdir = ( experiment_utils._get_logdir(app_id, run_id) + "/" + result["best_id"] ) util._finalize_experiment( experiment_json, float(result["best_val"]), app_id, run_id, "FINISHED", exp_driver.duration, experiment_utils._get_logdir(app_id, run_id), best_logdir, optimization_key, ) util._log("Finished Experiment") return result except: # noqa: E722 _exception_handler( experiment_utils._seconds_to_milliseconds(time.time() - job_start) ) if exp_driver: if exp_driver.exception: raise exp_driver.exception raise finally: # grace period to send last logs to sparkmagic # sparkmagic hb poll intervall is 5 seconds, therefore wait 6 seconds time.sleep(6) # cleanup spark jobs if running and exp_driver is not None: exp_driver.stop() run_id += 1 running = False sc.setJobGroup("", "") return result