def _wrapper_fun(iter): """ Args: iter: Returns: """ for i in iter: executor_num = i experiment_utils._set_ml_id(app_id, run_id) tb_hdfs_path = '' hdfs_exec_logdir = '' t = threading.Thread(target=devices._print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() try: #Arguments if args_dict: param_string, params, args = experiment_utils.build_parameters( map_fun, executor_num, args_dict) hdfs_exec_logdir, hdfs_appid_logdir = experiment_utils._create_experiment_subdirectories( app_id, run_id, param_string, 'random_search', params=params) logfile = experiment_utils._init_logger(hdfs_exec_logdir) tb_hdfs_path, tb_pid = tensorboard._register( hdfs_exec_logdir, hdfs_appid_logdir, executor_num, local_logdir=local_logdir) print(devices._get_gpu_info()) print( '-------------------------------------------------------') print('Started running task ' + param_string) task_start = time.time() retval = map_fun(*args) task_end = time.time() experiment_utils._handle_return(retval, hdfs_exec_logdir, optimization_key, logfile) time_str = 'Finished task ' + param_string + ' - took ' + experiment_utils._time_diff( task_start, task_end) print(time_str) print('Returning metric ' + str(retval)) print( '-------------------------------------------------------') except: raise finally: experiment_utils._cleanup(tensorboard, t)
def _wrapper_fun(iter): """ Args: :iter: Returns: """ for i in iter: executor_num = i experiment_utils._set_ml_id(app_id, run_id) tb_hdfs_path = '' hdfs_exec_logdir = '' t = threading.Thread(target=devices._print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() global local_logdir_bool try: #Arguments if args_dict: param_string, params, args = experiment_utils.build_parameters(map_fun, executor_num, args_dict) val = _get_return_file(param_string, app_id, generation_id, run_id) hdfs_exec_logdir, hdfs_appid_logdir = experiment_utils._create_experiment_subdirectories(app_id, run_id, param_string, 'differential_evolution', sub_type='generation.' + str(generation_id), params=params) logfile = experiment_utils._init_logger(hdfs_exec_logdir) tb_hdfs_path, tb_pid = tensorboard._register(hdfs_exec_logdir, hdfs_appid_logdir, executor_num, local_logdir=local_logdir_bool) print(devices._get_gpu_info()) print('-------------------------------------------------------') print('Started running task ' + param_string) if val is not None: val = json.loads(val) task_start = time.time() if val is None: val = map_fun(*args) task_end = time.time() time_str = 'Finished task ' + param_string + ' - took ' + experiment_utils._time_diff(task_start, task_end) print(time_str) experiment_utils._handle_return(val, hdfs_exec_logdir, opt_key, logfile) print('Returning metric ' + str(val)) print('-------------------------------------------------------') except: raise finally: experiment_utils._cleanup(tensorboard, t)
def _start_run(): global running global app_id global run_id running = True experiment_utils._set_ml_id(app_id, run_id)
def _wrapper_fun(iter): """ Wraps the user supplied training function in order to be passed to the Spark Executors. Args: iter: Returns: """ experiment_utils._set_ml_id(app_id, run_id) # get task context information to determine executor identifier partition_id, task_attempt = util.get_partition_attempt_id() client = rpc.Client(server_addr, partition_id, task_attempt, hb_interval, secret) log_file = (log_dir + "/executor_" + str(partition_id) + "_" + str(task_attempt) + ".log") # save the builtin print original_print = __builtin__.print reporter = Reporter(log_file, partition_id, task_attempt, original_print) def maggy_print(*args, **kwargs): """Maggy custom print() function.""" original_print(*args, **kwargs) reporter.log(" ".join(str(x) for x in args), True) # override the builtin print __builtin__.print = maggy_print try: client_addr = client.client_addr host_port = client_addr[0] + ":" + str(client_addr[1]) exec_spec = {} exec_spec["partition_id"] = partition_id exec_spec["task_attempt"] = task_attempt exec_spec["host_port"] = host_port exec_spec["trial_id"] = None reporter.log("Registering with experiment driver", False) client.register(exec_spec) client.start_heartbeat(reporter) # blocking trial_id, parameters = client.get_suggestion(reporter) while not client.done: if experiment_type == "ablation": ablation_params = { "ablated_feature": parameters.get("ablated_feature", "None"), "ablated_layer": parameters.get("ablated_layer", "None"), } parameters.pop("ablated_feature") parameters.pop("ablated_layer") tb_logdir = log_dir + "/" + trial_id trial_log_file = tb_logdir + "/output.log" reporter.set_trial_id(trial_id) # If trial is repeated, delete trial directory, except log file if hopshdfs.exists(tb_logdir): util._clean_dir(tb_logdir, [trial_log_file]) else: hopshdfs.mkdir(tb_logdir) reporter.init_logger(trial_log_file) tensorboard._register(tb_logdir) if experiment_type == "ablation": hopshdfs.dump( json.dumps(ablation_params, default=util.json_default_numpy), tb_logdir + "/.hparams.json", ) else: hopshdfs.dump( json.dumps(parameters, default=util.json_default_numpy), tb_logdir + "/.hparams.json", ) try: reporter.log("Starting Trial: {}".format(trial_id), False) reporter.log("Trial Configuration: {}".format(parameters), False) if experiment_type == "optimization": tensorboard._write_hparams(parameters, trial_id) sig = inspect.signature(map_fun) if sig.parameters.get("reporter", None): retval = map_fun(**parameters, reporter=reporter) else: retval = map_fun(**parameters) if experiment_type == "optimization": tensorboard._write_session_end() retval = util._handle_return_val(retval, tb_logdir, optimization_key, trial_log_file) except exceptions.EarlyStopException as e: retval = e.metric reporter.log("Early Stopped Trial.", False) reporter.log("Finished Trial: {}".format(trial_id), False) reporter.log("Final Metric: {}".format(retval), False) client.finalize_metric(retval, reporter) # blocking trial_id, parameters = client.get_suggestion(reporter) except: # noqa: E722 reporter.log(traceback.format_exc(), False) raise finally: reporter.close_logger() client.stop() client.close()
def _wrapper_fun(iter): """ Args: iter: Returns: """ for i in iter: executor_num = i experiment_utils._set_ml_id(app_id, run_id) t = threading.Thread(target=devices._print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() is_chief = False logdir = None tb_hdfs_path = None try: host = experiment_utils._get_ip_address() tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tmp_socket.bind(('', 0)) port = tmp_socket.getsockname()[1] client = allreduce_reservation.Client(server_addr) host_port = host + ":" + str(port) client.register({"worker": host_port, "index": executor_num}) cluster = client.await_reservations() tmp_socket.close() client.close() task_index = experiment_utils._find_index(host_port, cluster) if task_index == -1: cluster["task"] = {"type": "chief", "index": 0} else: cluster["task"] = {"type": "worker", "index": task_index} evaluator_node = None if evaluator: last_worker_index = len(cluster["cluster"]["worker"]) - 1 evaluator_node = cluster["cluster"]["worker"][ last_worker_index] cluster["cluster"]["evaluator"] = [evaluator_node] del cluster["cluster"]["worker"][last_worker_index] if evaluator_node == host_port: cluster["task"] = {"type": "evaluator", "index": 0} print('TF_CONFIG: {} '.format(cluster)) if num_executors > 1: os.environ["TF_CONFIG"] = json.dumps(cluster) is_chief = (cluster["task"]["type"] == "chief") is_evaluator = (cluster["task"]["type"] == "evaluator") if is_chief: logdir = experiment_utils._get_logdir(app_id, run_id) tb_hdfs_path, tb_pid = tensorboard._register( logdir, logdir, executor_num, local_logdir=local_logdir) elif is_evaluator: logdir = experiment_utils._get_logdir(app_id, run_id) tensorboard.events_logdir = logdir logfile = experiment_utils._init_logger( experiment_utils._get_logdir(app_id, run_id), role=cluster["task"]["type"], index=cluster["task"]["index"]) print(devices._get_gpu_info()) print('-------------------------------------------------------') print('Started running task') task_start = time.time() retval = map_fun() if is_chief: experiment_utils._handle_return_simple( retval, experiment_utils._get_logdir(app_id, run_id), logfile) task_end = time.time() time_str = 'Finished task - took ' + experiment_utils._time_diff( task_start, task_end) print(time_str) print('-------------------------------------------------------') except: raise finally: experiment_utils._cleanup(tensorboard, t)
def set_ml_id(self, app_id, run_id): return experiment_utils._set_ml_id(app_id, run_id)
def lagom( map_fun, name="no-name", experiment_type="optimization", searchspace=None, optimizer=None, direction="max", num_trials=1, ablation_study=None, ablator=None, optimization_key="metric", hb_interval=1, es_policy="median", es_interval=300, es_min=10, description="", ): """Launches a maggy experiment, which depending on `experiment_type` can either be a hyperparameter optimization or an ablation study experiment. Given a search space, objective and a model training procedure `map_fun` (black-box function), an experiment is the whole process of finding the best hyperparameter combination in the search space, optimizing the black-box function. Currently maggy supports random search and a median stopping rule. **lagom** is a Swedish word meaning "just the right amount". :param map_fun: User defined experiment containing the model training. :type map_fun: function :param name: A user defined experiment identifier. :type name: str :param experiment_type: Type of Maggy experiment, either 'optimization' (default) or 'ablation'. :type experiment_type: str :param searchspace: A maggy Searchspace object from which samples are drawn. :type searchspace: Searchspace :param optimizer: The optimizer is the part generating new trials. :type optimizer: str, AbstractOptimizer :param direction: If set to ‘max’ the highest value returned will correspond to the best solution, if set to ‘min’ the opposite is true. :type direction: str :param num_trials: the number of trials to evaluate given the search space, each containing a different hyperparameter combination :type num_trials: int :param ablation_study: Ablation study object. Can be None for optimization experiment type. :type ablation_study: AblationStudy :param ablator: Ablator to use for experiment type 'ablation'. :type ablator: str, AbstractAblator :param optimization_key: Name of the metric to be optimized :type optimization_key: str, optional :param hb_interval: The heartbeat interval in seconds from trial executor to experiment driver, defaults to 1 :type hb_interval: int, optional :param es_policy: The earlystopping policy, defaults to 'median' :type es_policy: str, optional :param es_interval: Frequency interval in seconds to check currently running trials for early stopping, defaults to 300 :type es_interval: int, optional :param es_min: Minimum number of trials finalized before checking for early stopping, defaults to 10 :type es_min: int, optional :param description: A longer description of the experiment. :type description: str, optional :raises RuntimeError: An experiment is currently running. :return: A dictionary indicating the best trial and best hyperparameter combination with it's performance metric :rtype: dict """ global running if running: raise RuntimeError("An experiment is currently running.") job_start = time.time() sc = hopsutil._find_spark().sparkContext exp_driver = None try: global app_id global experiment_json global run_id app_id = str(sc.applicationId) app_id, run_id = util._validate_ml_id(app_id, run_id) # start run running = True experiment_utils._set_ml_id(app_id, run_id) # create experiment dir experiment_utils._create_experiment_dir(app_id, run_id) tensorboard._register(experiment_utils._get_logdir(app_id, run_id)) num_executors = util.num_executors(sc) # start experiment driver if experiment_type == "optimization": assert num_trials > 0, "number of trials should be greater " + "than zero" tensorboard._write_hparams_config( experiment_utils._get_logdir(app_id, run_id), searchspace ) if num_executors > num_trials: num_executors = num_trials exp_driver = experimentdriver.ExperimentDriver( "optimization", searchspace=searchspace, optimizer=optimizer, direction=direction, num_trials=num_trials, name=name, num_executors=num_executors, hb_interval=hb_interval, es_policy=es_policy, es_interval=es_interval, es_min=es_min, description=description, log_dir=experiment_utils._get_logdir(app_id, run_id), ) exp_function = exp_driver.optimizer.name() elif experiment_type == "ablation": exp_driver = experimentdriver.ExperimentDriver( "ablation", ablation_study=ablation_study, ablator=ablator, name=name, num_executors=num_executors, hb_interval=hb_interval, description=description, log_dir=experiment_utils._get_logdir(app_id, run_id), ) # using exp_driver.num_executor since # it has been set using ablator.get_number_of_trials() # in experiment.py if num_executors > exp_driver.num_executors: num_executors = exp_driver.num_executors exp_function = exp_driver.ablator.name() else: running = False raise RuntimeError( "Unknown experiment_type:" "should be either 'optimization' or 'ablation', " "But it is '{0}'".format(str(experiment_type)) ) nodeRDD = sc.parallelize(range(num_executors), num_executors) # Do provenance after initializing exp_driver, because exp_driver does # the type checks for optimizer and searchspace sc.setJobGroup(os.environ["ML_ID"], "{0} | {1}".format(name, exp_function)) experiment_json = experiment_utils._populate_experiment( name, exp_function, "MAGGY", exp_driver.searchspace.json(), description, app_id, direction, optimization_key, ) experiment_json = experiment_utils._attach_experiment_xattr( app_id, run_id, experiment_json, "CREATE" ) util._log( "Started Maggy Experiment: {0}, {1}, run {2}".format(name, app_id, run_id) ) exp_driver.init(job_start) server_addr = exp_driver.server_addr # Force execution on executor, since GPU is located on executor nodeRDD.foreachPartition( trialexecutor._prepare_func( app_id, run_id, experiment_type, map_fun, server_addr, hb_interval, exp_driver._secret, optimization_key, experiment_utils._get_logdir(app_id, run_id), ) ) job_end = time.time() result = exp_driver.finalize(job_end) best_logdir = ( experiment_utils._get_logdir(app_id, run_id) + "/" + result["best_id"] ) util._finalize_experiment( experiment_json, float(result["best_val"]), app_id, run_id, "FINISHED", exp_driver.duration, experiment_utils._get_logdir(app_id, run_id), best_logdir, optimization_key, ) util._log("Finished Experiment") return result except: # noqa: E722 _exception_handler( experiment_utils._seconds_to_milliseconds(time.time() - job_start) ) if exp_driver: if exp_driver.exception: raise exp_driver.exception raise finally: # grace period to send last logs to sparkmagic # sparkmagic hb poll intervall is 5 seconds, therefore wait 6 seconds time.sleep(6) # cleanup spark jobs if running and exp_driver is not None: exp_driver.stop() run_id += 1 running = False sc.setJobGroup("", "") return result
def _wrapper_fun(iter): """ Args: iter: Returns: """ for i in iter: executor_num = i experiment_utils._set_ml_id(app_id, run_id) t = threading.Thread(target=devices._print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() role = None logdir = None tb_hdfs_path = None client = parameter_server_reservation.Client(server_addr) try: host = experiment_utils._get_ip_address() tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tmp_socket.bind(('', 0)) port = tmp_socket.getsockname()[1] host_port = host + ":" + str(port) exec_spec = {} if executor_num < num_ps: exec_spec["task_type"] = "ps" else: exec_spec["task_type"] = "worker" exec_spec["host_port"] = host_port exec_spec["gpus_present"] = devices.get_num_gpus() > 0 client.register(exec_spec) cluster = client.await_reservations() tmp_socket.close() role, index = experiment_utils._find_task_and_index(host_port, cluster) cluster_spec = {} cluster_spec["cluster"] = cluster cluster_spec["task"] = {"type": role, "index": index} evaluator_node = None if evaluator: last_worker_index = len(cluster_spec["cluster"]["worker"])-1 evaluator_node = cluster_spec["cluster"]["worker"][last_worker_index] cluster_spec["cluster"]["evaluator"] = [evaluator_node] del cluster_spec["cluster"]["worker"][last_worker_index] if evaluator_node == host_port: role = "evaluator" cluster_spec["task"] = {"type": "evaluator", "index": 0} print('TF_CONFIG: {} '.format(cluster_spec)) os.environ["TF_CONFIG"] = json.dumps(cluster_spec) logfile = experiment_utils._init_logger(experiment_utils._get_logdir(app_id, run_id), role=role, index=cluster_spec["task"]["index"]) dist_logdir = experiment_utils._get_logdir(app_id, run_id) + '/logdir' is_chief = (cluster["task"]["type"] == "chief") if is_chief: hdfs.mkdir(dist_logdir) tensorboard._register(dist_logdir, experiment_utils._get_logdir(app_id, run_id), executor_num, local_logdir=local_logdir) else: tensorboard.events_logdir = dist_logdir print(devices._get_gpu_info()) print('-------------------------------------------------------') print('Started running task') task_start = time.time() retval=None if role == "ps": ps_thread = threading.Thread(target=lambda: map_fun()) ps_thread.start() client.await_all_workers_finished() else: retval = map_fun() if role == "chief": experiment_utils._handle_return_simple(retval, experiment_utils._get_logdir(app_id, run_id), logfile) task_end = time.time() time_str = 'Finished task - took ' + experiment_utils._time_diff(task_start, task_end) print(time_str) print('-------------------------------------------------------') except: raise finally: if role != "ps": client.register_worker_finished() client.close() experiment_utils._cleanup(tensorboard, t)