def _run(sc, map_fun, run_id, local_logdir=False, name="no-name", evaluator=False): """ Args: sc: map_fun: local_logdir: name: Returns: """ app_id = str(sc.applicationId) num_executions = util.num_executors() #Each TF task should be run on 1 executor nodeRDD = sc.parallelize(range(num_executions), num_executions) #Make SparkUI intuitive by grouping jobs sc.setJobGroup( os.environ['ML_ID'], "{} | ParameterServerStrategy - Distributed Training".format(name)) server = parameter_server_reservation.Server(num_executions) server_addr = server.start() num_ps = util.num_param_servers() #Force execution on executor, since GPU is located on executor nodeRDD.foreachPartition( _prepare_func(app_id, run_id, map_fun, local_logdir, server_addr, num_ps, evaluator)) logdir = experiment_utils._get_logdir(app_id, run_id) print('Finished Experiment \n') path_to_return = logdir + '/.outputs.json' if pydoop.hdfs.path.exists(path_to_return): with pydoop.hdfs.open(path_to_return, "r") as fi: contents = fi.read() fi.close() return logdir, json.loads(contents) return logdir, None
def _run(sc, map_fun, run_id, args_dict, samples, direction=Direction.MAX, local_logdir=False, name="no-name", optimization_key=None): """ Args: sc: map_fun: args_dict: local_logdir: name: Returns: """ app_id = str(sc.applicationId) arg_lists = list(args_dict.values()) for i in range(len(arg_lists)): if len(arg_lists[i]) != 2: raise ValueError( 'Boundary list must contain exactly two elements, [lower_bound, upper_bound] for each hyperparameter' ) hp_names = args_dict.keys() random_dict = {} for hp in hp_names: lower_bound = args_dict[hp][0] upper_bound = args_dict[hp][1] assert lower_bound < upper_bound, "lower bound: " + str( lower_bound) + " must be less than upper bound: " + str( upper_bound) random_values = [] if type(lower_bound) is int and type(upper_bound) is int: for i in range(samples): random_values.append(random.randint(lower_bound, upper_bound)) elif (type(lower_bound) is float or type(lower_bound) is int) and (type(upper_bound) is float or type(upper_bound) is int): for i in range(samples): random_values.append(random.uniform(lower_bound, upper_bound)) else: raise ValueError('Only float and int is currently supported') random_dict[hp] = random_values random_dict, new_samples = _remove_duplicates(random_dict, samples) sc.setJobGroup(os.environ['ML_ID'], "{} | Random Search".format(name)) #Each TF task should be run on 1 executor nodeRDD = sc.parallelize(range(new_samples), new_samples) nodeRDD.foreachPartition( _prepare_func(app_id, run_id, map_fun, random_dict, local_logdir, optimization_key)) arg_count = six.get_function_code(map_fun).co_argcount arg_names = six.get_function_code(map_fun).co_varnames exp_dir = experiment_utils._get_logdir(app_id, run_id) max_val, max_hp, min_val, min_hp, avg, max_return_dict, min_return_dict = experiment_utils._get_best( random_dict, new_samples, arg_names, arg_count, exp_dir, optimization_key) param_combination = "" best_val = "" return_dict = {} if direction.upper() == Direction.MAX: param_combination = max_hp best_val = str(max_val) return_dict = max_return_dict elif direction.upper() == Direction.MIN: param_combination = min_hp best_val = str(min_val) return_dict = min_return_dict print('Finished Experiment \n') best_dir = exp_dir + '/' + param_combination return best_dir, experiment_utils._get_params_dict( best_dir), best_val, return_dict
def mirrored(map_fun, name='no-name', local_logdir=False, description=None, evaluator=False): """ *Distributed Training* Example usage: >>> from hops import experiment >>> def mirrored_training(): >>> # Do all imports in the function >>> import tensorflow >>> # Put all code inside the wrapper function >>> from hops import tensorboard >>> from hops import devices >>> logdir = tensorboard.logdir() >>> ...MirroredStrategy()... >>> experiment.mirrored(mirrored_training, local_logdir=True) Args: :map_fun: contains the code where you are using MirroredStrategy. :name: name of the experiment :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS :description: a longer description for the experiment :evaluator: whether to run one of the workers as an evaluator Returns: HDFS path in your project where the experiment is stored and return value from the process running as chief """ num_ps = util.num_param_servers() assert num_ps == 0, "number of parameter servers should be 0" global running if running: raise RuntimeError("An experiment is currently running.") num_workers = util.num_executors() if evaluator: assert num_workers > 2, "number of workers must be atleast 3 if evaluator is set to True" start = time.time() sc = util._find_spark().sparkContext try: global app_id global experiment_json global run_id app_id = str(sc.applicationId) _start_run() hdfs.mkdir(experiment_utils._get_logdir(app_id, run_id)) experiment_json = experiment_utils._populate_experiment( name, 'mirrored', 'DISTRIBUTED_TRAINING', None, description, app_id, None, None) experiment_json = experiment_utils._attach_experiment_xattr( app_id, run_id, experiment_json, 'CREATE') logdir, return_dict = mirrored_impl._run(sc, map_fun, run_id, local_logdir=local_logdir, name=name, evaluator=evaluator) duration = experiment_utils._seconds_to_milliseconds(time.time() - start) experiment_utils._finalize_experiment(experiment_json, None, app_id, run_id, 'FINISHED', duration, logdir, None, None) return logdir, return_dict except: _exception_handler( experiment_utils._seconds_to_milliseconds(time.time() - start)) raise finally: _end_run(sc)
def collective_all_reduce(map_fun, name='no-name', local_logdir=False, description=None, evaluator=False): """ *Distributed Training* Sets up the cluster to run CollectiveAllReduceStrategy. TF_CONFIG is exported in the background and does not need to be set by the user themselves. Example usage: >>> from hops import experiment >>> def distributed_training(): >>> # Do all imports in the function >>> import tensorflow >>> # Put all code inside the wrapper function >>> from hops import tensorboard >>> from hops import devices >>> logdir = tensorboard.logdir() >>> ...CollectiveAllReduceStrategy(num_gpus_per_worker=devices.get_num_gpus())... >>> experiment.collective_all_reduce(distributed_training, local_logdir=True) Args: :map_fun: the function containing code to run CollectiveAllReduceStrategy :name: the name of the experiment :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS :description: a longer description for the experiment :evaluator: whether to run one of the workers as an evaluator Returns: HDFS path in your project where the experiment is stored and return value from the process running as chief """ num_ps = util.num_param_servers() num_executors = util.num_executors() assert num_ps == 0, "number of parameter servers should be 0" assert num_executors > 1, "number of workers (executors) should be greater than 1" if evaluator: assert num_executors > 2, "number of workers must be atleast 3 if evaluator is set to True" global running if running: raise RuntimeError("An experiment is currently running.") start = time.time() sc = util._find_spark().sparkContext try: global app_id global experiment_json global run_id app_id = str(sc.applicationId) _start_run() hdfs.mkdir(experiment_utils._get_logdir(app_id, run_id)) experiment_json = experiment_utils._populate_experiment( name, 'collective_all_reduce', 'DISTRIBUTED_TRAINING', None, description, app_id, None, None) experiment_json = experiment_utils._attach_experiment_xattr( app_id, run_id, experiment_json, 'CREATE') logdir, return_dict = allreduce_impl._run(sc, map_fun, run_id, local_logdir=local_logdir, name=name, evaluator=evaluator) duration = experiment_utils._seconds_to_milliseconds(time.time() - start) experiment_utils._finalize_experiment(experiment_json, None, app_id, run_id, 'FINISHED', duration, logdir, None, None) return logdir, return_dict except: _exception_handler( experiment_utils._seconds_to_milliseconds(time.time() - start)) raise finally: _end_run(sc)
def launch(map_fun, args_dict=None, name='no-name', local_logdir=False, description=None, metric_key=None): """ *Experiment* or *Parallel Experiment* Run an Experiment contained in *map_fun* one time with no arguments or multiple times with different arguments if *args_dict* is specified. Example usage: >>> from hops import experiment >>> def train_nn(): >>> # Do all imports in the function >>> import tensorflow >>> # Put all code inside the wrapper function >>> accuracy, loss = network.evaluate(learning_rate, layers, dropout) >>> experiment.launch(train_nn) Returning multiple outputs, including images and logs: >>> from hops import experiment >>> def train_nn(): >>> # Do all imports in the function >>> import tensorflow >>> # Put all code inside the wrapper function >>> from PIL import Image >>> f = open('logfile.txt', 'w') >>> f.write('Starting training...') >>> accuracy, loss = network.evaluate(learning_rate, layers, dropout) >>> img = Image.new(.....) >>> img.save('diagram.png') >>> return {'accuracy': accuracy, 'loss': loss, 'logfile': 'logfile.txt', 'diagram': 'diagram.png'} >>> experiment.launch(train_nn) Args: :map_fun: The function to run :args_dict: If specified will run the same function multiple times with different arguments, {'a':[1,2], 'b':[5,3]} would run the function two times with arguments (1,5) and (2,3) provided that the function signature contains two arguments like *def func(a,b):* :name: name of the experiment :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS :description: A longer description for the experiment :metric_key: If returning a dict with multiple return values, this key should match the name of the key in the dict for the metric you want to associate with the experiment Returns: HDFS path in your project where the experiment is stored """ num_ps = util.num_param_servers() assert num_ps == 0, "number of parameter servers should be 0" global running if running: raise RuntimeError( "An experiment is currently running. Please call experiment.end() to stop it." ) start = time.time() sc = util._find_spark().sparkContext try: global app_id global experiment_json global run_id app_id = str(sc.applicationId) _start_run() hdfs.mkdir(experiment_utils._get_logdir(app_id, run_id)) experiment_json = None if args_dict: experiment_json = experiment_utils._populate_experiment( name, 'launch', 'EXPERIMENT', json.dumps(args_dict), description, app_id, None, None) else: experiment_json = experiment_utils._populate_experiment( name, 'launch', 'EXPERIMENT', None, description, app_id, None, None) experiment_json = experiment_utils._attach_experiment_xattr( app_id, run_id, experiment_json, 'CREATE') logdir, return_dict = launcher._run(sc, map_fun, run_id, args_dict, local_logdir) duration = experiment_utils._seconds_to_milliseconds(time.time() - start) metric = experiment_utils._get_metric(return_dict, metric_key) experiment_utils._finalize_experiment(experiment_json, metric, app_id, run_id, 'FINISHED', duration, logdir, None, None) return logdir, return_dict except: _exception_handler( experiment_utils._seconds_to_milliseconds(time.time() - start)) raise finally: _end_run(sc)
def differential_evolution(objective_function, boundary_dict, direction=Direction.MAX, generations=4, population=6, mutation=0.5, crossover=0.7, name='no-name', local_logdir=False, description=None, optimization_key='metric'): """ *Parallel Experiment* Run differential evolution to explore a given search space for each hyperparameter and figure out the best hyperparameter combination. The function is treated as a blackbox that returns a metric for some given hyperparameter combination. The returned metric is used to evaluate how 'good' the hyperparameter combination was. Example usage: >>> from hops import experiment >>> boundary_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]} >>> def train_nn(learning_rate, layers, dropout): >>> import tensorflow >>> return network.evaluate(learning_rate, layers, dropout) >>> experiment.differential_evolution(train_nn, boundary_dict, direction=Direction.MAX) Returning multiple outputs, including images and logs: >>> from hops import experiment >>> boundary_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]} >>> def train_nn(learning_rate, layers, dropout): >>> # Do all imports in the function >>> import tensorflow >>> # Put all code inside the wrapper function >>> from PIL import Image >>> f = open('logfile.txt', 'w') >>> f.write('Starting training...') >>> accuracy, loss = network.evaluate(learning_rate, layers, dropout) >>> img = Image.new(.....) >>> img.save('diagram.png') >>> return {'accuracy': accuracy, 'loss': loss, 'logfile': 'logfile.txt', 'diagram': 'diagram.png'} >>> # Important! Remember: optimization_key must be set when returning multiple outputs >>> experiment.differential_evolution(train_nn, boundary_dict, direction=Direction.MAX, optimization_key='accuracy') Args: :objective_function: the function to run, must return a metric :boundary_dict: a dict where each key corresponds to an argument of *objective_function* and the correspond value should be a list of two elements. The first element being the lower bound for the parameter and the the second element the upper bound. :direction: Direction.MAX to maximize the returned metric, Direction.MIN to minize the returned metric :generations: number of generations :population: size of population :mutation: mutation rate to explore more different hyperparameters :crossover: how fast to adapt the population to the best in each generation :name: name of the experiment :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS :description: a longer description for the experiment :optimization_key: When returning a dict, the key name of the metric to maximize or minimize in the dict should be set as this value Returns: HDFS path in your project where the experiment is stored, dict with best hyperparameters and return dict with best metrics """ num_ps = util.num_param_servers() assert num_ps == 0, "number of parameter servers should be 0" global running if running: raise RuntimeError("An experiment is currently running.") start = time.time() sc = util._find_spark().sparkContext try: global app_id global experiment_json global run_id app_id = str(sc.applicationId) _start_run() diff_evo_impl.run_id = run_id hdfs.mkdir(experiment_utils._get_logdir(app_id, run_id)) experiment_json = experiment_utils._populate_experiment( name, 'differential_evolution', 'PARALLEL_EXPERIMENTS', json.dumps(boundary_dict), description, app_id, direction, optimization_key) experiment_json = experiment_utils._attach_experiment_xattr( app_id, run_id, experiment_json, 'CREATE') logdir, best_param, best_metric, return_dict = diff_evo_impl._run( objective_function, boundary_dict, direction=direction, generations=generations, population=population, mutation=mutation, crossover=crossover, cleanup_generations=False, local_logdir=local_logdir, name=name, optimization_key=optimization_key) duration = experiment_utils._seconds_to_milliseconds(time.time() - start) experiment_utils._finalize_experiment( experiment_json, best_metric, app_id, run_id, 'FINISHED', duration, experiment_utils._get_logdir(app_id, run_id), logdir, optimization_key) return logdir, best_param, return_dict except: _exception_handler( experiment_utils._seconds_to_milliseconds(time.time() - start)) raise finally: _end_run(sc)
def _wrapper_fun(iter): """ Args: iter: Returns: """ for i in iter: executor_num = i experiment_utils._set_ml_id(app_id, run_id) t = threading.Thread(target=devices._print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() is_chief = False logdir = None tb_hdfs_path = None try: host = experiment_utils._get_ip_address() tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tmp_socket.bind(('', 0)) port = tmp_socket.getsockname()[1] client = allreduce_reservation.Client(server_addr) host_port = host + ":" + str(port) client.register({"worker": host_port, "index": executor_num}) cluster = client.await_reservations() tmp_socket.close() client.close() task_index = experiment_utils._find_index(host_port, cluster) if task_index == -1: cluster["task"] = {"type": "chief", "index": 0} else: cluster["task"] = {"type": "worker", "index": task_index} evaluator_node = None if evaluator: last_worker_index = len(cluster["cluster"]["worker"]) - 1 evaluator_node = cluster["cluster"]["worker"][ last_worker_index] cluster["cluster"]["evaluator"] = [evaluator_node] del cluster["cluster"]["worker"][last_worker_index] if evaluator_node == host_port: cluster["task"] = {"type": "evaluator", "index": 0} print('TF_CONFIG: {} '.format(cluster)) if num_executors > 1: os.environ["TF_CONFIG"] = json.dumps(cluster) is_chief = (cluster["task"]["type"] == "chief") is_evaluator = (cluster["task"]["type"] == "evaluator") if is_chief: logdir = experiment_utils._get_logdir(app_id, run_id) tb_hdfs_path, tb_pid = tensorboard._register( logdir, logdir, executor_num, local_logdir=local_logdir) elif is_evaluator: logdir = experiment_utils._get_logdir(app_id, run_id) tensorboard.events_logdir = logdir logfile = experiment_utils._init_logger( experiment_utils._get_logdir(app_id, run_id), role=cluster["task"]["type"], index=cluster["task"]["index"]) print(devices._get_gpu_info()) print('-------------------------------------------------------') print('Started running task') task_start = time.time() retval = map_fun() if is_chief: experiment_utils._handle_return_simple( retval, experiment_utils._get_logdir(app_id, run_id), logfile) task_end = time.time() time_str = 'Finished task - took ' + experiment_utils._time_diff( task_start, task_end) print(time_str) print('-------------------------------------------------------') except: raise finally: experiment_utils._cleanup(tensorboard, t)
def _wrapper_fun(iter): """ Args: iter: Returns: """ for i in iter: executor_num = i experiment_utils._set_ml_id(app_id, run_id) t = threading.Thread(target=devices._print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() role = None logdir = None tb_hdfs_path = None client = parameter_server_reservation.Client(server_addr) try: host = experiment_utils._get_ip_address() tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tmp_socket.bind(('', 0)) port = tmp_socket.getsockname()[1] host_port = host + ":" + str(port) exec_spec = {} if executor_num < num_ps: exec_spec["task_type"] = "ps" else: exec_spec["task_type"] = "worker" exec_spec["host_port"] = host_port exec_spec["gpus_present"] = devices.get_num_gpus() > 0 client.register(exec_spec) cluster = client.await_reservations() tmp_socket.close() role, index = experiment_utils._find_task_and_index(host_port, cluster) cluster_spec = {} cluster_spec["cluster"] = cluster cluster_spec["task"] = {"type": role, "index": index} evaluator_node = None if evaluator: last_worker_index = len(cluster_spec["cluster"]["worker"])-1 evaluator_node = cluster_spec["cluster"]["worker"][last_worker_index] cluster_spec["cluster"]["evaluator"] = [evaluator_node] del cluster_spec["cluster"]["worker"][last_worker_index] if evaluator_node == host_port: role = "evaluator" cluster_spec["task"] = {"type": "evaluator", "index": 0} print('TF_CONFIG: {} '.format(cluster_spec)) os.environ["TF_CONFIG"] = json.dumps(cluster_spec) logfile = experiment_utils._init_logger(experiment_utils._get_logdir(app_id, run_id), role=role, index=cluster_spec["task"]["index"]) dist_logdir = experiment_utils._get_logdir(app_id, run_id) + '/logdir' is_chief = (cluster["task"]["type"] == "chief") if is_chief: hdfs.mkdir(dist_logdir) tensorboard._register(dist_logdir, experiment_utils._get_logdir(app_id, run_id), executor_num, local_logdir=local_logdir) else: tensorboard.events_logdir = dist_logdir print(devices._get_gpu_info()) print('-------------------------------------------------------') print('Started running task') task_start = time.time() retval=None if role == "ps": ps_thread = threading.Thread(target=lambda: map_fun()) ps_thread.start() client.await_all_workers_finished() else: retval = map_fun() if role == "chief": experiment_utils._handle_return_simple(retval, experiment_utils._get_logdir(app_id, run_id), logfile) task_end = time.time() time_str = 'Finished task - took ' + experiment_utils._time_diff(task_start, task_end) print(time_str) print('-------------------------------------------------------') except: raise finally: if role != "ps": client.register_worker_finished() client.close() experiment_utils._cleanup(tensorboard, t)
def log_searchspace(self, app_id, run_id, searchspace): tensorboard._write_hparams_config( experiment_utils._get_logdir(app_id, run_id), searchspace )
def init_ml_tracking(self, app_id, run_id): tensorboard._register(experiment_utils._get_logdir(app_id, run_id))
def _run(sc, map_fun, run_id, args_dict, direction=Direction.MAX, local_logdir=False, name="no-name", optimization_key=None): """ Run the wrapper function with each hyperparameter combination as specified by the dictionary Args: sc: map_fun: args_dict: direction: local_logdir: name: Returns: """ app_id = str(sc.applicationId) num_executions = 1 if direction.upper() != Direction.MAX and direction.upper( ) != Direction.MIN: raise ValueError('Invalid direction ' + direction + ', must be Direction.MAX or Direction.MIN') arg_lists = list(args_dict.values()) currentLen = len(arg_lists[0]) for i in range(len(arg_lists)): if currentLen != len(arg_lists[i]): raise ValueError( 'Length of each function argument list must be equal') num_executions = len(arg_lists[i]) #Each TF task should be run on 1 executor nodeRDD = sc.parallelize(range(num_executions), num_executions) #Make SparkUI intuitive by grouping jobs sc.setJobGroup(os.environ['ML_ID'], "{} | Grid Search".format(name)) #Force execution on executor, since GPU is located on executor nodeRDD.foreachPartition( _prepare_func(app_id, run_id, map_fun, args_dict, local_logdir, optimization_key)) arg_count = six.get_function_code(map_fun).co_argcount arg_names = six.get_function_code(map_fun).co_varnames exp_dir = experiment_utils._get_logdir(app_id, run_id) max_val, max_hp, min_val, min_hp, avg, max_return_dict, min_return_dict = experiment_utils._get_best( args_dict, num_executions, arg_names, arg_count, exp_dir, optimization_key) param_combination = "" best_val = "" return_dict = {} if direction.upper() == Direction.MAX: param_combination = max_hp best_val = str(max_val) return_dict = max_return_dict elif direction.upper() == Direction.MIN: param_combination = min_hp best_val = str(min_val) return_dict return_dict = min_return_dict print('Finished Experiment \n') best_dir = exp_dir + '/' + param_combination return best_dir, experiment_utils._get_params_dict( best_dir), best_val, return_dict
def lagom( map_fun, name="no-name", experiment_type="optimization", searchspace=None, optimizer=None, direction="max", num_trials=1, ablation_study=None, ablator=None, optimization_key="metric", hb_interval=1, es_policy="median", es_interval=300, es_min=10, description="", ): """Launches a maggy experiment, which depending on `experiment_type` can either be a hyperparameter optimization or an ablation study experiment. Given a search space, objective and a model training procedure `map_fun` (black-box function), an experiment is the whole process of finding the best hyperparameter combination in the search space, optimizing the black-box function. Currently maggy supports random search and a median stopping rule. **lagom** is a Swedish word meaning "just the right amount". :param map_fun: User defined experiment containing the model training. :type map_fun: function :param name: A user defined experiment identifier. :type name: str :param experiment_type: Type of Maggy experiment, either 'optimization' (default) or 'ablation'. :type experiment_type: str :param searchspace: A maggy Searchspace object from which samples are drawn. :type searchspace: Searchspace :param optimizer: The optimizer is the part generating new trials. :type optimizer: str, AbstractOptimizer :param direction: If set to ‘max’ the highest value returned will correspond to the best solution, if set to ‘min’ the opposite is true. :type direction: str :param num_trials: the number of trials to evaluate given the search space, each containing a different hyperparameter combination :type num_trials: int :param ablation_study: Ablation study object. Can be None for optimization experiment type. :type ablation_study: AblationStudy :param ablator: Ablator to use for experiment type 'ablation'. :type ablator: str, AbstractAblator :param optimization_key: Name of the metric to be optimized :type optimization_key: str, optional :param hb_interval: The heartbeat interval in seconds from trial executor to experiment driver, defaults to 1 :type hb_interval: int, optional :param es_policy: The earlystopping policy, defaults to 'median' :type es_policy: str, optional :param es_interval: Frequency interval in seconds to check currently running trials for early stopping, defaults to 300 :type es_interval: int, optional :param es_min: Minimum number of trials finalized before checking for early stopping, defaults to 10 :type es_min: int, optional :param description: A longer description of the experiment. :type description: str, optional :raises RuntimeError: An experiment is currently running. :return: A dictionary indicating the best trial and best hyperparameter combination with it's performance metric :rtype: dict """ global running if running: raise RuntimeError("An experiment is currently running.") job_start = time.time() sc = hopsutil._find_spark().sparkContext exp_driver = None try: global app_id global experiment_json global run_id app_id = str(sc.applicationId) app_id, run_id = util._validate_ml_id(app_id, run_id) # start run running = True experiment_utils._set_ml_id(app_id, run_id) # create experiment dir experiment_utils._create_experiment_dir(app_id, run_id) tensorboard._register(experiment_utils._get_logdir(app_id, run_id)) num_executors = util.num_executors(sc) # start experiment driver if experiment_type == "optimization": assert num_trials > 0, "number of trials should be greater " + "than zero" tensorboard._write_hparams_config( experiment_utils._get_logdir(app_id, run_id), searchspace ) if num_executors > num_trials: num_executors = num_trials exp_driver = experimentdriver.ExperimentDriver( "optimization", searchspace=searchspace, optimizer=optimizer, direction=direction, num_trials=num_trials, name=name, num_executors=num_executors, hb_interval=hb_interval, es_policy=es_policy, es_interval=es_interval, es_min=es_min, description=description, log_dir=experiment_utils._get_logdir(app_id, run_id), ) exp_function = exp_driver.optimizer.name() elif experiment_type == "ablation": exp_driver = experimentdriver.ExperimentDriver( "ablation", ablation_study=ablation_study, ablator=ablator, name=name, num_executors=num_executors, hb_interval=hb_interval, description=description, log_dir=experiment_utils._get_logdir(app_id, run_id), ) # using exp_driver.num_executor since # it has been set using ablator.get_number_of_trials() # in experiment.py if num_executors > exp_driver.num_executors: num_executors = exp_driver.num_executors exp_function = exp_driver.ablator.name() else: running = False raise RuntimeError( "Unknown experiment_type:" "should be either 'optimization' or 'ablation', " "But it is '{0}'".format(str(experiment_type)) ) nodeRDD = sc.parallelize(range(num_executors), num_executors) # Do provenance after initializing exp_driver, because exp_driver does # the type checks for optimizer and searchspace sc.setJobGroup(os.environ["ML_ID"], "{0} | {1}".format(name, exp_function)) experiment_json = experiment_utils._populate_experiment( name, exp_function, "MAGGY", exp_driver.searchspace.json(), description, app_id, direction, optimization_key, ) experiment_json = experiment_utils._attach_experiment_xattr( app_id, run_id, experiment_json, "CREATE" ) util._log( "Started Maggy Experiment: {0}, {1}, run {2}".format(name, app_id, run_id) ) exp_driver.init(job_start) server_addr = exp_driver.server_addr # Force execution on executor, since GPU is located on executor nodeRDD.foreachPartition( trialexecutor._prepare_func( app_id, run_id, experiment_type, map_fun, server_addr, hb_interval, exp_driver._secret, optimization_key, experiment_utils._get_logdir(app_id, run_id), ) ) job_end = time.time() result = exp_driver.finalize(job_end) best_logdir = ( experiment_utils._get_logdir(app_id, run_id) + "/" + result["best_id"] ) util._finalize_experiment( experiment_json, float(result["best_val"]), app_id, run_id, "FINISHED", exp_driver.duration, experiment_utils._get_logdir(app_id, run_id), best_logdir, optimization_key, ) util._log("Finished Experiment") return result except: # noqa: E722 _exception_handler( experiment_utils._seconds_to_milliseconds(time.time() - job_start) ) if exp_driver: if exp_driver.exception: raise exp_driver.exception raise finally: # grace period to send last logs to sparkmagic # sparkmagic hb poll intervall is 5 seconds, therefore wait 6 seconds time.sleep(6) # cleanup spark jobs if running and exp_driver is not None: exp_driver.stop() run_id += 1 running = False sc.setJobGroup("", "") return result
def _run(sc, map_fun, run_id, args_dict=None, local_logdir=False, name="no-name"): """ Args: sc: map_fun: args_dict: local_logdir: name: Returns: """ app_id = str(sc.applicationId) if args_dict == None: num_executions = 1 else: arg_lists = list(args_dict.values()) currentLen = len(arg_lists[0]) for i in range(len(arg_lists)): if currentLen != len(arg_lists[i]): raise ValueError( 'Length of each function argument list must be equal') num_executions = len(arg_lists[i]) sc.setJobGroup(os.environ['ML_ID'], "{} | Launcher running experiment".format(name)) #Each TF task should be run on 1 executor nodeRDD = sc.parallelize(range(num_executions), num_executions) #Force execution on executor, since GPU is located on executor nodeRDD.foreachPartition( _prepare_func(app_id, run_id, map_fun, args_dict, local_logdir)) print('Finished Experiment \n') # For single run return .return if exists if args_dict == None: path_to_return = experiment_utils._get_logdir( app_id, run_id) + '/.outputs.json' if hdfs.exists(path_to_return): return_json = hdfs.load(path_to_return) return_dict = json.loads(return_json) return experiment_utils._get_logdir(app_id, run_id), return_dict else: return experiment_utils._get_logdir(app_id, run_id), None elif num_executions == 1: arg_count = six.get_function_code(map_fun).co_argcount arg_names = six.get_function_code(map_fun).co_varnames argIndex = 0 param_string = '' while arg_count > 0: param_name = arg_names[argIndex] param_val = args_dict[param_name][0] param_string += str(param_name) + '=' + str(param_val) + '&' arg_count -= 1 argIndex += 1 param_string = param_string[:-1] path_to_return = experiment_utils._get_logdir( app_id, run_id) + '/' + param_string + '/.outputs.json' if hdfs.exists(path_to_return): return_json = hdfs.load(path_to_return) return_dict = json.loads(return_json) return experiment_utils._get_logdir(app_id, run_id), return_dict else: return experiment_utils._get_logdir(app_id, run_id), None else: return experiment_utils._get_logdir(app_id, run_id), None
def _wrapper_fun(iter): """ Args: iter: Returns: """ for i in iter: executor_num = i experiment_utils._set_ml_id(app_id, run_id) tb_hdfs_path = '' hdfs_exec_logdir = experiment_utils._get_logdir(app_id, run_id) t = threading.Thread(target=devices._print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() try: #Arguments if args_dict: param_string, params, args = experiment_utils.build_parameters( map_fun, executor_num, args_dict) hdfs_exec_logdir, hdfs_appid_logdir = experiment_utils._create_experiment_subdirectories( app_id, run_id, param_string, 'grid_search', params=params) logfile = experiment_utils._init_logger(hdfs_exec_logdir) tb_hdfs_path, tb_pid = tensorboard._register( hdfs_exec_logdir, hdfs_appid_logdir, executor_num, local_logdir=local_logdir) print(devices._get_gpu_info()) print( '-------------------------------------------------------') print('Started running task ' + param_string) task_start = time.time() retval = map_fun(*args) task_end = time.time() experiment_utils._handle_return_simple(retval, hdfs_exec_logdir, logfile) time_str = 'Finished task ' + param_string + ' - took ' + experiment_utils._time_diff( task_start, task_end) print(time_str) print( '-------------------------------------------------------') else: tb_hdfs_path, tb_pid = tensorboard._register( hdfs_exec_logdir, hdfs_exec_logdir, executor_num, local_logdir=local_logdir) logfile = experiment_utils._init_logger(hdfs_exec_logdir) print(devices._get_gpu_info()) print( '-------------------------------------------------------') print('Started running task') task_start = time.time() retval = map_fun() task_end = time.time() experiment_utils._handle_return_simple(retval, hdfs_exec_logdir, logfile) time_str = 'Finished task - took ' + experiment_utils._time_diff( task_start, task_end) print(time_str) print( '-------------------------------------------------------') except: raise finally: experiment_utils._cleanup(tensorboard, t)
def random_search(train_fn, boundary_dict, direction=Direction.MAX, samples=10, name='no-name', local_logdir=False, description=None, optimization_key='metric'): """ *Parallel Experiment* Run an Experiment contained in *train_fn* for configured number of random samples controlled by the *samples* parameter. Each hyperparameter is contained in *boundary_dict* with the key corresponding to the name of the hyperparameter and a list containing two elements defining the lower and upper bound. The experiment must return a metric corresponding to how 'good' the given hyperparameter combination is. Example usage: >>> from hops import experiment >>> boundary_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]} >>> def train_nn(learning_rate, layers, dropout): >>> # Do all imports in the function >>> import tensorflow >>> # Put all code inside the train_fn function >>> return network.evaluate(learning_rate, layers, dropout) >>> experiment.differential_evolution(train_nn, boundary_dict, direction='max') Returning multiple outputs, including images and logs: >>> from hops import experiment >>> boundary_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]} >>> def train_nn(learning_rate, layers, dropout): >>> # Do all imports in the function >>> import tensorflow >>> # Put all code inside the train_fn function >>> from PIL import Image >>> f = open('logfile.txt', 'w') >>> f.write('Starting training...') >>> accuracy, loss = network.evaluate(learning_rate, layers, dropout) >>> img = Image.new(.....) >>> img.save('diagram.png') >>> return {'accuracy': accuracy, 'loss': loss, 'logfile': 'logfile.txt', 'diagram': 'diagram.png'} >>> # Important! Remember: optimization_key must be set when returning multiple outputs >>> experiment.differential_evolution(train_nn, boundary_dict, direction='max', optimization_key='accuracy') Args: :train_fn: The function to run :boundary_dict: dict containing hyperparameter name and corresponding boundaries, each experiment randomize a value in the boundary range. :direction: Direction.MAX to maximize the returned metric, Direction.MIN to minize the returned metric :samples: the number of random samples to evaluate for each hyperparameter given the boundaries, for example samples=3 would result in 3 hyperparameter combinations in total to evaluate :name: name of the experiment :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS :description: A longer description for the experiment :optimization_key: When returning a dict, the key name of the metric to maximize or minimize in the dict should be set as this value Returns: HDFS path in your project where the experiment is stored, dict with best hyperparameters and return dict with best metrics """ num_ps = util.num_param_servers() assert num_ps == 0, "number of parameter servers should be 0" global running if running: raise RuntimeError("An experiment is currently running.") start = time.time() sc = util._find_spark().sparkContext try: global app_id global experiment_json global run_id app_id = str(sc.applicationId) _start_run() experiment_utils._create_experiment_dir(app_id, run_id) experiment_json = experiment_utils._populate_experiment( name, 'random_search', 'PARALLEL_EXPERIMENTS', json.dumps(boundary_dict), description, app_id, direction, optimization_key) experiment_json = experiment_utils._attach_experiment_xattr( app_id, run_id, experiment_json, 'CREATE') logdir, best_param, best_metric, return_dict = r_search_impl._run( sc, train_fn, run_id, boundary_dict, samples, direction=direction, local_logdir=local_logdir, optimization_key=optimization_key) duration = experiment_utils._seconds_to_milliseconds(time.time() - start) experiment_utils._finalize_experiment( experiment_json, best_metric, app_id, run_id, 'FINISHED', duration, experiment_utils._get_logdir(app_id, run_id), logdir, optimization_key) return logdir, best_param, return_dict except: _exception_handler( experiment_utils._seconds_to_milliseconds(time.time() - start)) raise finally: _end_run(sc)
def grid_search(train_fn, grid_dict, direction=Direction.MAX, name='no-name', local_logdir=False, description=None, optimization_key='metric'): """ *Parallel Experiment* Run grid search evolution to explore a predefined set of hyperparameter combinations. The function is treated as a blackbox that returns a metric for some given hyperparameter combination. The returned metric is used to evaluate how 'good' the hyperparameter combination was. Example usage: >>> from hops import experiment >>> grid_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]} >>> def train_nn(learning_rate, layers, dropout): >>> # Do all imports in the function >>> import tensorflow >>> # Put all code inside the train_fn function >>> return network.evaluate(learning_rate, layers, dropout) >>> experiment.grid_search(train_nn, grid_dict, direction=Direction.MAX) Returning multiple outputs, including images and logs: >>> from hops import experiment >>> grid_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]} >>> def train_nn(learning_rate, layers, dropout): >>> # Do all imports in the function >>> import tensorflow >>> # Put all code inside the train_fn function >>> from PIL import Image >>> f = open('logfile.txt', 'w') >>> f.write('Starting training...') >>> accuracy, loss = network.evaluate(learning_rate, layers, dropout) >>> img = Image.new(.....) >>> img.save('diagram.png') >>> return {'accuracy': accuracy, 'loss': loss, 'logfile': 'logfile.txt', 'diagram': 'diagram.png'} >>> # Important! Remember: optimization_key must be set when returning multiple outputs >>> experiment.grid_search(train_nn, grid_dict, direction=Direction.MAX, optimization_key='accuracy') Args: :train_fn: the function to run, must return a metric :grid_dict: a dict with a key for each argument with a corresponding value being a list containing the hyperparameters to test, internally all possible combinations will be generated and run as separate Experiments :direction: Direction.MAX to maximize the returned metric, Direction.MIN to minize the returned metric :name: name of the experiment :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS :description: a longer description for the experiment :optimization_key: When returning a dict, the key name of the metric to maximize or minimize in the dict should be set as this value Returns: HDFS path in your project where the experiment is stored, dict with best hyperparameters and return dict with best metrics """ num_ps = util.num_param_servers() assert num_ps == 0, "number of parameter servers should be 0" global running if running: raise RuntimeError("An experiment is currently running.") start = time.time() sc = util._find_spark().sparkContext try: global app_id global experiment_json global run_id app_id = str(sc.applicationId) _start_run() experiment_utils._create_experiment_dir(app_id, run_id) experiment_json = experiment_utils._populate_experiment( name, 'grid_search', 'PARALLEL_EXPERIMENTS', json.dumps(grid_dict), description, app_id, direction, optimization_key) experiment_json = experiment_utils._attach_experiment_xattr( app_id, run_id, experiment_json, 'CREATE') grid_params = experiment_utils.grid_params(grid_dict) logdir, best_param, best_metric, return_dict = grid_search_impl._run( sc, train_fn, run_id, grid_params, direction=direction, local_logdir=local_logdir, name=name, optimization_key=optimization_key) duration = experiment_utils._seconds_to_milliseconds(time.time() - start) experiment_utils._finalize_experiment( experiment_json, best_metric, app_id, run_id, 'FINISHED', duration, experiment_utils._get_logdir(app_id, run_id), logdir, optimization_key) return logdir, best_param, return_dict except: _exception_handler( experiment_utils._seconds_to_milliseconds(time.time() - start)) raise finally: _end_run(sc)
def get_logdir(self, app_id, run_id): return experiment_utils._get_logdir(app_id, run_id)