def _launch(sc, map_fun, args_dict=None, local_logdir=False, name="no-name"): """ Args: sc: map_fun: args_dict: local_logdir: name: Returns: """ global run_id app_id = str(sc.applicationId) if args_dict == None: num_executions = 1 else: arg_lists = list(args_dict.values()) currentLen = len(arg_lists[0]) for i in range(len(arg_lists)): if currentLen != len(arg_lists[i]): raise ValueError( 'Length of each function argument list must be equal') num_executions = len(arg_lists[i]) sc.setJobGroup("Launcher", "{} | Running experiment".format(name)) #Each TF task should be run on 1 executor nodeRDD = sc.parallelize(range(num_executions), num_executions) #Force execution on executor, since GPU is located on executor global run_id nodeRDD.foreachPartition( _prepare_func(app_id, run_id, map_fun, args_dict, local_logdir)) print('Finished Experiment \n') if args_dict == None: path_to_metric = _get_logdir(app_id) + '/metric' if pydoop.hdfs.path.exists(path_to_metric): with pydoop.hdfs.open(path_to_metric, "r") as fi: metric = float(fi.read()) fi.close() return metric, hopshdfs._get_experiments_dir( ) + '/' + app_id + '/launcher/run.' + str(run_id) return None, hopshdfs._get_experiments_dir( ) + '/' + app_id + '/launcher/run.' + str(run_id)
def visualize(hdfs_root_logdir): """ Visualize all TensorBoard events for a given path in HopsFS. This is intended for use after running TensorFlow jobs to visualize them all in the same TensorBoard. tflauncher.launch returns the path in HopsFS which should be handed as argument for this method to visualize all runs. Args: :hdfs_root_logdir: the path in HopsFS to enter as the logdir for TensorBoard """ sc = util._find_spark().sparkContext app_id = str(sc.applicationId) pypath = os.getenv("PYSPARK_PYTHON") logdir = os.getcwd() + '/tensorboard_events/' if os.path.exists(logdir): shutil.rmtree(logdir) os.makedirs(logdir) else: os.makedirs(logdir) #find free port tb_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tb_socket.bind(('', 0)) tb_addr, tb_port = tb_socket.getsockname() tb_path = util._find_tensorboard() tb_socket.close() tb_env = os.environ.copy() tb_env['CUDA_VISIBLE_DEVICES'] = '' tb_env['LC_ALL'] = 'C' tb_proc = subprocess.Popen([ pypath, tb_path, "--logdir=%s" % logdir, "--port=%d" % tb_port, "--host=%s" % "0.0.0.0" ], env=tb_env, preexec_fn=util._on_executor_exit('SIGTERM')) host = socket.gethostname() tb_url = "http://{0}:{1}".format(host, tb_port) tb_endpoint = hopshdfs._get_experiments_dir( ) + "/" + app_id + "/TensorBoard.visualize" #dump tb host:port to hdfs pydoop.hdfs.dump(tb_url, tb_endpoint, user=hopshdfs.project_user()) handle = hopshdfs.get() hdfs_logdir_entries = handle.list_directory(hdfs_root_logdir) for entry in hdfs_logdir_entries: file_name, extension = splitext(entry['name']) if not extension == '.log': pydoop.hdfs.get(entry['name'], logdir) tb_proc.wait() stdout, stderr = tb_proc.communicate() print(stdout) print(stderr)
def _evolutionary_launch(spark_session, map_fun, args_dict, name="no-name"): """ Run the wrapper function with each hyperparameter combination as specified by the dictionary Args: :spark_session: SparkSession object :map_fun: The TensorFlow function to run :args_dict: (optional) A dictionary containing hyperparameter values to insert as arguments for each TensorFlow job """ sc = spark_session.sparkContext app_id = str(sc.applicationId) arg_lists = list(args_dict.values()) num_executions = len(arg_lists[0]) #Each TF task should be run on 1 executor nodeRDD = sc.parallelize(range(num_executions), num_executions) #Force execution on executor, since GPU is located on executor global generation_id global run_id #Make SparkUI intuitive by grouping jobs sc.setJobGroup( "Differential Evolution ", "{} | Hyperparameter Optimization, generation: {}".format( name, generation_id)) nodeRDD.foreachPartition( _prepare_func(app_id, generation_id, map_fun, args_dict, run_id)) generation_id += 1 return hopshdfs._get_experiments_dir() + '/' + app_id + "/"
def _get_logdir(app_id): """ Args: app_id: Returns: """ global run_id return hopshdfs._get_experiments_dir() + '/' + app_id + '/begin/run.' + str(run_id)
def _get_logdir(app_id): """ Args: app_id: Returns: """ global run_id return hopshdfs._get_experiments_dir() + '/' + app_id + '/collective_all_reduce/run.' + str(run_id)
def _get_logdir(app_id): """ Args: :app_id: Returns: """ global run_id return hopshdfs._get_experiments_dir() + "/" + app_id + "/differential_evolution/run." + str(run_id)
def _launch(sc, map_fun, args_dict=None, local_logdir=False, name="no-name"): """ Args: sc: map_fun: args_dict: local_logdir: name: Returns: """ global run_id app_id = str(sc.applicationId) num_executions = 1 sc.setJobGroup("MirroredStrategy", "{} | Running on multiple devices".format(name)) #Each TF task should be run on 1 executor nodeRDD = sc.parallelize(range(num_executions), num_executions) #Force execution on executor, since GPU is located on executor global run_id nodeRDD.foreachPartition( _prepare_func(app_id, run_id, map_fun, args_dict, local_logdir)) print('Finished Experiment \n') path_to_metric = _get_logdir(app_id) + '/metric' if pydoop.hdfs.path.exists(path_to_metric): with pydoop.hdfs.open(path_to_metric, "r") as fi: metric = float(fi.read()) fi.close() return metric, hopshdfs._get_experiments_dir( ) + '/' + app_id + '/mirrored/run.' + str(run_id) return None, hopshdfs._get_experiments_dir( ) + '/' + app_id + '/mirrored/run.' + str(run_id)
def _get_metric(param_string, app_id, generation_id, run_id): """ Args: :param_string: :app_id: :generation_id: :run_id: Returns: """ project_path = hopshdfs.project_path() handle = hopshdfs.get() for i in range(generation_id): possible_result_path = hopshdfs._get_experiments_dir() + '/' + app_id + '/differential_evolution/run.' \ + str(run_id) + '/generation.' + str(i) + '/' + param_string + '/metric' if handle.exists(possible_result_path): with pydoop.hdfs.open(possible_result_path, "r") as fi: metric = float(fi.read()) fi.close() return metric return None
def _grid_launch(sc, map_fun, args_dict, direction='max', local_logdir=False, name="no-name"): """ Run the wrapper function with each hyperparameter combination as specified by the dictionary Args: sc: map_fun: args_dict: direction: local_logdir: name: Returns: """ global run_id app_id = str(sc.applicationId) num_executions = 1 if direction != 'max' and direction != 'min': raise ValueError('Invalid direction ' + direction + ', must be max or min') arg_lists = list(args_dict.values()) currentLen = len(arg_lists[0]) for i in range(len(arg_lists)): if currentLen != len(arg_lists[i]): raise ValueError('Length of each function argument list must be equal') num_executions = len(arg_lists[i]) #Each TF task should be run on 1 executor nodeRDD = sc.parallelize(range(num_executions), num_executions) #Make SparkUI intuitive by grouping jobs sc.setJobGroup("Grid Search", "{} | Hyperparameter Optimization".format(name)) #Force execution on executor, since GPU is located on executor job_start = datetime.datetime.now() nodeRDD.foreachPartition(_prepare_func(app_id, run_id, map_fun, args_dict, local_logdir)) job_end = datetime.datetime.now() job_time_str = util._time_diff(job_start, job_end) arg_count = six.get_function_code(map_fun).co_argcount arg_names = six.get_function_code(map_fun).co_varnames hdfs_appid_dir = hopshdfs._get_experiments_dir() + '/' + app_id hdfs_runid_dir = _get_logdir(app_id) max_val, max_hp, min_val, min_hp, avg = _get_best(args_dict, num_executions, arg_names, arg_count, hdfs_appid_dir, run_id) param_combination = "" best_val = "" if direction == 'max': param_combination = max_hp best_val = str(max_val) results = '\n------ Grid Search results ------ direction(' + direction + ') \n' \ 'BEST combination ' + max_hp + ' -- metric ' + str(max_val) + '\n' \ 'WORST combination ' + min_hp + ' -- metric ' + str(min_val) + '\n' \ 'AVERAGE metric -- ' + str(avg) + '\n' \ 'Total job time ' + job_time_str + '\n' _write_result(hdfs_runid_dir, results) print(results) elif direction == 'min': param_combination = min_hp best_val = str(min_val) results = '\n------ Grid Search results ------ direction(' + direction + ') \n' \ 'BEST combination ' + min_hp + ' -- metric ' + str(min_val) + '\n' \ 'WORST combination ' + max_hp + ' -- metric ' + str(max_val) + '\n' \ 'AVERAGE metric -- ' + str(avg) + '\n' \ 'Total job time ' + job_time_str + '\n' _write_result(hdfs_runid_dir, results) print(results) print('Finished Experiment \n') return hdfs_runid_dir, param_combination, best_val
def _search(spark, function, search_dict, direction='max', generations=10, popsize=10, mutation=0.5, crossover=0.7, cleanup_generations=False, local_logdir=False, name="no-name"): """ Args: :spark: :function: :search_dict: :direction: :generations: :popsize: :mutation: :crossover: :cleanup_generations: :local_logdir: :name: Returns: """ global run_id global local_logdir_bool local_logdir_bool = local_logdir global spark_session spark_session = spark global objective_function objective_function = function global cleanup cleanup = cleanup_generations argcount = six.get_function_code(function).co_argcount arg_names = six.get_function_code(function).co_varnames ordered_arr = [] app_id = spark.sparkContext.applicationId arg_lists = list(search_dict.values()) for i in range(len(arg_lists)): if len(arg_lists[i]) != 2: raise ValueError( 'Boundary list must contain exactly two elements, [lower_bound, upper_bound] for float/int ' 'or [category1, category2] in the case of strings') argIndex = 0 while argcount != 0: ordered_arr.append( (arg_names[argIndex], search_dict[arg_names[argIndex]])) argcount = argcount - 1 argIndex = argIndex + 1 ordered_dict = OrderedDict(ordered_arr) bounds_list = [] types_list = [] for entry in ordered_dict: bounds_list.append((ordered_dict[entry][0], ordered_dict[entry][1])) if isinstance(ordered_dict[entry][0], int): types_list.append('int') elif isinstance(ordered_dict[entry][0], float): types_list.append('float') else: types_list.append('cat') global diff_evo diff_evo = DifferentialEvolution(_execute_all, bounds_list, types_list, ordered_dict, direction=direction, generations=generations, popsize=popsize, crossover=crossover, mutation=mutation, name=name) root_dir = hopshdfs._get_experiments_dir() + "/" + str( app_id) + "/differential_evolution/run." + str(run_id) best_param, best_metric = diff_evo._solve(root_dir) print('Finished Experiment \n') return str(root_dir), best_param, best_metric
def _launch(sc, map_fun, args_dict, samples, direction='max', local_logdir=False, name="no-name"): """ Args: sc: map_fun: args_dict: local_logdir: name: Returns: """ global run_id app_id = str(sc.applicationId) arg_lists = list(args_dict.values()) for i in range(len(arg_lists)): if len(arg_lists[i]) != 2: raise ValueError( 'Boundary list must contain exactly two elements, [lower_bound, upper_bound] for each hyperparameter' ) hp_names = args_dict.keys() random_dict = {} for hp in hp_names: lower_bound = args_dict[hp][0] upper_bound = args_dict[hp][1] assert lower_bound < upper_bound, "lower bound: " + str( lower_bound) + " must be less than upper bound: " + str( upper_bound) random_values = [] if type(lower_bound) == int and type(upper_bound) == int: for i in range(samples): random_values.append(random.randint(lower_bound, upper_bound)) elif type(lower_bound) == float and type(upper_bound) == float: for i in range(samples): random_values.append(random.uniform(lower_bound, upper_bound)) else: raise ValueError('Only float and int is currently supported') random_dict[hp] = random_values random_dict, new_samples = _remove_duplicates(random_dict, samples) sc.setJobGroup("Random Search", "{} | Hyperparameter Optimization".format(name)) #Each TF task should be run on 1 executor nodeRDD = sc.parallelize(range(new_samples), new_samples) job_start = datetime.datetime.now() nodeRDD.foreachPartition( _prepare_func(app_id, run_id, map_fun, random_dict, local_logdir)) job_end = datetime.datetime.now() job_time_str = util._time_diff(job_start, job_end) arg_count = six.get_function_code(map_fun).co_argcount arg_names = six.get_function_code(map_fun).co_varnames hdfs_appid_dir = hopshdfs._get_experiments_dir() + '/' + app_id hdfs_runid_dir = _get_logdir(app_id) max_val, max_hp, min_val, min_hp, avg = _get_best(random_dict, new_samples, arg_names, arg_count, hdfs_appid_dir, run_id) param_combination = "" best_val = "" if direction == 'max': param_combination = max_hp best_val = str(max_val) results = '\n------ Random Search results ------ direction(' + direction + ') \n' \ 'BEST combination ' + max_hp + ' -- metric ' + str(max_val) + '\n' \ 'WORST combination ' + min_hp + ' -- metric ' + str(min_val) + '\n' \ 'AVERAGE metric -- ' + str(avg) + '\n' \ 'Total job time ' + job_time_str + '\n' _write_result(hdfs_runid_dir, results) print(results) elif direction == 'min': param_combination = min_hp best_val = str(min_val) results = '\n------ Random Search results ------ direction(' + direction + ') \n' \ 'BEST combination ' + min_hp + ' -- metric ' + str(min_val) + '\n' \ 'WORST combination ' + max_hp + ' -- metric ' + str(max_val) + '\n' \ 'AVERAGE metric -- ' + str(avg) + '\n' \ 'Total job time ' + job_time_str + '\n' _write_result(hdfs_runid_dir, results) print(results) print('Finished Experiment \n') return hdfs_runid_dir, param_combination, best_val