def launch(sc, map_fun, args_dict=None, local_logdir=False): global run_id app_id = str(sc.applicationId) if args_dict == None: num_executions = 1 else: arg_lists = list(args_dict.values()) currentLen = len(arg_lists[0]) for i in range(len(arg_lists)): if currentLen != len(arg_lists[i]): raise ValueError('Length of each function argument list must be equal') num_executions = len(arg_lists[i]) #Each TF task should be run on 1 executor nodeRDD = sc.parallelize(range(num_executions), num_executions) #Force execution on executor, since GPU is located on executor global run_id nodeRDD.foreachPartition(_prepare_func(app_id, run_id, map_fun, args_dict, local_logdir)) print('Finished Experiment \n') if args_dict == None: path_to_metric = get_logdir(app_id) + '/metric' if pydoop.hdfs.path.exists(path_to_metric): with pydoop.hdfs.open(path_to_metric, "r") as fi: metric = float(fi.read()) fi.close() return metric, hopshdfs.get_experiments_dir() + '/' + app_id + '/launcher/run.' + str(run_id) return None, hopshdfs.get_experiments_dir() + '/' + app_id + '/launcher/run.' + str(run_id)
def _evolutionary_launch(spark_session, map_fun, args_dict=None): """ Run the wrapper function with each hyperparameter combination as specified by the dictionary Args: :spark_session: SparkSession object :map_fun: The TensorFlow function to run :args_dict: (optional) A dictionary containing hyperparameter values to insert as arguments for each TensorFlow job """ sc = spark_session.sparkContext app_id = str(sc.applicationId) if args_dict == None: num_executions = 1 else: arg_lists = list(args_dict.values()) currentLen = len(arg_lists[0]) for i in range(len(arg_lists)): if currentLen != len(arg_lists[i]): raise ValueError( 'Length of each function argument list must be equal') num_executions = len(arg_lists[i]) #Each TF task should be run on 1 executor nodeRDD = sc.parallelize(range(num_executions), num_executions) #Force execution on executor, since GPU is located on executor global generation_id global run_id nodeRDD.foreachPartition( _prepare_func(app_id, generation_id, map_fun, args_dict, run_id)) generation_id += 1 return hopshdfs.get_experiments_dir() + '/' + app_id + "/"
def visualize(spark_session, hdfs_root_logdir): """ Visualize all TensorBoard events for a given path in HopsFS. This is intended for use after running TensorFlow jobs to visualize them all in the same TensorBoard. tflauncher.launch returns the path in HopsFS which should be handed as argument for this method to visualize all runs. Args: :spark_session: SparkSession object :hdfs_root_logdir: the path in HopsFS to enter as the logdir for TensorBoard """ sc = spark_session.sparkContext app_id = str(sc.applicationId) pypath = os.getenv("PYSPARK_PYTHON") logdir = os.getcwd() + '/tensorboard_events/' if os.path.exists(logdir): shutil.rmtree(logdir) os.makedirs(logdir) else: os.makedirs(logdir) #find free port tb_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tb_socket.bind(('', 0)) tb_addr, tb_port = tb_socket.getsockname() tb_path = util.find_tensorboard() tb_socket.close() tb_env = os.environ.copy() tb_env['CUDA_VISIBLE_DEVICES'] = '' tb_proc = subprocess.Popen( [pypath, tb_path, "--logdir=%s" % logdir, "--port=%d" % tb_port], env=tb_env, preexec_fn=util.on_executor_exit('SIGTERM')) host = socket.gethostname() tb_url = "http://{0}:{1}".format(host, tb_port) tb_endpoint = hopshdfs.get_experiments_dir( ) + "/" + app_id + "/TensorBoard.driver" #dump tb host:port to hdfs pydoop.hdfs.dump(tb_url, tb_endpoint, user=hopshdfs.project_user()) handle = hopshdfs.get() hdfs_logdir_entries = handle.list_directory(hdfs_root_logdir) for entry in hdfs_logdir_entries: file_name, extension = splitext(entry['name']) if not extension == '.log': pydoop.hdfs.get(entry['name'], logdir) tb_proc.wait() stdout, stderr = tb_proc.communicate() print(stdout) print(stderr)
def _get_metric(param_string, app_id, generation_id, run_id): project_path = hopshdfs.project_path() handle = hopshdfs.get() for i in range(generation_id): possible_result_path = hopshdfs.get_experiments_dir() + '/' + app_id + '/differential_evolution/run.' \ + str(run_id) + '/generation.' + str(i) + '/' + param_string + '/metric' if handle.exists(possible_result_path): with pydoop.hdfs.open(possible_result_path, "r") as fi: metric = float(fi.read()) fi.close() return metric return None
def get_logdir(app_id): global run_id return hopshdfs.get_experiments_dir( ) + "/" + app_id + "/differential_evolution/run." + str(run_id)
def _search(spark, function, search_dict, direction='max', generations=10, popsize=10, mutation=0.5, crossover=0.7, cleanup_generations=False, local_logdir=False): global run_id global local_logdir_bool local_logdir_bool = local_logdir global spark_session spark_session = spark global objective_function objective_function = function global cleanup cleanup = cleanup_generations argcount = six.get_function_code(function).co_argcount arg_names = six.get_function_code(function).co_varnames ordered_arr = [] app_id = spark.sparkContext.applicationId argIndex = 0 while argcount != 0: ordered_arr.append( (arg_names[argIndex], search_dict[arg_names[argIndex]])) argcount = argcount - 1 argIndex = argIndex + 1 ordered_dict = OrderedDict(ordered_arr) bounds_list = [] types_list = [] for entry in ordered_dict: bounds_list.append((ordered_dict[entry][0], ordered_dict[entry][1])) if isinstance(ordered_dict[entry][0], int): types_list.append('int') elif isinstance(ordered_dict[entry][0], float): types_list.append('float') else: types_list.append('cat') global diff_evo diff_evo = DifferentialEvolution(execute_all, bounds_list, types_list, ordered_dict, direction=direction, generations=generations, popsize=popsize, crossover=crossover, mutation=mutation) root_dir = hopshdfs.get_experiments_dir() + "/" + str( app_id) + "/differential_evolution/run." + str(run_id) best_param, best_metric = diff_evo.solve(root_dir) print('Finished Experiment \n') return str(root_dir), best_param, best_metric
def get_logdir(app_id): global run_id return hopshdfs.get_experiments_dir( ) + '/' + app_id + '/tensorflowonspark/run.' + str(run_id)
def get_logdir(app_id): global run_id return hopshdfs.get_experiments_dir( ) + '/' + app_id + '/grid_search/run.' + str(run_id)
def _grid_launch(sc, map_fun, args_dict, direction='max', local_logdir=False): """ Run the wrapper function with each hyperparameter combination as specified by the dictionary Args: :spark_session: SparkSession object :map_fun: The TensorFlow function to run :args_dict: (optional) A dictionary containing hyperparameter values to insert as arguments for each TensorFlow job """ global run_id app_id = str(sc.applicationId) num_executions = 1 if direction != 'max' and direction != 'min': raise ValueError('Invalid direction ' + direction + ', must be max or min') arg_lists = list(args_dict.values()) currentLen = len(arg_lists[0]) for i in range(len(arg_lists)): if currentLen != len(arg_lists[i]): raise ValueError( 'Length of each function argument list must be equal') num_executions = len(arg_lists[i]) #Each TF task should be run on 1 executor nodeRDD = sc.parallelize(range(num_executions), num_executions) #Force execution on executor, since GPU is located on executor job_start = datetime.datetime.now() nodeRDD.foreachPartition( _prepare_func(app_id, run_id, map_fun, args_dict, local_logdir)) job_end = datetime.datetime.now() job_time_str = util.time_diff(job_start, job_end) arg_count = six.get_function_code(map_fun).co_argcount arg_names = six.get_function_code(map_fun).co_varnames hdfs_appid_dir = hopshdfs.get_experiments_dir() + '/' + app_id hdfs_runid_dir = hdfs_appid_dir + '/grid_search/run.' + str(run_id) max_val, max_hp, min_val, min_hp, avg = _get_best(args_dict, num_executions, arg_names, arg_count, hdfs_appid_dir, run_id) param_combination = "" best_val = "" if direction == 'max': param_combination = max_hp best_val = str(max_val) results = '\n------ Grid search results ------ direction(' + direction + ') \n' \ 'BEST combination ' + max_hp + ' -- metric ' + str(max_val) + '\n' \ 'WORST combination ' + min_hp + ' -- metric ' + str(min_val) + '\n' \ 'AVERAGE metric -- ' + str(avg) + '\n' \ 'Total job time ' + job_time_str + '\n' write_result(hdfs_runid_dir, results) print(results) elif direction == 'min': param_combination = min_hp best_val = str(min_val) results = '\n------ Grid search results ------ direction(' + direction + ') \n' \ 'BEST combination ' + min_hp + ' -- metric ' + str(min_val) + '\n' \ 'WORST combination ' + max_hp + ' -- metric ' + str(max_val) + '\n' \ 'AVERAGE metric -- ' + str(avg) + '\n' \ 'Total job time ' + job_time_str + '\n' write_result(hdfs_runid_dir, results) print(results) print('Finished Experiment \n') return hdfs_runid_dir, param_combination, best_val
def get_logdir(app_id): global run_id return hopshdfs.get_experiments_dir() + '/' + app_id + '/launcher/run.' + str(run_id)