def visualize(hdfs_root_logdir): """ Visualize all TensorBoard events for a given path in HopsFS. This is intended for use after running TensorFlow jobs to visualize them all in the same TensorBoard. tflauncher.launch returns the path in HopsFS which should be handed as argument for this method to visualize all runs. Args: :hdfs_root_logdir: the path in HopsFS to enter as the logdir for TensorBoard """ sc = util._find_spark().sparkContext app_id = str(sc.applicationId) pypath = os.getenv("PYSPARK_PYTHON") logdir = os.getcwd() + '/tensorboard_events/' if os.path.exists(logdir): shutil.rmtree(logdir) os.makedirs(logdir) else: os.makedirs(logdir) #find free port tb_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tb_socket.bind(('', 0)) tb_addr, tb_port = tb_socket.getsockname() tb_path = util._find_tensorboard() tb_socket.close() tb_env = os.environ.copy() tb_env['CUDA_VISIBLE_DEVICES'] = '' tb_env['LC_ALL'] = 'C' tb_proc = subprocess.Popen([ pypath, tb_path, "--logdir=%s" % logdir, "--port=%d" % tb_port, "--host=%s" % "0.0.0.0" ], env=tb_env, preexec_fn=util._on_executor_exit('SIGTERM')) host = socket.gethostname() tb_url = "http://{0}:{1}".format(host, tb_port) tb_endpoint = hopshdfs._get_experiments_dir( ) + "/" + app_id + "/TensorBoard.visualize" #dump tb host:port to hdfs pydoop.hdfs.dump(tb_url, tb_endpoint, user=hopshdfs.project_user()) handle = hopshdfs.get() hdfs_logdir_entries = handle.list_directory(hdfs_root_logdir) for entry in hdfs_logdir_entries: file_name, extension = splitext(entry['name']) if not extension == '.log': pydoop.hdfs.get(entry['name'], logdir) tb_proc.wait() stdout, stderr = tb_proc.communicate() print(stdout) print(stderr)
def get_portable_runner_config(sdk_worker_parallelism=1, worker_threads=100, pre_optimize="all", execution_mode_for_batch="BATCH_FORCED"): """ Instantiate a list of pipeline configuration options for the PortableRunner. Args: sdk_worker_parallelism: sdk_worker_parallelism worker_threads: worker_threads pre_optimize: pre_optimize execution_mode_for_batch: execution_mode_for_batch Returns: a list of pipeline configuration options for the PortableRunner. """ return [ '--runner=PortableRunner', '--hdfs_host=' + str(hopsfs.get_webhdfs_host()), '--hdfs_port=' + str(hopsfs.get_webhdfs_port()), '--hdfs_user='******'--job_endpoint=' + jobserver_host + ":" + str(jobserver_port), '--environment_type=PROCESS', '--environment_config=' + '{"command":"' + get_sdk_worker() + '"}', '--sdk_worker_parallelism=' + str(sdk_worker_parallelism), '--experiments=worker_threads' + str(worker_threads), '--experiments=pre_optimize=' + pre_optimize, '--execution_mode_for_batch=' + execution_mode_for_batch ]
def _wrapper_fun(iter): """ Args: iter: Returns: """ for i in iter: executor_num = i tb_pid = 0 tb_hdfs_path = '' hdfs_exec_logdir = '' t = threading.Thread(target=devices._print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() try: #Arguments if args_dict: argcount = six.get_function_code(map_fun).co_argcount names = six.get_function_code(map_fun).co_varnames args = [] argIndex = 0 param_string = '' while argcount > 0: #Get args for executor and run function param_name = names[argIndex] param_val = args_dict[param_name][executor_num] param_string += str(param_name) + '=' + str( param_val) + '.' args.append(param_val) argcount -= 1 argIndex += 1 param_string = param_string[:-1] hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories( app_id, run_id, param_string, 'launcher') pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) hopshdfs._init_logger() tb_hdfs_path, tb_pid = tensorboard._register( hdfs_exec_logdir, hdfs_appid_logdir, executor_num, local_logdir=local_logdir) gpu_str = '\nChecking for GPUs in the environment' + devices._get_gpu_info( ) hopshdfs.log(gpu_str) print(gpu_str) print( '-------------------------------------------------------') print('Started running task ' + param_string + '\n') hopshdfs.log('Started running task ' + param_string) task_start = datetime.datetime.now() map_fun(*args) task_end = datetime.datetime.now() time_str = 'Finished task ' + param_string + ' - took ' + util._time_diff( task_start, task_end) print('\n' + time_str) print( '-------------------------------------------------------') hopshdfs.log(time_str) else: hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories( app_id, run_id, None, 'launcher') pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) hopshdfs._init_logger() tb_hdfs_path, tb_pid = tensorboard._register( hdfs_exec_logdir, hdfs_appid_logdir, executor_num, local_logdir=local_logdir) gpu_str = '\nChecking for GPUs in the environment' + devices._get_gpu_info( ) hopshdfs.log(gpu_str) print(gpu_str) print( '-------------------------------------------------------') print('Started running task\n') hopshdfs.log('Started running task') task_start = datetime.datetime.now() retval = map_fun() task_end = datetime.datetime.now() if retval: _handle_return(retval, hdfs_exec_logdir) time_str = 'Finished task - took ' + util._time_diff( task_start, task_end) print('\n' + time_str) print( '-------------------------------------------------------') hopshdfs.log(time_str) except: #Always do cleanup _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join() raise finally: try: if local_logdir: local_tb = tensorboard.local_logdir_path util._store_local_tensorboard(local_tb, hdfs_exec_logdir) except: pass _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join()
def _wrapper_fun(iter): """ Args: iter: Returns: """ for i in iter: executor_num = i tb_hdfs_path = '' hdfs_exec_logdir = '' t = threading.Thread(target=devices._print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() task_index = None try: host = util._get_ip_address() tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tmp_socket.bind(('', 0)) port = tmp_socket.getsockname()[1] client = allreduce_reservation.Client(server_addr) host_port = host + ":" + str(port) client.register({"worker": host_port, "index": executor_num}) cluster = client.await_reservations() tmp_socket.close() client.close() task_index = _find_index(host_port, cluster) cluster["task"] = {"type": "worker", "index": task_index} os.environ["TF_CONFIG"] = json.dumps(cluster) if task_index == 0: hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories( app_id, run_id, None, 'collective_all_reduce') pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) hopshdfs._init_logger() tb_hdfs_path, tb_pid = tensorboard._register( hdfs_exec_logdir, hdfs_appid_logdir, executor_num, local_logdir=local_logdir) gpu_str = '\nChecking for GPUs in the environment' + devices._get_gpu_info( ) if task_index == 0: hopshdfs.log(gpu_str) print(gpu_str) print('-------------------------------------------------------') print('Started running task \n') if task_index == 0: hopshdfs.log('Started running task') task_start = datetime.datetime.now() retval = map_fun() if task_index == 0: if retval: _handle_return(retval, hdfs_exec_logdir) task_end = datetime.datetime.now() time_str = 'Finished task - took ' + util._time_diff( task_start, task_end) print('\n' + time_str) print('-------------------------------------------------------') if task_index == 0: hopshdfs.log(time_str) except: #Always do cleanup _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join() raise finally: if task_index == 0: if local_logdir: local_tb = tensorboard.local_logdir_path util._store_local_tensorboard(local_tb, hdfs_exec_logdir) _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join()
def _wrapper_fun(iter): for i in iter: executor_num = i tb_pid = 0 tb_hdfs_path = '' hdfs_exec_logdir = '' t = threading.Thread(target=devices.print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() global local_logdir_bool try: #Arguments if args_dict: argcount = six.get_function_code(map_fun).co_argcount names = six.get_function_code(map_fun).co_varnames args = [] argIndex = 0 param_string = '' while argcount > 0: #Get args for executor and run function param_name = names[argIndex] param_val = args_dict[param_name][executor_num] param_string += str(param_name) + '=' + str( param_val) + '.' args.append(param_val) argcount -= 1 argIndex += 1 param_string = param_string[:-1] val = _get_metric(param_string, app_id, generation_id, run_id) hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs.create_directories( app_id, run_id, param_string, 'differential_evolution', sub_type='generation.' + str(generation_id)) pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) hopshdfs.init_logger() tb_hdfs_path, tb_pid = tensorboard.register( hdfs_exec_logdir, hdfs_appid_logdir, executor_num, local_logdir=local_logdir_bool) gpu_str = '\nChecking for GPUs in the environment' + devices.get_gpu_info( ) hopshdfs.log(gpu_str) print(gpu_str) print( '-------------------------------------------------------') print('Started running task ' + param_string + '\n') if val: print('Reading returned metric from previous run: ' + str(val)) hopshdfs.log('Started running task ' + param_string) task_start = datetime.datetime.now() if not val: val = map_fun(*args) task_end = datetime.datetime.now() time_str = 'Finished task ' + param_string + ' - took ' + util.time_diff( task_start, task_end) print('\n' + time_str) hopshdfs.log(time_str) try: castval = int(val) except: raise ValueError( 'Your function needs to return a metric (number) which should be maximized or minimized' ) metric_file = hdfs_exec_logdir + '/metric' fs_handle = hopshdfs.get_fs() try: fd = fs_handle.open_file(metric_file, mode='w') except: fd = fs_handle.open_file(metric_file, flags='w') fd.write(str(float(val)).encode()) fd.flush() fd.close() print('Returning metric ' + str(val)) print( '-------------------------------------------------------') except: #Always do cleanup if tb_hdfs_path: _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join() raise finally: if local_logdir_bool: local_tb = tensorboard.local_logdir_path util.store_local_tensorboard(local_tb, hdfs_exec_logdir) hopshdfs.log('Finished running') if tb_hdfs_path: _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join()
def _register(hdfs_exec_dir, endpoint_dir, exec_num, local_logdir=False): """ Args: hdfs_exec_dir: endpoint_dir: exec_num: local_logdir: Returns: """ global tb_pid if tb_pid != 0: subprocess.Popen(["kill", str(tb_pid)]) _reset_global() global events_logdir events_logdir = hdfs_exec_dir global local_logdir_bool local_logdir_bool = local_logdir if tb_pid == 0: global pypath pypath = os.getenv("PYSPARK_PYTHON") #find free port tb_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tb_socket.bind(('', 0)) global tb_port tb_addr, tb_port = tb_socket.getsockname() global tb_path tb_path = experiment_utils._find_tensorboard() tb_socket.close() tb_env = _init_tb_env() global local_logdir_path if local_logdir: local_logdir_path = os.getcwd() + '/local_logdir' if os.path.exists(local_logdir_path): shutil.rmtree(local_logdir_path) os.makedirs(local_logdir_path) else: os.makedirs(local_logdir_path) local_logdir_path = local_logdir_path + '/' tb_proc = subprocess.Popen( [ pypath, tb_path, "--logdir=%s" % local_logdir_path, "--port=%d" % tb_port, "--host=%s" % "0.0.0.0" ], env=tb_env, preexec_fn=util._on_executor_exit('SIGTERM')) else: tb_proc = subprocess.Popen( [ pypath, tb_path, "--logdir=%s" % events_logdir, "--port=%d" % tb_port, "--host=%s" % "0.0.0.0" ], env=tb_env, preexec_fn=util._on_executor_exit('SIGTERM')) tb_pid = tb_proc.pid host = socket.gethostname() global tb_url tb_url = "http://{0}:{1}".format(host, tb_port) global endpoint endpoint = endpoint_dir + "/TensorBoard.task" + str(exec_num) #dump tb host:port to hdfs pydoop.hdfs.dump(tb_url, endpoint, user=hopshdfs.project_user()) return endpoint, tb_pid
def begin(name='no-name', local_logdir=False, versioned_resources=None, description=None): """ Start a custom Experiment, at the end of the experiment call *end(metric)*. *IMPORTANT* - This call should not be combined with other functions in the experiment module, other than *end*. Other experiment functions such as *grid_search* manages the *begin* and *end* functions internally Example usage: >>> from hops import experiment >>> experiment.begin(name='calculate pi') >>> # Code to calculate pi >>> pi = calc_pi() >>> experiment.end(pi) Args: :name: name of the experiment :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS :versioned_resources: A list of HDFS paths of resources to version with this experiment :description: A longer description for the experiment Returns: HDFS path in your project where the experiment is stored """ global running if running: raise RuntimeError("An experiment is currently running. Please call experiment.stop() to stop it.") try: global app_id global experiment_json global elastic_id global run_id global driver_tensorboard_hdfs_path running = True sc = util._find_spark().sparkContext app_id = str(sc.applicationId) run_id = run_id + 1 versioned_path = util._version_resources(versioned_resources, _get_logdir(app_id)) experiment_json = None experiment_json = util._populate_experiment(sc, name, 'experiment', 'begin', _get_logdir(app_id), None, versioned_path, description) util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories(app_id, run_id, None, 'begin') pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) hopshdfs._init_logger() driver_tensorboard_hdfs_path,_ = tensorboard._register(hdfs_exec_logdir, hdfs_appid_logdir, 0, local_logdir=local_logdir) except: _exception_handler() raise return driver_tensorboard_hdfs_path
def _wrapper_fun(iter): """ Args: iter: Returns: """ for i in iter: executor_num = i tb_hdfs_path = '' hdfs_exec_logdir = '' t = threading.Thread(target=devices._print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() role = None client = parameter_server_reservation.Client(server_addr) try: host = util._get_ip_address() tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tmp_socket.bind(('', 0)) port = tmp_socket.getsockname()[1] host_port = host + ":" + str(port) exec_spec = {} if executor_num < num_ps: exec_spec["task_type"] = "ps" else: exec_spec["task_type"] = "worker" exec_spec["host_port"] = host_port exec_spec["gpus_present"] = devices.get_num_gpus() > 0 client.register(exec_spec) cluster = client.await_reservations() tmp_socket.close() role, index = _find_task_and_index(host_port, cluster) cluster_spec = {} cluster_spec["cluster"] = cluster cluster_spec["task"] = {"type": role, "index": index} print(cluster_spec) os.environ["TF_CONFIG"] = json.dumps(cluster_spec) if role == "chief": hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories( app_id, run_id, None, 'parameter_server') pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) hopshdfs._init_logger() tb_hdfs_path, tb_pid = tensorboard._register( hdfs_exec_logdir, hdfs_appid_logdir, executor_num, local_logdir=local_logdir) gpu_str = '\nChecking for GPUs in the environment' + devices._get_gpu_info( ) if role == "chief": hopshdfs.log(gpu_str) print(gpu_str) print('-------------------------------------------------------') print('Started running task \n') if role == "chief": hopshdfs.log('Started running task') task_start = datetime.datetime.now() retval = None if role == "ps": ps_thread = threading.Thread(target=lambda: map_fun()) ps_thread.start() print("waiting for workers") client.await_all_workers_finished() print("waiting finished") else: retval = map_fun() if role == "chief": if retval: _handle_return(retval, hdfs_exec_logdir) task_end = datetime.datetime.now() time_str = 'Finished task - took ' + util._time_diff( task_start, task_end) print('\n' + time_str) print('-------------------------------------------------------') if role == "chief": hopshdfs.log(time_str) except: _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join() raise finally: if role == "chief": if local_logdir: local_tb = tensorboard.local_logdir_path util._store_local_tensorboard(local_tb, hdfs_exec_logdir) try: if role == "worker" or role == "chief": client.register_worker_finished() client.close() except: pass _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join()
def _wrapper_fun(iter): for i in iter: executor_num = i hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs.create_directories( app_id, run_id, None, 'horovod') tb_pid = 0 tb_hdfs_path = '' pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) hopshdfs.init_logger() hopshdfs.log('Starting Spark executor with arguments') if executor_num == 0: tb_hdfs_path, tb_pid = tensorboard.register( hdfs_exec_logdir, hdfs_appid_logdir, 0, local_logdir=local_logdir) gpu_str = '\n\nChecking for GPUs in the environment\n' + devices.get_gpu_info( ) hopshdfs.log(gpu_str) print(gpu_str) #1. Download notebook file fs_handle = hopshdfs.get_fs() try: fd = fs_handle.open_file(nb_path, flags='r') except: fd = fs_handle.open_file(nb_path, mode='r') notebook = '' for line in fd: notebook += line path, filename = os.path.split(nb_path) f_nb = open(filename, "w+") f_nb.write(notebook) f_nb.flush() f_nb.close() # 2. Convert notebook to py file jupyter_runnable = os.path.abspath( os.path.join(os.environ['PYSPARK_PYTHON'], os.pardir)) + '/jupyter' conversion_cmd = jupyter_runnable + ' nbconvert --to python ' + filename conversion = subprocess.Popen(conversion_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) conversion.wait() stdout, stderr = conversion.communicate() print(stdout) print(stderr) # 3. Make py file runnable py_runnable = os.getcwd() + '/' + filename.split('.')[0] + '.py' st = os.stat(py_runnable) os.chmod(py_runnable, st.st_mode | stat.S_IEXEC) t_gpus = threading.Thread( target=devices.print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t_gpus.start() mpi_logfile_path = os.getcwd() + '/mpirun.log' if os.path.exists(mpi_logfile_path): os.remove(mpi_logfile_path) mpi_logfile = open(mpi_logfile_path, 'w') # 4. Run allreduce mpi_np = os.environ['MPI_NP'] mpi_cmd = 'HOROVOD_TIMELINE=' + tensorboard.logdir() + '/timeline.json' + \ ' TENSORBOARD_LOGDIR=' + tensorboard.logdir() + \ ' mpirun -np ' + str(mpi_np) + \ ' -bind-to none -map-by slot ' + \ ' -x HOROVOD_TIMELINE ' + \ ' -x TENSORBOARD_LOGDIR ' + \ ' -x NCCL_DEBUG=INFO ' + \ os.environ['PYSPARK_PYTHON'] + ' ' + py_runnable mpi = subprocess.Popen(mpi_cmd, shell=True, stdout=mpi_logfile, stderr=mpi_logfile, preexec_fn=util.on_executor_exit('SIGTERM')) t_log = threading.Thread(target=print_log) t_log.start() mpi.wait() if devices.get_num_gpus() > 0: t_gpus.do_run = False t_gpus.join() return_code = mpi.returncode if local_logdir: local_tb = tensorboard.local_logdir_path pydoop.hdfs.put(local_tb, hdfs_exec_logdir) if return_code != 0: cleanup(tb_hdfs_path) t_log.do_run = False t_log.join() raise Exception('mpirun FAILED, look in the logs for the error') cleanup(tb_hdfs_path) t_log.do_run = False t_log.join() hopshdfs.kill_logger()
def begin(spark, name='no-name', local_logdir=False, versioned_resources=None, description=None): """ Start an experiment Args: :spark_session: SparkSession object :name: (optional) name of the job """ global running if running: raise RuntimeError( "An experiment is currently running. Please call experiment.stop() to stop it." ) try: global app_id global experiment_json global elastic_id global run_id global driver_tensorboard_hdfs_path running = True sc = spark.sparkContext app_id = str(sc.applicationId) run_id = run_id + 1 versioned_path = util.version_resources(versioned_resources, get_logdir(app_id)) experiment_json = None experiment_json = util.populate_experiment(sc, name, 'experiment', 'begin', get_logdir(app_id), None, versioned_path, description) util.version_resources(versioned_resources, get_logdir(app_id)) util.put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs.create_directories( app_id, run_id, None, 'begin') pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) hopshdfs.init_logger() driver_tensorboard_hdfs_path, _ = tensorboard.register( hdfs_exec_logdir, hdfs_appid_logdir, 0, local_logdir=local_logdir, tensorboard_driver=True) except: exception_handler() raise return
def _wrapper_fun(iter): """ Args: iter: Returns: """ for i in iter: executor_num = i tb_pid = 0 tb_hdfs_path = '' hdfs_exec_logdir = '' t = threading.Thread(target=devices._print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() try: hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories( app_id, run_id, None, 'mirrored') pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) hopshdfs._init_logger() tb_hdfs_path, tb_pid = tensorboard._register( hdfs_exec_logdir, hdfs_appid_logdir, executor_num, local_logdir=local_logdir) gpu_str = '\nChecking for GPUs in the environment' + devices._get_gpu_info( ) hopshdfs.log(gpu_str) print(gpu_str) print('-------------------------------------------------------') print('Started running task\n') hopshdfs.log('Started running task') task_start = datetime.datetime.now() retval = map_fun() task_end = datetime.datetime.now() if retval: _handle_return(retval, hdfs_exec_logdir) time_str = 'Finished task - took ' + util._time_diff( task_start, task_end) print('\n' + time_str) print('-------------------------------------------------------') hopshdfs.log(time_str) except: #Always do cleanup _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join() raise finally: try: if local_logdir: local_tb = tensorboard.local_logdir_path util._store_local_tensorboard(local_tb, hdfs_exec_logdir) except: pass _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join()
def register(hdfs_exec_dir, endpoint_dir, exec_num, local_logdir=False, tensorboard_driver=False): global tb_pid if tb_pid != 0: subprocess.Popen(["kill", str(tb_pid)]) _reset_global() global events_logdir events_logdir = hdfs_exec_dir global local_logdir_bool local_logdir_bool = local_logdir if tb_pid == 0: global pypath pypath = os.getenv("PYSPARK_PYTHON") #find free port tb_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tb_socket.bind(('', 0)) global tb_port tb_addr, tb_port = tb_socket.getsockname() global tb_path tb_path = util.find_tensorboard() tb_socket.close() tb_env = os.environ.copy() tb_env['CUDA_VISIBLE_DEVICES'] = '' tb_proc = None global local_logdir_path if local_logdir: local_logdir_path = os.getcwd() + '/local_logdir' if os.path.exists(local_logdir_path): shutil.rmtree(local_logdir_path) os.makedirs(local_logdir_path) else: os.makedirs(local_logdir_path) tb_proc = subprocess.Popen( [ pypath, tb_path, "--logdir=%s" % local_logdir_path, "--port=%d" % tb_port ], env=tb_env, preexec_fn=util.on_executor_exit('SIGTERM')) else: tb_proc = subprocess.Popen( [ pypath, tb_path, "--logdir=%s" % events_logdir, "--port=%d" % tb_port ], env=tb_env, preexec_fn=util.on_executor_exit('SIGTERM')) tb_pid = tb_proc.pid host = socket.gethostname() global tb_url tb_url = "http://{0}:{1}".format(host, tb_port) global endpoint if tensorboard_driver: endpoint = endpoint_dir + "/TensorBoard.driver" else: endpoint = endpoint_dir + "/TensorBoard.task" + str(exec_num) #dump tb host:port to hdfs pydoop.hdfs.dump(tb_url, endpoint, user=hopshdfs.project_user()) return endpoint, tb_pid