def _wrapper_fun(iter): """ Args: iter: Returns: """ for i in iter: executor_num = i experiment_utils._set_ml_id(app_id, run_id) tb_hdfs_path = '' hdfs_exec_logdir = '' t = threading.Thread(target=devices._print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() try: #Arguments if args_dict: param_string, params, args = experiment_utils.build_parameters( map_fun, executor_num, args_dict) hdfs_exec_logdir, hdfs_appid_logdir = experiment_utils._create_experiment_subdirectories( app_id, run_id, param_string, 'random_search', params=params) logfile = experiment_utils._init_logger(hdfs_exec_logdir) tb_hdfs_path, tb_pid = tensorboard._register( hdfs_exec_logdir, hdfs_appid_logdir, executor_num, local_logdir=local_logdir) print(devices._get_gpu_info()) print( '-------------------------------------------------------') print('Started running task ' + param_string) task_start = time.time() retval = map_fun(*args) task_end = time.time() experiment_utils._handle_return(retval, hdfs_exec_logdir, optimization_key, logfile) time_str = 'Finished task ' + param_string + ' - took ' + experiment_utils._time_diff( task_start, task_end) print(time_str) print('Returning metric ' + str(retval)) print( '-------------------------------------------------------') except: raise finally: experiment_utils._cleanup(tensorboard, t)
def _wrapper_fun(iter): """ Args: :iter: Returns: """ for i in iter: executor_num = i experiment_utils._set_ml_id(app_id, run_id) tb_hdfs_path = '' hdfs_exec_logdir = '' t = threading.Thread(target=devices._print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() global local_logdir_bool try: #Arguments if args_dict: param_string, params, args = experiment_utils.build_parameters(map_fun, executor_num, args_dict) val = _get_return_file(param_string, app_id, generation_id, run_id) hdfs_exec_logdir, hdfs_appid_logdir = experiment_utils._create_experiment_subdirectories(app_id, run_id, param_string, 'differential_evolution', sub_type='generation.' + str(generation_id), params=params) logfile = experiment_utils._init_logger(hdfs_exec_logdir) tb_hdfs_path, tb_pid = tensorboard._register(hdfs_exec_logdir, hdfs_appid_logdir, executor_num, local_logdir=local_logdir_bool) print(devices._get_gpu_info()) print('-------------------------------------------------------') print('Started running task ' + param_string) if val is not None: val = json.loads(val) task_start = time.time() if val is None: val = map_fun(*args) task_end = time.time() time_str = 'Finished task ' + param_string + ' - took ' + experiment_utils._time_diff(task_start, task_end) print(time_str) experiment_utils._handle_return(val, hdfs_exec_logdir, opt_key, logfile) print('Returning metric ' + str(val)) print('-------------------------------------------------------') except: raise finally: experiment_utils._cleanup(tensorboard, t)
def _wrapper_fun(iter): """ Args: iter: Returns: """ for i in iter: executor_num = i tb_pid = 0 tb_hdfs_path = '' hdfs_exec_logdir = '' t = threading.Thread(target=devices._print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() try: #Arguments if args_dict: argcount = six.get_function_code(map_fun).co_argcount names = six.get_function_code(map_fun).co_varnames args = [] argIndex = 0 param_string = '' while argcount > 0: #Get args for executor and run function param_name = names[argIndex] param_val = args_dict[param_name][executor_num] param_string += str(param_name) + '=' + str( param_val) + '.' args.append(param_val) argcount -= 1 argIndex += 1 param_string = param_string[:-1] hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories( app_id, run_id, param_string, 'launcher') pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) hopshdfs._init_logger() tb_hdfs_path, tb_pid = tensorboard._register( hdfs_exec_logdir, hdfs_appid_logdir, executor_num, local_logdir=local_logdir) gpu_str = '\nChecking for GPUs in the environment' + devices._get_gpu_info( ) hopshdfs.log(gpu_str) print(gpu_str) print( '-------------------------------------------------------') print('Started running task ' + param_string + '\n') hopshdfs.log('Started running task ' + param_string) task_start = datetime.datetime.now() map_fun(*args) task_end = datetime.datetime.now() time_str = 'Finished task ' + param_string + ' - took ' + util._time_diff( task_start, task_end) print('\n' + time_str) print( '-------------------------------------------------------') hopshdfs.log(time_str) else: hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories( app_id, run_id, None, 'launcher') pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) hopshdfs._init_logger() tb_hdfs_path, tb_pid = tensorboard._register( hdfs_exec_logdir, hdfs_appid_logdir, executor_num, local_logdir=local_logdir) gpu_str = '\nChecking for GPUs in the environment' + devices._get_gpu_info( ) hopshdfs.log(gpu_str) print(gpu_str) print( '-------------------------------------------------------') print('Started running task\n') hopshdfs.log('Started running task') task_start = datetime.datetime.now() retval = map_fun() task_end = datetime.datetime.now() if retval: _handle_return(retval, hdfs_exec_logdir) time_str = 'Finished task - took ' + util._time_diff( task_start, task_end) print('\n' + time_str) print( '-------------------------------------------------------') hopshdfs.log(time_str) except: #Always do cleanup _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join() raise finally: try: if local_logdir: local_tb = tensorboard.local_logdir_path util._store_local_tensorboard(local_tb, hdfs_exec_logdir) except: pass _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join()
def _wrapper_fun(iter): """ Args: iter: Returns: """ for i in iter: executor_num = i tb_hdfs_path = '' hdfs_exec_logdir = '' t = threading.Thread(target=devices._print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() task_index = None try: host = util._get_ip_address() tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tmp_socket.bind(('', 0)) port = tmp_socket.getsockname()[1] client = allreduce_reservation.Client(server_addr) host_port = host + ":" + str(port) client.register({"worker": host_port, "index": executor_num}) cluster = client.await_reservations() tmp_socket.close() client.close() task_index = _find_index(host_port, cluster) cluster["task"] = {"type": "worker", "index": task_index} os.environ["TF_CONFIG"] = json.dumps(cluster) if task_index == 0: hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories( app_id, run_id, None, 'collective_all_reduce') pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) hopshdfs._init_logger() tb_hdfs_path, tb_pid = tensorboard._register( hdfs_exec_logdir, hdfs_appid_logdir, executor_num, local_logdir=local_logdir) gpu_str = '\nChecking for GPUs in the environment' + devices._get_gpu_info( ) if task_index == 0: hopshdfs.log(gpu_str) print(gpu_str) print('-------------------------------------------------------') print('Started running task \n') if task_index == 0: hopshdfs.log('Started running task') task_start = datetime.datetime.now() retval = map_fun() if task_index == 0: if retval: _handle_return(retval, hdfs_exec_logdir) task_end = datetime.datetime.now() time_str = 'Finished task - took ' + util._time_diff( task_start, task_end) print('\n' + time_str) print('-------------------------------------------------------') if task_index == 0: hopshdfs.log(time_str) except: #Always do cleanup _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join() raise finally: if task_index == 0: if local_logdir: local_tb = tensorboard.local_logdir_path util._store_local_tensorboard(local_tb, hdfs_exec_logdir) _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join()
def _wrapper_fun(iter): """ Args: iter: Returns: """ for i in iter: executor_num = i experiment_utils._set_ml_id(app_id, run_id) t = threading.Thread(target=devices._print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() is_chief = False logdir = None tb_hdfs_path = None try: host = experiment_utils._get_ip_address() tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tmp_socket.bind(('', 0)) port = tmp_socket.getsockname()[1] client = allreduce_reservation.Client(server_addr) host_port = host + ":" + str(port) client.register({"worker": host_port, "index": executor_num}) cluster = client.await_reservations() tmp_socket.close() client.close() task_index = experiment_utils._find_index(host_port, cluster) if task_index == -1: cluster["task"] = {"type": "chief", "index": 0} else: cluster["task"] = {"type": "worker", "index": task_index} evaluator_node = None if evaluator: last_worker_index = len(cluster["cluster"]["worker"]) - 1 evaluator_node = cluster["cluster"]["worker"][ last_worker_index] cluster["cluster"]["evaluator"] = [evaluator_node] del cluster["cluster"]["worker"][last_worker_index] if evaluator_node == host_port: cluster["task"] = {"type": "evaluator", "index": 0} print('TF_CONFIG: {} '.format(cluster)) if num_executors > 1: os.environ["TF_CONFIG"] = json.dumps(cluster) is_chief = (cluster["task"]["type"] == "chief") is_evaluator = (cluster["task"]["type"] == "evaluator") if is_chief: logdir = experiment_utils._get_logdir(app_id, run_id) tb_hdfs_path, tb_pid = tensorboard._register( logdir, logdir, executor_num, local_logdir=local_logdir) elif is_evaluator: logdir = experiment_utils._get_logdir(app_id, run_id) tensorboard.events_logdir = logdir logfile = experiment_utils._init_logger( experiment_utils._get_logdir(app_id, run_id), role=cluster["task"]["type"], index=cluster["task"]["index"]) print(devices._get_gpu_info()) print('-------------------------------------------------------') print('Started running task') task_start = time.time() retval = map_fun() if is_chief: experiment_utils._handle_return_simple( retval, experiment_utils._get_logdir(app_id, run_id), logfile) task_end = time.time() time_str = 'Finished task - took ' + experiment_utils._time_diff( task_start, task_end) print(time_str) print('-------------------------------------------------------') except: raise finally: experiment_utils._cleanup(tensorboard, t)
def _wrapper_fun(iter): """ Args: iter: Returns: """ for i in iter: executor_num = i tb_hdfs_path = '' hdfs_exec_logdir = '' t = threading.Thread(target=devices._print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() role = None client = parameter_server_reservation.Client(server_addr) try: host = util._get_ip_address() tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tmp_socket.bind(('', 0)) port = tmp_socket.getsockname()[1] host_port = host + ":" + str(port) exec_spec = {} if executor_num < num_ps: exec_spec["task_type"] = "ps" else: exec_spec["task_type"] = "worker" exec_spec["host_port"] = host_port exec_spec["gpus_present"] = devices.get_num_gpus() > 0 client.register(exec_spec) cluster = client.await_reservations() tmp_socket.close() role, index = _find_task_and_index(host_port, cluster) cluster_spec = {} cluster_spec["cluster"] = cluster cluster_spec["task"] = {"type": role, "index": index} print(cluster_spec) os.environ["TF_CONFIG"] = json.dumps(cluster_spec) if role == "chief": hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories( app_id, run_id, None, 'parameter_server') pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) hopshdfs._init_logger() tb_hdfs_path, tb_pid = tensorboard._register( hdfs_exec_logdir, hdfs_appid_logdir, executor_num, local_logdir=local_logdir) gpu_str = '\nChecking for GPUs in the environment' + devices._get_gpu_info( ) if role == "chief": hopshdfs.log(gpu_str) print(gpu_str) print('-------------------------------------------------------') print('Started running task \n') if role == "chief": hopshdfs.log('Started running task') task_start = datetime.datetime.now() retval = None if role == "ps": ps_thread = threading.Thread(target=lambda: map_fun()) ps_thread.start() print("waiting for workers") client.await_all_workers_finished() print("waiting finished") else: retval = map_fun() if role == "chief": if retval: _handle_return(retval, hdfs_exec_logdir) task_end = datetime.datetime.now() time_str = 'Finished task - took ' + util._time_diff( task_start, task_end) print('\n' + time_str) print('-------------------------------------------------------') if role == "chief": hopshdfs.log(time_str) except: _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join() raise finally: if role == "chief": if local_logdir: local_tb = tensorboard.local_logdir_path util._store_local_tensorboard(local_tb, hdfs_exec_logdir) try: if role == "worker" or role == "chief": client.register_worker_finished() client.close() except: pass _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join()
def _wrapper_fun(iter): """ Args: :iter: Returns: """ for i in iter: executor_num = i tb_pid = 0 tb_hdfs_path = '' hdfs_exec_logdir = '' t = threading.Thread(target=devices._print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() global local_logdir_bool try: #Arguments if args_dict: argcount = six.get_function_code(map_fun).co_argcount names = six.get_function_code(map_fun).co_varnames args = [] argIndex = 0 param_string = '' while argcount > 0: #Get args for executor and run function param_name = names[argIndex] param_val = args_dict[param_name][executor_num] param_string += str(param_name) + '=' + str( param_val) + '.' args.append(param_val) argcount -= 1 argIndex += 1 param_string = param_string[:-1] val = _get_metric(param_string, app_id, generation_id, run_id) hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories( app_id, run_id, param_string, 'differential_evolution', sub_type='generation.' + str(generation_id)) pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) hopshdfs._init_logger() tb_hdfs_path, tb_pid = tensorboard._register( hdfs_exec_logdir, hdfs_appid_logdir, executor_num, local_logdir=local_logdir_bool) gpu_str = '\nChecking for GPUs in the environment' + devices._get_gpu_info( ) hopshdfs.log(gpu_str) print(gpu_str) print( '-------------------------------------------------------') print('Started running task ' + param_string + '\n') if val: print('Reading returned metric from previous run: ' + str(val)) hopshdfs.log('Started running task ' + param_string) task_start = datetime.datetime.now() if not val: val = map_fun(*args) task_end = datetime.datetime.now() time_str = 'Finished task ' + param_string + ' - took ' + util._time_diff( task_start, task_end) print('\n' + time_str) hopshdfs.log(time_str) try: castval = int(val) except: raise ValueError( 'Your function needs to return a metric (number) which should be maximized or minimized' ) metric_file = hdfs_exec_logdir + '/metric' fs_handle = hopshdfs.get_fs() try: fd = fs_handle.open_file(metric_file, mode='w') except: fd = fs_handle.open_file(metric_file, flags='w') fd.write(str(float(val)).encode()) fd.flush() fd.close() print('Returning metric ' + str(val)) print( '-------------------------------------------------------') except: #Always do cleanup if tb_hdfs_path: _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join(20) raise finally: if local_logdir_bool: local_tb = tensorboard.local_logdir_path util._store_local_tensorboard(local_tb, hdfs_exec_logdir) hopshdfs.log('Finished running') if tb_hdfs_path: _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join(20)
def _wrapper_fun(iter): """ Args: iter: Returns: """ for i in iter: executor_num = i tb_pid = 0 tb_hdfs_path = '' hdfs_exec_logdir = '' t = threading.Thread(target=devices._print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() try: hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories( app_id, run_id, None, 'mirrored') pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) hopshdfs._init_logger() tb_hdfs_path, tb_pid = tensorboard._register( hdfs_exec_logdir, hdfs_appid_logdir, executor_num, local_logdir=local_logdir) gpu_str = '\nChecking for GPUs in the environment' + devices._get_gpu_info( ) hopshdfs.log(gpu_str) print(gpu_str) print('-------------------------------------------------------') print('Started running task\n') hopshdfs.log('Started running task') task_start = datetime.datetime.now() retval = map_fun() task_end = datetime.datetime.now() if retval: _handle_return(retval, hdfs_exec_logdir) time_str = 'Finished task - took ' + util._time_diff( task_start, task_end) print('\n' + time_str) print('-------------------------------------------------------') hopshdfs.log(time_str) except: #Always do cleanup _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join() raise finally: try: if local_logdir: local_tb = tensorboard.local_logdir_path util._store_local_tensorboard(local_tb, hdfs_exec_logdir) except: pass _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join()
def _wrapper_fun(iter): """ Args: iter: Returns: """ for i in iter: executor_num = i experiment_utils._set_ml_id(app_id, run_id) t = threading.Thread(target=devices._print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() role = None logdir = None tb_hdfs_path = None client = parameter_server_reservation.Client(server_addr) try: host = experiment_utils._get_ip_address() tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tmp_socket.bind(('', 0)) port = tmp_socket.getsockname()[1] host_port = host + ":" + str(port) exec_spec = {} if executor_num < num_ps: exec_spec["task_type"] = "ps" else: exec_spec["task_type"] = "worker" exec_spec["host_port"] = host_port exec_spec["gpus_present"] = devices.get_num_gpus() > 0 client.register(exec_spec) cluster = client.await_reservations() tmp_socket.close() role, index = experiment_utils._find_task_and_index(host_port, cluster) cluster_spec = {} cluster_spec["cluster"] = cluster cluster_spec["task"] = {"type": role, "index": index} evaluator_node = None if evaluator: last_worker_index = len(cluster_spec["cluster"]["worker"])-1 evaluator_node = cluster_spec["cluster"]["worker"][last_worker_index] cluster_spec["cluster"]["evaluator"] = [evaluator_node] del cluster_spec["cluster"]["worker"][last_worker_index] if evaluator_node == host_port: role = "evaluator" cluster_spec["task"] = {"type": "evaluator", "index": 0} print('TF_CONFIG: {} '.format(cluster_spec)) os.environ["TF_CONFIG"] = json.dumps(cluster_spec) logfile = experiment_utils._init_logger(experiment_utils._get_logdir(app_id, run_id), role=role, index=cluster_spec["task"]["index"]) dist_logdir = experiment_utils._get_logdir(app_id, run_id) + '/logdir' is_chief = (cluster["task"]["type"] == "chief") if is_chief: hdfs.mkdir(dist_logdir) tensorboard._register(dist_logdir, experiment_utils._get_logdir(app_id, run_id), executor_num, local_logdir=local_logdir) else: tensorboard.events_logdir = dist_logdir print(devices._get_gpu_info()) print('-------------------------------------------------------') print('Started running task') task_start = time.time() retval=None if role == "ps": ps_thread = threading.Thread(target=lambda: map_fun()) ps_thread.start() client.await_all_workers_finished() else: retval = map_fun() if role == "chief": experiment_utils._handle_return_simple(retval, experiment_utils._get_logdir(app_id, run_id), logfile) task_end = time.time() time_str = 'Finished task - took ' + experiment_utils._time_diff(task_start, task_end) print(time_str) print('-------------------------------------------------------') except: raise finally: if role != "ps": client.register_worker_finished() client.close() experiment_utils._cleanup(tensorboard, t)