def start(self): """ Start listener in a background thread Returns: address of the Server as a tuple of (host, port) """ server_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) server_sock.bind(('', 0)) server_sock.listen(10) # hostname may not be resolvable but IP address probably will be host = experiment_utils._get_ip_address() port = server_sock.getsockname()[1] addr = (host, port) def _listen(self, sock): """ Args: self: sock: Returns: """ CONNECTIONS = [] CONNECTIONS.append(sock) while not self.done: read_socks, write_socks, err_socks = select.select( CONNECTIONS, [], [], 60) for sock in read_socks: if sock == server_sock: client_sock, client_addr = sock.accept() CONNECTIONS.append(client_sock) logging.debug( "client connected from {0}".format(client_addr)) else: try: msg = self.receive(sock) self._handle_message(sock, msg) except Exception as e: logging.debug(e) sock.close() CONNECTIONS.remove(sock) server_sock.close() t = threading.Thread(target=_listen, args=(self, server_sock)) t.daemon = True t.start() return addr
def __init__(self, server_addr, partition_id, task_attempt, hb_interval, secret): # socket for main thread self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.sock.connect(server_addr) # socket for heartbeat thread self.hb_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.hb_sock.connect(server_addr) self.server_addr = server_addr self.done = False self.client_addr = ( experiment_utils._get_ip_address(), self.sock.getsockname()[1], ) self.partition_id = partition_id self.task_attempt = task_attempt self.hb_interval = hb_interval self._secret = secret
def _wrapper_fun(iter): """ Args: iter: Returns: """ for i in iter: executor_num = i experiment_utils._set_ml_id(app_id, run_id) t = threading.Thread(target=devices._print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() is_chief = False logdir = None tb_hdfs_path = None try: host = experiment_utils._get_ip_address() tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tmp_socket.bind(('', 0)) port = tmp_socket.getsockname()[1] client = allreduce_reservation.Client(server_addr) host_port = host + ":" + str(port) client.register({"worker": host_port, "index": executor_num}) cluster = client.await_reservations() tmp_socket.close() client.close() task_index = experiment_utils._find_index(host_port, cluster) if task_index == -1: cluster["task"] = {"type": "chief", "index": 0} else: cluster["task"] = {"type": "worker", "index": task_index} evaluator_node = None if evaluator: last_worker_index = len(cluster["cluster"]["worker"]) - 1 evaluator_node = cluster["cluster"]["worker"][ last_worker_index] cluster["cluster"]["evaluator"] = [evaluator_node] del cluster["cluster"]["worker"][last_worker_index] if evaluator_node == host_port: cluster["task"] = {"type": "evaluator", "index": 0} print('TF_CONFIG: {} '.format(cluster)) if num_executors > 1: os.environ["TF_CONFIG"] = json.dumps(cluster) is_chief = (cluster["task"]["type"] == "chief") is_evaluator = (cluster["task"]["type"] == "evaluator") if is_chief: logdir = experiment_utils._get_logdir(app_id, run_id) tb_hdfs_path, tb_pid = tensorboard._register( logdir, logdir, executor_num, local_logdir=local_logdir) elif is_evaluator: logdir = experiment_utils._get_logdir(app_id, run_id) tensorboard.events_logdir = logdir logfile = experiment_utils._init_logger( experiment_utils._get_logdir(app_id, run_id), role=cluster["task"]["type"], index=cluster["task"]["index"]) print(devices._get_gpu_info()) print('-------------------------------------------------------') print('Started running task') task_start = time.time() retval = map_fun() if is_chief: experiment_utils._handle_return_simple( retval, experiment_utils._get_logdir(app_id, run_id), logfile) task_end = time.time() time_str = 'Finished task - took ' + experiment_utils._time_diff( task_start, task_end) print(time_str) print('-------------------------------------------------------') except: raise finally: experiment_utils._cleanup(tensorboard, t)
def get_ip_address(self): return experiment_utils._get_ip_address()
def start(self, exp_driver): """ Start listener in a background thread. Returns: address of the Server as a tuple of (host, port) """ global server_host_port server_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) if not server_host_port: server_sock.bind(("", 0)) # hostname may not be resolvable but IP address probably will be host = experiment_utils._get_ip_address() port = server_sock.getsockname()[1] server_host_port = (host, port) # register this driver with Hopsworks sc = hopsutil._find_spark().sparkContext app_id = str(sc.applicationId) method = hopsconstants.HTTP_CONFIG.HTTP_POST resource_url = (hopsconstants.DELIMITERS.SLASH_DELIMITER + hopsconstants.REST_CONFIG.HOPSWORKS_REST_RESOURCE + hopsconstants.DELIMITERS.SLASH_DELIMITER + "maggy" + hopsconstants.DELIMITERS.SLASH_DELIMITER + "drivers") json_contents = { "hostIp": host, "port": port, "appId": app_id, "secret": exp_driver._secret, } json_embeddable = json.dumps(json_contents) headers = { hopsconstants.HTTP_CONFIG.HTTP_CONTENT_TYPE: hopsconstants.HTTP_CONFIG.HTTP_APPLICATION_JSON } try: response = hopsutil.send_request(method, resource_url, data=json_embeddable, headers=headers) if (response.status_code // 100) != 2: print("No connection to Hopsworks for logging.") exp_driver._log("No connection to Hopsworks for logging.") except Exception as e: print("Connection failed to Hopsworks. No logging.") exp_driver._log(e) exp_driver._log("Connection failed to Hopsworks. No logging.") else: server_sock.bind(server_host_port) server_sock.listen(10) def _listen(self, sock, driver): CONNECTIONS = [] CONNECTIONS.append(sock) while not self.done: read_socks, _, _ = select.select(CONNECTIONS, [], [], 60) for sock in read_socks: if sock == server_sock: client_sock, client_addr = sock.accept() CONNECTIONS.append(client_sock) _ = client_addr else: try: msg = self.receive(sock) # raise exception if secret does not match # so client socket gets closed if not secrets.compare_digest( msg["secret"], exp_driver._secret): exp_driver._log("SERVER secret: {}".format( exp_driver._secret)) exp_driver._log( "ERROR: wrong secret {}".format( msg["secret"])) raise Exception self._handle_message(sock, msg, driver) except Exception as e: _ = e sock.close() CONNECTIONS.remove(sock) server_sock.close() t = threading.Thread(target=_listen, args=(self, server_sock, exp_driver)) t.daemon = True t.start() return server_host_port
def _wrapper_fun(iter): """ Args: iter: Returns: """ for i in iter: executor_num = i experiment_utils._set_ml_id(app_id, run_id) t = threading.Thread(target=devices._print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() role = None logdir = None tb_hdfs_path = None client = parameter_server_reservation.Client(server_addr) try: host = experiment_utils._get_ip_address() tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tmp_socket.bind(('', 0)) port = tmp_socket.getsockname()[1] host_port = host + ":" + str(port) exec_spec = {} if executor_num < num_ps: exec_spec["task_type"] = "ps" else: exec_spec["task_type"] = "worker" exec_spec["host_port"] = host_port exec_spec["gpus_present"] = devices.get_num_gpus() > 0 client.register(exec_spec) cluster = client.await_reservations() tmp_socket.close() role, index = experiment_utils._find_task_and_index(host_port, cluster) cluster_spec = {} cluster_spec["cluster"] = cluster cluster_spec["task"] = {"type": role, "index": index} evaluator_node = None if evaluator: last_worker_index = len(cluster_spec["cluster"]["worker"])-1 evaluator_node = cluster_spec["cluster"]["worker"][last_worker_index] cluster_spec["cluster"]["evaluator"] = [evaluator_node] del cluster_spec["cluster"]["worker"][last_worker_index] if evaluator_node == host_port: role = "evaluator" cluster_spec["task"] = {"type": "evaluator", "index": 0} print('TF_CONFIG: {} '.format(cluster_spec)) os.environ["TF_CONFIG"] = json.dumps(cluster_spec) logfile = experiment_utils._init_logger(experiment_utils._get_logdir(app_id, run_id), role=role, index=cluster_spec["task"]["index"]) dist_logdir = experiment_utils._get_logdir(app_id, run_id) + '/logdir' is_chief = (cluster["task"]["type"] == "chief") if is_chief: hdfs.mkdir(dist_logdir) tensorboard._register(dist_logdir, experiment_utils._get_logdir(app_id, run_id), executor_num, local_logdir=local_logdir) else: tensorboard.events_logdir = dist_logdir print(devices._get_gpu_info()) print('-------------------------------------------------------') print('Started running task') task_start = time.time() retval=None if role == "ps": ps_thread = threading.Thread(target=lambda: map_fun()) ps_thread.start() client.await_all_workers_finished() else: retval = map_fun() if role == "chief": experiment_utils._handle_return_simple(retval, experiment_utils._get_logdir(app_id, run_id), logfile) task_end = time.time() time_str = 'Finished task - took ' + experiment_utils._time_diff(task_start, task_end) print(time_str) print('-------------------------------------------------------') except: raise finally: if role != "ps": client.register_worker_finished() client.close() experiment_utils._cleanup(tensorboard, t)