def start(self):
        """
    Start listener in a background thread

    Returns:
        address of the Server as a tuple of (host, port)
    """
        server_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        server_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        server_sock.bind(('', 0))
        server_sock.listen(10)

        # hostname may not be resolvable but IP address probably will be
        host = experiment_utils._get_ip_address()
        port = server_sock.getsockname()[1]
        addr = (host, port)

        def _listen(self, sock):
            """

      Args:
          self:
          sock:

      Returns:

      """
            CONNECTIONS = []
            CONNECTIONS.append(sock)

            while not self.done:
                read_socks, write_socks, err_socks = select.select(
                    CONNECTIONS, [], [], 60)
                for sock in read_socks:
                    if sock == server_sock:
                        client_sock, client_addr = sock.accept()
                        CONNECTIONS.append(client_sock)
                        logging.debug(
                            "client connected from {0}".format(client_addr))
                    else:
                        try:
                            msg = self.receive(sock)
                            self._handle_message(sock, msg)
                        except Exception as e:
                            logging.debug(e)
                            sock.close()
                            CONNECTIONS.remove(sock)

            server_sock.close()

        t = threading.Thread(target=_listen, args=(self, server_sock))
        t.daemon = True
        t.start()

        return addr
示例#2
0
文件: rpc.py 项目: carlee0/maggy
 def __init__(self, server_addr, partition_id, task_attempt, hb_interval,
              secret):
     # socket for main thread
     self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
     self.sock.connect(server_addr)
     # socket for heartbeat thread
     self.hb_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
     self.hb_sock.connect(server_addr)
     self.server_addr = server_addr
     self.done = False
     self.client_addr = (
         experiment_utils._get_ip_address(),
         self.sock.getsockname()[1],
     )
     self.partition_id = partition_id
     self.task_attempt = task_attempt
     self.hb_interval = hb_interval
     self._secret = secret
示例#3
0
    def _wrapper_fun(iter):
        """

        Args:
            iter:

        Returns:

        """

        for i in iter:
            executor_num = i

        experiment_utils._set_ml_id(app_id, run_id)

        t = threading.Thread(target=devices._print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t.start()

        is_chief = False
        logdir = None
        tb_hdfs_path = None
        try:
            host = experiment_utils._get_ip_address()

            tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            tmp_socket.bind(('', 0))
            port = tmp_socket.getsockname()[1]

            client = allreduce_reservation.Client(server_addr)
            host_port = host + ":" + str(port)

            client.register({"worker": host_port, "index": executor_num})
            cluster = client.await_reservations()
            tmp_socket.close()
            client.close()

            task_index = experiment_utils._find_index(host_port, cluster)

            if task_index == -1:
                cluster["task"] = {"type": "chief", "index": 0}
            else:
                cluster["task"] = {"type": "worker", "index": task_index}

            evaluator_node = None
            if evaluator:
                last_worker_index = len(cluster["cluster"]["worker"]) - 1
                evaluator_node = cluster["cluster"]["worker"][
                    last_worker_index]
                cluster["cluster"]["evaluator"] = [evaluator_node]
                del cluster["cluster"]["worker"][last_worker_index]
                if evaluator_node == host_port:
                    cluster["task"] = {"type": "evaluator", "index": 0}

            print('TF_CONFIG: {} '.format(cluster))

            if num_executors > 1:
                os.environ["TF_CONFIG"] = json.dumps(cluster)

            is_chief = (cluster["task"]["type"] == "chief")

            is_evaluator = (cluster["task"]["type"] == "evaluator")

            if is_chief:
                logdir = experiment_utils._get_logdir(app_id, run_id)
                tb_hdfs_path, tb_pid = tensorboard._register(
                    logdir, logdir, executor_num, local_logdir=local_logdir)
            elif is_evaluator:
                logdir = experiment_utils._get_logdir(app_id, run_id)
                tensorboard.events_logdir = logdir

            logfile = experiment_utils._init_logger(
                experiment_utils._get_logdir(app_id, run_id),
                role=cluster["task"]["type"],
                index=cluster["task"]["index"])

            print(devices._get_gpu_info())
            print('-------------------------------------------------------')
            print('Started running task')
            task_start = time.time()
            retval = map_fun()

            if is_chief:
                experiment_utils._handle_return_simple(
                    retval, experiment_utils._get_logdir(app_id, run_id),
                    logfile)

            task_end = time.time()
            time_str = 'Finished task - took ' + experiment_utils._time_diff(
                task_start, task_end)
            print(time_str)
            print('-------------------------------------------------------')
        except:
            raise
        finally:
            experiment_utils._cleanup(tensorboard, t)
示例#4
0
 def get_ip_address(self):
     return experiment_utils._get_ip_address()
示例#5
0
文件: rpc.py 项目: carlee0/maggy
    def start(self, exp_driver):
        """
        Start listener in a background thread.

        Returns:
            address of the Server as a tuple of (host, port)
        """
        global server_host_port

        server_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        server_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        if not server_host_port:
            server_sock.bind(("", 0))
            # hostname may not be resolvable but IP address probably will be
            host = experiment_utils._get_ip_address()
            port = server_sock.getsockname()[1]
            server_host_port = (host, port)

            # register this driver with Hopsworks
            sc = hopsutil._find_spark().sparkContext
            app_id = str(sc.applicationId)

            method = hopsconstants.HTTP_CONFIG.HTTP_POST
            resource_url = (hopsconstants.DELIMITERS.SLASH_DELIMITER +
                            hopsconstants.REST_CONFIG.HOPSWORKS_REST_RESOURCE +
                            hopsconstants.DELIMITERS.SLASH_DELIMITER +
                            "maggy" +
                            hopsconstants.DELIMITERS.SLASH_DELIMITER +
                            "drivers")
            json_contents = {
                "hostIp": host,
                "port": port,
                "appId": app_id,
                "secret": exp_driver._secret,
            }
            json_embeddable = json.dumps(json_contents)
            headers = {
                hopsconstants.HTTP_CONFIG.HTTP_CONTENT_TYPE:
                hopsconstants.HTTP_CONFIG.HTTP_APPLICATION_JSON
            }

            try:
                response = hopsutil.send_request(method,
                                                 resource_url,
                                                 data=json_embeddable,
                                                 headers=headers)

                if (response.status_code // 100) != 2:
                    print("No connection to Hopsworks for logging.")
                    exp_driver._log("No connection to Hopsworks for logging.")
            except Exception as e:
                print("Connection failed to Hopsworks. No logging.")
                exp_driver._log(e)
                exp_driver._log("Connection failed to Hopsworks. No logging.")
        else:
            server_sock.bind(server_host_port)
        server_sock.listen(10)

        def _listen(self, sock, driver):
            CONNECTIONS = []
            CONNECTIONS.append(sock)

            while not self.done:
                read_socks, _, _ = select.select(CONNECTIONS, [], [], 60)
                for sock in read_socks:
                    if sock == server_sock:
                        client_sock, client_addr = sock.accept()
                        CONNECTIONS.append(client_sock)
                        _ = client_addr
                    else:
                        try:
                            msg = self.receive(sock)

                            # raise exception if secret does not match
                            # so client socket gets closed
                            if not secrets.compare_digest(
                                    msg["secret"], exp_driver._secret):
                                exp_driver._log("SERVER secret: {}".format(
                                    exp_driver._secret))
                                exp_driver._log(
                                    "ERROR: wrong secret {}".format(
                                        msg["secret"]))
                                raise Exception

                            self._handle_message(sock, msg, driver)
                        except Exception as e:
                            _ = e
                            sock.close()
                            CONNECTIONS.remove(sock)

            server_sock.close()

        t = threading.Thread(target=_listen,
                             args=(self, server_sock, exp_driver))
        t.daemon = True
        t.start()

        return server_host_port
    def _wrapper_fun(iter):
        """

        Args:
            iter:

        Returns:

        """

        for i in iter:
            executor_num = i


        experiment_utils._set_ml_id(app_id, run_id)

        t = threading.Thread(target=devices._print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t.start()

        role = None
        logdir = None
        tb_hdfs_path = None

        client = parameter_server_reservation.Client(server_addr)

        try:
            host = experiment_utils._get_ip_address()

            tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            tmp_socket.bind(('', 0))
            port = tmp_socket.getsockname()[1]
            host_port = host + ":" + str(port)

            exec_spec = {}
            if executor_num < num_ps:
                exec_spec["task_type"] = "ps"
            else:
                exec_spec["task_type"] = "worker"
            exec_spec["host_port"] = host_port
            exec_spec["gpus_present"] = devices.get_num_gpus() > 0

            client.register(exec_spec)

            cluster = client.await_reservations()

            tmp_socket.close()

            role, index = experiment_utils._find_task_and_index(host_port, cluster)

            cluster_spec = {}
            cluster_spec["cluster"] = cluster
            cluster_spec["task"] = {"type": role, "index": index}

            evaluator_node = None
            if evaluator:
                last_worker_index = len(cluster_spec["cluster"]["worker"])-1
                evaluator_node = cluster_spec["cluster"]["worker"][last_worker_index]
                cluster_spec["cluster"]["evaluator"] = [evaluator_node]
                del cluster_spec["cluster"]["worker"][last_worker_index]
                if evaluator_node == host_port:
                    role = "evaluator"
                    cluster_spec["task"] = {"type": "evaluator", "index": 0}

            print('TF_CONFIG: {} '.format(cluster_spec))
            os.environ["TF_CONFIG"] = json.dumps(cluster_spec)

            logfile = experiment_utils._init_logger(experiment_utils._get_logdir(app_id, run_id), role=role, index=cluster_spec["task"]["index"])

            dist_logdir = experiment_utils._get_logdir(app_id, run_id) + '/logdir'

            is_chief = (cluster["task"]["type"] == "chief")
            if is_chief:
                hdfs.mkdir(dist_logdir)
                tensorboard._register(dist_logdir, experiment_utils._get_logdir(app_id, run_id), executor_num, local_logdir=local_logdir)
            else:
                tensorboard.events_logdir = dist_logdir
                
            print(devices._get_gpu_info())
            print('-------------------------------------------------------')
            print('Started running task')
            task_start = time.time()

            retval=None
            if role == "ps":
                ps_thread = threading.Thread(target=lambda: map_fun())
                ps_thread.start()
                client.await_all_workers_finished()
            else:
                retval = map_fun()

            if role == "chief":
                experiment_utils._handle_return_simple(retval, experiment_utils._get_logdir(app_id, run_id), logfile)

            task_end = time.time()
            time_str = 'Finished task - took ' + experiment_utils._time_diff(task_start, task_end)
            print(time_str)
            print('-------------------------------------------------------')
        except:
            raise
        finally:
            if role != "ps":
                client.register_worker_finished()
            client.close()
            experiment_utils._cleanup(tensorboard, t)