Exemplo n.º 1
0
    def _start_analytical_engine(self):
        rmcp = ResolveMPICmdPrefix()
        cmd, mpi_env = rmcp.resolve(self._num_workers, self._hosts)

        master = self._hosts.split(",")[0]
        rpc_port = get_free_port(master)
        self._analytical_engine_endpoint = f"{master}:{rpc_port}"

        cmd.append(ANALYTICAL_ENGINE_PATH)
        cmd.extend(["--host", "0.0.0.0"])
        cmd.extend(["--port", str(rpc_port)])
        cmd.extend(["--vineyard_shared_mem", self._shared_mem])

        if rmcp.openmpi():
            cmd.extend(["-v", str(self._glog_level)])
        else:
            mpi_env["GLOG_v"] = str(self._glog_level)

        if self._vineyard_socket:
            cmd.extend(["--vineyard_socket", self._vineyard_socket])

        env = os.environ.copy()
        env.update(mpi_env)

        logger.info("Launch analytical engine with command: %s", " ".join(cmd))

        process = subprocess.Popen(
            cmd,
            start_new_session=True,
            cwd=os.getcwd(),
            env=env,
            encoding="utf-8",
            errors="replace",
            stdin=subprocess.DEVNULL,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            universal_newlines=True,
            bufsize=1,
        )

        logger.info("Server is initializing analytical engine.")
        stdout_watcher = PipeWatcher(process.stdout, sys.stdout)
        stderr_watcher = PipeWatcher(process.stderr, sys.stderr)
        setattr(process, "stdout_watcher", stdout_watcher)
        setattr(process, "stderr_watcher", stderr_watcher)

        self._analytical_engine_process = process

        start_time = time.time()

        while is_free_port(rpc_port):
            time.sleep(1)
            if (self._timeout_seconds
                    and self._timeout_seconds + start_time < time.time()):
                raise RuntimeError(
                    "Launch analytical engine failed due to timeout.")
        logger.info("Analytical engine is ready, endpoint is {0}".format(
            self._analytical_engine_endpoint))
Exemplo n.º 2
0
    def create_learning_instance(self, object_id, handle, config):
        # prepare argument
        handle = json.loads(
            base64.b64decode(handle.encode("utf-8")).decode("utf-8"))

        server_list = []
        for i in range(self._num_workers):
            server_list.append(f"localhost:{str(get_free_port('localhost'))}")
        hosts = ",".join(server_list)
        handle["server"] = hosts
        handle = base64.b64encode(
            json.dumps(handle).encode("utf-8")).decode("utf-8")

        # launch the server
        env = os.environ.copy()
        # set coordinator dir to PYTHONPATH
        if "PYTHONPATH" in env:
            env["PYTHONPATH"] = (
                os.path.join(os.path.dirname(__file__), "..") + os.pathsep +
                env["PYTHONPATH"])
        else:
            env["PYTHONPATH"] = os.path.join(os.path.dirname(__file__), "..")

        self._learning_instance_processes[object_id] = []
        for index in range(self._num_workers):
            cmd = [
                sys.executable,
                "-m",
                "gscoordinator.learning",
                handle,
                config,
                str(index),
            ]
            logger.debug("launching learning server: %s", " ".join(cmd))

            proc = subprocess.Popen(
                cmd,
                env=env,
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
                encoding="utf-8",
                errors="replace",
                universal_newlines=True,
                bufsize=1,
            )
            stdout_watcher = PipeWatcher(
                proc.stdout,
                sys.stdout,
                suppressed=(not logger.isEnabledFor(logging.DEBUG)),
            )
            setattr(proc, "stdout_watcher", stdout_watcher)
            self._learning_instance_processes[object_id].append(proc)

        return server_list
Exemplo n.º 3
0
    def _create_vineyard(self):
        if not self._vineyard_socket:
            ts = get_timestamp()
            vineyard_socket = f"{self._vineyard_socket_prefix}{ts}"
            self._vineyard_rpc_port = 9600 if is_free_port(
                9600) else get_free_port()

            cmd = self._find_vineyardd()
            cmd.extend(["--socket", vineyard_socket])
            cmd.extend(["--rpc_socket_port", str(self._vineyard_rpc_port)])
            cmd.extend(["--size", self._shared_mem])
            cmd.extend(["-etcd_endpoint", self._etcd_endpoint])
            cmd.extend(["-etcd_prefix", f"vineyard.gsa.{ts}"])
            env = os.environ.copy()
            env["GLOG_v"] = str(self._glog_level)

            logger.info("Launch vineyardd with command: %s", " ".join(cmd))

            process = subprocess.Popen(
                cmd,
                start_new_session=True,
                cwd=os.getcwd(),
                env=env,
                encoding="utf-8",
                errors="replace",
                stdin=subprocess.DEVNULL,
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
                universal_newlines=True,
                bufsize=1,
            )

            logger.info("Server is initializing vineyardd.")
            stdout_watcher = PipeWatcher(
                process.stdout,
                sys.stdout,
                suppressed=(not logger.isEnabledFor(logging.DEBUG)),
            )
            setattr(process, "stdout_watcher", stdout_watcher)

            self._vineyard_socket = vineyard_socket
            self._vineyardd_process = process

            start_time = time.time()
            while not os.path.exists(self._vineyard_socket):
                time.sleep(1)
                if (self._timeout_seconds
                        and self._timeout_seconds + start_time < time.time()):
                    raise RuntimeError(
                        "Launch vineyardd failed due to timeout.")
            logger.info("Vineyardd is ready, ipc socket is {0}".format(
                self._vineyard_socket))
Exemplo n.º 4
0
    def _create_interactive_engine_service(self):
        # launch zetcd proxy
        logger.info("Launching zetcd proxy service ...")
        zetcd_exec = shutil.which("zetcd")
        if not zetcd_exec:
            raise RuntimeError("zetcd command not found.")
        etcd_endpoints = self._get_etcd_endpoints()
        cmd = [
            zetcd_exec,
            "--zkaddr",
            "0.0.0.0:{}".format(self._zookeeper_port),
            "--endpoints",
            "{}".format(",".join(etcd_endpoints)),
        ]
        logger.info("zetcd cmd {}".format(" ".join(cmd)))

        self._zetcd_process = subprocess.Popen(
            cmd,
            start_new_session=True,
            cwd=os.getcwd(),
            env=os.environ.copy(),
            encoding="utf-8",
            errors="replace",
            stdin=subprocess.DEVNULL,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            universal_newlines=True,
            bufsize=1,
        )
        stdout_watcher = PipeWatcher(self._zetcd_process.stdout,
                                     sys.stdout,
                                     drop=True)
        setattr(self._zetcd_process, "stdout_watcher", stdout_watcher)

        start_time = time.time()
        while is_free_port(
                self._zookeeper_port,
                socket.gethostbyname(socket.gethostname()),
        ):
            time.sleep(1)
            if (self._saved_locals["timeout_seconds"]
                    and self._saved_locals["timeout_seconds"] + start_time <
                    time.time()):
                raise RuntimeError("Launch zetcd service failed.")
        logger.info("ZEtcd is ready, endpoint is {0}:{1}".format(
            socket.gethostbyname(socket.gethostname()), self._zookeeper_port))
Exemplo n.º 5
0
    def _launch_coordinator(self):
        if self._port is None:
            self._port = get_free_port()
        else:
            # check port conflict
            if not is_free_port(self._port):
                raise RuntimeError("Port {} already used.".format(self._port))

        self._coordinator_endpoint = "{}:{}".format(self._hosts[0], self._port)

        cmd = [
            sys.executable,
            "-m",
            "gscoordinator",
            "--num_workers",
            "{}".format(str(self._num_workers)),
            "--hosts",
            "{}".format(",".join(self._hosts)),
            "--log_level",
            "{}".format(gs_config.log_level),
            "--timeout_seconds",
            "{}".format(self._timeout_seconds),
            "--port",
            "{}".format(str(self._port)),
            "--cluster_type",
            self.type(),
            "--instance_id",
            self._instance_id,
        ]

        if self._etcd_addrs is not None:
            cmd.extend(["--etcd_addrs", self._etcd_addrs])

        if self._vineyard_shared_mem is not None:
            cmd.extend(["--vineyard_shared_mem", self._vineyard_shared_mem])

        if self._vineyard_socket:
            cmd.extend(
                ["--vineyard_socket", "{}".format(self._vineyard_socket)])

        logger.info("Initializing coordinator with command: %s", " ".join(cmd))

        env = os.environ.copy()
        env["PYTHONUNBUFFERED"] = "TRUE"
        # add graphscope module to PYTHONPATH
        if "PYTHONPATH" in env:
            env["PYTHONPATH"] = (
                os.path.join(os.path.dirname(graphscope.__file__), "..") +
                os.pathsep + env["PYTHONPATH"])
        else:
            env["PYTHONPATH"] = os.path.join(
                os.path.dirname(graphscope.__file__), "..")

        # Param `start_new_session=True` is for putting child process to a new process group
        # so it won't get the signals from parent.
        # In notebook environment, we need to accept the signal from kernel restarted/stoped.
        process = subprocess.Popen(
            cmd,
            start_new_session=False if in_notebook() else True,
            cwd=COORDINATOR_HOME,
            env=env,
            encoding="utf-8",
            errors="replace",
            stdin=subprocess.DEVNULL,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            universal_newlines=True,
            bufsize=1,
        )
        stdout_watcher = PipeWatcher(process.stdout, sys.stdout)
        if not gs_config.show_log:
            stdout_watcher.add_filter(
                lambda line: "Loading" in line and "it/s]" in line)
        setattr(process, "stdout_watcher", stdout_watcher)
        stderr_watcher = PipeWatcher(process.stderr, sys.stderr)
        setattr(process, "stderr_watcher", stderr_watcher)
        self._proc = process
Exemplo n.º 6
0
    def create_learning_instance(self, object_id, handle, config):
        # allocate service for ports
        self._create_graphlearn_service(object_id,
                                        self._learning_engine_ports_usage,
                                        len(self._pod_name_list))

        # prepare arguments
        handle = json.loads(
            base64.b64decode(handle.encode("utf-8")).decode("utf-8"))
        hosts = ",".join([
            "%s:%s" % (pod_name, port) for pod_name, port in zip(
                self._pod_name_list,
                range(
                    self._learning_engine_ports_usage,
                    self._learning_engine_ports_usage +
                    len(self._pod_name_list),
                ),
            )
        ])
        handle["server"] = hosts
        handle = base64.b64encode(
            json.dumps(handle).encode("utf-8")).decode("utf-8")

        # launch the server
        self._learning_instance_processes[object_id] = []
        for pod_index, pod in enumerate(self._pod_name_list):
            cmd = [
                "kubectl",
                "-n",
                self._saved_locals["namespace"],
                "exec",
                "-it",
                "-c",
                self._engine_container_name,
                pod,
                "--",
                "python3",
                "-m"
                "gscoordinator.learning",
                handle,
                config,
                str(pod_index),
            ]
            logging.debug("launching learning server: %s", " ".join(cmd))
            proc = subprocess.Popen(
                cmd,
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
                encoding="utf-8",
                errors="replace",
                universal_newlines=True,
                bufsize=1,
            )
            stdout_watcher = PipeWatcher(
                proc.stdout,
                sys.stdout,
                drop=True,
                suppressed=(not logger.isEnabledFor(logging.DEBUG)),
            )
            setattr(proc, "stdout_watcher", stdout_watcher)
            self._learning_instance_processes[object_id].append(proc)

        # update the port usage record
        self._learning_engine_ports_usage += len(self._pod_name_list)

        # parse the service hosts and ports
        return self._parse_graphlearn_service_endpoint(object_id)
Exemplo n.º 7
0
    def _launch_analytical_engine_locally(self):
        logger.info("Starting GAE rpc service on {} ...".format(
            str(self._analytical_engine_endpoint)))

        # generate and distribute hostfile
        kube_hosts_path = os.path.join(get_tempdir(), "kube_hosts")
        with open(kube_hosts_path, "w") as f:
            for i, pod_ip in enumerate(self._pod_ip_list):
                f.write("{} {}\n".format(pod_ip, self._pod_name_list[i]))

        for pod in self._pod_name_list:
            subprocess.check_call([
                shutil.which("kubectl"),
                "-n",
                self._saved_locals["namespace"],
                "cp",
                kube_hosts_path,
                "{}:/tmp/hosts_of_nodes".format(pod),
                "-c",
                self._engine_container_name,
            ])

        # launch engine
        rmcp = ResolveMPICmdPrefix(rsh_agent=True)
        cmd, mpi_env = rmcp.resolve(self._num_workers,
                                    ",".join(self._pod_name_list))

        cmd.append(ANALYTICAL_ENGINE_PATH)
        cmd.extend(["--host", "0.0.0.0"])
        cmd.extend(["--port", str(self._random_analytical_engine_rpc_port)])
        cmd.extend([
            "--vineyard_shared_mem", self._saved_locals["vineyard_shared_mem"]
        ])

        if rmcp.openmpi():
            cmd.extend(["-v", str(self._glog_level)])
        else:
            mpi_env["GLOG_v"] = str(self._glog_level)

        cmd.extend([
            "--vineyard_socket",
            os.path.join(get_tempdir(), "vineyard_workspace", "vineyard.sock"),
        ])
        logger.info("Analytical engine launching command: {}".format(
            " ".join(cmd)))

        env = os.environ.copy()
        env.update(mpi_env)

        self._analytical_engine_process = subprocess.Popen(
            cmd,
            env=env,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            encoding="utf-8",
            errors="replace",
            universal_newlines=True,
            bufsize=1,
        )

        stdout_watcher = PipeWatcher(self._analytical_engine_process.stdout,
                                     sys.stdout,
                                     drop=True)
        stderr_watcher = PipeWatcher(self._analytical_engine_process.stderr,
                                     sys.stderr,
                                     drop=True)
        setattr(self._analytical_engine_process, "stdout_watcher",
                stdout_watcher)
        setattr(self._analytical_engine_process, "stderr_watcher",
                stderr_watcher)