Exemplo n.º 1
0
    def start(self):
        """Launch graphscope instance on kubernetes cluster.

        Raises:
            RuntimeError: If instance launch failed or timeout.

        Returns:
            str: Coordinator service endpoint.
        """
        try:
            self._create_namespace()
            self._create_role_and_binding()

            self._create_services()
            time.sleep(1)
            self._waiting_for_services_ready()
            self._coordinator_endpoint = self._get_coordinator_endpoint()
            logger.info(
                "Coordinator pod start successful with address %s, connecting to service ...",
                self._coordinator_endpoint,
            )
        except Exception as e:
            time.sleep(1)
            self._dump_coordinator_failed_status()
            self.stop()
            raise K8sError(
                "Error when launching Coordinator on kubernetes cluster"
            ) from e
Exemplo n.º 2
0
def wait_for_deployment_complete(api_client, namespace, name, timeout_seconds=60):
    core_api = kube_client.CoreV1Api(api_client)
    app_api = kube_client.AppsV1Api(api_client)
    start_time = time.time()
    while time.time() - start_time < timeout_seconds:
        time.sleep(2)
        response = app_api.read_namespaced_deployment_status(
            namespace=namespace, name=name
        )
        s = response.status
        if (
            s.updated_replicas == response.spec.replicas
            and s.replicas == response.spec.replicas
            and s.available_replicas == response.spec.replicas
            and s.observed_generation >= response.metadata.generation
        ):
            return True
        else:
            # check failed
            selector = ""
            for k, v in response.spec.selector.match_labels.items():
                selector += k + "=" + v + ","
            selector = selector[:-1]
            pods = core_api.list_namespaced_pod(
                namespace=namespace, label_selector=selector
            )
            for pod in pods.items:
                if pod.status.container_statuses is not None:
                    for container_status in pod.status.container_statuses:
                        if (
                            not container_status.ready
                            and container_status.restart_count > 0
                        ):
                            raise K8sError("Deployment {} start failed.".format(name))
    raise TimeoutError("Waiting timeout for deployment {}".format(name))
Exemplo n.º 3
0
    def start(self):
        """Launch graphscope instance on kubernetes cluster.

        Raises:
            RuntimeError: If instance launch failed or timeout.

        Returns:
            str: Coordinator service endpoint.
        """
        try:
            self._create_namespace()
            self._create_role_and_binding()

            self._create_services()
            self._waiting_for_services_ready()
            return self._get_coordinator_endpoint()
        except Exception as e:
            time.sleep(1)
            logger.error(
                "Error when launching Coordinator on kubernetes cluster.")
            self._dump_coordinator_status()
            self.stop()
            raise K8sError(
                "Error when launching Coordinator on kubernetes cluster"
            ) from e
Exemplo n.º 4
0
    def _stream_event_impl(self, simple=False):
        field_selector = "involvedObject.name=" + self._pod_name

        event_messages = []
        while not self._stopped:
            time.sleep(1)
            try:
                events = self._core_api.list_namespaced_event(
                    namespace=self._namespace,
                    field_selector=field_selector,
                    timeout_seconds=2,
                )
            except K8SApiException:
                pass
            else:
                for event in events.items:
                    msg = "{0}: {1}".format(self._pod_name, event.message)
                    if msg and msg not in event_messages:
                        event_messages.append(msg)
                        self._lines.put(msg)
                        logger.info(msg, extra={"simple": simple})
                        if event.reason == "Failed":
                            raise K8sError("Kubernetes event error: {}".format(msg))
Exemplo n.º 5
0
    def _connect(self):
        if self._config_params["addr"] is not None:
            # try connect to exist coordinator
            self._session_type = types_pb2.HOSTS
            proc, endpoint = None, self._config_params["addr"]
        elif self._config_params["enable_k8s"]:
            if (self._config_params["k8s_etcd_image"] is None
                    or self._config_params["k8s_gs_image"] is None):
                raise K8sError("None image found.")
            api_client = kube_config.new_client_from_config(
                **self._config_params["k8s_client_config"])
            proc = None
            self._session_type = types_pb2.K8S
            self._k8s_cluster = KubernetesCluster(
                api_client=api_client,
                namespace=self._config_params["k8s_namespace"],
                service_type=self._config_params["k8s_service_type"],
                num_workers=self._config_params["num_workers"],
                gs_image=self._config_params["k8s_gs_image"],
                etcd_image=self._config_params["k8s_etcd_image"],
                gie_graph_manager_image=self.
                _config_params["k8s_gie_graph_manager_image"],
                zookeeper_image=self._config_params["k8s_zookeeper_image"],
                image_pull_policy=self._config_params["k8s_image_pull_policy"],
                image_pull_secrets=self.
                _config_params["k8s_image_pull_secrets"],
                vineyard_cpu=self._config_params["k8s_vineyard_cpu"],
                vineyard_mem=self._config_params["k8s_vineyard_mem"],
                vineyard_shared_mem=self.
                _config_params["k8s_vineyard_shared_mem"],
                etcd_cpu=self._config_params["k8s_etcd_cpu"],
                etcd_mem=self._config_params["k8s_etcd_mem"],
                zookeeper_cpu=self._config_params["k8s_zookeeper_cpu"],
                zookeeper_mem=self._config_params["k8s_zookeeper_mem"],
                gie_graph_manager_cpu=self.
                _config_params["k8s_gie_graph_manager_cpu"],
                gie_graph_manager_mem=self.
                _config_params["k8s_gie_graph_manager_mem"],
                engine_cpu=self._config_params["k8s_engine_cpu"],
                engine_mem=self._config_params["k8s_engine_mem"],
                coordinator_cpu=float(
                    self._config_params["k8s_coordinator_cpu"]),
                coordinator_mem=self._config_params["k8s_coordinator_mem"],
                volumes=self._config_params["k8s_volumes"],
                waiting_for_delete=self.
                _config_params["k8s_waiting_for_delete"],
                timeout_seconds=self._config_params["timeout_seconds"],
            )
            endpoint = self._k8s_cluster.start()
            if self._config_params["k8s_namespace"] is None:
                self._config_params[
                    "k8s_namespace"] = self._k8s_cluster.get_namespace()
        elif (isinstance(self._config_params["hosts"], list)
              and len(self._config_params["hosts"]) != 0
              and self._config_params["num_workers"] > 0):
            # lanuch coordinator with hosts
            proc, endpoint = _launch_coordinator_on_local(self._config_params)
            self._session_type = types_pb2.HOSTS
        else:
            raise RuntimeError("Session initialize failed.")

        # waiting service ready
        self._grpc_client = GRPCClient(endpoint)
        self._grpc_client.waiting_service_ready(
            timeout_seconds=self._config_params["timeout_seconds"],
            enable_k8s=self._config_params["enable_k8s"],
        )

        # connect to rpc server
        try:
            (
                self._session_id,
                self._engine_config,
                self._pod_name_list,
            ) = self._grpc_client.connect()
            _session_dict[self._session_id] = self
        except Exception:
            if proc is not None and proc.poll() is None:
                try:
                    proc.terminate()
                except:  # noqa: E722
                    pass
            raise

        return proc, endpoint
Exemplo n.º 6
0
def get_service_endpoints(api_client, namespace, name, type, timeout_seconds=60):
    """Get service endpoint by service name and service type.

    Args:
        api_client: ApiClient
            An kubernetes ApiClient object, initialized with the client args.
        namespace: str
            Namespace of the service belongs to.
        name: str
            Service name.
        type: str
            Service type. Valid options are NodePort, LoadBalancer and ClusterIP.
        timeout_seconds: int
            Raise TimeoutError after waiting for this seconds, only used in LoadBalancer type.

    Raises:
        TimeoutError: If the underlying cloud-provider doesn't support the LoadBalancer
            service type.
        K8sError: The service type is not one of (NodePort, LoadBalancer, ClusterIP). Or
            the service has no endpoint.

    Returns: A list of endpoint.
        If service type is LoadBalancer, format with <load_balancer_ip>:<port>. And
        if service type is NodePort, format with <host_ip>:<node_port>, And
        if service type is ClusterIP, format with <cluster_ip>:<port>
    """
    start_time = time.time()

    core_api = kube_client.CoreV1Api(api_client)
    svc = core_api.read_namespaced_service(name=name, namespace=namespace)

    # get pods
    selector = ""
    for k, v in svc.spec.selector.items():
        selector += k + "=" + v + ","
    selector = selector[:-1]
    pods = core_api.list_namespaced_pod(namespace=namespace, label_selector=selector)

    ips = []
    ports = []
    if type == "NodePort":
        for pod in pods.items:
            ips.append(pod.status.host_ip)
        for port in svc.spec.ports:
            ports.append(port.node_port)
    elif type == "LoadBalancer":
        while True:
            svc = core_api.read_namespaced_service(name=name, namespace=namespace)
            if svc.status.load_balancer.ingress is not None:
                for ingress in svc.status.load_balancer.ingress:
                    if ingress.hostname is not None:
                        ips.append(ingress.hostname)
                    else:
                        ips.append(ingress.ip)
                for port in svc.spec.ports:
                    ports.append(port.port)
                break
            time.sleep(1)
            if time.time() - start_time > timeout_seconds:
                raise TimeoutError("LoadBalancer service type is not supported yet.")
    elif type == "ClusterIP":
        ips.append(svc.spec.cluster_ip)
        for port in svc.spec.ports:
            ports.append(port.port)
    else:
        raise K8sError("Service type {0} is not supported yet".format(type))

    # generate endpoint
    endpoints = []

    if not ips or not ports:
        raise K8sError("Get {0} service {1} failed.".format(type, name))

    for ip in ips:
        for port in ports:
            endpoints.append("{0}:{1}".format(ip, port))
    return endpoints