示例#1
0
def clean_pod_template(pod_template):
    """ Normalize pod template and check for type errors """
    if isinstance(pod_template, str):
        msg = ('Expected a kubernetes.client.V1Pod object, got %s'
               'If trying to pass a yaml filename then use '
               'KubeCluster.from_yaml')
        raise TypeError(msg % pod_template)

    if isinstance(pod_template, dict):
        msg = ('Expected a kubernetes.client.V1Pod object, got %s'
               'If trying to pass a dictionary specification then use '
               'KubeCluster.from_dict')
        raise TypeError(msg % str(pod_template))

    pod_template = copy.deepcopy(pod_template)

    # Make sure metadata / labels / env objects exist, so they can be modified
    # later without a lot of `is None` checks
    if pod_template.metadata is None:
        pod_template.metadata = client.V1ObjectMeta()
    if pod_template.metadata.labels is None:
        pod_template.metadata.labels = {}

    if pod_template.spec.containers[0].env is None:
        pod_template.spec.containers[0].env = []

    # add default tolerations
    tolerations = [
        client.V1Toleration(
            key='k8s.dask.org/dedicated',
            operator='Equal',
            value='worker',
            effect='NoSchedule',
        ),
        # GKE currently does not permit creating taints on a node pool
        # with a `/` in the key field
        client.V1Toleration(
            key='k8s.dask.org_dedicated',
            operator='Equal',
            value='worker',
            effect='NoSchedule',
        ),
    ]

    if pod_template.spec.tolerations is None:
        pod_template.spec.tolerations = tolerations
    else:
        pod_template.spec.tolerations.extend(tolerations)

    return pod_template
示例#2
0
    def test_pod_spec(self):
        cluster_spec = ClusterSpec(cluster_spec_json=test_spec)
        pod = create_test_pod("test_spec")
        pod = cluster_spec.patch_pod(pod, "other")

        self.assertEqual(
            pod.metadata.labels["elasticdl.org/app-name"], "elasticdl"
        )
        self.assertEqual(pod.metadata.labels["elasticdl.org/site"], "hangzhou")
        self.assertEqual(
            pod.metadata.annotations["tag.elasticdl.org/optimization"],
            "enabled",
        )
        expected_tolerations = [
            client.V1Toleration(
                effect="NoSchedule",
                key="elasticdl.org/logic-pool",
                operator="Equal",
                value="ElasticDL",
            )
        ]
        self.assertEqual(pod.spec.tolerations, expected_tolerations)
        match_expressions = [
            client.V1NodeSelectorRequirement(
                key="elasticdl.org/logic-pool",
                operator="In",
                values=["ElasticDL"],
            )
        ]

        expected_affinity = client.V1Affinity(
            node_affinity=client.V1NodeAffinity(
                required_during_scheduling_ignored_during_execution=(
                    client.V1NodeSelector(
                        node_selector_terms=[
                            client.V1NodeSelectorTerm(
                                match_expressions=match_expressions
                            )
                        ]
                    )
                )
            )
        )
        self.assertEqual(pod.spec.affinity, expected_affinity)

        expected_env = []
        expected_env.append(client.V1EnvVar(name="LOG_ENABLED", value="true"))
        self.assertEqual(pod.spec.containers[0].env, expected_env)

        pod = create_test_pod("test_spec")
        pod = cluster_spec.patch_pod(pod, PodType.MASTER)
        self.assertEqual(pod.metadata.labels["elasticdl.org/xyz"], "Sun")

        pod = create_test_pod("test_spec")
        pod = cluster_spec.patch_pod(pod, PodType.WORKER)
        self.assertEqual(pod.metadata.labels["elasticdl.org/xyz"], "Earth")

        pod = create_test_pod("test_spec")
        pod = cluster_spec.patch_pod(pod, PodType.PS)
        self.assertEqual(pod.metadata.labels["elasticdl.org/xyz"], "Moon")
示例#3
0
    def _create_deployment(self):
        REPLICAS = 1

        container_port = k8s.V1ContainerPort(name=self.uid[-14:],
                                             container_port=os.getenv(
                                                 "OPENVAS_OMP_PORT", 9390))
        resources = k8s.V1ResourceRequirements(
            limits={
                "cpu": KubernetesDeployer.CONTAINER_USE_CPU_LIMIT,
                "memory": KubernetesDeployer.CONTAINER_USE_MEMORY_LIMIT,
            })
        readiness_probe = k8s.V1Probe(
            _exec=k8s.V1ExecAction(
                command=KubernetesDeployer.OPENVAS_HEALTHCHECK_COMMAND),
            initial_delay_seconds=300,
            period_seconds=30,
        )
        liveness_probe = k8s.V1Probe(
            tcp_socket=k8s.V1TCPSocketAction(
                port=container_port.container_port),
            initial_delay_seconds=180,
            period_seconds=30,
            failure_threshold=3,
            timeout_seconds=5,
        )
        container = k8s.V1Container(
            image=KubernetesDeployer.OPENVAS_CONTAINER_IMAGE,
            name=self.uid,
            image_pull_policy="IfNotPresent",
            ports=[container_port],
            resources=resources,
            readiness_probe=readiness_probe,
            liveness_probe=liveness_probe,
        )
        toleration = k8s.V1Toleration(effect="NoSchedule",
                                      key="Scanners",
                                      operator="Exists")
        pod_spec = k8s.V1PodSpec(containers=[container],
                                 tolerations=[toleration])
        pod_metadata = k8s.V1ObjectMeta(
            name=self.uid,
            labels={"app.kubernetes.io/name": self.uid},
            annotations={
                "cluster-autoscaler.kubernetes.io/safe-to-evict": "false"
            },
        )
        pod_template = k8s.V1PodTemplateSpec(spec=pod_spec,
                                             metadata=pod_metadata)
        selector = k8s.V1LabelSelector(
            match_labels={"app.kubernetes.io/name": self.uid})
        deployment_spec = k8s.V1DeploymentSpec(replicas=REPLICAS,
                                               selector=selector,
                                               template=pod_template)
        deployment_metadata = k8s.V1ObjectMeta(
            name=self.uid, labels={"app.kubernetes.io/name": self.uid})
        deployment = k8s.V1Deployment(spec=deployment_spec,
                                      metadata=deployment_metadata)
        return k8s.AppsV1Api(self.client).create_namespaced_deployment(
            self.namespace, deployment)
示例#4
0
def create_toleration(toleration_data):
    toleration = client.V1Toleration()

    if "effect" in toleration_data:
        toleration.effect = toleration_data["effect"]
    if "key" in toleration_data:
        toleration.key = toleration_data["key"]
    if "operator" in toleration_data:
        toleration.operator = toleration_data["operator"]
    if "value" in toleration_data:
        toleration.value = toleration_data["value"]
    if "toleration_seconds" in toleration_data:
        toleration.toleration_seconds = int(
            toleration_data["toleration_seconds"])

    return toleration
示例#5
0
    def _create_deployment(self):
        REPLICAS = 1

        container_port = k8s.V1ContainerPort(name=self.uid[-14:],
                                             container_port=os.getenv(
                                                 "OPENVAS_OMP_PORT", 9390))
        resources = k8s.V1ResourceRequirements(
            limits={
                "cpu": KubernetesDeployer.CONTAINER_USE_CPU_LIMIT,
                "memory": KubernetesDeployer.CONTAINER_USE_MEMORY_LIMIT,
            })
        container = k8s.V1Container(
            image=KubernetesDeployer.OPENVAS_CONTAINER_IMAGE,
            name=self.uid,
            image_pull_policy="IfNotPresent",
            ports=[container_port],
            resources=resources,
        )
        toleration = k8s.V1Toleration(effect="NoSchedule",
                                      key="Scanners",
                                      operator="Exists")
        pod_spec = k8s.V1PodSpec(containers=[container],
                                 tolerations=[toleration])
        pod_metadata = k8s.V1ObjectMeta(
            name=self.uid, labels={"app.kubernetes.io/name": self.uid})
        pod_template = k8s.V1PodTemplateSpec(spec=pod_spec,
                                             metadata=pod_metadata)
        selector = k8s.V1LabelSelector(
            match_labels={"app.kubernetes.io/name": self.uid})
        deployment_spec = k8s.V1DeploymentSpec(replicas=REPLICAS,
                                               selector=selector,
                                               template=pod_template)
        deployment_metadata = k8s.V1ObjectMeta(
            name=self.uid, labels={"app.kubernetes.io/name": self.uid})
        deployment = k8s.V1Deployment(spec=deployment_spec,
                                      metadata=deployment_metadata)
        return k8s.AppsV1Api(self.client).create_namespaced_deployment(
            self.namespace, deployment)
示例#6
0
    def submit(self):
        """
        Submit a build pod to create the image for the repository.

        Progress of the build can be monitored by listening for items in
        the Queue passed to the constructor as `q`.
        """
        volume_mounts = [
            client.V1VolumeMount(mount_path="/var/run/docker.sock",
                                 name="docker-socket")
        ]
        docker_socket_path = urlparse(self.docker_host).path
        volumes = [
            client.V1Volume(
                name="docker-socket",
                host_path=client.V1HostPathVolumeSource(
                    path=docker_socket_path, type="Socket"),
            )
        ]

        if self.push_secret:
            volume_mounts.append(
                client.V1VolumeMount(mount_path="/root/.docker",
                                     name="docker-config"))
            volumes.append(
                client.V1Volume(
                    name="docker-config",
                    secret=client.V1SecretVolumeSource(
                        secret_name=self.push_secret),
                ))

        env = []
        if self.git_credentials:
            env.append(
                client.V1EnvVar(name="GIT_CREDENTIAL_ENV",
                                value=self.git_credentials))

        self.pod = client.V1Pod(
            metadata=client.V1ObjectMeta(
                name=self.name,
                labels={
                    "name": self.name,
                    "component": self._component_label,
                },
                annotations={
                    "binder-repo": self.repo_url,
                },
            ),
            spec=client.V1PodSpec(
                containers=[
                    client.V1Container(
                        image=self.build_image,
                        name="builder",
                        args=self.get_cmd(),
                        volume_mounts=volume_mounts,
                        resources=client.V1ResourceRequirements(
                            limits={"memory": self.memory_limit},
                            requests={"memory": self.memory_request},
                        ),
                        env=env,
                    )
                ],
                tolerations=[
                    client.V1Toleration(
                        key="hub.jupyter.org/dedicated",
                        operator="Equal",
                        value="user",
                        effect="NoSchedule",
                    ),
                    # GKE currently does not permit creating taints on a node pool
                    # with a `/` in the key field
                    client.V1Toleration(
                        key="hub.jupyter.org_dedicated",
                        operator="Equal",
                        value="user",
                        effect="NoSchedule",
                    ),
                ],
                node_selector=self.node_selector,
                volumes=volumes,
                restart_policy="Never",
                affinity=self.get_affinity(),
            ),
        )

        try:
            _ = self.api.create_namespaced_pod(
                self.namespace,
                self.pod,
                _request_timeout=KUBE_REQUEST_TIMEOUT,
            )
        except client.rest.ApiException as e:
            if e.status == 409:
                # Someone else created it!
                app_log.info("Build %s already running", self.name)
                pass
            else:
                raise
        else:
            app_log.info("Started build %s", self.name)

        app_log.info("Watching build pod %s", self.name)
        while not self.stop_event.is_set():
            w = watch.Watch()
            try:
                for f in w.stream(
                        self.api.list_namespaced_pod,
                        self.namespace,
                        label_selector=f"name={self.name}",
                        timeout_seconds=30,
                        _request_timeout=KUBE_REQUEST_TIMEOUT,
                ):
                    if f["type"] == "DELETED":
                        # Assume this is a successful completion
                        self.progress(
                            ProgressEvent.Kind.BUILD_STATUS_CHANGE,
                            ProgressEvent.BuildStatus.COMPLETED,
                        )
                        return
                    self.pod = f["object"]
                    if not self.stop_event.is_set():
                        # Account for all the phases kubernetes pods can be in
                        # Pending, Running, Succeeded, Failed, Unknown
                        # https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase
                        phase = self.pod.status.phase
                        if phase == "Pending":
                            self.progress(
                                ProgressEvent.Kind.BUILD_STATUS_CHANGE,
                                ProgressEvent.BuildStatus.PENDING,
                            )
                        elif phase == "Running":
                            self.progress(
                                ProgressEvent.Kind.BUILD_STATUS_CHANGE,
                                ProgressEvent.BuildStatus.RUNNING,
                            )
                        elif phase == "Succeeded":
                            # Do nothing! We will clean this up, and send a 'Completed' progress event
                            # when the pod has been deleted
                            pass
                        elif phase == "Failed":
                            self.progress(
                                ProgressEvent.Kind.BUILD_STATUS_CHANGE,
                                ProgressEvent.BuildStatus.FAILED,
                            )
                        elif phase == "Unknown":
                            self.progress(
                                ProgressEvent.Kind.BUILD_STATUS_CHANGE,
                                ProgressEvent.BuildStatus.UNKNOWN,
                            )
                        else:
                            # This shouldn't happen, unless k8s introduces new Phase types
                            warnings.warn(
                                f"Found unknown phase {phase} when building {self.name}"
                            )

                    if self.pod.status.phase == "Succeeded":
                        self.cleanup()
                    elif self.pod.status.phase == "Failed":
                        self.cleanup()
            except Exception:
                app_log.exception("Error in watch stream for %s", self.name)
                raise
            finally:
                w.stop()
            if self.stop_event.is_set():
                app_log.info("Stopping watch of %s", self.name)
                return
示例#7
0
    def submit(self):
        """Submit a build pod to create the image for the repository."""
        volume_mounts = [
            client.V1VolumeMount(mount_path="/var/run/docker.sock",
                                 name="docker-socket")
        ]
        docker_socket_path = urlparse(self.docker_host).path
        volumes = [
            client.V1Volume(name="docker-socket",
                            host_path=client.V1HostPathVolumeSource(
                                path=docker_socket_path, type='Socket'))
        ]

        if self.push_secret:
            volume_mounts.append(
                client.V1VolumeMount(mount_path="/root/.docker",
                                     name='docker-push-secret'))
            volumes.append(
                client.V1Volume(name='docker-push-secret',
                                secret=client.V1SecretVolumeSource(
                                    secret_name=self.push_secret)))

        env = []
        if self.git_credentials:
            env.append(
                client.V1EnvVar(name='GIT_CREDENTIAL_ENV',
                                value=self.git_credentials))

        self.pod = client.V1Pod(
            metadata=client.V1ObjectMeta(
                name=self.name,
                labels={
                    "name": self.name,
                    "component": self._component_label,
                },
                annotations={
                    "binder-repo": self.repo_url,
                },
            ),
            spec=client.V1PodSpec(
                containers=[
                    client.V1Container(
                        image=self.build_image,
                        name="builder",
                        args=self.get_cmd(),
                        volume_mounts=volume_mounts,
                        resources=client.V1ResourceRequirements(
                            limits={'memory': self.memory_limit},
                            requests={'memory': self.memory_request},
                        ),
                        env=env)
                ],
                tolerations=[
                    client.V1Toleration(
                        key='hub.jupyter.org/dedicated',
                        operator='Equal',
                        value='user',
                        effect='NoSchedule',
                    ),
                    # GKE currently does not permit creating taints on a node pool
                    # with a `/` in the key field
                    client.V1Toleration(
                        key='hub.jupyter.org_dedicated',
                        operator='Equal',
                        value='user',
                        effect='NoSchedule',
                    ),
                ],
                node_selector=self.node_selector,
                volumes=volumes,
                restart_policy="Never",
                affinity=self.get_affinity()))

        try:
            ret = self.api.create_namespaced_pod(
                self.namespace,
                self.pod,
                _request_timeout=KUBE_REQUEST_TIMEOUT,
            )
        except client.rest.ApiException as e:
            if e.status == 409:
                # Someone else created it!
                app_log.info("Build %s already running", self.name)
                pass
            else:
                raise
        else:
            app_log.info("Started build %s", self.name)

        app_log.info("Watching build pod %s", self.name)
        while not self.stop_event.is_set():
            w = watch.Watch()
            try:
                for f in w.stream(
                        self.api.list_namespaced_pod,
                        self.namespace,
                        label_selector="name={}".format(self.name),
                        timeout_seconds=30,
                        _request_timeout=KUBE_REQUEST_TIMEOUT,
                ):
                    if f['type'] == 'DELETED':
                        self.progress('pod.phasechange', 'Deleted')
                        return
                    self.pod = f['object']
                    if not self.stop_event.is_set():
                        self.progress('pod.phasechange', self.pod.status.phase)
                    if self.pod.status.phase == 'Succeeded':
                        self.cleanup()
                    elif self.pod.status.phase == 'Failed':
                        self.cleanup()
            except Exception as e:
                app_log.exception("Error in watch stream for %s", self.name)
                raise
            finally:
                w.stop()
            if self.stop_event.is_set():
                app_log.info("Stopping watch of %s", self.name)
                return
示例#8
0
def clean_pod_template(pod_template, match_node_purpose="prefer", pod_type="worker"):
    """ Normalize pod template and check for type errors """
    if isinstance(pod_template, str):
        msg = (
            "Expected a kubernetes.client.V1Pod object, got %s"
            "If trying to pass a yaml filename then use "
            "KubeCluster.from_yaml"
        )
        raise TypeError(msg % pod_template)

    if isinstance(pod_template, dict):
        msg = (
            "Expected a kubernetes.client.V1Pod object, got %s"
            "If trying to pass a dictionary specification then use "
            "KubeCluster.from_dict"
        )
        raise TypeError(msg % str(pod_template))

    pod_template = copy.deepcopy(pod_template)

    # Make sure metadata / labels / env objects exist, so they can be modified
    # later without a lot of `is None` checks
    if pod_template.metadata is None:
        pod_template.metadata = client.V1ObjectMeta()
    if pod_template.metadata.labels is None:
        pod_template.metadata.labels = {}

    if pod_template.spec.containers[0].env is None:
        pod_template.spec.containers[0].env = []

    # add default tolerations
    tolerations = [
        client.V1Toleration(
            key="k8s.dask.org/dedicated",
            operator="Equal",
            value=pod_type,
            effect="NoSchedule",
        ),
        # GKE currently does not permit creating taints on a node pool
        # with a `/` in the key field
        client.V1Toleration(
            key="k8s.dask.org_dedicated",
            operator="Equal",
            value=pod_type,
            effect="NoSchedule",
        ),
    ]

    if pod_template.spec.tolerations is None:
        pod_template.spec.tolerations = tolerations
    else:
        pod_template.spec.tolerations.extend(tolerations)

    # add default node affinity to k8s.dask.org/node-purpose=worker
    if match_node_purpose != "ignore":
        # for readability
        affinity = pod_template.spec.affinity

        if affinity is None:
            affinity = client.V1Affinity()
        if affinity.node_affinity is None:
            affinity.node_affinity = client.V1NodeAffinity()

        # a common object for both a preferred and a required node affinity
        node_selector_term = client.V1NodeSelectorTerm(
            match_expressions=[
                client.V1NodeSelectorRequirement(
                    key="k8s.dask.org/node-purpose", operator="In", values=[pod_type]
                )
            ]
        )

        if match_node_purpose == "require":
            if (
                affinity.node_affinity.required_during_scheduling_ignored_during_execution
                is None
            ):
                affinity.node_affinity.required_during_scheduling_ignored_during_execution = client.V1NodeSelector(
                    node_selector_terms=[]
                )
            affinity.node_affinity.required_during_scheduling_ignored_during_execution.node_selector_terms.append(
                node_selector_term
            )
        elif match_node_purpose == "prefer":
            if (
                affinity.node_affinity.preferred_during_scheduling_ignored_during_execution
                is None
            ):
                affinity.node_affinity.preferred_during_scheduling_ignored_during_execution = (
                    []
                )
            preferred_scheduling_terms = [
                client.V1PreferredSchedulingTerm(
                    preference=node_selector_term, weight=100
                )
            ]
            affinity.node_affinity.preferred_during_scheduling_ignored_during_execution.extend(
                preferred_scheduling_terms
            )
        else:
            raise ValueError(
                'Attribute must be one of "ignore", "prefer", or "require".'
            )
        pod_template.spec.affinity = affinity

    return pod_template
示例#9
0
def update_deploy_v2():
    data = json.loads(request.get_data().decode('UTF-8'))
    current_app.logger.debug("接受到的数据:{}".format(data))
    namespace = handle_input(data.get('namespace'))
    deploy_name = handle_input(data.get('deploy_name'))
    action = handle_input(data.get('action'))

    image = None
    replicas = None
    toleration = None
    pod_anti_affinity = None
    pod_affinity = None
    node_affinity = None
    labels = None
    if action == "add_pod_anti_affinity":
        print("正在运行{}操作".format(action))
        affinity = handle_input(data.get('pod_anti_affinity'))
        affinity_type = handle_input(affinity.get('type'))

        labelSelector = handle_input(affinity.get('labelSelector'))
        key = handle_input(affinity.get('key'))
        value = handle_input(affinity.get('value'))

        topologyKey = handle_input(affinity.get('topologyKey'))
        if affinity_type == "required":
            if labelSelector == "matchExpressions":
                if not isinstance(value, list):
                    value = [value]
                operator = handle_input(affinity.get('operator'))
                if operator != 'In' and operator != 'NotIn':
                    value = None
                print(value)
                label_selector = client.V1LabelSelector(match_expressions=[
                    client.V1LabelSelectorRequirement(
                        key=key, operator=operator, values=value)
                ])
            elif labelSelector == "matchLabels":
                if isinstance(value, list):
                    return jsonify(
                        {"error": "{}模式下不支持values设置为数组".format(labelSelector)})
                label_selector = client.V1LabelSelector(
                    match_labels={key: value})
            else:
                return jsonify(
                    {"error": "不支持{} labelSelector".format(labelSelector)})
            client.V1Affinity
            pod_anti_affinity = client.V1PodAntiAffinity(
                required_during_scheduling_ignored_during_execution=[
                    client.V1PodAffinityTerm(label_selector=label_selector,
                                             topology_key=topologyKey)
                ])
            print("添加的互斥调度为:{}".format(pod_anti_affinity))
        elif affinity_type == "preferred":
            weight = string_to_int(handle_input(affinity.get('weight')))
            if weight == None:
                return jsonify(
                    {"error": "{}类型必须设置weight".format(affinity_type)})

            if labelSelector == "matchExpressions":
                if not isinstance(value, list):
                    value = [value]

                operator = handle_input(affinity.get('operator'))
                if operator != 'In' and operator != 'NotIn':
                    value = None
                label_selector = client.V1LabelSelector(match_expressions=[
                    client.V1LabelSelectorRequirement(
                        key=key, operator=operator, values=value)
                ])
            elif labelSelector == "matchLabels":
                if isinstance(value, list):
                    return jsonify(
                        {"error": "{}模式下不支持values设置为数组".format(labelSelector)})
                label_selector = client.V1LabelSelector(
                    match_labels={key: value})
            else:
                return jsonify(
                    {"error": "不支持{} labelSelector".format(labelSelector)})
            pod_anti_affinity = client.V1PodAntiAffinity(
                preferred_during_scheduling_ignored_during_execution=[
                    client.V1WeightedPodAffinityTerm(
                        pod_affinity_term=client.V1PodAffinityTerm(
                            label_selector=label_selector,
                            topology_key=topologyKey),
                        weight=weight)
                ])
            print("添加的互斥调度为:{}".format(pod_anti_affinity))
        else:
            return jsonify({"error": "不支持{}这种调度".format(affinity_type)})
    elif action == "delete_pod_anti_affinity":
        print("正在运行{}操作".format(action))
        pass
    elif action == "add_node_affinity":
        current_app.logger.debug("正在运行{}操作".format(action))
        affinity = handle_input(data.get('node_affinity'))
        node_affinity_type = handle_input(affinity.get('type'))

        nodeSelector = handle_input(affinity.get('nodeSelector'))
        key = handle_input(affinity.get('key'))
        value = handle_input(affinity.get('value'))
        operator = handle_input(affinity.get('operator'))
        values = []
        if operator == 'Exists' or operator == 'DoesNotExist':
            values == None
        else:
            if not isinstance(value, list):
                values.append(value)
            else:
                values = value

        if node_affinity_type == "preferred":
            weight = string_to_int(handle_input(affinity.get('weight')))
            if weight == None:
                return simple_error_handle(
                    "{}类型必须设置weight".format(node_affinity_type))
            preferred_term = []
            if nodeSelector == "matchExpressions":
                match_expressions = []
                expression = client.V1NodeSelectorRequirement(
                    key=key,
                    operator=operator,
                    values=values,
                )
                match_expressions.append(expression)
                preference = client.V1NodeSelectorTerm(
                    match_expressions=match_expressions)
            # nodeSelector == "matchFields"
            else:
                match_fields = []
                field = client.V1NodeSelectorRequirement(
                    key=key,
                    operator=operator,
                    values=values,
                )
                match_fields.append(field)
                preference = client.V1NodeSelectorTerm(
                    match_fields=match_fields)
            term = client.V1PreferredSchedulingTerm(
                weight=weight,
                preference=preference,
            )
            preferred_term.append(term)
            node_affinity = client.V1NodeAffinity(
                #直接append
                preferred_during_scheduling_ignored_during_execution=
                preferred_term)
        elif node_affinity_type == "required":
            current_app.logger.debug(
                "node_affinity_type:{}".format(node_affinity_type))
            node_selector_terms = []
            if nodeSelector == "matchExpressions":
                match_expressions = []
                expression = client.V1NodeSelectorRequirement(
                    key=key,
                    operator=operator,
                    values=values,
                )
                match_expressions.append(expression)
                term = client.V1NodeSelectorTerm(
                    match_expressions=match_expressions)
            else:
                match_fields = []
                field = client.V1NodeSelectorRequirement(
                    key=key,
                    operator=operator,
                    values=values,
                )
                match_fields.append(field)

                term = client.V1NodeSelectorTerm(match_fields=match_fields)
            node_selector_terms.append(term)
            node_affinity = client.V1NodeAffinity(
                required_during_scheduling_ignored_during_execution=client.
                V1NodeSelector(node_selector_terms=node_selector_terms))
        else:
            return simple_error_handle("不支持{}这种调度".format(node_affinity_type))
    elif action == "delete_node_affinity":
        print("正在运行{}操作".format(action))
        pass
    elif action == "add_toleration":
        print("正在运行{}操作".format(action))
        t = handle_input(data.get("toleration"))
        print(type(toleration), toleration)

        effect = t.get('effect')
        key = t.get('key')
        operator = t.get('operator')
        value = t.get('value')
        toleration_seconds = handle_toleraion_seconds(
            t.get('toleration_seconds'))
        print("toleration_seconds:{}".format(toleration_seconds))
        toleration = client.V1Toleration(effect=effect,
                                         key=key,
                                         operator=operator,
                                         toleration_seconds=toleration_seconds,
                                         value=value)
        print(toleration)
        if not toleration:
            msg = "{}需要提供toleration(effect,key,operator,value,)".format(action)
            return jsonify({"error": msg})
    elif action == "delete_toleration":
        print("正在运行{}操作".format(action))
        t = handle_input(data.get("toleration"))
        effect = handle_toleration_item(t.get('effect'))
        key = handle_toleration_item(t.get('key'))
        operator = handle_toleration_item(t.get('operator'))
        value = handle_toleration_item(t.get('value'))
        toleration_seconds = handle_toleraion_seconds(
            t.get('toleration_seconds'))
        print("toleration_seconds:{}".format(toleration_seconds))

        # if (effect != None and key != None and operator != None):
        toleration = client.V1Toleration(effect=effect,
                                         key=key,
                                         operator=operator,
                                         toleration_seconds=toleration_seconds,
                                         value=value)
        if not toleration:
            msg = "{}需要提供toleration(effect,key,operator,value,)".format(action)
            return jsonify({"error": msg})
    elif action == "add_pod_affinity":
        pass
    elif action == "delete_pod_affinity":
        pass
    elif action == "update_replicas":
        replicas = handle_input(data.get('replicas'))
        if not replicas:
            msg = "{}需要提供replicas".format(action)
            return jsonify({"error": msg})
    elif action == "update_image":
        project = handle_input(data.get('project'))
        env = handle_input(data.get('env'))
        imageRepo = handle_input(data.get('imageRepo'))
        imageName = handle_input(data.get('imageName'))
        imageTag = handle_input(data.get('imageTag'))
        if (imageRepo != None and project != None and env != None
                and imageName != None and imageTag != None):
            image = "{}/{}-{}/{}:{}".format(imageRepo, project, env, imageName,
                                            imageTag)
        print("image值{}".format(image))
        if not image:
            msg = "{}需要提供image".format(action)
            return jsonify({"error": msg})
    elif action == "add_labels":
        pass
    elif action == "delete_labels":
        pass
    else:
        msg = "暂时不支持{}操作".format(action)
        print(msg)
        return jsonify({"error": msg})
    return update_deployment_v2(deploy_name=deploy_name, namespace=namespace, action=action, image=image, replicas=replicas,toleration=toleration,node_affinity=node_affinity,\
                pod_anti_affinity=pod_anti_affinity,pod_affinity=pod_affinity,labels=labels)
示例#10
0
def generate_pod():
    metadata = client.V1ObjectMeta(
        name="platform-app-958795556-2nqgj",
        namespace="production",
        generate_name="platform-app-958795556-",
        labels={
            "app": "platform",
            "chart": "platform",
            "component": "app",
            "heritage": "Helm",
            "pod-template-hash": "958795556",
            "release": "platform-production",
            "version": "1.0.3",
        },
        owner_references=[
            client.V1OwnerReference(
                api_version="apps/v1",
                kind="ReplicaSet",
                name="platform-app-958795556",
                uid="35ba938b-681d-11eb-a74a-16e1a04d726b",
                controller=True,
                block_owner_deletion=True,
            )
        ],
    )

    container = client.V1Container(
        name="app",
        image="platform.azurecr.io/app:master",
        image_pull_policy="Always",
        termination_message_policy="File",
        termination_message_path="/dev/termination-log",
        env=[],
        resources=client.V1ResourceRequirements(
            limits={
                "cpu": "1200m",
                "memory": "1Gi"
            },
            requests={
                "cpu": "1",
                "memory": "768Mi"
            },
        ),
        ports=[client.V1ContainerPort(container_port=3000, protocol="TCP")],
        volume_mounts=[
            client.V1VolumeMount(
                name="default-token-2cg25",
                read_only=True,
                mount_path="/var/run/secrets/kubernetes.io/serviceaccount",
            )
        ],
        liveness_probe=client.V1Probe(
            initial_delay_seconds=10,
            timeout_seconds=5,
            period_seconds=10,
            success_threshold=1,
            failure_threshold=6,
            http_get=client.V1HTTPGetAction(path="/health/liveness",
                                            port=3000,
                                            scheme="HTTP"),
        ),
        readiness_probe=client.V1Probe(
            initial_delay_seconds=10,
            timeout_seconds=5,
            period_seconds=10,
            success_threshold=2,
            failure_threshold=6,
            http_get=client.V1HTTPGetAction(path="/health/readness",
                                            port=3000,
                                            scheme="HTTP"),
        ),
    )

    spec = client.V1PodSpec(
        containers=[container],
        volumes=[
            client.V1Volume(
                name="default-token-2cg25",
                secret=client.V1SecretVolumeSource(
                    secret_name="default-token-2cg25", default_mode=420),
            )
        ],
        restart_policy="Always",
        termination_grace_period_seconds=30,
        dns_policy="ClusterFirst",
        service_account_name="default",
        service_account="default",
        node_name="aks-agentpool-26722002-vmss00039t",
        security_context=client.V1PodSecurityContext(run_as_user=1000,
                                                     fs_group=1000),
        scheduler_name="default-scheduler",
        tolerations=[
            client.V1Toleration(
                key="node.kubernetes.io/not-ready",
                operator="Exists",
                effect="NoExecute",
                toleration_seconds=300,
            ),
            client.V1Toleration(
                key="node.kubernetes.io/unreachable",
                operator="Exists",
                effect="NoExecute",
                toleration_seconds=300,
            ),
        ],
        priority=0,
        enable_service_links=True,
    )

    return client.V1Pod(metadata=metadata, spec=spec)
示例#11
0
def submit_job(args, command=None):
    container_image = args.container
    container_name = args.name

    body = client.V1Job(api_version="batch/v1", kind="Job", metadata=client.V1ObjectMeta(name=container_name))
    body.status = client.V1JobStatus()
    template = client.V1PodTemplate()

    labels = {
        'hugin-job': "1",
        'hugin-job-name': f'{container_name}'
    }
    template.template = client.V1PodTemplateSpec(
        metadata=client.V1ObjectMeta(labels=labels)
    )

    tolerations = []
    env = []
    if args.environment:
        for env_spec in args.environment:
            env_name,env_value = env_spec.split("=", 1)
            env.append(client.V1EnvVar(name=env_name, value=env_value))

    containe_args = dict(
        name=f"container-{container_name}",
        image=container_image,
        env=env,
    )

    if args.gpu:
        tolerations.append(client.V1Toleration(
        key='nvidia.com/gpu', operator='Exists', effect='NoSchedule'))
        containe_args['resources'] = client.V1ResourceRequirements(limits={"nvidia.com/gpu": 1})
    if command or args.command:
        containe_args['command'] = command if command else args.command

    container = client.V1Container(**containe_args)
    pull_secrets = []
    if args.pull_secret is not None:
        pull_secrets.append(client.V1LocalObjectReference(name=args.pull_secret))
    pod_args = dict(containers=[container],
                    restart_policy='Never',
                    image_pull_secrets=pull_secrets)


    if tolerations:
        pod_args['tolerations'] = tolerations

    if args.node_selector is not None:
        parts = args.node_selector.split("=", 1)
        if len(parts) == 2:
            affinity = client.V1Affinity(
                node_affinity=client.V1NodeAffinity(
                    required_during_scheduling_ignored_during_execution=client.V1NodeSelector(
                        node_selector_terms=[client.V1NodeSelectorTerm(
                            match_expressions=[client.V1NodeSelectorRequirement(
                                key=parts[0], operator='In', values=[parts[1]])]
                        )]
                    )
                )
            )
            pod_args['affinity'] = affinity

    template.template.spec = client.V1PodSpec(**pod_args)
    body.spec = client.V1JobSpec(ttl_seconds_after_finished=1800, template=template.template)
    try:
        api_response = batch_v1.create_namespaced_job("default", body, pretty=True)
        #print (api_response)
    except client.exceptions.ApiException as e:
        logging.critical(f"Failed to start job: {e.reason}")
示例#12
0
class K8sJobEngine():

    def __init__(self):
        self.log = get_logger(self.__class_name.__name__)
        if kalytical_config.kalytical_endpoint is None:
            # This is the API endpoint we send back to the pod for a callback/interaction during pipeline running. It may be behind a load balancer/DNS - i.e. it can't communicate with local host)
            raise ConfigException(
                "Config is missing parameter for kalytical API endpoint!")
        try:
            # Defaults to the service account asssigned to th epod
            config.load_incluster_config()
        except CDonfigException:
            self.log.warn(
                f"Could not load kube configuration from pod! Attempting to configure client with local kubeconfig={config.KUBE_CONFIG_DEFAULT_LOCATION}")
            config.load_kube_config()
        self._k8s_core_client = client.CoreV1Api()

        self.log = get_logger(self. __class__.__name__)
        self._running_job_list = None

    @staticmethod
    def _get_default_container_args() -> list:
        return kalytical_config.k8spodengine_default_container_args

    @staticmethod
    def _get_default_container_uri() -> str:
        return kalytical_config.default_pipeline_image_uri

    async def submit_job(self, header_model: PipelineHeaderModel, exec_uuid: str, source_uuid: str = None, retry_count: int = 0) -> RunningPipelineModel:
        self.log.info(
            f"Attempting to submit pod for pipeline_uuid={header_model.pipeline_uuid}")
        job_pod = self.marshall_k8s_pod(
            header_model=header_model, exec_uuid=exec_uuid, source_uuid=source_uuid, retry_count=retry_count)
        # TODO Handle cases where pod create fails - i.e. resource starvation
        pod_resp = self._k8s_core_client.create_namespaced_pod(
            namespace=kalytical_config.k8spodengine_k8s_namespace, body=job_pod)

        return self.unmarshall_pod(pod_obj=pod_resp)

    async def marshall_+k8s_pod(self, header_model: PipelineHeaderModel, exec_uuid: str, source_uuid: str = None, retry_count: int = 0) 0 > client.V1Pod():
        common_job_name = '-'.join(exec_uuid,
                                   header_model.pipeline_uuid, str(retry_count))
        if 'pipeline_args' in header_model.engine_args.keys():
            pipeline_args = header_model.engine_args['pipeline_args']
        else:
            pipeline_args = self._get_default_engine_args()

        if 'pipeline_command' in header_model.engine_args.keys():
            pipeline_command = header_model.engine_args['pipeline_command']
        else:
            pipeline_command = self._get_default_container_command()

        if 'pipeline_image' in header_model.engine_args.keys():
            pipeline_image = header_model.engine_args['pipeline_image']
        else:
            pipeline_image = self._get_default_container_uri()

        container_spec = client.V1Container(
            name=common_job_name,
            image=pipeline_image,
            args=pipeline_args,
            command=pipeline_command,
            env=[
                client.V1EnvVar(name="PIPELINE_UUID",
                                value=header_model.pipeline_uuid),
                client.V1EnvVar(name="SOURCE_UUID", value=json.dumps(
                    json.dumps(source_uuid))),
                client.V1EnvVar(name="EXEC_UUID", value=exec_uuid),
                client.V1EnvVar(name="RETRY_COUNT", value=str(retry_count)),
                client.V1EnvVar(name="MQ_CALLBACK_URL",
                                value=kalytical_config.mq_url),
                client.V1EnvVar(name="KALYTICAL_AUTH_SECRET",
                                value=kalytical_config.api_secret),
                client.V1EnvVar(name="KALYTICAL_API_ENDPOINT",
                                value=kalytical_config.api_endpoint)

            ],

            resources=client.V1ResourceRequirements(
                limit={'cpu': header_model.engine_args['cpu_count'], 'memory': header_model.engine_args['memory_gi']}),
            pod_spec=client.V1PodSpec(service_account_name=kalytical_config.k8spodengine_svc_account_name, node_selector={"kalkytical.k8s.node/workload": pipeline, "beta.kubernetes.io/instance-type": header_model.engine_args['instance_type']}, tolerations[client.V1Toleration(key="node.kubernetes.io/pipeline", operator="Exists", effect='NoSchedule')], security_context=client.V1PodSecurityContext(fs_group=100), restart_policy=Never, container=[container_spec])
            # TODO TOlerations and selectors might not work for a generic use case
            return client.V1Pod(spec=pod_spec, metadata=client.V1ObjectMeta(name=common_job_name, label{"pod_source": "kalytical", "exec_uuid": exec_uuid, "pipeline_uuid": header_model.pipeline_uuid})))
示例#13
0
def make_pod_spec(
    image,
    labels={},
    threads_per_worker=1,
    env={},
    extra_container_config={},
    extra_pod_config={},
    memory_limit=None,
    memory_request=None,
    cpu_limit=None,
    cpu_request=None,
):
    """
    Create generic pod template from input parameters

    Examples
    --------
    >>> make_pod_spec(image='daskdev/dask:latest', memory_limit='4G', memory_request='4G')
    """
    args = [
        'dask-worker',
        '$(DASK_SCHEDULER_ADDRESS)',
        '--nthreads',
        str(threads_per_worker),
        '--death-timeout',
        '60',
    ]
    if memory_limit:
        args.extend(['--memory-limit', str(memory_limit)])
    pod = client.V1Pod(
        metadata=client.V1ObjectMeta(labels=labels),
        spec=client.V1PodSpec(
            restart_policy='Never',
            containers=[
                client.V1Container(
                    name='dask-worker',
                    image=image,
                    args=args,
                    env=[
                        client.V1EnvVar(name=k, value=v)
                        for k, v in env.items()
                    ],
                )
            ],
            tolerations=[
                client.V1Toleration(
                    key='k8s.dask.org/dedicated',
                    operator='Equal',
                    value='worker',
                    effect='NoSchedule',
                ),
                # GKE currently does not permit creating taints on a node pool
                # with a `/` in the key field
                client.V1Toleration(
                    key='k8s.dask.org_dedicated',
                    operator='Equal',
                    value='worker',
                    effect='NoSchedule',
                ),
            ]))

    resources = client.V1ResourceRequirements(limits={}, requests={})

    if cpu_request:
        resources.requests['cpu'] = cpu_request
    if memory_request:
        resources.requests['memory'] = memory_request

    if cpu_limit:
        resources.limits['cpu'] = cpu_limit
    if memory_limit:
        resources.limits['memory'] = memory_limit

    pod.spec.containers[0].resources = resources

    for key, value in extra_container_config.items():
        _set_k8s_attribute(pod.spec.containers[0], key, value)

    for key, value in extra_pod_config.items():
        _set_k8s_attribute(pod.spec, key, value)
    return pod
示例#14
0
    def from_runs(cls, id: str, runs: List[Run]):
        k8s_name = 'tensorboard-' + id
        run_names_hash = K8STensorboardInstance.generate_run_names_hash(runs)

        volume_mounts = []

        for run in runs:
            mount = k8s.V1VolumeMount(
                name=cls.EXPERIMENTS_OUTPUT_VOLUME_NAME,
                mount_path=os.path.join(
                    cls.TENSORBOARD_CONTAINER_MOUNT_PATH_PREFIX, run.owner,
                    run.name),
                sub_path=os.path.join(run.owner, run.name))
            volume_mounts.append(mount)

        deployment_labels = {
            'name': k8s_name,
            'type': 'nauta-tensorboard',
            'nauta_app_name': 'tensorboard',
            'id': id,
            'runs-hash': run_names_hash
        }

        tensorboard_command = [
            "tensorboard", "--logdir",
            cls.TENSORBOARD_CONTAINER_MOUNT_PATH_PREFIX, "--port", "6006",
            "--host", "127.0.0.1"
        ]

        nauta_config = NautaPlatformConfig.incluster_init()

        tensorboard_image = nauta_config.get_tensorboard_image()
        tensorboard_proxy_image = nauta_config.get_activity_proxy_image()

        deployment = k8s.V1Deployment(
            api_version='apps/v1',
            kind='Deployment',
            metadata=k8s.V1ObjectMeta(name=k8s_name, labels=deployment_labels),
            spec=k8s.V1DeploymentSpec(
                replicas=1,
                selector=k8s.V1LabelSelector(match_labels=deployment_labels),
                template=k8s.V1PodTemplateSpec(
                    metadata=k8s.V1ObjectMeta(labels=deployment_labels),
                    spec=k8s.V1PodSpec(
                        tolerations=[
                            k8s.V1Toleration(key='master',
                                             operator='Exists',
                                             effect='NoSchedule')
                        ],
                        affinity=k8s.
                        V1Affinity(node_affinity=k8s.V1NodeAffinity(
                            required_during_scheduling_ignored_during_execution
                            =k8s.V1NodeSelector(node_selector_terms=[
                                k8s.V1NodeSelectorTerm(match_expressions=[
                                    k8s.V1NodeSelectorRequirement(
                                        key="master",
                                        operator="In",
                                        values=["True"])
                                ])
                            ]))),
                        containers=[
                            k8s.V1Container(name='app',
                                            image=tensorboard_image,
                                            command=tensorboard_command,
                                            volume_mounts=volume_mounts),
                            k8s.V1Container(
                                name='proxy',
                                image=tensorboard_proxy_image,
                                ports=[k8s.V1ContainerPort(container_port=80)],
                                readiness_probe=k8s.V1Probe(
                                    period_seconds=5,
                                    http_get=k8s.V1HTTPGetAction(
                                        path='/healthz', port=80)))
                        ],
                        volumes=[
                            k8s.V1Volume(
                                name=cls.EXPERIMENTS_OUTPUT_VOLUME_NAME,
                                persistent_volume_claim=  # noqa
                                k8s.V1PersistentVolumeClaimVolumeSource(
                                    claim_name=cls.
                                    EXPERIMENTS_OUTPUT_VOLUME_NAME,
                                    read_only=True))
                        ]))))

        service = k8s.V1Service(
            api_version='v1',
            kind='Service',
            metadata=k8s.V1ObjectMeta(name=k8s_name,
                                      labels={
                                          'name': k8s_name,
                                          'type': 'nauta-tensorboard',
                                          'nauta_app_name': 'tensorboard',
                                          'id': id
                                      }),
            spec=k8s.V1ServiceSpec(
                type='ClusterIP',
                ports=[k8s.V1ServicePort(name='web', port=80, target_port=80)],
                selector={
                    'name': k8s_name,
                    'type': 'nauta-tensorboard',
                    'nauta_app_name': 'tensorboard',
                    'id': id
                }))

        ingress = k8s.V1beta1Ingress(
            api_version='extensions/v1beta1',
            kind='Ingress',
            metadata=k8s.V1ObjectMeta(
                name=k8s_name,
                labels={
                    'name': k8s_name,
                    'type': 'nauta-tensorboard',
                    'nauta_app_name': 'tensorboard',
                    'id': id
                },
                annotations={
                    'nauta.ingress.kubernetes.io/rewrite-target': '/',
                    'kubernetes.io/ingress.class': 'nauta-ingress'
                }),
            spec=k8s.V1beta1IngressSpec(rules=[
                k8s.V1beta1IngressRule(
                    host='localhost',
                    http=k8s.V1beta1HTTPIngressRuleValue(paths=[
                        k8s.V1beta1HTTPIngressPath(
                            path='/tb/' + id + "/",
                            backend=k8s.V1beta1IngressBackend(
                                service_name=k8s_name, service_port=80))
                    ]))
            ]))

        return cls(deployment=deployment, service=service, ingress=ingress)