Exemplo n.º 1
0
    def manifest(self, namespace, name, app_type, target, **kwargs):
        min_replicas = kwargs.get('min')
        max_replicas = kwargs.get('max')
        cpu_percent = kwargs.get('cpu_percent')

        if min_replicas < 1:
            raise KubeException('min replicas needs to be 1 or higher')

        if max_replicas < min_replicas:
            raise KubeException(
                'max replicas can not be smaller than min replicas')

        labels = {
            'app': namespace,
            'type': app_type,
            'heritage': 'drycc',
        }

        manifest = {
            'kind': 'HorizontalPodAutoscaler',
            'apiVersion': self.api_version,
            'metadata': {
                'name': name,
                'namespace': namespace,
                'labels': labels,
            },
            'spec': {
                'minReplicas': min_replicas,
                'maxReplicas': max_replicas,
            }
        }

        if self.version() >= parse("1.3.0"):
            manifest['spec']['targetCPUUtilizationPercentage'] = cpu_percent

            manifest['spec']['scaleTargetRef'] = {
                'apiVersion': target['apiVersion'],
                # only works with Deployments, RS and RC
                'kind': target['kind'],
                'name': target['metadata']['name'],
            }
        elif self.version() <= parse("1.2.0"):
            # api changed between version
            manifest['spec']['cpuUtilization'] = {
                'targetPercentage': cpu_percent
            }

            manifest['spec']['scaleRef'] = {
                # only works with Deployments, RS and RC
                'kind': target['kind'],
                'name': target['metadata']['name'],
                # the resource of the above which does the scale action
                'subresource': 'scale',
            }

        return manifest
Exemplo n.º 2
0
    def _handle_not_ready_pods(self, namespace, labels):
        """
        Detects if any pod is in the Running phase but not Ready and handles
        any potential issues around that mainly failed healthcheks
        """
        pods = self.get(namespace, labels=labels).json()['items']
        if not pods:
            pods = []
        for pod in pods:
            # only care about pods that are in running phase
            if pod['status']['phase'] != 'Running':
                continue

            name = '{}-{}'.format(pod['metadata']['labels']['app'],
                                  pod['metadata']['labels']['type'])  # noqa
            # find the right container in case there are many on the pod
            container = self.find_container(name,
                                            pod['status']['containerStatuses'])
            if container is None or container['ready'] == 'true':
                continue

            for event in self.events(pod):
                if event['reason'] == 'Unhealthy':
                    # strip out whitespaces on either side
                    message = "\n".join(
                        [x.strip() for x in event['message'].split("\n")])
                    raise KubeException(message)
Exemplo n.º 3
0
    def _handle_pod_errors(self, pod, reason, message):
        """
        Handle potential pod errors based on the Pending
        reason passed into the function

        Images, FailedScheduling and others are needed
        """
        # image error reported on the container level
        container_errors = [
            'Pending',  # often an indication of deeper inspection is needed
            'ErrImagePull',
            'ImagePullBackOff',
            'RegistryUnavailable',
            'ErrImageInspect',
        ]
        # Image event reason mapping
        event_errors = {
            "Failed": "FailedToPullImage",
            "InspectFailed": "FailedToInspectImage",
            "ErrImageNeverPull": "ErrImageNeverPullPolicy",
            # Not including this one for now as the message is not useful
            # "BackOff": "BackOffPullImage",
        }
        # We want to be able to ignore pod scheduling errors as they might be temporary
        if not os.environ.get("DEIS_IGNORE_SCHEDULING_FAILURE", False):
            # FailedScheduling relates limits
            event_errors["FailedScheduling"] = "FailedScheduling"

        # Nicer error than from the event
        # Often this gets to ImageBullBackOff before we can introspect tho
        if reason == 'ErrImagePull':
            raise KubeException(message)

        # collect all error messages of worth
        messages = []
        if reason in container_errors:
            for event in self.events(pod):
                if event['reason'] in event_errors.keys():
                    # only show a given error once
                    event_errors.pop(event['reason'])
                    # strip out whitespaces on either side
                    message = "\n".join(
                        [x.strip() for x in event['message'].split("\n")])
                    messages.append(message)

        if messages:
            raise KubeException("\n".join(messages))
Exemplo n.º 4
0
    def deploy(self, namespace, name, image, entrypoint, command,
               **kwargs):  # noqa
        """Deploy Deployment depending on what's requested"""
        app_type = kwargs.get('app_type')
        version = kwargs.get('version')
        spec_annotations = {}

        # If an RC already exists then stop processing of the deploy
        try:
            # construct old school RC name
            rc_name = '{}-{}-{}'.format(namespace, version, app_type)
            self.rc.get(namespace, rc_name)
            self.log(namespace,
                     'RC {} already exists. Stopping deploy'.format(rc_name))
            return
        except KubeHTTPException:
            # if RC doesn't exist then let the app continue
            pass

        # create a deployment if missing, otherwise update to trigger a release
        try:
            # labels that represent the pod(s)
            labels = {
                'app': namespace,
                'version': version,
                'type': app_type,
                'heritage': 'deis',
            }
            # this depends on the deployment object having the latest information
            deployment = self.deployment.get(namespace, name).json()
            # a hack to persist the spec annotations on the deployment object to next release
            # instantiate spec_annotations and set to blank to avoid errors
            if 'annotations' in deployment['spec']['template'][
                    'metadata'].keys():
                old_spec_annotations = deployment['spec']['template'][
                    'metadata']['annotations']
                spec_annotations = old_spec_annotations
            if deployment['spec']['template']['metadata']['labels'] == labels:
                self.log(
                    namespace,
                    'Deployment {} with release {} already exists. Stopping deploy'
                    .format(name, version))  # noqa
                return
        except KubeException:
            # create the initial deployment object (and the first revision)
            self.deployment.create(namespace, name, image, entrypoint, command,
                                   spec_annotations, **kwargs)
        else:
            try:
                # kick off a new revision of the deployment
                self.deployment.update(namespace, name, image, entrypoint,
                                       command, spec_annotations, **kwargs)
            except KubeException as e:
                raise KubeException(
                    'There was a problem while deploying {} of {}-{}. '
                    "Additional information:\n{}".format(
                        version, namespace, app_type, str(e))) from e
Exemplo n.º 5
0
    def run(self, namespace, name, image, entrypoint, command, **kwargs):
        """Run a one-off command."""
        self.log(
            namespace, 'run {}, img {}, entrypoint {}, cmd "{}"'.format(
                name, image, entrypoint, command))

        # run pods never restart
        kwargs['restartPolicy'] = 'Never'
        kwargs['command'] = entrypoint
        kwargs['args'] = command

        self.pod.create(namespace, name, image, **kwargs)

        try:
            # give pod 20 minutes to execute (after it got into ready state)
            # this is a fairly arbitrary limit but the gunicorn worker / LBs
            # will make this timeout around 20 anyway.
            # TODO: Revisit in the future so it can run longer
            state = 'up'  # pod is still running
            waited = 0
            timeout = 1200  # 20 minutes
            while (state == 'up' and waited < timeout):
                pod = self.pod.get(namespace, name).json()
                state = str(self.pod.state(pod))
                # default data
                exit_code = 0

                waited += 1
                time.sleep(1)

            if state == 'down':  # run finished successfully
                exit_code = 0  # successful run
            elif state == 'crashed':  # run failed
                pod_state = pod['status']['containerStatuses'][0]['state']
                exit_code = pod_state['terminated']['exitCode']

            # timed out!
            if waited == timeout:
                raise KubeException('Timed out (20 mins) while running')

            # check if it is possible to get logs
            state = self.pod.state(self.pod.get(namespace, name).json())
            # States below up do not have logs
            if not isinstance(state, PodState) or state < PodState.up:
                return exit_code, 'Could not get logs. Pod is in state {}'.format(
                    str(state))

            # grab log information
            log = self.pod.logs(namespace, name)
            log.encoding = 'utf-8'  # defaults to "ISO-8859-1" otherwise...

            return exit_code, log.text
        finally:
            # cleanup
            self.pod.delete(namespace, name)
Exemplo n.º 6
0
    def http_delete(self, path, **kwargs):
        """
        Make a DELETE request to the k8s server.
        """
        try:
            url = urljoin(self.url, path)
            response = self.session.delete(url, **kwargs)
        except requests.exceptions.ConnectionError as err:
            # reraise as KubeException, but log stacktrace.
            message = "There was a problem deleting data from " \
                      "the Kubernetes API server. URL: {}".format(url)
            logger.error(message)
            raise KubeException(message) from err

        return response
Exemplo n.º 7
0
    def http_get(self, path, params=None, **kwargs):
        """
        Make a GET request to the k8s server.
        """
        try:
            url = urljoin(self.url, path)
            response = self.session.get(url, params=params, **kwargs)
        except requests.exceptions.ConnectionError as err:
            # reraise as KubeException, but log stacktrace.
            message = "There was a problem retrieving data from " \
                      "the Kubernetes API server. URL: {}, params: {}".format(url, params)
            logger.error(message)
            raise KubeException(message) from err

        return response
Exemplo n.º 8
0
    def http_put(self, path, data=None, **kwargs):
        """
        Make a PUT request to the k8s server.
        """
        try:
            url = urljoin(self.url, path)
            response = self.session.put(url, data=data, **kwargs)
        except requests.exceptions.ConnectionError as err:
            # reraise as KubeException, but log stacktrace.
            message = "There was a problem putting data to " \
                      "the Kubernetes API server. URL: {}, " \
                      "data: {}".format(url, data)
            logger.error(message)
            raise KubeException(message) from err

        return response
Exemplo n.º 9
0
 def _check_for_failed_events(self, namespace, labels):
     """
     Request for new ReplicaSet of Deployment and search for failed events involved by that RS
     Raises: KubeException when RS have events with FailedCreate reason
     """
     response = self.rs.get(namespace, labels=labels)
     data = response.json()
     fields = {
         'involvedObject.kind': 'ReplicaSet',
         'involvedObject.name': data['items'][0]['metadata']['name'],
         'involvedObject.namespace': namespace,
         'involvedObject.uid': data['items'][0]['metadata']['uid'],
     }
     events_list = self.ns.events(namespace, fields=fields).json()
     events = events_list.get('items', [])
     if events is not None and len(events) != 0:
         for event in events:
             if event['reason'] == 'FailedCreate':
                 log = self._get_formatted_messages(events)
                 self.log(namespace, log)
                 raise KubeException(log)
Exemplo n.º 10
0
    def http_patch(self, path, data=None, **kwargs):
        """
        Make a PATCH request to the k8s server.
        """
        try:
            url = urljoin(self.url, path)
            # accepted media types include:
            # application/json-patch+json,
            # application/merge-patch+json,
            # application/apply-patch+yaml
            # self.session.headers["Content-Type"] = "application/json-patch+json"
            response = self.session.patch(url, data=data, **kwargs)
        except requests.exceptions.ConnectionError as err:
            # reraise as KubeException, but log stacktrace.
            message = "There was a problem patching data to " \
                      "the Kubernetes API server. URL: {}, " \
                      "data: {}".format(url, data)
            logger.error(message)
            raise KubeException(message) from err

        return response
Exemplo n.º 11
0
 def _check_for_failed_events(self, namespace, labels):
     """
     Request for new ReplicaSet of Deployment and search for failed events involved by that RS
     Raises: KubeException when RS have events with FailedCreate reason
     """
     max_retries = 3
     retry_sleep_sec = 3.0
     for try_ in range(max_retries):
         response = self.rs.get(namespace, labels=labels)
         data = response.json()
         try:
             fields = {
                 'involvedObject.kind': 'ReplicaSet',
                 'involvedObject.name':
                 data['items'][0]['metadata']['name'],
                 'involvedObject.namespace': namespace,
                 'involvedObject.uid': data['items'][0]['metadata']['uid'],
             }
         except Exception as e:
             if try_ + 1 < max_retries:
                 self.log(
                     namespace,
                     "Got an empty ReplicaSet list. Trying one more time. {}"
                     .format(json.dumps(labels)))
                 time.sleep(retry_sleep_sec)
                 continue
             self.log(
                 namespace, "Did not find the ReplicaSet for {}".format(
                     json.dumps(labels)), "WARN")
             raise e
         events_list = self.ns.events(namespace, fields=fields).json()
         events = events_list.get('items', [])
         if events is not None and len(events) != 0:
             for event in events:
                 if event['reason'] == 'FailedCreate':
                     log = self._get_formatted_messages(events)
                     self.log(namespace, log)
                     raise KubeException(log)
Exemplo n.º 12
0
    def manifest(self, namespace, name, data, secret_type='Opaque', labels={}):
        secret_types = ['Opaque', 'kubernetes.io/dockerconfigjson']
        if secret_type not in secret_types:
            raise KubeException(
                '{} is not a supported secret type. Use one of the following: '
                .format(secret_type, ', '.join(secret_types)))  # noqa

        manifest = {
            'kind': 'Secret',
            'apiVersion': 'v1',
            'metadata': {
                'name': name,
                'namespace': namespace,
                'labels': {
                    'app': namespace,
                    'heritage': 'drycc'
                }
            },
            'type': secret_type,
            'data': {}
        }

        # add in any additional label info
        manifest['metadata']['labels'].update(labels)

        for key, value in data.items():
            if value is None:
                manifest['data'].update({key: ''})
                continue

            value = value if isinstance(value, bytes) else bytes(
                str(value), 'UTF-8')
            item = base64.b64encode(value).decode(encoding='UTF-8')
            manifest['data'].update({key: item})

        return manifest
Exemplo n.º 13
0
    def run(self, namespace, name, image, entrypoint, command, **kwargs):
        """Run a one-off command."""
        self.log(namespace, 'run {}, img {}, entrypoint {}, cmd "{}"'.format(
            name, image, entrypoint, command)
        )

        # force the app_type
        kwargs['app_type'] = 'run'
        # run pods never restart
        kwargs['restartPolicy'] = 'Never'
        kwargs['command'] = entrypoint
        kwargs['args'] = command

        # create application config and build the pod manifest
        self.set_application_config(namespace, kwargs.get('envs', {}), kwargs.get('version'))
        manifest = self.pod.manifest(namespace, name, image, **kwargs)

        url = self.pods.api("/namespaces/{}/pods", namespace)
        response = self.http_post(url, json=manifest)
        if self.unhealthy(response.status_code):
            raise KubeHTTPException(response, 'create Pod in Namespace "{}"', namespace)

        # wait for run pod to start - use the same function as scale
        labels = manifest['metadata']['labels']
        containers = manifest['spec']['containers']
        self.pods.wait_until_ready(
            namespace,
            containers,
            labels,
            desired=1,
            timeout=kwargs.get('deploy_timeout')
        )

        try:
            # give pod 20 minutes to execute (after it got into ready state)
            # this is a fairly arbitrary limit but the gunicorn worker / LBs
            # will make this timeout around 20 anyway.
            # TODO: Revisit in the future so it can run longer
            state = 'up'  # pod is still running
            waited = 0
            timeout = 1200  # 20 minutes
            while (state == 'up' and waited < timeout):
                response = self.pod.get(namespace, name)
                pod = response.json()
                state = str(self.pod.state(pod))
                # default data
                exit_code = 0

                waited += 1
                time.sleep(1)

            if state == 'down':  # run finished successfully
                exit_code = 0  # successful run
            elif state == 'crashed':  # run failed
                pod_state = pod['status']['containerStatuses'][0]['state']
                exit_code = pod_state['terminated']['exitCode']

            # timed out!
            if waited == timeout:
                raise KubeException('Timed out (20 mins) while running')

            # check if it is possible to get logs
            state = self.pod.state(self.pod.get(namespace, name).json())
            # States below up do not have logs
            if not isinstance(state, PodState) or state < PodState.up:
                return exit_code, 'Could not get logs. Pod is in state {}'.format(str(state))

            # grab log information
            log = self.pod.logs(namespace, name)
            log.encoding = 'utf-8'  # defaults to "ISO-8859-1" otherwise...

            return exit_code, log.text
        finally:
            # cleanup
            self.pod.delete(namespace, name)