Пример #1
0
    def validate_running_pod(self) -> None:
        """
        validate_running_pod

        Check whether the Pod we previously started is still running. If not,
        assume the job was killed without being processed by the
        operator (or was never started) and clean up. Mark as failed.

        If Pod is still running, update the status details.

        Returns:
        - None if no pod is expected
        - ProcessingComplete exception if pod is expected but not running
        - ProcessingComplete exception if pod is expected and is running
        """
        # TODO: what if a pod is running, but the operator doesn't expect one?
        curpod = self.get_status('pod')
        curitem = self.get_status('currently_running')
        if curpod:
            try:
                pod = Pod.objects(
                    self.api, namespace=self.namespace).get_by_name(curpod).obj
            except pykube.exceptions.ObjectDoesNotExist:
                self.info(f'pod {curpod} missing/deleted, cleaning up')
                self.set_status('currently_running')
                self.set_status('pod')
                self.set_status('state', 'missing')
                self.items.mark_failed(curitem)
                self.items.set_item_status(curitem, 'pod_detail')
                raise ProcessingComplete(
                    info='Cleaned up missing/deleted item')

            podphase = pod.get('status', {}).get('phase', 'unknown')
            self.info(f'validated that pod {curpod} is '
                      f'still running (phase={podphase})')

            recorded_phase = self.items.status(curitem, 'podphase', 'unknown')

            # valid phases are Pending, Running, Succeeded, Failed, Unknown
            # 'started' is the phase the pods start with when created by
            # operator.
            if recorded_phase in ('started', 'Pending', 'Running', 'Failed'):
                self.info(f'item {curitem} status for '
                          f'{curpod}: {recorded_phase}')
                raise ProcessingComplete(message=f'item {curitem} %s' %
                                         recorded_phase.lower())

            if recorded_phase == 'Succeeded':
                self.info(f'item {curitem} podphase={recorded_phase} but '
                          f'not yet acknowledged: {curpod}')
                raise ProcessingComplete(message=f'item {curitem} succeeded, '
                                         'awaiting acknowledgement')

            raise ProcessingComplete(
                error=f'item {curitem} unexpected state: '
                f'recorded_phase={recorded_phase}, '
                f'status={str(self.status)}',
                message=f'item {curitem} unexpected state')
Пример #2
0
 def get_kubeobj(self, reason: str = None):
     """Get the kube object for the overseen object."""
     namespace = self.namespace if self.namespace else pykube.all
     if self.my_pykube_objtype is None:
         raise ProcessingComplete(
             message='inheriting class must set self.my_pykube_objtype')
     try:
         return (self.my_pykube_objtype.objects(
             self.api, namespace=namespace).get_by_name(self.name))
     except pykube.exceptions.ObjectDoesNotExist as exc:
         raise ProcessingComplete(
             error=f'cannot find Object {self.name} ' +
             f'to {reason}' if reason else '' + f': {exc}',
             message=f'cannot retrieve "{self.name}" object')
Пример #3
0
    def validate_expected_pod_is_running(self) -> None:
        """
        validate_expected_pod_is_running

        Validate that the pod which we expect should be running (based
        on `oaatgroup` status `pod` and `currently_running`)

        Check whether the Pod we previously started is still running. If not,
        assume the job was killed without being processed by the
        operator (or was never started) and clean up. Mark as failed.

        Returns:
        - ProcessingComplete exception:
            - Cleaned up missing/deleted item
            - Pod exists and is in state: <state>
        """
        curpod = self.get_status('pod')
        curitem = self.get_status('currently_running')
        try:
            pod = Pod.objects(self.api,
                              namespace=self.namespace).get_by_name(curpod).obj
        except pykube.exceptions.ObjectDoesNotExist:
            self.info(f'pod {curpod} missing/deleted, cleaning up')
            self.set_status('currently_running')
            self.set_status('pod')
            self.set_status('state', 'missing')
            self.items.mark_failed(curitem)
            self.items.set_item_status(curitem, 'pod_detail')
            raise ProcessingComplete(
                message=f'item {curitem} failed during validation',
                info='Cleaned up missing/deleted item')

        podphase = pod.get('status', {}).get('phase', 'unknown')
        self.info(f'validated that pod {curpod} exists ' f'(phase={podphase})')
        recorded_phase = self.items.status(curitem, 'podphase', 'unknown')

        # if there is a mismatch in phase, then the pod phase handlers
        # have not yet picked it up and updated the oaatgroup phase.
        # Note it here, but take no further action
        if podphase != recorded_phase:
            self.info(f'mismatch in phase for pod {curpod}: '
                      f'pod={podphase}, oaatgroup={recorded_phase}')

        # valid phases are Pending, Running, Succeeded, Failed, Unknown
        # 'started' is the phase the pods start with when created by
        # operator.

        raise ProcessingComplete(
            message=f'Pod {curpod} exists and is in state {podphase}')
Пример #4
0
    def validate_state(self) -> None:
        """
        validate_state

        "pod" and "currently_running" should both be None or both be
        set. If they are out of sync, then our state is inconsistent.
        This should only happen in unusual situations such as the
        oaat-operator being killed while starting a pod.

        TODO: currently just resets both to None, effectively ignoring
        the result of a running pod. Ideally, we should validate the
        status of the pod and clean up.
        """
        curpod = self.get_status('pod')
        curitem = self.get_status('currently_running')
        if curpod is None and curitem is None:
            return None
        if curpod is not None and curitem is not None:
            return None

        self.set_status('currently_running')
        self.set_status('pod')

        raise ProcessingComplete(state='inconsistent state',
                                 message='internal error',
                                 error=(f'inconsistent state detected. '
                                        f'pod ({curpod}) is inconsistent '
                                        f'with currently_running ({curitem})'))
Пример #5
0
 def podspec(self) -> dict:
     """Retrieve Pod specification from this OaatType."""
     if not self.valid:
         raise ProcessingComplete(message='OaatType invalid',
                                  error=f'cannot find OaatType {self.name}')
     msg = 'error in OaatType definition'
     spec = self.obj.get('spec')
     if spec is None:
         raise ProcessingComplete(
             message=msg,
             error='missing spec in OaatType definition')
     if spec.get('type', '') not in ('pod',):
         raise ProcessingComplete(message=msg,
                                  error='spec.type must be "pod"')
     podspec = spec.get('podspec')
     if not podspec:
         raise ProcessingComplete(message=msg,
                                  error='spec.podspec is missing')
     if podspec.get('containers'):
         raise ProcessingComplete(
             message=msg,
             error='currently only support a single container, '
             'please do not use "spec.podspec.containers"')
     if not podspec.get('container'):
         raise ProcessingComplete(
             message=msg,
             error='spec.podspec.container is missing')
     if podspec.get('restartPolicy'):
         raise ProcessingComplete(
             message=msg,
             error='for spec.type="pod", you cannot specify '
             'a restartPolicy')
     return podspec
Пример #6
0
 def delete(self) -> None:
     myobj = self.get_kubeobj('delete it')
     try:
         myobj.delete(propagation_policy='Background')
         self.debug(f'delete of {self.name} successful')
     except pykube.exceptions.KubernetesError as exc:
         raise ProcessingComplete(
             error=f'cannot delete Object {self.name}: {exc}',
             message=f'cannot delete "{self.name}" object')
Пример #7
0
    def run_item(self, item_name) -> dict:
        """
        run_item

        Execute an item job Pod with the spec details from the appropriate
        OaatType object.
        """
        # TODO: check oaatType
        spec = self.oaattype.podspec()
        contspec = spec['container']
        del spec['container']
        contspec.setdefault('env', []).append({
            'name': 'OAAT_ITEM',
            'value': item_name
        })
        for idx in range(len(contspec.get('command', []))):
            contspec['command'][idx] = (contspec['command'][idx].replace(
                '%%oaat_item%%', item_name))
        for idx in range(len(contspec.get('args', []))):
            contspec['args'][idx] = (contspec['args'][idx].replace(
                '%%oaat_item%%', item_name))
        for env in contspec['env']:
            env['value'] = (env.get('value',
                                    '').replace('%%oaat_item%%', item_name))

        # TODO: currently only supports a single container. Do we want
        # multi-container?
        doc = {
            'apiVersion': 'v1',
            'kind': 'Pod',
            'metadata': {
                'generateName': self.name + '-' + item_name + '-',
                'labels': {
                    'parent-name': self.name,
                    'oaat-name': item_name,
                    'app': 'oaat-operator'
                }
            },
            'spec': {
                'containers': [contspec],
                **spec, 'restartPolicy': 'Never'
            },
        }

        kopf.adopt(doc)
        pod = Pod(self.api, doc)

        try:
            pod.create()
        except pykube.exceptions.KubernetesError as exc:
            self.items.mark_failed(item_name)
            raise ProcessingComplete(
                error=f'could not create pod {doc}: {exc}',
                message=f'error creating pod for {item_name}')
        return pod
Пример #8
0
    def validate_oaat_type(self) -> None:
        """
        validate_oaat_type

        Ensure the group refers to an appropriate OaatType object.
        """
        if self.oaattype.valid:
            self.info('found valid oaat type')
            return None
        self.set_annotation('operator-status', 'missingOaatType')
        raise ProcessingComplete(
            message='error in OaatGroup definition',
            error=f'unknown oaat type {self.oaattypename}')
Пример #9
0
    def get_oaattype(self) -> KubeOaatType:
        """Retrieve the OaatType object."""
        if self.name is None:
            return None

        try:
            return (
                KubeOaatType
                .objects(self.api, namespace=self.namespace)
                .get_by_name(self.name)
                .obj)
        except pykube.exceptions.ObjectDoesNotExist as exc:
            raise ProcessingComplete(
                error=(
                    f'cannot find OaatType {self.namespace}/{self.name}: '
                    f'{exc}'),
                message=f'error retrieving "{self.name}" OaatType object')
Пример #10
0
    def validate_no_rogue_pods_are_running(self) -> None:
        found_rogue = 0
        for pod in Pod.objects(self.api, namespace=self.namespace).iterator():
            if pod.name == self.get_status('pod'):
                continue
            if pod.labels.get('parent-name', '') == self.name:
                if pod.labels.get('app', '') == 'oaat-operator':
                    podphase = (pod.obj['status'].get('phase', 'unknown'))
                    if podphase in ['Running', 'Pending']:
                        self.warning(
                            f'rogue pod {pod.name} found (phase={podphase})')
                        found_rogue += 1

        if found_rogue > 0:
            raise ProcessingComplete(
                message='rogue pods running',
                error=f'found {found_rogue} rogue pods running')
Пример #11
0
    def validate_items(self,
                       status_annotation=None,
                       count_annotation=None) -> None:
        """
        validate_items

        Ensure there are oaatItems to process.
        """
        if not len(self.items):
            if status_annotation:
                self.set_annotation(status_annotation, 'missingItems')
            raise ProcessingComplete(state='nothing to do',
                                     error='error in OaatGroup definition',
                                     message=f'no items found. '
                                     f'Please set "oaatItems" in {self.name}')

        # we have oaatItems, so mark the object as "active" (via annotation)
        if status_annotation:
            self.set_annotation(status_annotation, 'active')
        if count_annotation:
            self.set_annotation(count_annotation, value=len(self.items))
Пример #12
0
def create_action(**kwargs):
    # [1] Overseer should raise ValueError if kwargs are not passed
    try:
        Overseer()
    except ValueError as exc:
        assert re.search('Overseer must be called with full kopf kwargs',
                         str(exc)), exc
        kwargs['logger'].debug('[1] successful')

    pov = Overseer(**kwargs)

    # [2] error
    pov.error('[2] error message')

    # [3] warning
    pov.warning('[3] warning message')

    # [4] info
    pov.info('[4] info message')

    # [5] debug
    pov.debug('[5] debug message')

    # [6] get_status
    assert pov.get_status('unset_status') is None
    assert pov.get_status('unset_status', 'empty') == 'empty'
    # set_status
    pov.set_status('new_status')
    pov.set_status('new_status2', 'new_state')

    # [7] get_label
    assert pov.get_label('nolabel') is None
    assert pov.get_label('nolabel', 'empty') == 'empty'
    assert pov.get_label('testlabel') == 'labelvalue'
    assert pov.get_label('testlabel', 'empty') == 'labelvalue'

    # [8] get_kubeobj without my_pykube_objtype
    try:
        pov.get_kubeobj()
    except ProcessingComplete as exc:
        assert (str(exc) == 'inheriting class must set self.my_pykube_objtype'
                ), exc
        kwargs['logger'].debug('[8] successful')

    # [9] get_kubeobj missing object
    savename = pov.name
    pov.name = 'badname'
    pov.my_pykube_objtype = Pod
    try:
        pov.get_kubeobj()
    except ProcessingComplete as exc:
        assert str(exc) == 'cannot retrieve "badname" object', exc
        kwargs['logger'].debug('[9] successful')
    pov.name = savename

    # [10] get_kubeobj sunny day
    kobj = pov.get_kubeobj('examine it')
    kwargs['logger'].debug(f'kubeobj.metadata: {kobj.metadata}')
    assert kobj.metadata['name'] == kwargs['name']
    kwargs['logger'].debug('[10] successful')

    # [11] set_annotation
    pov.set_annotation('testannotation')
    pov.set_annotation('new_annotation', 'annotation_value')

    # [12] handle_processing_complete
    try:
        raise ProcessingComplete(state='retstate',
                                 info='retinfo',
                                 error='reterror',
                                 warning='retwarning',
                                 message='retmessage')
    except ProcessingComplete as exc:
        assert (pov.handle_processing_complete(exc).get('message') ==
                'retmessage'), exc
        kwargs['logger'].debug('[12] successful')

    # [13] handle_processing_complete none
    try:
        raise ProcessingComplete()
    except ProcessingComplete as exc:
        assert pov.handle_processing_complete(exc) is None, exc
        kwargs['logger'].debug('[13] successful')

    pov.debug('about to complete')

    return 'all overseer tests successful'
Пример #13
0
    def find_job_to_run(self) -> str:
        """
        find_job_to_run

        Find the best item job to run based on last success and
        failure times.

        Basic algorithm:
        - phase one: choose valid item candidates:
            - start with a list of all possible items to run
            - remove from the list items which have been successful within the
              period in the 'frequency' setting
            - remove from the list items which have failed within the period
              in the 'failureCoolOff' setting
        - phase two: choose the item to run from the valid item candidates:
            - if there is just one item, choose it
            - find the item with the oldest success (or has never succeeded)
            - if there is just one item that is 'oldest', choose it
            - of the items with the oldest success, find the item with the
              oldest failure
            - if there is just one item that has both the oldest success and
              the oldest failure, choose it
            - choose at random (this is likely to occur if no items have
              been run - i.e. first iteration)
        """
        now = oaatoperator.utility.now()

        # Phase One: Choose valid item candidates
        oaat_items = self.items.list()
        item_status = {item['name']: 'candidate' for item in oaat_items}

        if not oaat_items:
            raise ProcessingComplete(
                message='error in OaatGroup definition',
                error='no items found. please set "oaatItems"')

        self.debug('oaat_items: ' + ', '.join([i['name'] for i in oaat_items]))

        # Filter out items which have been recently successful
        self.debug(f'frequency: {self.freq}s')
        self.debug(f'now: {now}')
        self.debug(f'cool_off: {self.cool_off}')

        candidates = []
        for item in oaat_items:
            if now > item['success'] + self.freq:
                candidates.append(item)
                item_status[item['name']] = (
                    f'not successful within last freq ({self.freq})')
            else:
                item_status[item['name']] = (
                    f'successful within last freq ({self.freq})')

        self.debug('Valid, based on success: ' +
                   ', '.join([i['name'] for i in candidates]))

        # Filter out items which have failed within the cool off period
        if self.cool_off is not None:
            self.debug(f'testing {item["name"]} - '
                       f'now: {now}, '
                       f'failure: {item["failure"]}, '
                       f'cool_off: {self.cool_off}'
                       f'test: {now < item["failure"] + self.cool_off}')
            if now < item['failure'] + self.cool_off:
                candidates.remove(item)
                item_status[item['name']] = (
                    f'cool_off ({self.cool_off}) not expired since '
                    f'last failure')

            self.debug('Valid, based on success and failure cool off: ' +
                       ', '.join([i['name'] for i in candidates]))

        self.info('item status (* = candidate):\n' +
                  '\n'.join([('* ' if i in candidates else '- ') +
                             f'{i["name"]} ' + f'{item_status[i["name"]]} ' +
                             f'success={i["success"].isoformat()}, ' +
                             f'failure={i["failure"].isoformat()}, ' +
                             f'numfails={i["numfails"]}' for i in oaat_items]))

        if not candidates:
            self.set_status('state', 'idle')
            raise ProcessingComplete(message='not time to run next item')

        # return single candidate if there is only one left
        if len(candidates) == 1:
            return candidates[0]['name']

        # Phase 2: Choose the item to run from the valid item candidates
        # Get all items which are "oldest"
        oldest_success_time = min([t['success'] for t in candidates])
        oldest_success_items = [
            item for item in candidates
            if item['success'] == oldest_success_time
        ]

        self.debug('oldest_items {oldest_success_time}: ' +
                   ', '.join([i['name'] for i in oldest_success_items]))

        if len(oldest_success_items) == 1:
            return oldest_success_items[0]['name']

        # More than one item "equally old" success. Choose based on
        # last failure (but only if there has been a failure for the item)
        failure_items = [
            item for item in oldest_success_items if item['numfails'] > 0
        ]

        if len(failure_items) == 0:
            # nothing has failed
            remaining_items = oldest_success_items
        else:
            oldest_failure_time = min(
                [item['failure'] for item in failure_items])
            self.debug(f'oldest_failure_time: {oldest_failure_time}')
            oldest_failure_items = [
                item for item in oldest_success_items
                if item['failure'] == oldest_failure_time
            ]

            self.debug('oldest_failure_items: ' +
                       ', '.join([i['name'] for i in oldest_failure_items]))

            if len(oldest_failure_items) == 1:
                return oldest_failure_items[0]['name']

            remaining_items = oldest_failure_items

        # more than one "equally old" failure.  Choose at random
        return remaining_items[randrange(
            len(remaining_items))]['name']  # nosec