def mesos_update_to_event(mesos_status: addict.Dict, task_config: MesosTaskConfig) -> Event: kwargs = dict( raw=mesos_status, task_id=task_config.task_id, task_config=task_config, timestamp=time.time(), ) kwargs.update(MESOS_STATUS_MAP[mesos_status.state]) return task_event(**kwargs)
def __handle_deleted_pod_event(self, event: PodEvent) -> None: pod = event["object"] pod_name = pod.metadata.name task_metadata = self.task_metadata[pod_name] raw_event = event['raw_object'] logger.info( f"Removing {pod_name} from state and emitting 'killed' event.") self.task_metadata = self.task_metadata.discard(pod_name) self.event_queue.put( task_event(task_id=pod_name, terminal=True, success=False, timestamp=time.time(), raw=raw_event, task_config=task_metadata.task_config, platform_type="killed"))
def reconcile(self, task_config: KubernetesTaskConfig) -> None: pod_name = task_config.pod_name try: pod = self.kube_client.get_pod(namespace=self.namespace, pod_name=pod_name) except Exception: logger.exception( f"Hit an exception attempting to fetch pod {pod_name}") pod = None if pod_name not in self.task_metadata: self._initialize_existing_task(task_config) with self.task_metadata_lock: task_metadata = self.task_metadata[pod_name] self.task_metadata = self.task_metadata.set( pod_name, task_metadata.set(task_config=task_config)) if not pod: # Pod has gone away while restarting logger.info( f"Pod {pod_name} for task {task_config.name} was no longer found. " "Marking as LOST") self.task_metadata = self.task_metadata.set( pod_name, task_metadata.set( task_state=KubernetesTaskState.TASK_LOST, task_state_history=task_metadata.task_state_history. append( (KubernetesTaskState.TASK_LOST, time.time()), ))) self.event_queue.put( task_event(task_id=pod_name, terminal=False, timestamp=time.time(), raw=None, task_config=task_metadata.task_config, platform_type="lost")) else: # Treat like a modified pod self.__update_modified_pod(pod=pod, event=None)
def _background_check_task(self, time_now, tasks_to_reconcile, task_id, md): if md.task_state != 'TASK_INITED': tasks_to_reconcile.append(task_id) if md.task_state == 'TASK_INITED': # give up if the task hasn't launched after # offer_timeout inited_at = md.task_state_history['TASK_INITED'] offer_timeout = md.task_config.offer_timeout expires_at = inited_at + offer_timeout if time_now >= expires_at: log.warning( f'Task {task_id} has been waiting for offers ' 'for longer than configured timeout ' f'{offer_timeout}. Giving up and removing the ' 'task from the task queue.' ) # killing the task will also remove them from the queue self.kill_task(task_id) # we are not expecting mesos to send terminal update # for this task, so cleaning it up manually self.task_metadata = self.task_metadata.discard( task_id ) self.event_queue.put( task_event( task_id=task_id, terminal=True, timestamp=time_now, success=False, message='stop', task_config=md.task_config, raw='Failed due to offer timeout', ) ) get_metric(metrics.TASK_OFFER_TIMEOUT).count(1) # Task is not eligible for killing or reenqueuing in_current_state_since = md.task_state_history[md.task_state] if time_now < in_current_state_since + self.task_staging_timeout_s: return if md.task_state == 'UNKNOWN': log.warning( f'Re-enqueuing task {task_id} in unknown state for ' f'longer than {self.task_staging_timeout_s}' ) # Re-enqueue task self.enqueue_task(md.task_config) get_metric( metrics.TASK_FAILED_TO_LAUNCH_COUNT).count(1) elif md.task_state == 'TASK_STAGING': log.warning(f'Killing stuck task {task_id}') self.kill_task(task_id) self.task_metadata = self.task_metadata.set( task_id, md.set( task_state='TASK_STUCK', task_state_history=md.task_state_history.set( 'TASK_STUCK', time_now), ) ) self.blacklist_slave( agent_id=self.task_metadata[task_id].agent_id, timeout=self.slave_blacklist_timeout_s, ) get_metric(metrics.TASK_STUCK_COUNT).count(1) elif md.task_state == 'TASK_STUCK': t = time.time() # 10s since last iteration + time we spent in current one time_delta = 10 + t - time_now # seconds since task was put in TASK_STUCK state time_stuck = t - md.task_state_history['TASK_STUCK'] # seconds since `time_stuck` crossed another hour # boundary hour_rolled = time_stuck % 3600 # if `time_stuck` crossed hour boundary since last # background check - lets re-send kill request if hour_rolled < time_delta: hours_stuck = time_stuck // 3600 log.warning( f'Task {task_id} is stuck, waiting for terminal ' f'state for {hours_stuck}h, sending another kill' ) self.kill_task(task_id)
import time from task_processing.interfaces.event import task_event # https://github.com/apache/mesos/blob/master/include/mesos/mesos.proto MESOS_STATUS_MAP = { 'TASK_STARTING': task_event(platform_type='starting', terminal=False), 'TASK_RUNNING': task_event(platform_type='running', terminal=False), 'TASK_FINISHED': task_event(platform_type='finished', terminal=True, success=True), 'TASK_FAILED': task_event(platform_type='failed', terminal=True, success=False), 'TASK_KILLED': task_event(platform_type='killed', terminal=True, success=False), 'TASK_LOST': task_event(platform_type='lost', terminal=True, success=False), 'TASK_STAGING': task_event(platform_type='staging', terminal=False), 'TASK_ERROR': task_event(platform_type='error', terminal=True, success=False), 'TASK_KILLING': task_event(platform_type='killing', terminal=False), 'TASK_DROPPED': task_event(platform_type='dropped', terminal=True, success=False), 'TASK_UNREACHABLE': task_event(platform_type='unreachable', terminal=False), 'TASK_GONE': task_event(platform_type='gone', terminal=True, success=False),
def __update_modified_pod(self, pod: V1Pod, event: Optional[PodEvent]) -> None: """ Called during reconciliation and normal event handling """ pod_name = pod.metadata.name task_metadata = self.task_metadata[pod_name] raw_event = event['raw_object'] if event else None if pod.status.phase not in SUPPORTED_POD_MODIFIED_EVENT_PHASES: logger.debug( f"Got a MODIFIED event for {pod_name} for unhandled phase: " f"{pod.status.phase} - ignoring.") return if (pod.status.phase in {"Succeeded", "Failed"} and task_metadata.task_state is KubernetesTaskState.TASK_PENDING): logger.debug( f"Adding running event for {pod_name}, Kubernetes appears to have " "compacted the Running phase event.") self.task_metadata = self.task_metadata.set( pod_name, task_metadata.set( node_name=pod.spec.node_name, task_state=KubernetesTaskState.TASK_RUNNING, task_state_history=task_metadata.task_state_history.append( (KubernetesTaskState.TASK_RUNNING, time.time()), ))) self.event_queue.put( task_event(task_id=pod_name, terminal=False, timestamp=time.time(), raw=raw_event, task_config=task_metadata.task_config, platform_type="running")) if (pod.status.phase == "Succeeded" and task_metadata.task_state is not KubernetesTaskState.TASK_FINISHED): logger.info( f"Removing {pod_name} from state and emitting 'finished' event.", ) self.task_metadata = self.task_metadata.discard(pod_name) self.event_queue.put( task_event(task_id=pod_name, terminal=True, success=True, timestamp=time.time(), raw=raw_event, task_config=task_metadata.task_config, platform_type="finished")) return elif (pod.status.phase == "Failed" and task_metadata.task_state is not KubernetesTaskState.TASK_FAILED): logger.info( f"Removing {pod_name} from state and emitting 'failed' event.") self.task_metadata = self.task_metadata.discard(pod_name) self.event_queue.put( task_event(task_id=pod_name, terminal=True, success=False, timestamp=time.time(), raw=raw_event, task_config=task_metadata.task_config, platform_type="failed")) return elif (pod.status.phase == "Running" and task_metadata.task_state is not KubernetesTaskState.TASK_RUNNING): logger.info( f"Successfully launched {pod_name}, emitting 'running' event.") self.task_metadata = self.task_metadata.set( pod_name, task_metadata.set( node_name=pod.spec.node_name, task_state=KubernetesTaskState.TASK_RUNNING, task_state_history=task_metadata.task_state_history.append( (KubernetesTaskState.TASK_RUNNING, time.time()), ))) self.event_queue.put( task_event(task_id=pod_name, terminal=False, timestamp=time.time(), raw=raw_event, task_config=task_metadata.task_config, platform_type="running")) return # XXX: figure out how to handle this correctly (and when this actually # happens - we were unable to cajole k8s into giving us an event with an Unknown # phase) elif (pod.status.phase == "Unknown" and task_metadata.task_state is not KubernetesTaskState.TASK_LOST): logger.info( f"Got a MODIFIED event for {pod_name} with unknown phase, host likely " "unexpectedly died") self.task_metadata = self.task_metadata.set( pod_name, task_metadata.set( node_name=pod.spec.node_name, task_state=KubernetesTaskState.TASK_LOST, task_state_history=task_metadata.task_state_history.append( (KubernetesTaskState.TASK_LOST, time.time()), ))) self.event_queue.put( task_event(task_id=pod_name, terminal=False, timestamp=time.time(), raw=raw_event, task_config=task_metadata.task_config, platform_type="lost")) return logger.info( f"Ignoring MODIFIED event for {pod_name} as it did not result " "in a state transition", )