def status(self): if self.threaded_health_checker.healthy: if self.threaded_health_checker.running: return StatusResult('Task is healthy.', TaskState.Value('TASK_RUNNING')) else: return StatusResult(None, TaskState.Value('TASK_STARTING')) return StatusResult( 'Failed health check! %s' % self.threaded_health_checker.reason, TaskState.Value('TASK_FAILED'))
def run(self): while True: status_result = self._status_checker.status if status_result is not None: log.info('Status manager got %s' % status_result) if status_result.status == TaskState.Value('TASK_RUNNING'): if not self._running_callback_dispatched: self._running_callback(status_result) self._running_callback_dispatched = True elif status_result.status != TaskState.Value('TASK_STARTING'): self._unhealthy_callback(status_result) break self._clock.sleep(self.POLL_WAIT.as_(Time.SECONDS))
def test_chained_health_interface(): hi = ChainedStatusChecker([]) assert hi.status is None hi = ChainedStatusChecker([Healthy()]) assert hi.status is None si1 = EventHealth() si2 = EventHealth() chained_si = ChainedStatusChecker([si1, si2]) for si in (si1, si2): assert not si.started.is_set() chained_si.start() for si in (si1, si2): assert si.started.is_set() assert chained_si.status is None reason = StatusResult('derp', TaskState.Value('TASK_FAILED')) si2.set_status(reason) assert chained_si.status == reason assert chained_si.status.reason == 'derp' assert TaskState.Name(chained_si.status.status) == 'TASK_FAILED' for si in (si1, si2): assert not si.stopped.is_set() chained_si.stop() for si in (si1, si2): assert si.stopped.is_set()
def test_consecutive_failures(self): '''Verify that a task is unhealthy only after max_consecutive_failures is exceeded''' initial_interval_secs = 2 interval_secs = 1 self.append_health_checks(False, num_calls=2) self.append_health_checks(True) self.append_health_checks(False, num_calls=3) hct = HealthChecker( self._checker.health, interval_secs=interval_secs, initial_interval_secs=initial_interval_secs, max_consecutive_failures=2, clock=self._clock) hct.start() # 2 consecutive health check failures followed by a successful health check. self._clock.tick(initial_interval_secs) assert hct.status is None self._clock.tick(interval_secs) assert hct.status is None self._clock.tick(interval_secs) assert hct.status is None # 3 consecutive health check failures. self._clock.tick(interval_secs) assert hct.status is None self._clock.tick(interval_secs) assert hct.status is None self._clock.tick(interval_secs) thread_yield() assert hct.status.status == TaskState.Value('TASK_FAILED') hct.stop() assert self._checker.health.call_count == 6
def test_initial_interval_whatev(self): self.append_health_checks(False) hct = HealthChecker(self._checker.health, interval_secs=5, initial_interval_secs=0, clock=self._clock) hct.start() assert hct.status.status == TaskState.Value('TASK_FAILED') hct.stop() assert self._checker.health.call_count == 1
def test_consecutive_failures(self): '''Verify that a task is unhealthy only after max_consecutive_failures is exceeded''' initial_interval_secs = 2 interval_secs = 1 self.append_health_checks(False, num_calls=2) self.append_health_checks(True) self.append_health_checks(False, num_calls=3) hct = HealthChecker(self._checker.health, interval_secs=interval_secs, initial_interval_secs=initial_interval_secs, max_consecutive_failures=2, clock=self._clock) hct.start() self._clock.converge(threads=[hct.threaded_health_checker]) # 2 consecutive health check failures followed by a successful health check. epsilon = 0.001 self._clock.tick(initial_interval_secs + epsilon) self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, amount=1) assert hct.status is None assert hct.metrics.sample()['consecutive_failures'] == 1 self._clock.tick(interval_secs + epsilon) self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, amount=1) assert hct.status is None assert hct.metrics.sample()['consecutive_failures'] == 2 self._clock.tick(interval_secs + epsilon) self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, amount=1) assert hct.status is None assert hct.metrics.sample()['consecutive_failures'] == 0 # 3 consecutive health check failures. self._clock.tick(interval_secs + epsilon) self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, amount=1) assert hct.status is None assert hct.metrics.sample()['consecutive_failures'] == 1 self._clock.tick(interval_secs + epsilon) self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, amount=1) assert hct.status is None assert hct.metrics.sample()['consecutive_failures'] == 2 self._clock.tick(interval_secs + epsilon) self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, amount=1) assert hct.status.status == TaskState.Value('TASK_FAILED') assert hct.metrics.sample()['consecutive_failures'] == 3 hct.stop() assert self._checker.health.call_count == 6
def status(self): """ Return status that is computed from the statuses of the StatusCheckers. The computed status is based on the priority given below (in increasing order of priority). None -> healthy (lowest-priority) TASK_RUNNING -> healthy and running TASK_STARTING -> healthy but still in starting Otherwise -> unhealthy (highest-priority) """ if not self._in_terminal_state(): cur_status = None for status_checker in self._status_checkers: status_result = status_checker.status if status_result is not None: log.info( '%s reported %s' % (status_checker.__class__.__name__, status_result)) if not isinstance(status_result, StatusResult): raise TypeError( 'StatusChecker returned something other than a StatusResult: got %s' % type(status_result)) if status_result.status == TaskState.Value( 'TASK_STARTING'): # TASK_STARTING overrides other statuses cur_status = status_result elif status_result.status == TaskState.Value( 'TASK_RUNNING'): if cur_status is None or cur_status == TaskState.Value( 'TASK_RUNNING'): # TASK_RUNNING needs consensus (None is also included) cur_status = status_result else: # Any other status leads to a terminal state self._status = status_result return self._status self._status = cur_status return self._status
def test_initial_interval_2x(self): self.append_health_checks(False) hct = HealthChecker(self._checker.health, interval_secs=5, clock=self._clock) hct.start() thread_yield() assert hct.status is None self._clock.tick(6) assert hct.status is None self._clock.tick(3) assert hct.status is None self._clock.tick(5) thread_yield() assert hct.status.status == TaskState.Value('TASK_FAILED') hct.stop() assert self._checker.health.call_count == 1
def test_initial_interval_whatev(self): self.append_health_checks(False, 2) hct = HealthChecker(self._checker.health, interval_secs=5, initial_interval_secs=0, clock=self._clock) hct.start() self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, amount=5) assert hct.status.status == TaskState.Value('TASK_FAILED') hct.stop() # this is an implementation detail -- we healthcheck in the initializer and # healthcheck in the run loop. if we ever change the implementation, expect # this to break. assert self._checker.health.call_count == 2
def test_initial_interval_2x(self): self.append_health_checks(False) hct = HealthChecker(self._checker.health, interval_secs=5, clock=self._clock) hct.start() assert self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, 10) assert hct.status is None self._clock.tick(6) assert self._clock.converge(threads=[hct.threaded_health_checker]) assert hct.status is None self._clock.tick(3) assert self._clock.converge(threads=[hct.threaded_health_checker]) assert hct.status is None self._clock.tick(5) assert self._clock.converge(threads=[hct.threaded_health_checker]) assert hct.status.status == TaskState.Value('TASK_FAILED') hct.stop() assert self._checker.health.call_count == 1
def status(self): if not self.threaded_health_checker.healthy: return StatusResult( 'Failed health check! %s' % self.threaded_health_checker.reason, TaskState.Value('TASK_FAILED'))
def _in_terminal_state(self): return (self._status is not None and self._status.status != TaskState.Value('TASK_RUNNING') and self._status.status != TaskState.Value('TASK_STARTING'))
def status(self): if self.call_count == 2: return StatusResult('Fake reason', TaskState.Value('TASK_KILLED')) self.call_count += 1 return self._status
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import threading import pytest from mesos.interface.mesos_pb2 import TaskState from apache.aurora.executor.common.status_checker import (ChainedStatusChecker, Healthy, StatusChecker, StatusResult) TASK_STARTING = StatusResult(None, TaskState.Value('TASK_STARTING')) TASK_RUNNING = StatusResult(None, TaskState.Value('TASK_RUNNING')) TASK_FAILED = StatusResult(None, TaskState.Value('TASK_FAILED')) class EventHealth(StatusChecker): def __init__(self, status=None): self.started = threading.Event() self.stopped = threading.Event() self._status = status @property def status(self): return self._status def set_status(self, status):
def test_run_with_starting_status(self): self.do_test_run_with_status( StatusResult(None, TaskState.Value('TASK_STARTING')), 0)
def status(self): if self.call_count == 2: return TaskState.Value('TASK_KILLED') self.call_count += 1 return None
def status(self): if self._killed: return StatusResult(self._reason, TaskState.Value('TASK_KILLED'))
def test_run_with_running_status(self): self.do_test_run_with_status( StatusResult(None, TaskState.Value('TASK_RUNNING')), 1)
def running_callback(result): assert result == StatusResult(None, TaskState.Value('TASK_RUNNING')) self.running_callback_called += 1
def unhealthy_callback(result): assert result == StatusResult('Fake reason', TaskState.Value('TASK_KILLED')) self.unhealthy_callback_called = True
def callback(result): assert result == TaskState.Value('TASK_KILLED') self.callback_called = True
def __init__(self): self._status = StatusResult( 'No health-check defined, task is assumed healthy.', TaskState.Value('TASK_RUNNING'))