def test_should_restart_bot_not_set(self): state = { 'running_time': 0, 'started_ts': 1410989556.174, } self.assertEqual( (False, ''), bot_management.should_restart_bot('id', state))
def test_should_restart_bot_bad_type(self): state = { 'periodic_reboot_secs': '100', 'running_time': 105, 'started_ts': 1410989556.174, } self.assertEqual( (False, ''), bot_management.should_restart_bot('id', state))
def test_should_restart_bot(self): state = { 'periodic_reboot_secs': 100, 'running_time': 105, 'started_ts': 1410989556.174, } needs_reboot, message = bot_management.should_restart_bot('id', state) self.assertTrue(needs_reboot) self.assertTrue(message)
def post(self): """Handles a polling request. Be very permissive on missing values. This can happen because of errors on the bot, *we don't want to deny them the capacity to update*, so that the bot code is eventually fixed and the bot self-update to this working code. It makes recovery of the fleet in case of catastrophic failure much easier. """ (_request, bot_id, version, state, dimensions, quarantined_msg) = self._process() sleep_streak = state.get('sleep_streak', 0) quarantined = bool(quarantined_msg) # Note bot existence at two places, one for stats at 1 minute resolution, # the other for the list of known bots. action = 'bot_inactive' if quarantined else 'bot_active' stats.add_entry(action=action, bot_id=bot_id, dimensions=dimensions) def bot_event(event_type, task_id=None, task_name=None): bot_management.bot_event(event_type=event_type, bot_id=bot_id, external_ip=self.request.remote_addr, dimensions=dimensions, state=state, version=version, quarantined=quarantined, task_id=task_id, task_name=task_name, message=quarantined_msg) # Bot version is host-specific because the host URL is embedded in # swarming_bot.zip expected_version = bot_code.get_bot_version(self.request.host_url) if version != expected_version: bot_event('request_update') self._cmd_update(expected_version) return if quarantined: bot_event('request_sleep') self._cmd_sleep(sleep_streak, quarantined) return # # At that point, the bot should be in relatively good shape since it's # running the right version. It is still possible that invalid code was # pushed to the server, so be diligent about it. # # Bot may need a reboot if it is running for too long. We do not reboot # quarantined bots. needs_restart, restart_message = bot_management.should_restart_bot( bot_id, state) if needs_restart: bot_event('request_restart') self._cmd_restart(restart_message) return # The bot is in good shape. Try to grab a task. try: # This is a fairly complex function call, exceptions are expected. request, run_result = task_scheduler.bot_reap_task( dimensions, bot_id, version) if not request: # No task found, tell it to sleep a bit. bot_event('request_sleep') self._cmd_sleep(sleep_streak, quarantined) return try: # This part is tricky since it intentionally runs a transaction after # another one. if request.properties.is_terminate: bot_event('bot_terminate', task_id=run_result.task_id) self._cmd_terminate(run_result.task_id) else: bot_event('request_task', task_id=run_result.task_id, task_name=request.name) self._cmd_run(request, run_result.key, bot_id) except: logging.exception('Dang, exception after reaping') raise except runtime.DeadlineExceededError: # If the timeout happened before a task was assigned there is no problems. # If the timeout occurred after a task was assigned, that task will # timeout (BOT_DIED) since the bot didn't get the details required to # run it) and it will automatically get retried (TODO) when the task times # out. # TODO(maruel): Note the task if possible and hand it out on next poll. # https://code.google.com/p/swarming/issues/detail?id=130 self.abort(500, 'Deadline')
def post(self): """Handles a polling request. Be very permissive on missing values. This can happen because of errors on the bot, *we don't want to deny them the capacity to update*, so that the bot code is eventually fixed and the bot self-update to this working code. It makes recovery of the fleet in case of catastrophic failure much easier. """ (_request, bot_id, version, state, dimensions, quarantined_msg) = self._process() sleep_streak = state.get("sleep_streak", 0) quarantined = bool(quarantined_msg) # Note bot existence at two places, one for stats at 1 minute resolution, # the other for the list of known bots. action = "bot_inactive" if quarantined else "bot_active" stats.add_entry(action=action, bot_id=bot_id, dimensions=dimensions) def bot_event(event_type, task_id=None, task_name=None): bot_management.bot_event( event_type=event_type, bot_id=bot_id, external_ip=self.request.remote_addr, dimensions=dimensions, state=state, version=version, quarantined=quarantined, task_id=task_id, task_name=task_name, message=quarantined_msg, ) # Bot version is host-specific because the host URL is embedded in # swarming_bot.zip expected_version = bot_code.get_bot_version(self.request.host_url) if version != expected_version: bot_event("request_update") self._cmd_update(expected_version) return if quarantined: bot_event("request_sleep") self._cmd_sleep(sleep_streak, quarantined) return # # At that point, the bot should be in relatively good shape since it's # running the right version. It is still possible that invalid code was # pushed to the server, so be diligent about it. # # Bot may need a reboot if it is running for too long. We do not reboot # quarantined bots. needs_restart, restart_message = bot_management.should_restart_bot(bot_id, state) if needs_restart: bot_event("request_restart") self._cmd_restart(restart_message) return # The bot is in good shape. Try to grab a task. try: # This is a fairly complex function call, exceptions are expected. request, run_result = task_scheduler.bot_reap_task(dimensions, bot_id, version) if not request: # No task found, tell it to sleep a bit. bot_event("request_sleep") self._cmd_sleep(sleep_streak, quarantined) return try: # This part is tricky since it intentionally runs a transaction after # another one. if request.properties.is_terminate: bot_event("bot_terminate", task_id=run_result.task_id) self._cmd_terminate(run_result.task_id) else: bot_event("request_task", task_id=run_result.task_id, task_name=request.name) self._cmd_run(request, run_result.key, bot_id) except: logging.exception("Dang, exception after reaping") raise except runtime.DeadlineExceededError: # If the timeout happened before a task was assigned there is no problems. # If the timeout occurred after a task was assigned, that task will # timeout (BOT_DIED) since the bot didn't get the details required to # run it) and it will automatically get retried (TODO) when the task times # out. # TODO(maruel): Note the task if possible and hand it out on next poll. # https://code.google.com/p/swarming/issues/detail?id=130 self.abort(500, "Deadline")
def post(self): """Handles a polling request. Be very permissive on missing values. This can happen because of errors on the bot, *we don't want to deny them the capacity to update*, so that the bot code is eventually fixed and the bot self-update to this working code. It makes recovery of the fleet in case of catastrophic failure much easier. """ if config.settings().force_bots_to_sleep_and_not_run_task: # Ignore everything, just sleep. Tell the bot it is quarantined to inform # it that it won't be running anything anyway. Use a large streak so it # will sleep for 60s. self._cmd_sleep(1000, True) return res = self._process() sleep_streak = res.state.get('sleep_streak', 0) quarantined = bool(res.quarantined_msg) # Note bot existence at two places, one for stats at 1 minute resolution, # the other for the list of known bots. action = 'bot_inactive' if quarantined else 'bot_active' stats.add_entry(action=action, bot_id=res.bot_id, dimensions=res.dimensions) def bot_event(event_type, task_id=None, task_name=None): bot_management.bot_event( event_type=event_type, bot_id=res.bot_id, external_ip=self.request.remote_addr, authenticated_as=auth.get_peer_identity().to_bytes(), dimensions=res.dimensions, state=res.state, version=res.version, quarantined=quarantined, task_id=task_id, task_name=task_name, message=res.quarantined_msg) # Bot version is host-specific because the host URL is embedded in # swarming_bot.zip expected_version = bot_code.get_bot_version(self.request.host_url) if res.version != expected_version: bot_event('request_update') self._cmd_update(expected_version) return if quarantined: bot_event('request_sleep') self._cmd_sleep(sleep_streak, quarantined) return # If the server-side per-bot config for the bot has changed, we need # to restart this particular bot, so it picks up new config in /handshake. # Do this check only for bots that know about server-side per-bot configs # already (such bots send 'bot_group_cfg_version' state attribute). cur_bot_cfg_ver = res.state.get('bot_group_cfg_version') if cur_bot_cfg_ver and cur_bot_cfg_ver != res.bot_group_cfg.version: bot_event('request_restart') self._cmd_restart('Restarting to pick up new bots.cfg config') return # # At that point, the bot should be in relatively good shape since it's # running the right version. It is still possible that invalid code was # pushed to the server, so be diligent about it. # # Bot may need a reboot if it is running for too long. We do not reboot # quarantined bots. needs_restart, restart_message = bot_management.should_restart_bot( res.bot_id, res.state) if needs_restart: bot_event('request_restart') self._cmd_restart(restart_message) return # The bot is in good shape. Try to grab a task. try: # This is a fairly complex function call, exceptions are expected. request, run_result = task_scheduler.bot_reap_task( res.dimensions, res.bot_id, res.version, res.state.get('lease_expiration_ts')) if not request: # No task found, tell it to sleep a bit. bot_event('request_sleep') self._cmd_sleep(sleep_streak, quarantined) return try: # This part is tricky since it intentionally runs a transaction after # another one. if request.properties.is_terminate: bot_event('bot_terminate', task_id=run_result.task_id) self._cmd_terminate(run_result.task_id) else: bot_event('request_task', task_id=run_result.task_id, task_name=request.name) self._cmd_run(request, run_result.key, res.bot_id) except: logging.exception('Dang, exception after reaping') raise except runtime.DeadlineExceededError: # If the timeout happened before a task was assigned there is no problems. # If the timeout occurred after a task was assigned, that task will # timeout (BOT_DIED) since the bot didn't get the details required to # run it) and it will automatically get retried (TODO) when the task times # out. # TODO(maruel): Note the task if possible and hand it out on next poll. # https://code.google.com/p/swarming/issues/detail?id=130 self.abort(500, 'Deadline')