def test_store_bot_config(self): # When a new start bot script is uploaded, we should recalculate the # version hash since it will have changed. v1 = bot_code.get_bot_version('http://localhost') bot_code.store_bot_config('dummy_script') v2 = bot_code.get_bot_version('http://localhost') v3 = bot_code.get_bot_version('http://localhost:8080') self.assertNotEqual(v1, v2) self.assertNotEqual(v1, v3) self.assertNotEqual(v2, v3)
def test_api_server(self): self.set_as_privileged_user() actual = self.app.get('/swarming/api/v1/client/server').json expected = { 'bot_version': bot_code.get_bot_version('http://localhost'), } self.assertEqual(expected, actual)
def get(self): limit = int(self.request.get('limit', 100)) cursor = datastore_query.Cursor(urlsafe=self.request.get('cursor')) sort_by = self.request.get('sort_by', '__key__') if sort_by not in self.ACCEPTABLE_BOTS_SORTS: self.abort(400, 'Invalid sort_by query parameter') if sort_by[0] == '-': order = datastore_query.PropertyOrder( sort_by[1:], datastore_query.PropertyOrder.DESCENDING) else: order = datastore_query.PropertyOrder( sort_by, datastore_query.PropertyOrder.ASCENDING) now = utils.utcnow() cutoff = now - datetime.timedelta( seconds=config.settings().bot_death_timeout_secs) num_bots_busy_future = bot_management.BotInfo.query( bot_management.BotInfo.is_busy == True).count_async() num_bots_dead_future = bot_management.BotInfo.query( bot_management.BotInfo.last_seen_ts < cutoff).count_async() num_bots_quarantined_future = bot_management.BotInfo.query( bot_management.BotInfo.quarantined == True).count_async() num_bots_total_future = bot_management.BotInfo.query().count_async() fetch_future = bot_management.BotInfo.query().order(order).fetch_page_async( limit, start_cursor=cursor) # TODO(maruel): self.request.host_url should be the default AppEngine url # version and not the current one. It is only an issue when # version-dot-appid.appspot.com urls are used to access this page. version = bot_code.get_bot_version(self.request.host_url) bots, cursor, more = fetch_future.get_result() # Prefetch the tasks. We don't actually use the value here, it'll be # implicitly used by ndb local's cache when refetched by the html template. tasks = filter(None, (b.task for b in bots)) ndb.get_multi(tasks) num_bots_busy = num_bots_busy_future.get_result() num_bots_dead = num_bots_dead_future.get_result() num_bots_quarantined = num_bots_quarantined_future.get_result() num_bots_total = num_bots_total_future.get_result() params = { 'bots': bots, 'current_version': version, 'cursor': cursor.urlsafe() if cursor and more else '', 'is_admin': acl.is_admin(), 'is_privileged_user': acl.is_privileged_user(), 'limit': limit, 'now': now, 'num_bots_alive': num_bots_total - num_bots_dead, 'num_bots_busy': num_bots_busy, 'num_bots_dead': num_bots_dead, 'num_bots_quarantined': num_bots_quarantined, 'sort_by': sort_by, 'sort_options': self.SORT_OPTIONS, 'xsrf_token': self.generate_xsrf_token(), } self.response.write( template.render('swarming/restricted_botslist.html', params))
def get(self, bot_id): # pagination is currently for tasks, not events. limit = int(self.request.get('limit', 100)) cursor = datastore_query.Cursor(urlsafe=self.request.get('cursor')) bot_future = bot_management.get_info_key(bot_id).get_async() run_results, cursor, more = task_result.TaskRunResult.query( task_result.TaskRunResult.bot_id == bot_id).order( -task_result.TaskRunResult.started_ts).fetch_page( limit, start_cursor=cursor) events_future = bot_management.get_events_query(bot_id).fetch_async(100) now = utils.utcnow() bot = bot_future.get_result() # Calculate the time this bot was idle. idle_time = datetime.timedelta() run_time = datetime.timedelta() if run_results: run_time = run_results[0].duration_now(now) or datetime.timedelta() if not cursor and run_results[0].state != task_result.State.RUNNING: # Add idle time since last task completed. Do not do this when a cursor # is used since it's not representative. idle_time = now - run_results[0].ended_ts for index in xrange(1, len(run_results)): # .started_ts will always be set by definition but .ended_ts may be None # if the task was abandoned. We can't count idle time since the bot may # have been busy running *another task*. # TODO(maruel): One option is to add a third value "broken_time". # Looking at timestamps specifically could help too, e.g. comparing # ended_ts of this task vs the next one to see if the bot was assigned # two tasks simultaneously. if run_results[index].ended_ts: idle_time += ( run_results[index-1].started_ts - run_results[index].ended_ts) duration = run_results[index].duration if duration: run_time += duration params = { 'bot': bot, 'bot_id': bot_id, 'current_version': bot_code.get_bot_version(self.request.host_url), 'cursor': cursor.urlsafe() if cursor and more else None, 'events': events_future.get_result(), 'idle_time': idle_time, 'is_admin': acl.is_admin(), 'limit': limit, 'now': now, 'run_results': run_results, 'run_time': run_time, 'xsrf_token': self.generate_xsrf_token(), } self.response.write( template.render('swarming/restricted_bot.html', params))
def get(self, version=None): if version: expected = bot_code.get_bot_version(self.request.host_url) if version != expected: # This can happen when the server is rapidly updated. logging.error("Requested Swarming bot %s, have %s", version, expected) self.abort(404) self.response.headers["Cache-Control"] = "public, max-age=3600" else: self.response.headers["Cache-Control"] = "no-cache, no-store" self.response.headers["Content-Type"] = "application/octet-stream" self.response.headers["Content-Disposition"] = 'attachment; filename="swarming_bot.zip"' self.response.out.write(bot_code.get_swarming_bot_zip(self.request.host_url))
def post(self): (_request, bot_id, version, state, dimensions, quarantined_msg) = self._process() bot_management.bot_event( event_type='bot_connected', bot_id=bot_id, external_ip=self.request.remote_addr, dimensions=dimensions, state=state, version=version, quarantined=bool(quarantined_msg), task_id='', task_name=None, message=quarantined_msg) data = { # This access token will be used to validate each subsequent request. 'bot_version': bot_code.get_bot_version(self.request.host_url), 'server_version': utils.get_app_version(), 'xsrf_token': self.generate_xsrf_token(), } self.send_response(data)
def test_get_bot_version(self): actual = bot_code.get_bot_version('http://localhost') self.assertTrue(re.match(r'^[0-9a-f]{40}$', actual), actual)
def get(self, bot_id): # pagination is currently for tasks, not events. limit = int(self.request.get('limit', 100)) cursor = datastore_query.Cursor(urlsafe=self.request.get('cursor')) run_results_future = task_result.TaskRunResult.query( task_result.TaskRunResult.bot_id == bot_id).order( -task_result.TaskRunResult.started_ts).fetch_page_async( limit, start_cursor=cursor) bot_future = bot_management.get_info_key(bot_id).get_async() events_future = bot_management.get_events_query( bot_id, True).fetch_async(100) now = utils.utcnow() # Calculate the time this bot was idle. idle_time = datetime.timedelta() run_time = datetime.timedelta() run_results, cursor, more = run_results_future.get_result() if run_results: run_time = run_results[0].duration_now(now) or datetime.timedelta() if not cursor and run_results[0].state != task_result.State.RUNNING: # Add idle time since last task completed. Do not do this when a cursor # is used since it's not representative. idle_time = now - run_results[0].ended_ts for index in xrange(1, len(run_results)): # .started_ts will always be set by definition but .ended_ts may be None # if the task was abandoned. We can't count idle time since the bot may # have been busy running *another task*. # TODO(maruel): One option is to add a third value "broken_time". # Looking at timestamps specifically could help too, e.g. comparing # ended_ts of this task vs the next one to see if the bot was assigned # two tasks simultaneously. if run_results[index].ended_ts: idle_time += ( run_results[index-1].started_ts - run_results[index].ended_ts) # We are taking the whole time the bot was doing work, not just the # duration associated with the task. duration = run_results[index].duration_total if duration: run_time += duration events = events_future.get_result() bot = bot_future.get_result() if not bot and events: # If there is not BotInfo, look if there are BotEvent child of this # entity. If this is the case, it means the bot was deleted but it's # useful to show information about it to the user even if the bot was # deleted. For example, it could be an auto-scaled bot. bot = bot_management.BotInfo( key=bot_management.get_info_key(bot_id), dimensions=events[0].dimensions, state=events[0].state, external_ip=events[0].external_ip, version=events[0].version, quarantined=events[0].quarantined, task_id=events[0].task_id, last_seen_ts=events[0].ts) params = { 'bot': bot, 'bot_id': bot_id, 'current_version': bot_code.get_bot_version(self.request.host_url), 'cursor': cursor.urlsafe() if cursor and more else None, 'events': events, 'idle_time': idle_time, 'is_admin': acl.is_admin(), 'limit': limit, 'now': now, 'run_results': run_results, 'run_time': run_time, 'xsrf_token': self.generate_xsrf_token(), } self.response.write( template.render('swarming/restricted_bot.html', params))
def test_api_server(self): self.set_as_privileged_user() actual = self.app.get("/swarming/api/v1/client/server").json expected = {"bot_version": bot_code.get_bot_version("http://localhost")} self.assertEqual(expected, actual)
def post(self): """Handles a polling request. Be very permissive on missing values. This can happen because of errors on the bot, *we don't want to deny them the capacity to update*, so that the bot code is eventually fixed and the bot self-update to this working code. It makes recovery of the fleet in case of catastrophic failure much easier. """ (_request, bot_id, version, state, dimensions, quarantined_msg) = self._process() sleep_streak = state.get("sleep_streak", 0) quarantined = bool(quarantined_msg) # Note bot existence at two places, one for stats at 1 minute resolution, # the other for the list of known bots. action = "bot_inactive" if quarantined else "bot_active" stats.add_entry(action=action, bot_id=bot_id, dimensions=dimensions) def bot_event(event_type, task_id=None, task_name=None): bot_management.bot_event( event_type=event_type, bot_id=bot_id, external_ip=self.request.remote_addr, dimensions=dimensions, state=state, version=version, quarantined=quarantined, task_id=task_id, task_name=task_name, message=quarantined_msg, ) # Bot version is host-specific because the host URL is embedded in # swarming_bot.zip expected_version = bot_code.get_bot_version(self.request.host_url) if version != expected_version: bot_event("request_update") self._cmd_update(expected_version) return if quarantined: bot_event("request_sleep") self._cmd_sleep(sleep_streak, quarantined) return # # At that point, the bot should be in relatively good shape since it's # running the right version. It is still possible that invalid code was # pushed to the server, so be diligent about it. # # Bot may need a reboot if it is running for too long. We do not reboot # quarantined bots. needs_restart, restart_message = bot_management.should_restart_bot(bot_id, state) if needs_restart: bot_event("request_restart") self._cmd_restart(restart_message) return # The bot is in good shape. Try to grab a task. try: # This is a fairly complex function call, exceptions are expected. request, run_result = task_scheduler.bot_reap_task(dimensions, bot_id, version) if not request: # No task found, tell it to sleep a bit. bot_event("request_sleep") self._cmd_sleep(sleep_streak, quarantined) return try: # This part is tricky since it intentionally runs a transaction after # another one. if request.properties.is_terminate: bot_event("bot_terminate", task_id=run_result.task_id) self._cmd_terminate(run_result.task_id) else: bot_event("request_task", task_id=run_result.task_id, task_name=request.name) self._cmd_run(request, run_result.key, bot_id) except: logging.exception("Dang, exception after reaping") raise except runtime.DeadlineExceededError: # If the timeout happened before a task was assigned there is no problems. # If the timeout occurred after a task was assigned, that task will # timeout (BOT_DIED) since the bot didn't get the details required to # run it) and it will automatically get retried (TODO) when the task times # out. # TODO(maruel): Note the task if possible and hand it out on next poll. # https://code.google.com/p/swarming/issues/detail?id=130 self.abort(500, "Deadline")
def get(self): data = { 'bot_version': bot_code.get_bot_version(self.request.host_url), } self.send_response(utils.to_json_encodable(data))
def get(self): logging.error('Unexpected old client') data = { 'bot_version': bot_code.get_bot_version(self.request.host_url), } self.send_response(utils.to_json_encodable(data))
def get(self, bot_id): # pagination is currently for tasks, not events. limit = int(self.request.get('limit', 100)) cursor = datastore_query.Cursor(urlsafe=self.request.get('cursor')) run_results_future = task_result.TaskRunResult.query( task_result.TaskRunResult.bot_id == bot_id).order( -task_result.TaskRunResult.started_ts).fetch_page_async( limit, start_cursor=cursor) bot_future = bot_management.get_info_key(bot_id).get_async() events_future = bot_management.get_events_query(bot_id).fetch_async( 100) now = utils.utcnow() # Calculate the time this bot was idle. idle_time = datetime.timedelta() run_time = datetime.timedelta() run_results, cursor, more = run_results_future.get_result() if run_results: run_time = run_results[0].duration_now(now) or datetime.timedelta() if not cursor and run_results[0].state != task_result.State.RUNNING: # Add idle time since last task completed. Do not do this when a cursor # is used since it's not representative. idle_time = now - run_results[0].ended_ts for index in xrange(1, len(run_results)): # .started_ts will always be set by definition but .ended_ts may be None # if the task was abandoned. We can't count idle time since the bot may # have been busy running *another task*. # TODO(maruel): One option is to add a third value "broken_time". # Looking at timestamps specifically could help too, e.g. comparing # ended_ts of this task vs the next one to see if the bot was assigned # two tasks simultaneously. if run_results[index].ended_ts: idle_time += (run_results[index - 1].started_ts - run_results[index].ended_ts) duration = run_results[index].duration if duration: run_time += duration events = events_future.get_result() bot = bot_future.get_result() if not bot and events: # If there is not BotInfo, look if there are BotEvent child of this # entity. If this is the case, it means the bot was deleted but it's # useful to show information about it to the user even if the bot was # deleted. For example, it could be an auto-scaled bot. bot = bot_management.BotInfo( key=bot_management.get_info_key(bot_id), dimensions=events[0].dimensions, state=events[0].state, external_ip=events[0].external_ip, version=events[0].version, quarantined=events[0].quarantined, task_id=events[0].task_id, last_seen_ts=events[0].ts) params = { 'bot': bot, 'bot_id': bot_id, 'current_version': bot_code.get_bot_version(self.request.host_url), 'cursor': cursor.urlsafe() if cursor and more else None, 'events': events, 'idle_time': idle_time, 'is_admin': acl.is_admin(), 'limit': limit, 'now': now, 'run_results': run_results, 'run_time': run_time, 'xsrf_token': self.generate_xsrf_token(), } self.response.write( template.render('swarming/restricted_bot.html', params))