def events(self, request): """Returns events that happened on a bot.""" logging.info('%s', request) try: now = utils.utcnow() start = message_conversion.epoch_to_datetime(request.start) end = message_conversion.epoch_to_datetime(request.end) order = not (start or end) query = bot_management.get_events_query(request.bot_id, order) if not order: query = query.order( -bot_management.BotEvent.ts, bot_management.BotEvent.key) if start: query = query.filter(bot_management.BotEvent.ts >= start) if end: query = query.filter(bot_management.BotEvent.ts < end) items, cursor = datastore_utils.fetch_page( query, request.limit, request.cursor) except ValueError as e: raise endpoints.BadRequestException( 'Inappropriate filter for bot.events: %s' % e) return swarming_rpcs.BotEvents( cursor=cursor, items=[message_conversion.bot_event_to_rpc(r) for r in items], now=now)
def test_bot_event_poll_sleep(self): now = datetime.datetime(2010, 1, 2, 3, 4, 5, 6) self.mock_now(now) bot_management.bot_event( event_type='request_sleep', bot_id='id1', external_ip='8.8.4.4', dimensions={'id': ['id1'], 'foo': ['bar']}, state={'ram': 65}, version=hashlib.sha1().hexdigest(), quarantined=True, task_id=None, task_name=None) # Assert that BotInfo was updated too. expected = { 'dimensions': {u'foo': [u'bar'], u'id': [u'id1']}, 'external_ip': u'8.8.4.4', 'first_seen_ts': now, 'id': 'id1', 'last_seen_ts': now, 'quarantined': True, 'state': {u'ram': 65}, 'task_id': None, 'task_name': None, 'version': u'da39a3ee5e6b4b0d3255bfef95601890afd80709', } bot_info = bot_management.get_info_key('id1').get() self.assertEqual(expected, bot_info.to_dict()) self.assertEqual(False, bot_info.is_busy) # No BotEvent is registered for 'poll'. self.assertEqual([], bot_management.get_events_query('id1', True).fetch())
def test_bot_event(self): self.mock(random, 'getrandbits', lambda _: 0x88) now = datetime.datetime(2010, 1, 2, 3, 4, 5) self.mock_now(now) token, params = self.get_bot_token() params['event'] = 'bot_rebooting' params['message'] = 'for the best' response = self.post_with_token('/swarming/api/v1/bot/event', params, token) self.assertEqual({}, response) # TODO(maruel): Replace with client api to query last BotEvent. actual = [ e.to_dict() for e in bot_management.get_events_query('bot1', True)] expected = [ { 'dimensions': { u'id': [u'bot1'], u'os': [u'Amiga'], u'pool': [u'default'], }, 'event_type': u'bot_rebooting', 'external_ip': u'192.168.2.2', 'message': u'for the best', 'quarantined': False, 'state': { u'running_time': 1234.0, u'sleep_streak': 0, u'started_ts': 1410990411.111, }, 'task_id': u'', 'ts': now, 'version': self.bot_version, }, { 'dimensions': { u'id': [u'bot1'], u'os': [u'Amiga'], u'pool': [u'default'], }, 'event_type': u'bot_connected', 'external_ip': u'192.168.2.2', 'message': None, 'quarantined': False, 'state': { u'running_time': 1234.0, u'sleep_streak': 0, u'started_ts': 1410990411.111, }, 'task_id': u'', 'ts': now, 'version': u'123', }, ] self.assertEqual(expected, actual)
def get(self, bot_id): # pagination is currently for tasks, not events. limit = int(self.request.get('limit', 100)) cursor = datastore_query.Cursor(urlsafe=self.request.get('cursor')) bot_future = bot_management.get_info_key(bot_id).get_async() run_results, cursor, more = task_result.TaskRunResult.query( task_result.TaskRunResult.bot_id == bot_id).order( -task_result.TaskRunResult.started_ts).fetch_page( limit, start_cursor=cursor) events_future = bot_management.get_events_query(bot_id).fetch_async(100) now = utils.utcnow() bot = bot_future.get_result() # Calculate the time this bot was idle. idle_time = datetime.timedelta() run_time = datetime.timedelta() if run_results: run_time = run_results[0].duration_now(now) or datetime.timedelta() if not cursor and run_results[0].state != task_result.State.RUNNING: # Add idle time since last task completed. Do not do this when a cursor # is used since it's not representative. idle_time = now - run_results[0].ended_ts for index in xrange(1, len(run_results)): # .started_ts will always be set by definition but .ended_ts may be None # if the task was abandoned. We can't count idle time since the bot may # have been busy running *another task*. # TODO(maruel): One option is to add a third value "broken_time". # Looking at timestamps specifically could help too, e.g. comparing # ended_ts of this task vs the next one to see if the bot was assigned # two tasks simultaneously. if run_results[index].ended_ts: idle_time += ( run_results[index-1].started_ts - run_results[index].ended_ts) duration = run_results[index].duration if duration: run_time += duration params = { 'bot': bot, 'bot_id': bot_id, 'current_version': bot_code.get_bot_version(self.request.host_url), 'cursor': cursor.urlsafe() if cursor and more else None, 'events': events_future.get_result(), 'idle_time': idle_time, 'is_admin': acl.is_admin(), 'limit': limit, 'now': now, 'run_results': run_results, 'run_time': run_time, 'xsrf_token': self.generate_xsrf_token(), } self.response.write( template.render('swarming/restricted_bot.html', params))
def test_bot_event_busy(self): now = datetime.datetime(2010, 1, 2, 3, 4, 5, 6) self.mock_now(now) bot_management.bot_event( event_type='request_task', bot_id='id1', external_ip='8.8.4.4', dimensions={'id': ['id1'], 'foo': ['bar']}, state={'ram': 65}, version=hashlib.sha1().hexdigest(), quarantined=False, task_id='12311', task_name='yo') expected = { 'dimensions': {u'foo': [u'bar'], u'id': [u'id1']}, 'external_ip': u'8.8.4.4', 'first_seen_ts': now, 'id': 'id1', 'last_seen_ts': now, 'quarantined': False, 'state': {u'ram': 65}, 'task_id': u'12311', 'task_name': u'yo', 'version': u'da39a3ee5e6b4b0d3255bfef95601890afd80709', } bot_info = bot_management.get_info_key('id1').get() self.assertEqual(expected, bot_info.to_dict()) self.assertEqual(True, bot_info.is_busy) expected = [ { 'dimensions': {u'foo': [u'bar'], u'id': [u'id1']}, 'event_type': u'request_task', 'external_ip': u'8.8.4.4', 'message': None, 'quarantined': False, 'state': {u'ram': 65}, 'task_id': u'12311', 'ts': now, 'version': u'da39a3ee5e6b4b0d3255bfef95601890afd80709', }, ] self.assertEqual( expected, [e.to_dict() for e in bot_management.get_events_query('id1', True)])
def test_get_events_query(self): now = datetime.datetime(2010, 1, 2, 3, 4, 5, 6) self.mock_now(now) bot_management.bot_event(event_type='bot_connected', bot_id='id1', external_ip='8.8.4.4', authenticated_as='bot:id1.domain', dimensions={ 'id': ['id1'], 'foo': ['bar'] }, state={'ram': 65}, version=hashlib.sha1().hexdigest(), quarantined=False, task_id=None, task_name=None) expected = [ { 'authenticated_as': 'bot:id1.domain', 'dimensions': { u'foo': [u'bar'], u'id': [u'id1'] }, 'event_type': u'bot_connected', 'external_ip': u'8.8.4.4', 'lease_id': None, 'lease_expiration_ts': None, 'message': None, 'quarantined': False, 'state': { u'ram': 65 }, 'task_id': None, 'ts': now, 'version': u'da39a3ee5e6b4b0d3255bfef95601890afd80709', }, ] self.assertEqual(expected, [ i.to_dict() for i in bot_management.get_events_query('id1', True) ])
def test_get_events_query(self): now = datetime.datetime(2010, 1, 2, 3, 4, 5, 6) self.mock_now(now) bot_management.bot_event( event_type='bot_connected', bot_id='id1', external_ip='8.8.4.4', dimensions={'id': ['id1'], 'foo': ['bar']}, state={'ram': 65}, version=hashlib.sha1().hexdigest(), quarantined=False, task_id=None, task_name=None) expected = [ { 'dimensions': {u'foo': [u'bar'], u'id': [u'id1']}, 'event_type': u'bot_connected', 'external_ip': u'8.8.4.4', 'message': None, 'quarantined': False, 'state': {u'ram': 65}, 'task_id': None, 'ts': now, 'version': u'da39a3ee5e6b4b0d3255bfef95601890afd80709', }, ] self.assertEqual( expected, [i.to_dict() for i in bot_management.get_events_query('id1')])
def get(self, request): """Returns information about a known bot. This includes its state and dimensions, and if it is currently running a task. """ logging.debug('%s', request) bot_id = request.bot_id bot = bot_management.get_info_key(bot_id).get() deleted = False if not bot: # If there is not BotInfo, look if there are BotEvent child of this # entity. If this is the case, it means the bot was deleted but it's # useful to show information about it to the user even if the bot was # deleted. For example, it could be an machine-provider bot. events = bot_management.get_events_query(bot_id, True).fetch(1) if not events: raise endpoints.NotFoundException('%s not found.' % bot_id) bot = bot_management.BotInfo( key=bot_management.get_info_key(bot_id), dimensions_flat=task_queues.dimensions_to_flat( events[0].dimensions), state=events[0].state, external_ip=events[0].external_ip, authenticated_as=events[0].authenticated_as, version=events[0].version, quarantined=events[0].quarantined, maintenance_msg=events[0].maintenance_msg, task_id=events[0].task_id, last_seen_ts=events[0].ts, lease_id=events[0].lease_id, lease_expiration_ts=events[0].lease_expiration_ts, machine_type=events[0].machine_type, machine_lease=events[0].machine_lease) deleted = True return message_conversion.bot_info_to_rpc(bot, deleted=deleted)
def get(self, bot_id): # pagination is currently for tasks, not events. limit = int(self.request.get('limit', 100)) cursor = datastore_query.Cursor(urlsafe=self.request.get('cursor')) run_results_future = task_result.TaskRunResult.query( task_result.TaskRunResult.bot_id == bot_id).order( -task_result.TaskRunResult.started_ts).fetch_page_async( limit, start_cursor=cursor) bot_future = bot_management.get_info_key(bot_id).get_async() events_future = bot_management.get_events_query( bot_id, True).fetch_async(100) now = utils.utcnow() # Calculate the time this bot was idle. idle_time = datetime.timedelta() run_time = datetime.timedelta() run_results, cursor, more = run_results_future.get_result() if run_results: run_time = run_results[0].duration_now(now) or datetime.timedelta() if not cursor and run_results[0].state != task_result.State.RUNNING: # Add idle time since last task completed. Do not do this when a cursor # is used since it's not representative. idle_time = now - run_results[0].ended_ts for index in xrange(1, len(run_results)): # .started_ts will always be set by definition but .ended_ts may be None # if the task was abandoned. We can't count idle time since the bot may # have been busy running *another task*. # TODO(maruel): One option is to add a third value "broken_time". # Looking at timestamps specifically could help too, e.g. comparing # ended_ts of this task vs the next one to see if the bot was assigned # two tasks simultaneously. if run_results[index].ended_ts: idle_time += ( run_results[index-1].started_ts - run_results[index].ended_ts) # We are taking the whole time the bot was doing work, not just the # duration associated with the task. duration = run_results[index].duration_as_seen_by_server if duration: run_time += duration events = events_future.get_result() bot = bot_future.get_result() if not bot and events: # If there is not BotInfo, look if there are BotEvent child of this # entity. If this is the case, it means the bot was deleted but it's # useful to show information about it to the user even if the bot was # deleted. For example, it could be an auto-scaled bot. bot = bot_management.BotInfo( key=bot_management.get_info_key(bot_id), dimensions_flat=bot_management.dimensions_to_flat( events[0].dimensions), state=events[0].state, external_ip=events[0].external_ip, authenticated_as=events[0].authenticated_as, version=events[0].version, quarantined=events[0].quarantined, task_id=events[0].task_id, last_seen_ts=events[0].ts) params = { 'bot': bot, 'bot_id': bot_id, 'current_version': bot_code.get_bot_version(self.request.host_url), 'cursor': cursor.urlsafe() if cursor and more else None, 'events': events, 'idle_time': idle_time, 'is_admin': acl.is_admin(), 'limit': limit, 'now': now, 'run_results': run_results, 'run_time': run_time, 'try_link': '/bot?id=%s' % bot_id, 'xsrf_token': self.generate_xsrf_token(), } self.response.write( template.render('swarming/restricted_bot.html', params))
def get(self, bot_id): # pagination is currently for tasks, not events. limit = int(self.request.get('limit', 100)) cursor = datastore_query.Cursor(urlsafe=self.request.get('cursor')) run_results_future = task_result.TaskRunResult.query( task_result.TaskRunResult.bot_id == bot_id).order( -task_result.TaskRunResult.started_ts).fetch_page_async( limit, start_cursor=cursor) bot_future = bot_management.get_info_key(bot_id).get_async() events_future = bot_management.get_events_query( bot_id, True).fetch_async(100) now = utils.utcnow() # Calculate the time this bot was idle. idle_time = datetime.timedelta() run_time = datetime.timedelta() run_results, cursor, more = run_results_future.get_result() if run_results: run_time = run_results[0].duration_now(now) or datetime.timedelta() if not cursor and run_results[0].state != task_result.State.RUNNING: # Add idle time since last task completed. Do not do this when a cursor # is used since it's not representative. idle_time = now - run_results[0].ended_ts for index in xrange(1, len(run_results)): # .started_ts will always be set by definition but .ended_ts may be None # if the task was abandoned. We can't count idle time since the bot may # have been busy running *another task*. # TODO(maruel): One option is to add a third value "broken_time". # Looking at timestamps specifically could help too, e.g. comparing # ended_ts of this task vs the next one to see if the bot was assigned # two tasks simultaneously. if run_results[index].ended_ts: idle_time += ( run_results[index-1].started_ts - run_results[index].ended_ts) # We are taking the whole time the bot was doing work, not just the # duration associated with the task. duration = run_results[index].duration_total if duration: run_time += duration events = events_future.get_result() bot = bot_future.get_result() if not bot and events: # If there is not BotInfo, look if there are BotEvent child of this # entity. If this is the case, it means the bot was deleted but it's # useful to show information about it to the user even if the bot was # deleted. For example, it could be an auto-scaled bot. bot = bot_management.BotInfo( key=bot_management.get_info_key(bot_id), dimensions=events[0].dimensions, state=events[0].state, external_ip=events[0].external_ip, version=events[0].version, quarantined=events[0].quarantined, task_id=events[0].task_id, last_seen_ts=events[0].ts) params = { 'bot': bot, 'bot_id': bot_id, 'current_version': bot_code.get_bot_version(self.request.host_url), 'cursor': cursor.urlsafe() if cursor and more else None, 'events': events, 'idle_time': idle_time, 'is_admin': acl.is_admin(), 'limit': limit, 'now': now, 'run_results': run_results, 'run_time': run_time, 'xsrf_token': self.generate_xsrf_token(), } self.response.write( template.render('swarming/restricted_bot.html', params))
def check_for_connection(machine_lease): """Checks for a bot_connected event. Args: machine_lease: MachineLease instance. """ assert machine_lease.instruction_ts # Technically this query is wrong because it looks at events in reverse # chronological order. The connection time we find here is actually the # most recent connection when we want the earliest. However, this function # is only called for new bots and stops being called once the connection # time is recorded, so the connection time we record should end up being the # first connection anyways. Iterating in the correct order would require # building a new, large index. for event in bot_management.get_events_query(machine_lease.bot_id, True): # We don't want to find a bot_connected event from before we sent the # connection instruction (e.g. in the event of hostname reuse), so do not # look at events from before the connection instruction was sent. if event.ts < machine_lease.instruction_ts: break if event.event_type == 'bot_connected': logging.info( 'Bot connected:\nKey: %s\nHostname: %s\nTime: %s', machine_lease.key, machine_lease.hostname, event.ts, ) associate_connection_ts(machine_lease.key, event.ts) ts_mon_metrics.on_machine_connected_time( (event.ts - machine_lease.instruction_ts).total_seconds(), fields={ 'machine_type': machine_lease.machine_type.id(), }, ) return # The bot hasn't connected yet. If it's dead or missing, release the lease. # At this point we have sent the connection instruction so the bot could still # connect after we release the lease but before Machine Provider actually # deletes the bot. Therefore we also schedule a termination task if releasing # the bot. That way, if the bot connects, it will just shut itself down. bot_info = bot_management.get_info_key(machine_lease.hostname).get() if not bot_info: logging.error( 'BotInfo missing:\nKey: %s\nHostname: %s', machine_lease.key, machine_lease.hostname, ) task_scheduler.schedule_request( task_request.create_termination_task(machine_lease.hostname), None, ) if release(machine_lease): clear_lease_request(machine_lease.key, machine_lease.client_request_id) return if bot_info.is_dead: logging.warning( 'Bot failed to connect in time:\nKey: %s\nHostname: %s', machine_lease.key, machine_lease.hostname, ) task_scheduler.schedule_request( task_request.create_termination_task(machine_lease.hostname), None, ) if release(machine_lease): cleanup_bot(machine_lease)
def test_bot_event(self): self.mock(random, 'getrandbits', lambda _: 0x88) now = datetime.datetime(2010, 1, 2, 3, 4, 5) self.mock_now(now) params = self.do_handshake() for e in handlers_bot.BotEventHandler.ALLOWED_EVENTS: if e == 'bot_error': # This one is tested specifically since it also logs an error message. continue params['event'] = e params['message'] = 'for the best' response = self.post_json('/swarming/api/v1/bot/event', params) self.assertEqual({}, response) # TODO(maruel): Replace with client api to query last BotEvent. actual = [ e.to_dict() for e in bot_management.get_events_query('bot1', True)] expected = [ { 'authenticated_as': u'bot:whitelisted-ip', 'dimensions': { u'id': [u'bot1'], u'os': [u'Amiga'], u'pool': [u'default'], }, 'event_type': unicode(e), 'external_ip': u'192.168.2.2', 'lease_id': None, 'lease_expiration_ts': None, 'message': u'for the best', 'quarantined': False, 'state': { u'bot_group_cfg_version': u'default', u'running_time': 1234.0, u'sleep_streak': 0, u'started_ts': 1410990411.111, }, 'task_id': u'', 'ts': now, 'version': self.bot_version, } for e in reversed(handlers_bot.BotEventHandler.ALLOWED_EVENTS) if e != 'bot_error' ] expected.append( { 'authenticated_as': u'bot:whitelisted-ip', 'dimensions': { u'id': [u'bot1'], u'os': [u'Amiga'], u'pool': [u'default'], }, 'event_type': u'bot_connected', 'external_ip': u'192.168.2.2', 'lease_id': None, 'lease_expiration_ts': None, 'message': None, 'quarantined': False, 'state': { u'running_time': 1234.0, u'sleep_streak': 0, u'started_ts': 1410990411.111, }, 'task_id': u'', 'ts': now, 'version': u'123', }) self.assertEqual(expected, actual)
def test_get_events_query(self): _bot_event(event_type='bot_connected') expected = [_gen_bot_event(event_type=u'bot_connected')] self.assertEqual(expected, [ i.to_dict() for i in bot_management.get_events_query('id1', True) ])
def test_cron_update_bot_info(self): # Create two bots, one becomes dead, updating the cron job fixes composite. timeout = bot_management.config.settings().bot_death_timeout_secs def check(dead, alive): q = bot_management.filter_availability( bot_management.BotInfo.query(), quarantined=None, in_maintenance=None, is_dead=True, is_busy=None) self.assertEqual(dead, [t.to_dict() for t in q]) q = bot_management.filter_availability( bot_management.BotInfo.query(), quarantined=None, in_maintenance=None, is_dead=False, is_busy=None) self.assertEqual(alive, [t.to_dict() for t in q]) _bot_event(event_type='request_sleep') # One second before the timeout value. then = self.mock_now(self.now, timeout - 1) _bot_event(event_type='request_sleep', bot_id='id2', external_ip='8.8.4.4', authenticated_as='bot:id2.domain', dimensions={ 'id': ['id2'], 'foo': ['bar'] }) bot1_alive = _gen_bot_info(first_seen_ts=self.now, last_seen_ts=self.now) bot1_dead = _gen_bot_info( first_seen_ts=self.now, last_seen_ts=self.now, composite=[ bot_management.BotInfo.NOT_IN_MAINTENANCE, bot_management.BotInfo.DEAD, bot_management.BotInfo.HEALTHY, bot_management.BotInfo.IDLE, ], is_dead=True) bot2_alive = _gen_bot_info(authenticated_as=u'bot:id2.domain', dimensions={ u'foo': [u'bar'], u'id': [u'id2'] }, first_seen_ts=then, id='id2', last_seen_ts=then) check([], [bot1_alive, bot2_alive]) self.assertEqual(0, bot_management.cron_update_bot_info()) check([], [bot1_alive, bot2_alive]) # Just stale enough to trigger the dead logic. then = self.mock_now(self.now, timeout) # The cron job didn't run yet, so it still has ALIVE bit. check([], [bot1_alive, bot2_alive]) self.assertEqual(1, bot_management.cron_update_bot_info()) # The cron job ran, so it's now correct. check([bot1_dead], [bot2_alive]) # the last event should be bot_missing events = list(bot_management.get_events_query('id1', order=True)) event = events[0] bq_event = swarming_pb2.BotEvent() event.to_proto(bq_event) self.assertEqual(event.event_type, 'bot_missing') self.assertEqual(event.last_seen_ts, bot1_dead['last_seen_ts']) self.assertEqual(bq_event.event, swarming_pb2.BOT_MISSING) self.assertEqual(bq_event.bot.status, swarming_pb2.MISSING) last_seen_ts = timestamp_pb2.Timestamp() last_seen_ts.FromDatetime(bot1_dead['last_seen_ts']) self.assertEqual(bq_event.bot.info.last_seen_ts, last_seen_ts)