예제 #1
0
    def delete(self, request):
        """Deletes the bot corresponding to a provided bot_id.

    At that point, the bot will not appears in the list of bots but it is still
    possible to get information about the bot with its bot id is known, as
    historical data is not deleted.

    It is meant to remove from the DB the presence of a bot that was retired,
    e.g. the VM was shut down already. Use 'terminate' instead of the bot is
    still alive.
    """
        logging.debug('%s', request)
        bot_info_key = bot_management.get_info_key(request.bot_id)
        bot_info = get_or_raise(
            bot_info_key)  # raises 404 if there is no such bot
        cleaned = False
        if bot_info.machine_lease:
            ml = lease_management.MachineLease.get_by_id(
                bot_info.machine_lease)
            if lease_management.release(ml):
                lease_management.cleanup_bot(ml)
                cleaned = True
        if not cleaned:
            # BotRoot is parent to BotInfo. It is important to note that the bot is
            # not there anymore, so it is not a member of any task queue.
            task_queues.cleanup_after_bot(bot_info_key.parent())
        bot_info_key.delete()
        return swarming_rpcs.DeletedResponse(deleted=True)
예제 #2
0
def cleanup_bot(machine_lease):
    """Cleans up entities after a bot is removed."""
    bot_root_key = bot_management.get_root_key(machine_lease.hostname)
    # The bot is being removed, remove it from the task queues.
    task_queues.cleanup_after_bot(bot_root_key)
    bot_management.get_info_key(machine_lease.hostname).delete()
    clear_lease_request(machine_lease.key, machine_lease.client_request_id)
예제 #3
0
 def test_cleanup_after_bot(self):
     self.assertEqual(0, _assert_bot())
     self._assert_task()
     task_queues.cleanup_after_bot('bot1')
     # BotInfo is deleted separately.
     self.assert_count(1, bot_management.BotInfo)
     self.assert_count(0, task_queues.BotDimensions)
     self.assert_count(0, task_queues.BotTaskDimensions)
     self.assert_count(1, task_queues.TaskDimensions)
예제 #4
0
def cron_update_bot_info():
    """Refreshes BotInfo.composite for dead bots."""
    dt = datetime.timedelta(seconds=config.settings().bot_death_timeout_secs)
    cutoff = utils.utcnow() - dt

    @ndb.tasklet
    def run(bot_key):
        bot = yield bot_key.get_async()
        if (bot and bot.last_seen_ts <= cutoff
                and (BotInfo.ALIVE in bot.composite
                     or BotInfo.DEAD not in bot.composite)):
            # Updating it recomputes composite.
            # TODO(maruel): BotEvent.
            yield bot.put_async()
            logging.info('DEAD: %s', bot.id)
            raise ndb.Return(1)
        raise ndb.Return(0)

    # The assumption here is that a cron job can churn through all the entities
    # fast enough. The number of dead bot is expected to be <10k. In practice the
    # average runtime is around 8 seconds.
    dead = 0
    seen = 0
    failed = 0
    try:
        futures = []
        for b in BotInfo.query(BotInfo.last_seen_ts <= cutoff):
            seen += 1
            if BotInfo.ALIVE in b.composite or BotInfo.DEAD not in b.composite:
                # Make sure the variable is not aliased.
                k = b.key
                # Unregister the bot from task queues since it can't reap anything.
                task_queues.cleanup_after_bot(k.parent())
                # Retry more often than the default 1. We do not want to throw too much
                # in the logs and there should be plenty of time to do the retries.
                f = datastore_utils.transaction_async(lambda: run(k),
                                                      retries=5)
                futures.append(f)
                if len(futures) >= 5:
                    ndb.Future.wait_any(futures)
                    for i in xrange(len(futures) - 1, -1, -1):
                        if futures[i].done():
                            try:
                                dead += futures.pop(i).get_result()
                            except datastore_utils.CommitError:
                                logging.warning('Failed to commit a Tx')
                                failed += 1
        for f in futures:
            try:
                dead += f.get_result()
            except datastore_utils.CommitError:
                logging.warning('Failed to commit a Tx')
                failed += 1
    finally:
        logging.debug('Seen %d bots, updated %d bots, failed %d tx', seen,
                      dead, failed)
    return dead
예제 #5
0
def bot_event(event_type, bot_id, external_ip, authenticated_as, dimensions,
              state, version, quarantined, maintenance_msg, task_id, task_name,
              **kwargs):
    """Records when a bot has queried for work.

  The sheer fact this event is happening means the bot is alive (not dead), so
  this is good. It may be quarantined though, and in this case, it will be
  evicted from the task queues.

  If it's declaring maintenance, it will not be evicted from the task queues, as
  maintenance is supposed to be temporary and expected to complete within a
  reasonable time frame.

  Arguments:
  - event_type: event type, one of BotEvent.ALLOWED_EVENTS.
  - bot_id: bot id.
  - external_ip: IP address as seen by the HTTP handler.
  - authenticated_as: bot identity as seen by the HTTP handler.
  - dimensions: Bot's dimensions as self-reported. If not provided, keep
        previous value.
  - state: ephemeral state of the bot. It is expected to change constantly. If
        not provided, keep previous value.
  - version: swarming_bot.zip version as self-reported. Used to spot if a bot
        failed to update promptly. If not provided, keep previous value.
  - quarantined: bool to determine if the bot was declared quarantined.
  - maintenance_msg: string describing why the bot is in maintenance.
  - task_id: packed task id if relevant. Set to '' to zap the stored value.
  - task_name: task name if relevant. Zapped when task_id is zapped.
  - kwargs: optional values to add to BotEvent relevant to event_type.

  Returns:
    ndb.Key to BotEvent entity if one was added.
  """
    if not bot_id:
        return

    # Retrieve the previous BotInfo and update it.
    info_key = get_info_key(bot_id)
    bot_info = info_key.get()
    if not bot_info:
        bot_info = BotInfo(key=info_key)
    now = utils.utcnow()
    bot_info.last_seen_ts = now
    bot_info.external_ip = external_ip
    bot_info.authenticated_as = authenticated_as
    bot_info.maintenance_msg = maintenance_msg
    if dimensions:
        bot_info.dimensions_flat = task_queues.dimensions_to_flat(dimensions)
    if state:
        bot_info.state = state
    if quarantined is not None:
        bot_info.quarantined = quarantined
    if task_id is not None:
        bot_info.task_id = task_id
    if task_name:
        bot_info.task_name = task_name
    if version is not None:
        bot_info.version = version

    if quarantined:
        # Make sure it is not in the queue since it can't reap anything.
        task_queues.cleanup_after_bot(info_key.parent())

    try:
        if event_type in ('request_sleep', 'task_update'):
            # Handle this specifically. It's not much of an even worth saving a
            # BotEvent for but it's worth updating BotInfo. The only reason BotInfo is
            # GET is to keep first_seen_ts. It's not necessary to use a transaction
            # here since no BotEvent is being added, only last_seen_ts is really
            # updated.
            bot_info.put()
            return

        event = BotEvent(parent=get_root_key(bot_id),
                         event_type=event_type,
                         external_ip=external_ip,
                         authenticated_as=authenticated_as,
                         dimensions_flat=bot_info.dimensions_flat,
                         quarantined=bot_info.quarantined,
                         maintenance_msg=bot_info.maintenance_msg,
                         state=bot_info.state,
                         task_id=bot_info.task_id,
                         version=bot_info.version,
                         **kwargs)

        if event_type in ('task_completed', 'task_error', 'task_killed'):
            # Special case to keep the task_id in the event but not in the summary.
            bot_info.task_id = ''

        datastore_utils.store_new_version(event, BotRoot, [bot_info])
        return event.key
    finally:
        # Store the event in memcache to accelerate monitoring.
        # key is at minute resolution, because that's the monitoring precision.
        key = '%s:%s' % (bot_id, now.strftime('%Y-%m-%dT%H:%M'))
        m = memcache.Client()
        while True:
            data = [event_type, now.second]
            if m.add(key, data, time=3600, namespace='BotEvents'):
                break
            prev_val = m.get(key, for_cas=True, namespace='BotEvents')
            if prev_val is None:
                continue
            data = prev_val + [event_type, now.second]
            # Keep the data for one hour. If the cron job cannot reap it within 1h,
            # it's probably broken.
            if m.cas(key, data, time=3600, namespace='BotEvents'):
                break
예제 #6
0
def cleanup_bot(machine_lease):
    """Cleans up entities after a bot is removed."""
    task_queues.cleanup_after_bot(machine_lease.hostname)
    bot_management.get_info_key(machine_lease.hostname).delete()
    clear_lease_request(machine_lease.key, machine_lease.client_request_id)
예제 #7
0
def bot_event(event_type, bot_id, external_ip, authenticated_as, dimensions,
              state, version, quarantined, maintenance_msg, task_id, task_name,
              **kwargs):
    """Records when a bot has queried for work.

  The sheer fact this event is happening means the bot is alive (not dead), so
  this is good. It may be quarantined though, and in this case, it will be
  evicted from the task queues.

  If it's declaring maintenance, it will not be evicted from the task queues, as
  maintenance is supposed to be temporary and expected to complete within a
  reasonable time frame.

  Arguments:
  - event_type: event type, one of BotEvent.ALLOWED_EVENTS.
  - bot_id: bot id.
  - external_ip: IP address as seen by the HTTP handler.
  - authenticated_as: bot identity as seen by the HTTP handler.
  - dimensions: Bot's dimensions as self-reported. If not provided, keep
        previous value.
  - state: ephemeral state of the bot. It is expected to change constantly. If
        not provided, keep previous value.
  - version: swarming_bot.zip version as self-reported. Used to spot if a bot
        failed to update promptly. If not provided, keep previous value.
  - quarantined: bool to determine if the bot was declared quarantined.
  - maintenance_msg: string describing why the bot is in maintenance.
  - task_id: packed task id if relevant. Set to '' to zap the stored value.
  - task_name: task name if relevant. Zapped when task_id is zapped.
  - kwargs: optional values to add to BotEvent relevant to event_type.
  - lease_id (in kwargs): ID assigned by Machine Provider for this bot.
  - lease_expiration_ts (in kwargs): UTC seconds from epoch when Machine
        Provider lease expires.
  - machine_type (in kwargs): ID of the lease_management.MachineType this
        Machine Provider bot was leased for.
  - machine_lease (in kwargs): ID of the lease_management.MachineType
        corresponding to this bot.
  """
    if not bot_id:
        return

    # Retrieve the previous BotInfo and update it.
    info_key = get_info_key(bot_id)
    bot_info = info_key.get()
    if not bot_info:
        bot_info = BotInfo(key=info_key)
    bot_info.last_seen_ts = utils.utcnow()
    bot_info.external_ip = external_ip
    bot_info.authenticated_as = authenticated_as
    bot_info.maintenance_msg = maintenance_msg
    if dimensions:
        bot_info.dimensions_flat = task_queues.dimensions_to_flat(dimensions)
    if state:
        bot_info.state = state
    if quarantined is not None:
        bot_info.quarantined = quarantined
    if task_id is not None:
        bot_info.task_id = task_id
    if task_name:
        bot_info.task_name = task_name
    if version is not None:
        bot_info.version = version
    if kwargs.get('lease_id') is not None:
        bot_info.lease_id = kwargs['lease_id']
    if kwargs.get('lease_expiration_ts') is not None:
        bot_info.lease_expiration_ts = kwargs['lease_expiration_ts']
    if kwargs.get('machine_type') is not None:
        bot_info.machine_type = kwargs['machine_type']
    if kwargs.get('machine_lease') is not None:
        bot_info.machine_lease = kwargs['machine_lease']

    if quarantined:
        # Make sure it is not in the queue since it can't reap anything.
        task_queues.cleanup_after_bot(info_key.parent())

    if event_type in ('request_sleep', 'task_update'):
        # Handle this specifically. It's not much of an even worth saving a BotEvent
        # for but it's worth updating BotInfo. The only reason BotInfo is GET is to
        # keep first_seen_ts. It's not necessary to use a transaction here since no
        # BotEvent is being added, only last_seen_ts is really updated.
        bot_info.put()
        return

    event = BotEvent(parent=get_root_key(bot_id),
                     event_type=event_type,
                     external_ip=external_ip,
                     authenticated_as=authenticated_as,
                     dimensions_flat=bot_info.dimensions_flat,
                     quarantined=bot_info.quarantined,
                     maintenance_msg=bot_info.maintenance_msg,
                     state=bot_info.state,
                     task_id=bot_info.task_id,
                     version=bot_info.version,
                     **kwargs)

    if event_type in ('task_completed', 'task_error', 'task_killed'):
        # Special case to keep the task_id in the event but not in the summary.
        bot_info.task_id = ''

    datastore_utils.store_new_version(event, BotRoot, [bot_info])
예제 #8
0
def cron_update_bot_info():
    """Refreshes BotInfo.composite for dead bots."""
    @ndb.tasklet
    def run(bot_key):
        bot = yield bot_key.get_async()
        if bot and bot.should_be_dead and (bot.is_alive or not bot.is_dead):
            # bot composite get updated in _pre_put_hook
            yield bot.put_async()
            logging.info('Changing Bot status to DEAD: %s', bot.id)
            raise ndb.Return(bot_key)
        raise ndb.Return(None)

    def tx_result(future, stats):
        try:
            bot_key = future.get_result()
            if bot_key:
                stats['dead'] += 1
                bot = bot_key.get()
                logging.info('Sending bot_missing event: %s', bot.id)
                bot_event(event_type='bot_missing',
                          bot_id=bot.id,
                          message=None,
                          external_ip=None,
                          authenticated_as=None,
                          dimensions=None,
                          state=None,
                          version=None,
                          quarantined=None,
                          maintenance_msg=None,
                          task_id=None,
                          task_name=None,
                          register_dimensions=False,
                          last_seen_ts=bot.last_seen_ts)
        except datastore_utils.CommitError:
            logging.warning('Failed to commit a Tx')
            stats['failed'] += 1

    # The assumption here is that a cron job can churn through all the entities
    # fast enough. The number of dead bot is expected to be <10k. In practice the
    # average runtime is around 8 seconds.
    cron_stats = {
        'dead': 0,
        'seen': 0,
        'failed': 0,
    }
    try:
        futures = []
        for b in BotInfo.yield_dead_bots():
            cron_stats['seen'] += 1
            if b.is_alive or not b.is_dead:
                # Make sure the variable is not aliased.
                k = b.key
                # Unregister the bot from task queues since it can't reap anything.
                task_queues.cleanup_after_bot(k.parent())
                # Retry more often than the default 1. We do not want to throw too much
                # in the logs and there should be plenty of time to do the retries.
                f = datastore_utils.transaction_async(lambda: run(k),
                                                      retries=5)
                futures.append(f)
                if len(futures) >= 5:
                    ndb.Future.wait_any(futures)
                    for i in range(len(futures) - 1, -1, -1):
                        if futures[i].done():
                            f = futures.pop(i)
                            tx_result(f, cron_stats)
        for f in futures:
            tx_result(f, cron_stats)
    finally:
        logging.debug('Seen %d bots, updated %d dead bots, failed %d tx',
                      cron_stats['seen'], cron_stats['dead'],
                      cron_stats['failed'])
    return cron_stats['dead']
예제 #9
0
def bot_event(event_type, bot_id, external_ip, authenticated_as, dimensions,
              state, version, quarantined, maintenance_msg, task_id, task_name,
              register_dimensions, **kwargs):
    """Records when a bot has queried for work.

  This event happening usually means the bot is alive (not dead), except for
  'bot_missing' event which is created by server. It may be quarantined, and
  in this case, it will be evicted from the task queues.

  If it's declaring maintenance, it will not be evicted from the task queues, as
  maintenance is supposed to be temporary and expected to complete within a
  reasonable time frame.

  Arguments:
  - event_type: event type, one of BotEvent.ALLOWED_EVENTS.
  - bot_id: bot id.
  - external_ip: IP address as seen by the HTTP handler.
  - authenticated_as: bot identity as seen by the HTTP handler.
  - dimensions: Bot's dimensions as self-reported. If not provided, keep
        previous value.
  - state: ephemeral state of the bot. It is expected to change constantly. If
        not provided, keep previous value.
  - version: swarming_bot.zip version as self-reported. Used to spot if a bot
        failed to update promptly. If not provided, keep previous value.
  - quarantined: bool to determine if the bot was declared quarantined.
  - maintenance_msg: string describing why the bot is in maintenance.
  - task_id: packed task id if relevant. Set to '' to zap the stored value.
  - task_name: task name if relevant. Zapped when task_id is zapped.
  - register_dimensions: bool to specify whether to register dimensions to
    BotInfo.
  - kwargs: optional values to add to BotEvent relevant to event_type.

  Returns:
    ndb.Key to BotEvent entity if one was added.
  """
    if not bot_id:
        return

    # Retrieve the previous BotInfo and update it.
    info_key = get_info_key(bot_id)
    bot_info = info_key.get()
    if not bot_info:
        bot_info = BotInfo(key=info_key)
        # Register only id and pool dimensions at the first handshake.
        dimensions_flat = task_queues.bot_dimensions_to_flat(dimensions)
        bot_info.dimensions_flat = [
            d for d in dimensions_flat
            if d.startswith('id:') or d.startswith('pool:')
        ]

    now = utils.utcnow()
    # bot_missing event is created by a server, not a bot.
    # So it shouldn't update last_seen_ts, external_ip, authenticated_as,
    # maintenance_msg.
    # If the last_seen_ts gets updated, it would change the bot composite
    # to alive. And if it clears maintenance_msg, it would change the composite
    # to NOT_IN_MAINTENANCE and lose the message.
    if event_type != 'bot_missing':
        bot_info.last_seen_ts = now
        bot_info.external_ip = external_ip
        bot_info.authenticated_as = authenticated_as
        bot_info.maintenance_msg = maintenance_msg
    dimensions_updated = False
    dimensions_flat = []
    if dimensions:
        dimensions_flat = task_queues.bot_dimensions_to_flat(dimensions)
        if register_dimensions and bot_info.dimensions_flat != dimensions_flat:
            logging.debug('bot_event: Updating dimensions. from: %s, to: %s',
                          bot_info.dimensions_flat, dimensions_flat)
            bot_info.dimensions_flat = dimensions_flat
            dimensions_updated = True
    if state:
        bot_info.state = state
    if quarantined is not None:
        bot_info.quarantined = quarantined
    if task_id is not None:
        bot_info.task_id = task_id
    # Remove the task from the BotInfo summary in the following cases
    # 1) When the task finishes (event_type=task_XXX)
    #    In these cases, the BotEvent shall have the task
    #    since the event still refers to it
    # 2) When the bot is pooling (event_type=request_sleep)
    #    The bot has already finished the previous task.
    #    But it could have forgotten to remove the task from the BotInfo.
    #    So ensure the task is removed.
    # 3) When the bot is missing
    #    We assume it can't process assigned task anymore.
    if event_type in ('task_completed', 'task_error', 'task_killed',
                      'request_sleep', 'bot_missing'):
        bot_info.task_id = None
        bot_info.task_name = None
    if task_name:
        bot_info.task_name = task_name
    if version is not None:
        bot_info.version = version

    if quarantined:
        # Make sure it is not in the queue since it can't reap anything.
        task_queues.cleanup_after_bot(info_key.parent())

    try:
        # Decide whether saving the event.
        # It's not much of an even worth saving a BotEvent for but it's worth
        # updating BotInfo. The only reason BotInfo is GET is to keep first_seen_ts.
        # It's not necessary to use a transaction here since no BotEvent is being
        # added, only last_seen_ts is really updated.
        # crbug.com/1015365: It's useful saving BotEvent when dimensions updates.
        # crbug.com/952984: It needs to save BotEvent when quarantined.
        skip_save_event = (not dimensions_updated and not quarantined
                           and event_type in ('request_sleep', 'task_update'))
        if skip_save_event:
            bot_info.put()
            return

        # When it's a 'bot_*' or 'request_*' event, use the dimensions provided
        # by the bot.
        # When it's a 'task_*' event, use BotInfo.dimensios_flat since dimensions
        # aren't provided by the bot.
        event_dimensions_flat = dimensions_flat or bot_info.dimensions_flat

        event = BotEvent(parent=get_root_key(bot_id),
                         event_type=event_type,
                         external_ip=external_ip,
                         authenticated_as=authenticated_as,
                         dimensions_flat=event_dimensions_flat,
                         quarantined=bot_info.quarantined,
                         maintenance_msg=bot_info.maintenance_msg,
                         state=bot_info.state,
                         task_id=task_id or bot_info.task_id,
                         version=bot_info.version,
                         **kwargs)

        datastore_utils.store_new_version(event, BotRoot, [bot_info])
        return event.key
    finally:
        # Store the event in memcache to accelerate monitoring.
        # key is at minute resolution, because that's the monitoring precision.
        key = '%s:%s' % (bot_id, now.strftime('%Y-%m-%dT%H:%M'))
        m = memcache.Client()
        while True:
            data = [event_type, now.second]
            if m.add(key, data, time=3600, namespace='BotEvents'):
                break
            prev_val = m.get(key, for_cas=True, namespace='BotEvents')
            if prev_val is None:
                continue
            data = prev_val + [event_type, now.second]
            # Keep the data for one hour. If the cron job cannot reap it within 1h,
            # it's probably broken.
            if m.cas(key, data, time=3600, namespace='BotEvents'):
                break