Exemplo n.º 1
0
def config_for_task(request):
  """Retrieves the ExternalSchedulerConfig for this task request, if any.

  Arguments:
    request: a task_request.TaskRequest instance.

  Returns:
    pools_config.ExternalSchedulerConfig for external scheduler to use for
    this bot, if it exists, or None otherwise.
  """
  s0 = request.task_slice(0)
  pool = s0.properties.pool
  if not pool:
    return None
  pool_cfg = pools_config.get_pool_config(pool)
  if not pool_cfg or not pool_cfg.external_schedulers:
    return None

  # Determine the dimension intersection across all task slices.
  common_dimensions = set(
      task_queues.dimensions_to_flat(s0.properties.dimensions))
  for i in range(1, request.num_task_slices):
    s = request.task_slice(i)
    common_dimensions.intersection_update(
        task_queues.dimensions_to_flat(s.properties.dimensions))

  return _config_for_dimensions(pool_cfg, common_dimensions)
Exemplo n.º 2
0
    def get(self, request):
        """Returns information about a known bot.

    This includes its state and dimensions, and if it is currently running a
    task.
    """
        logging.debug('%s', request)
        bot_id = request.bot_id
        bot = bot_management.get_info_key(bot_id).get()
        deleted = False
        if not bot:
            # If there is not BotInfo, look if there are BotEvent child of this
            # entity. If this is the case, it means the bot was deleted but it's
            # useful to show information about it to the user even if the bot was
            # deleted.
            events = bot_management.get_events_query(bot_id, True).fetch(1)
            if not events:
                raise endpoints.NotFoundException('%s not found.' % bot_id)
            bot = bot_management.BotInfo(
                key=bot_management.get_info_key(bot_id),
                dimensions_flat=task_queues.dimensions_to_flat(
                    events[0].dimensions),
                state=events[0].state,
                external_ip=events[0].external_ip,
                authenticated_as=events[0].authenticated_as,
                version=events[0].version,
                quarantined=events[0].quarantined,
                maintenance_msg=events[0].maintenance_msg,
                task_id=events[0].task_id,
                last_seen_ts=events[0].ts)
            deleted = True

        return message_conversion.bot_info_to_rpc(bot, deleted=deleted)
Exemplo n.º 3
0
def has_capacity(dimensions):
    """Returns True if there's a reasonable chance for this task request
  dimensions set to be serviced by a bot alive.

  First look at the task queues, then look into the datastore to figure this
  out.
  """
    assert not ndb.in_transaction()
    # Look at the fast path.
    cap = task_queues.probably_has_capacity(dimensions)
    if cap is not None:
        return cap

    # Do a query. That's slower and it's eventually consistent.
    q = BotInfo.query()
    flat = task_queues.dimensions_to_flat(dimensions)
    for f in flat:
        q = q.filter(BotInfo.dimensions_flat == f)
    if q.count(limit=1):
        logging.info('Found capacity via BotInfo: %s', flat)
        # Add it to the quick cache to improve performance.
        task_queues.set_has_capacity(dimensions)
        return True

    logging.error('HAS NO CAPACITY: %s', flat)
    # TODO(maruel): https://crbug.com/839173
    return _FAKE_CAPACITY
Exemplo n.º 4
0
def has_capacity(dimensions):
    """Returns True if there's a reasonable chance for this task request
  dimensions set to be serviced by a bot alive.

  First look at the task queues, then look into the datastore to figure this
  out.
  """
    assert not ndb.in_transaction()
    # Look at the fast path.
    cap = task_queues.probably_has_capacity(dimensions)
    if cap is not None:
        return cap

    # Do a query. That's slower and it's eventually consistent.
    q = BotInfo.query()
    flat = task_queues.dimensions_to_flat(dimensions)
    for f in flat:
        q = q.filter(BotInfo.dimensions_flat == f)

    # Add it to the 'quick cache' to improve performance. This cache is kept for
    # the same duration as how long bots are considered still alive without a
    # ping. Useful if there's a single bot in the fleet for these dimensions and
    # it takes a long time to reboot. This is the case with Android with slow
    # initialization and some baremetal bots (thanks SCSI firmware!).
    seconds = config.settings().bot_death_timeout_secs

    if q.count(limit=1):
        logging.info('Found capacity via BotInfo: %s', flat)
        task_queues.set_has_capacity(dimensions, seconds)
        return True

    # Search a bit harder. In this case, we're looking for BotEvent which would be
    # a bot that used to exist recently.
    cutoff = utils.utcnow() - datetime.timedelta(seconds=seconds)
    q = BotEvent.query(BotEvent.ts > cutoff)
    flat = task_queues.dimensions_to_flat(dimensions)
    for f in flat:
        q = q.filter(BotEvent.dimensions_flat == f)
    if q.count(limit=1):
        logging.info('Found capacity via BotEvent: %s', flat)
        task_queues.set_has_capacity(dimensions, seconds)
        return True

    logging.warning('HAS NO CAPACITY: %s', flat)
    return False
Exemplo n.º 5
0
def _gen_bot_info(key_id, last_seen_ts, **kwargs):
    args = {
        'key': ndb.Key('BotRoot', key_id, 'BotInfo', 'info'),
        'last_seen_ts': last_seen_ts,
        'dimensions': {
            'os': ['Linux', 'Ubuntu'],
            'bot_id': [key_id],
        },
        'state': {},
    }
    args.update(**kwargs)
    args['dimensions_flat'] = task_queues.dimensions_to_flat(
        args.pop('dimensions'))
    return bot_management.BotInfo(**args)
Exemplo n.º 6
0
def config_for_bot(bot_dimensions):
  """Retrieves the ExternalSchedulerConfig for this bot, if any.

  Arguments:
  - bot_dimensions: The dimensions of the bot as a dictionary in
          {string key: list of string values} format.

  Returns:
    pools_config.ExternalSchedulerConfig for external scheduler to use for
    this bot, if it exists, or None otherwise.
  """
  pool_cfg = _bot_pool_cfg(bot_dimensions)
  bot_dimensions_flat = set(task_queues.dimensions_to_flat(bot_dimensions))
  return _config_for_dimensions(pool_cfg, bot_dimensions_flat)
Exemplo n.º 7
0
 def test_dimensions_to_flat_long_unicode(self):
   key = u'a' * 64
   actual = task_queues.dimensions_to_flat(
       {
         key: [
           # Ok.
           u'⌛' * 256,
           # Too long.
           u'â›”' * 257,
         ],
       })
   expected = [
       key + u':' + u'⌛' * 256,
       key + u':' + u'⛔' * 256 + u'…',
   ]
   self.assertEqual(expected, actual)
Exemplo n.º 8
0
 def test_dimensions_to_flat_long_ascii(self):
   key = u'a' * 64
   actual = task_queues.dimensions_to_flat(
       {
         key: [
           # Too long.
           u'b' * 257,
           # Ok.
           u'c' * 256,
         ],
       })
   expected = [
       key + u':' + u'b' * 256 + u'…',
       key + u':' + u'c' * 256,
   ]
   self.assertEqual(expected, actual)
Exemplo n.º 9
0
 def test_dimensions_to_flat_long_unicode_non_BMP(self):
   # For non-BMP characters, the length is effectively halved for now.
   key = u'a' * 64
   # Python considers emoji in the supplemental plane to have length 2 on UCS2
   # builds, and length 1 on UCS4 builds.
   l = 128 if sys.maxunicode < 65536 else 256
   actual = task_queues.dimensions_to_flat(
       {
         key: [
           # Too long.
           u'💥' * (l+1),
           # Ok.
           u'😬' * l,
         ],
       })
   expected = [
       key + u':' + u'💥' * l + u'…',
       key + u':' + u'😬' * l,
   ]
   self.assertEqual(expected, actual)
Exemplo n.º 10
0
def assign_task(es_cfg, bot_dimensions):
  """Calls external scheduler for a single idle bot with given dimensions.

  Arguments:
    es_cfg: pools_config.ExternalSchedulerConfig instance.
    bot_dimensions: dimensions {string key: list of string values}

  Returns:
    (Task id string, slice number) tuple or (None, None) if no task
    to assign.
  """
  bot_id = bot_dimensions[u'id'][0]
  logging.debug('Using external scheduler address: %s id: %s for bot %s',
                es_cfg.address, es_cfg.id, bot_id)

  req = plugin_pb2.AssignTasksRequest()

  idle_bot = req.idle_bots.add()
  idle_bot.bot_id = bot_id
  idle_bot.dimensions.extend(task_queues.dimensions_to_flat(bot_dimensions))

  req.scheduler_id = es_cfg.id
  req.time.GetCurrentTime()

  c = _get_client(es_cfg.address)

  # TODO(akeshet): Catch or handle errors appropriately.
  resp = c.AssignTasks(req, credentials=_creds())

  if not resp or not resp.assignments:
    return None, None

  assert len(resp.assignments) == 1
  assert resp.assignments[0].bot_id == bot_id

  return resp.assignments[0].task_id, resp.assignments[0].slice_number
Exemplo n.º 11
0
def bot_event(event_type, bot_id, external_ip, authenticated_as, dimensions,
              state, version, quarantined, maintenance_msg, task_id, task_name,
              **kwargs):
    """Records when a bot has queried for work.

  The sheer fact this event is happening means the bot is alive (not dead), so
  this is good. It may be quarantined though, and in this case, it will be
  evicted from the task queues.

  If it's declaring maintenance, it will not be evicted from the task queues, as
  maintenance is supposed to be temporary and expected to complete within a
  reasonable time frame.

  Arguments:
  - event_type: event type, one of BotEvent.ALLOWED_EVENTS.
  - bot_id: bot id.
  - external_ip: IP address as seen by the HTTP handler.
  - authenticated_as: bot identity as seen by the HTTP handler.
  - dimensions: Bot's dimensions as self-reported. If not provided, keep
        previous value.
  - state: ephemeral state of the bot. It is expected to change constantly. If
        not provided, keep previous value.
  - version: swarming_bot.zip version as self-reported. Used to spot if a bot
        failed to update promptly. If not provided, keep previous value.
  - quarantined: bool to determine if the bot was declared quarantined.
  - maintenance_msg: string describing why the bot is in maintenance.
  - task_id: packed task id if relevant. Set to '' to zap the stored value.
  - task_name: task name if relevant. Zapped when task_id is zapped.
  - kwargs: optional values to add to BotEvent relevant to event_type.

  Returns:
    ndb.Key to BotEvent entity if one was added.
  """
    if not bot_id:
        return

    # Retrieve the previous BotInfo and update it.
    info_key = get_info_key(bot_id)
    bot_info = info_key.get()
    if not bot_info:
        bot_info = BotInfo(key=info_key)
    now = utils.utcnow()
    bot_info.last_seen_ts = now
    bot_info.external_ip = external_ip
    bot_info.authenticated_as = authenticated_as
    bot_info.maintenance_msg = maintenance_msg
    if dimensions:
        bot_info.dimensions_flat = task_queues.dimensions_to_flat(dimensions)
    if state:
        bot_info.state = state
    if quarantined is not None:
        bot_info.quarantined = quarantined
    if task_id is not None:
        bot_info.task_id = task_id
    if task_name:
        bot_info.task_name = task_name
    if version is not None:
        bot_info.version = version

    if quarantined:
        # Make sure it is not in the queue since it can't reap anything.
        task_queues.cleanup_after_bot(info_key.parent())

    try:
        if event_type in ('request_sleep', 'task_update'):
            # Handle this specifically. It's not much of an even worth saving a
            # BotEvent for but it's worth updating BotInfo. The only reason BotInfo is
            # GET is to keep first_seen_ts. It's not necessary to use a transaction
            # here since no BotEvent is being added, only last_seen_ts is really
            # updated.
            bot_info.put()
            return

        event = BotEvent(parent=get_root_key(bot_id),
                         event_type=event_type,
                         external_ip=external_ip,
                         authenticated_as=authenticated_as,
                         dimensions_flat=bot_info.dimensions_flat,
                         quarantined=bot_info.quarantined,
                         maintenance_msg=bot_info.maintenance_msg,
                         state=bot_info.state,
                         task_id=bot_info.task_id,
                         version=bot_info.version,
                         **kwargs)

        if event_type in ('task_completed', 'task_error', 'task_killed'):
            # Special case to keep the task_id in the event but not in the summary.
            bot_info.task_id = ''

        datastore_utils.store_new_version(event, BotRoot, [bot_info])
        return event.key
    finally:
        # Store the event in memcache to accelerate monitoring.
        # key is at minute resolution, because that's the monitoring precision.
        key = '%s:%s' % (bot_id, now.strftime('%Y-%m-%dT%H:%M'))
        m = memcache.Client()
        while True:
            data = [event_type, now.second]
            if m.add(key, data, time=3600, namespace='BotEvents'):
                break
            prev_val = m.get(key, for_cas=True, namespace='BotEvents')
            if prev_val is None:
                continue
            data = prev_val + [event_type, now.second]
            # Keep the data for one hour. If the cron job cannot reap it within 1h,
            # it's probably broken.
            if m.cas(key, data, time=3600, namespace='BotEvents'):
                break
Exemplo n.º 12
0
def bot_event(event_type, bot_id, external_ip, authenticated_as, dimensions,
              state, version, quarantined, maintenance_msg, task_id, task_name,
              **kwargs):
    """Records when a bot has queried for work.

  The sheer fact this event is happening means the bot is alive (not dead), so
  this is good. It may be quarantined though, and in this case, it will be
  evicted from the task queues.

  If it's declaring maintenance, it will not be evicted from the task queues, as
  maintenance is supposed to be temporary and expected to complete within a
  reasonable time frame.

  Arguments:
  - event_type: event type, one of BotEvent.ALLOWED_EVENTS.
  - bot_id: bot id.
  - external_ip: IP address as seen by the HTTP handler.
  - authenticated_as: bot identity as seen by the HTTP handler.
  - dimensions: Bot's dimensions as self-reported. If not provided, keep
        previous value.
  - state: ephemeral state of the bot. It is expected to change constantly. If
        not provided, keep previous value.
  - version: swarming_bot.zip version as self-reported. Used to spot if a bot
        failed to update promptly. If not provided, keep previous value.
  - quarantined: bool to determine if the bot was declared quarantined.
  - maintenance_msg: string describing why the bot is in maintenance.
  - task_id: packed task id if relevant. Set to '' to zap the stored value.
  - task_name: task name if relevant. Zapped when task_id is zapped.
  - kwargs: optional values to add to BotEvent relevant to event_type.
  - lease_id (in kwargs): ID assigned by Machine Provider for this bot.
  - lease_expiration_ts (in kwargs): UTC seconds from epoch when Machine
        Provider lease expires.
  - machine_type (in kwargs): ID of the lease_management.MachineType this
        Machine Provider bot was leased for.
  - machine_lease (in kwargs): ID of the lease_management.MachineType
        corresponding to this bot.
  """
    if not bot_id:
        return

    # Retrieve the previous BotInfo and update it.
    info_key = get_info_key(bot_id)
    bot_info = info_key.get()
    if not bot_info:
        bot_info = BotInfo(key=info_key)
    bot_info.last_seen_ts = utils.utcnow()
    bot_info.external_ip = external_ip
    bot_info.authenticated_as = authenticated_as
    bot_info.maintenance_msg = maintenance_msg
    if dimensions:
        bot_info.dimensions_flat = task_queues.dimensions_to_flat(dimensions)
    if state:
        bot_info.state = state
    if quarantined is not None:
        bot_info.quarantined = quarantined
    if task_id is not None:
        bot_info.task_id = task_id
    if task_name:
        bot_info.task_name = task_name
    if version is not None:
        bot_info.version = version
    if kwargs.get('lease_id') is not None:
        bot_info.lease_id = kwargs['lease_id']
    if kwargs.get('lease_expiration_ts') is not None:
        bot_info.lease_expiration_ts = kwargs['lease_expiration_ts']
    if kwargs.get('machine_type') is not None:
        bot_info.machine_type = kwargs['machine_type']
    if kwargs.get('machine_lease') is not None:
        bot_info.machine_lease = kwargs['machine_lease']

    if quarantined:
        # Make sure it is not in the queue since it can't reap anything.
        task_queues.cleanup_after_bot(info_key.parent())

    if event_type in ('request_sleep', 'task_update'):
        # Handle this specifically. It's not much of an even worth saving a BotEvent
        # for but it's worth updating BotInfo. The only reason BotInfo is GET is to
        # keep first_seen_ts. It's not necessary to use a transaction here since no
        # BotEvent is being added, only last_seen_ts is really updated.
        bot_info.put()
        return

    event = BotEvent(parent=get_root_key(bot_id),
                     event_type=event_type,
                     external_ip=external_ip,
                     authenticated_as=authenticated_as,
                     dimensions_flat=bot_info.dimensions_flat,
                     quarantined=bot_info.quarantined,
                     maintenance_msg=bot_info.maintenance_msg,
                     state=bot_info.state,
                     task_id=bot_info.task_id,
                     version=bot_info.version,
                     **kwargs)

    if event_type in ('task_completed', 'task_error', 'task_killed'):
        # Special case to keep the task_id in the event but not in the summary.
        bot_info.task_id = ''

    datastore_utils.store_new_version(event, BotRoot, [bot_info])
Exemplo n.º 13
0
def notify_requests(es_cfg, requests, use_tq, is_callback, batch_mode=False):
  """Calls external scheduler to notify it of a task state.

  Arguments:
    - es_cfg: pools_config.ExternalSchedulerConfig for external scheduler to
        notify.
    - requests:
      A list of (task_request.TaskRequest,
                 task_result.TaskResultSummary or task_result.TaskRunResult)
      tuples.
    - use_tq: If true, make this call on a task queue (within the current
              datastore transaction).
    - is_callback: If true, indicates that this notification was in response
                   to a external-scheduler-requested callback. This is for
    - batch_mode: If true, the notifications will be sent in a batched mode
                  along with others, to reduce traffic to external scheduler.
                  Only valid when use_tq and global config's
                  enable_batch_es_notifications are true.

  Returns: Nothing.
  """
  logging.debug(
      'notify_requests(es_cfg=(%s,%s), requests=%s, use_tq=%s, '
      'is_callback=%s, batch_mode=%s)',
      es_cfg.address, es_cfg.id, [r.task_id for r, _ in requests], use_tq,
      is_callback, batch_mode)

  req = plugin_pb2.NotifyTasksRequest()
  req.is_callback = is_callback

  for request, result_summary in requests:
    item = req.notifications.add()
    # TODO(akeshet): This time should possibly come from the read time from
    # datastore, rather than the local server clock.
    item.time.FromDatetime(utils.utcnow())
    item.task.id = request.task_id
    item.task.tags.extend(request.tags)
    item.task.enqueued_time.FromDatetime(request.created_ts)
    for i in range(request.num_task_slices):
      s = request.task_slice(i)
      flat_dimensions = task_queues.dimensions_to_flat(s.properties.dimensions)
      s_pb = item.task.slices.add()
      s_pb.dimensions.extend(flat_dimensions)

    res = swarming_pb2.TaskResult()
    result_summary.to_proto(res)
    item.task.state = res.state
    if result_summary.bot_id:
      # TODO(akeshet): We should only actually set this is state is running.
      item.task.bot_id = result_summary.bot_id

  req.scheduler_id = es_cfg.id

  if not use_tq:
    # Ignore return value, the response proto is empty.
    notify_request_now(es_cfg.address, req)
    return

  request_json = json_format.MessageToJson(req)
  # If enable_batch_es_notifications is true, the notifications will be sent in
  # a batched mode along with others, to reduce traffic to external scheduler.
  if batch_mode and config.settings().enable_batch_es_notifications:
    payload = {'es_host': es_cfg.address, 'request_json': request_json}
    req = taskqueue.Task(payload=json.dumps(payload), method='PULL')
    if not req.add(queue_name='es-notify-tasks-batch',
                   transactional=ndb.in_transaction()):
      raise datastore_utils.CommitError('Failed to enqueue task')
    stats = taskqueue.QueueStatistics.fetch('es-notify-kick')
    # Add a kicker task if there are fewer than 10 minutes worth.
    if stats.tasks < 600:
      job_enqueued = utils.enqueue_task(
          '/internal/taskqueue/important/external_scheduler/notify-kick',
          'es-notify-kick',
          transactional=ndb.in_transaction())
      if not job_enqueued:
        logging.info('Failed to add a notify-kick for request.')
    return

  enqueued = utils.enqueue_task(
      '/internal/taskqueue/important/external_scheduler/notify-tasks',
      'es-notify-tasks',
      params={'es_host': es_cfg.address, 'request_json': request_json},
      transactional=ndb.in_transaction())
  if not enqueued:
    raise datastore_utils.CommitError('Failed to enqueue task')
Exemplo n.º 14
0
 def test_dimensions_to_flat_duplicate_value(self):
   actual = task_queues.dimensions_to_flat({u'a': [u'c', u'c']})
   self.assertEqual([u'a:c'], actual)
Exemplo n.º 15
0
 def test_dimensions_to_flat(self):
   actual = task_queues.dimensions_to_flat(
       {u'a': [u'c', u'bee'], u'cee': [u'zee']})
   self.assertEqual([u'a:bee', u'a:c', u'cee:zee'], actual)
Exemplo n.º 16
0
 def test_dimensions_to_flat(self):
   self.assertEqual(
       ['a:bee', 'a:c', 'cee:zee'],
       task_queues.dimensions_to_flat({'a': ['c', 'bee'], 'cee': ['zee']}))