Пример #1
0
def cron_update_bot_info():
    """Refreshes BotInfo.composite for dead bots."""
    dt = datetime.timedelta(seconds=config.settings().bot_death_timeout_secs)
    cutoff = utils.utcnow() - dt

    @ndb.tasklet
    def run(bot_key):
        bot = yield bot_key.get_async()
        if (bot and bot.last_seen_ts <= cutoff
                and (BotInfo.ALIVE in bot.composite
                     or BotInfo.DEAD not in bot.composite)):
            # Updating it recomputes composite.
            # TODO(maruel): BotEvent.
            yield bot.put_async()
            logging.info('DEAD: %s', bot.id)
            raise ndb.Return(1)
        raise ndb.Return(0)

    # The assumption here is that a cron job can churn through all the entities
    # fast enough. The number of dead bot is expected to be <10k. In practice the
    # average runtime is around 8 seconds.
    dead = 0
    seen = 0
    failed = 0
    try:
        futures = []
        for b in BotInfo.query(BotInfo.last_seen_ts <= cutoff):
            seen += 1
            if BotInfo.ALIVE in b.composite or BotInfo.DEAD not in b.composite:
                # Make sure the variable is not aliased.
                k = b.key
                # Unregister the bot from task queues since it can't reap anything.
                task_queues.cleanup_after_bot(k.parent())
                # Retry more often than the default 1. We do not want to throw too much
                # in the logs and there should be plenty of time to do the retries.
                f = datastore_utils.transaction_async(lambda: run(k),
                                                      retries=5)
                futures.append(f)
                if len(futures) >= 5:
                    ndb.Future.wait_any(futures)
                    for i in xrange(len(futures) - 1, -1, -1):
                        if futures[i].done():
                            try:
                                dead += futures.pop(i).get_result()
                            except datastore_utils.CommitError:
                                logging.warning('Failed to commit a Tx')
                                failed += 1
        for f in futures:
            try:
                dead += f.get_result()
            except datastore_utils.CommitError:
                logging.warning('Failed to commit a Tx')
                failed += 1
    finally:
        logging.debug('Seen %d bots, updated %d bots, failed %d tx', seen,
                      dead, failed)
    return dead
Пример #2
0
def _remove_old_entity_async(key, now):
    """Removes a stale TaskDimensions or BotTaskDimensions instance.

  Returns:
    key if it was deleted.
  """
    obj = yield key.get_async()
    if not obj or obj.valid_until_ts >= now:
        raise ndb.Return(None)

    @ndb.tasklet
    def tx():
        obj = yield key.get_async()
        if obj and obj.valid_until_ts < now:
            yield key.delete_async()
            raise ndb.Return(key)

    res = yield datastore_utils.transaction_async(
        tx, propagation=ndb.TransactionOptions.INDEPENDENT)
    raise ndb.Return(res)
Пример #3
0
def schedule_request(request):
    """Creates and stores all the entities to schedule a new task request.

  The number of entities created is 3: TaskRequest, TaskResultSummary and
  TaskToRun.

  The TaskRequest is saved first as a DB transaction, then TaskResultSummary and
  TaskToRun are saved as a single DB RPC. The Search index is also updated
  in-between.

  Arguments:
  - request: is in the TaskRequest entity saved in the DB.

  Returns:
    TaskResultSummary. TaskToRun is not returned.
  """
    dupe_future = None
    if request.properties.idempotent:
        # Find a previously run task that is also idempotent and completed. Start a
        # query to fetch items that can be used to dedupe the task. See the comment
        # for this property for more details.
        #
        # Do not use "cls.created_ts > oldest" here because this would require a
        # composite index. It's unnecessary because TaskRequest.key is mostly
        # equivalent to decreasing TaskRequest.created_ts, ordering by key works as
        # well and doesn't require a composite index.
        cls = task_result.TaskResultSummary
        h = request.properties.properties_hash
        dupe_future = cls.query(cls.properties_hash == h).order(cls.key).get_async()

    # At this point, the request is now in the DB but not yet in a mode where it
    # can be triggered or visible. Index it right away so it is searchable. If any
    # of remaining calls in this function fail, the TaskRequest and Search
    # Document will simply point to an incomplete task, which will be ignored.
    #
    # Creates the entities TaskToRun and TaskResultSummary but do not save them
    # yet. TaskRunResult will be created once a bot starts it.
    task = task_to_run.new_task_to_run(request)
    result_summary = task_result.new_result_summary(request)

    # Do not specify a doc_id, as they are guaranteed to be monotonically
    # increasing and searches are done in reverse order, which fits exactly the
    # created_ts ordering. This is useful because DateField is precise to the date
    # (!) and NumberField is signed 32 bits so the best it could do with EPOCH is
    # second resolution up to year 2038.
    index = search.Index(name="requests")
    packed = task_pack.pack_result_summary_key(result_summary.key)
    doc = search.Document(
        fields=[search.TextField(name="name", value=request.name), search.AtomField(name="id", value=packed)]
    )
    # Even if it fails here, we're still fine, as the task is not "alive" yet.
    search_future = index.put_async([doc])

    now = utils.utcnow()

    if dupe_future:
        # Reuse the results!
        dupe_summary = dupe_future.get_result()
        # Refuse tasks older than X days. This is due to the isolate server dropping
        # files. https://code.google.com/p/swarming/issues/detail?id=197
        oldest = now - datetime.timedelta(seconds=config.settings().reusable_task_age_secs)
        if dupe_summary and dupe_summary.created_ts > oldest:
            # If there's a bug, commenting out this block is sufficient to disable the
            # functionality.
            # Setting task.queue_number to None removes it from the scheduling.
            task.queue_number = None
            _copy_entity(dupe_summary, result_summary, ("created_ts", "name", "user", "tags"))
            result_summary.properties_hash = None
            result_summary.try_number = 0
            result_summary.cost_saved_usd = result_summary.cost_usd
            # Only zap after.
            result_summary.costs_usd = []
            result_summary.deduped_from = task_pack.pack_run_result_key(dupe_summary.run_result_key)

    # Get parent task details if applicable.
    parent_task_keys = None
    if request.parent_task_id:
        parent_run_key = task_pack.unpack_run_result_key(request.parent_task_id)
        parent_task_keys = [parent_run_key, task_pack.run_result_key_to_result_summary_key(parent_run_key)]

    result_summary.modified_ts = now

    # Storing these entities makes this task live. It is important at this point
    # that the HTTP handler returns as fast as possible, otherwise the task will
    # be run but the client will not know about it.
    def run():
        ndb.put_multi([result_summary, task])

    def run_parent():
        # This one is slower.
        items = ndb.get_multi(parent_task_keys)
        k = result_summary.task_id
        for item in items:
            item.children_task_ids.append(k)
            item.modified_ts = now
        ndb.put_multi(items)

    # Raising will abort to the caller.
    futures = [datastore_utils.transaction_async(run)]
    if parent_task_keys:
        futures.append(datastore_utils.transaction_async(run_parent))

    try:
        search_future.get_result()
    except search.Error:
        # Do not abort the task, for now search is best effort.
        logging.exception("Put failed")

    for future in futures:
        # Check for failures, it would raise in this case, aborting the call.
        future.get_result()

    stats.add_task_entry(
        "task_enqueued", result_summary.key, dimensions=request.properties.dimensions, user=request.user
    )
    return result_summary
Пример #4
0
def schedule_request(request):
  """Creates and stores all the entities to schedule a new task request.

  The number of entities created is 3: TaskRequest, TaskResultSummary and
  TaskToRun.

  The TaskRequest is saved first as a DB transaction, then TaskResultSummary and
  TaskToRun are saved as a single DB RPC. The Search index is also updated
  in-between.

  Arguments:
  - request: is in the TaskRequest entity saved in the DB.

  Returns:
    TaskResultSummary. TaskToRun is not returned.
  """
  dupe_future = None
  if request.properties.idempotent:
    # Find a previously run task that is also idempotent and completed. Start a
    # query to fetch items that can be used to dedupe the task. See the comment
    # for this property for more details.
    #
    # Do not use "cls.created_ts > oldest" here because this would require a
    # composite index. It's unnecessary because TaskRequest.key is mostly
    # equivalent to decreasing TaskRequest.created_ts, ordering by key works as
    # well and doesn't require a composite index.
    cls = task_result.TaskResultSummary
    h = request.properties.properties_hash
    dupe_future = cls.query(cls.properties_hash==h).order(cls.key).get_async()

  # At this point, the request is now in the DB but not yet in a mode where it
  # can be triggered or visible. Index it right away so it is searchable. If any
  # of remaining calls in this function fail, the TaskRequest and Search
  # Document will simply point to an incomplete task, which will be ignored.
  #
  # Creates the entities TaskToRun and TaskResultSummary but do not save them
  # yet. TaskRunResult will be created once a bot starts it.
  task = task_to_run.new_task_to_run(request)
  result_summary = task_result.new_result_summary(request)

  # Do not specify a doc_id, as they are guaranteed to be monotonically
  # increasing and searches are done in reverse order, which fits exactly the
  # created_ts ordering. This is useful because DateField is precise to the date
  # (!) and NumberField is signed 32 bits so the best it could do with EPOCH is
  # second resolution up to year 2038.
  index = search.Index(name='requests')
  packed = task_pack.pack_result_summary_key(result_summary.key)
  doc = search.Document(
      fields=[
        search.TextField(name='name', value=request.name),
        search.AtomField(name='id', value=packed),
      ])
  # Even if it fails here, we're still fine, as the task is not "alive" yet.
  search_future = index.put_async([doc])

  now = utils.utcnow()

  if dupe_future:
    # Reuse the results!
    dupe_summary = dupe_future.get_result()
    # Refuse tasks older than X days. This is due to the isolate server dropping
    # files. https://code.google.com/p/swarming/issues/detail?id=197
    oldest = now - datetime.timedelta(
        seconds=config.settings().reusable_task_age_secs)
    if dupe_summary and dupe_summary.created_ts > oldest:
      # If there's a bug, commenting out this block is sufficient to disable the
      # functionality.
      # Setting task.queue_number to None removes it from the scheduling.
      task.queue_number = None
      _copy_entity(dupe_summary, result_summary, ('created_ts', 'name', 'user'))
      result_summary.properties_hash = None
      result_summary.try_number = 0
      result_summary.cost_saved_usd = result_summary.cost_usd
      # Only zap after.
      result_summary.costs_usd = []
      result_summary.deduped_from = task_pack.pack_run_result_key(
          dupe_summary.run_result_key)

  # Get parent task details if applicable.
  parent_task_keys = None
  if request.parent_task_id:
    parent_run_key = task_pack.unpack_run_result_key(request.parent_task_id)
    parent_task_keys = [
      parent_run_key,
      task_pack.run_result_key_to_result_summary_key(parent_run_key),
    ]

  result_summary.modified_ts = now

  # Storing these entities makes this task live. It is important at this point
  # that the HTTP handler returns as fast as possible, otherwise the task will
  # be run but the client will not know about it.
  def run():
    ndb.put_multi([result_summary, task])

  def run_parent():
    # This one is slower.
    items = ndb.get_multi(parent_task_keys)
    k = result_summary.task_id
    for item in items:
      item.children_task_ids.append(k)
      item.modified_ts = now
    ndb.put_multi(items)

  # Raising will abort to the caller.
  futures = [datastore_utils.transaction_async(run)]
  if parent_task_keys:
    futures.append(datastore_utils.transaction_async(run_parent))

  try:
    search_future.get_result()
  except search.Error:
    # Do not abort the task, for now search is best effort.
    logging.exception('Put failed')

  for future in futures:
    # Check for failures, it would raise in this case, aborting the call.
    future.get_result()

  stats.add_task_entry(
      'task_enqueued', result_summary.key,
      dimensions=request.properties.dimensions,
      user=request.user)
  return result_summary
Пример #5
0
def cron_update_bot_info():
    """Refreshes BotInfo.composite for dead bots."""
    @ndb.tasklet
    def run(bot_key):
        bot = yield bot_key.get_async()
        if bot and bot.should_be_dead and (bot.is_alive or not bot.is_dead):
            # bot composite get updated in _pre_put_hook
            yield bot.put_async()
            logging.info('Changing Bot status to DEAD: %s', bot.id)
            raise ndb.Return(bot_key)
        raise ndb.Return(None)

    def tx_result(future, stats):
        try:
            bot_key = future.get_result()
            if bot_key:
                stats['dead'] += 1
                bot = bot_key.get()
                logging.info('Sending bot_missing event: %s', bot.id)
                bot_event(event_type='bot_missing',
                          bot_id=bot.id,
                          message=None,
                          external_ip=None,
                          authenticated_as=None,
                          dimensions=None,
                          state=None,
                          version=None,
                          quarantined=None,
                          maintenance_msg=None,
                          task_id=None,
                          task_name=None,
                          register_dimensions=False,
                          last_seen_ts=bot.last_seen_ts)
        except datastore_utils.CommitError:
            logging.warning('Failed to commit a Tx')
            stats['failed'] += 1

    # The assumption here is that a cron job can churn through all the entities
    # fast enough. The number of dead bot is expected to be <10k. In practice the
    # average runtime is around 8 seconds.
    cron_stats = {
        'dead': 0,
        'seen': 0,
        'failed': 0,
    }
    try:
        futures = []
        for b in BotInfo.yield_dead_bots():
            cron_stats['seen'] += 1
            if b.is_alive or not b.is_dead:
                # Make sure the variable is not aliased.
                k = b.key
                # Unregister the bot from task queues since it can't reap anything.
                task_queues.cleanup_after_bot(k.parent())
                # Retry more often than the default 1. We do not want to throw too much
                # in the logs and there should be plenty of time to do the retries.
                f = datastore_utils.transaction_async(lambda: run(k),
                                                      retries=5)
                futures.append(f)
                if len(futures) >= 5:
                    ndb.Future.wait_any(futures)
                    for i in range(len(futures) - 1, -1, -1):
                        if futures[i].done():
                            f = futures.pop(i)
                            tx_result(f, cron_stats)
        for f in futures:
            tx_result(f, cron_stats)
    finally:
        logging.debug('Seen %d bots, updated %d dead bots, failed %d tx',
                      cron_stats['seen'], cron_stats['dead'],
                      cron_stats['failed'])
    return cron_stats['dead']