예제 #1
0
def _update_stats(run_result, bot_id, request, completed):
    """Updates stats after a bot task update notification."""
    if completed:
        runtime_ms = 0
        if run_result.duration_as_seen_by_server:
            runtime_ms = _secs_to_ms(
                run_result.duration_as_seen_by_server.total_seconds())
        pending_ms = 0
        if run_result.started_ts:
            pending_ms = _secs_to_ms(
                (run_result.started_ts - request.created_ts).total_seconds())
        stats.add_run_entry('run_completed',
                            run_result.key,
                            bot_id=bot_id,
                            dimensions=request.properties.dimensions,
                            runtime_ms=runtime_ms,
                            user=request.user)
        stats.add_task_entry('task_completed',
                             task_pack.request_key_to_result_summary_key(
                                 request.key),
                             dimensions=request.properties.dimensions,
                             pending_ms=pending_ms,
                             user=request.user)
    else:
        stats.add_run_entry('run_updated',
                            run_result.key,
                            bot_id=bot_id,
                            dimensions=request.properties.dimensions)
예제 #2
0
def _update_stats(run_result, bot_id, request, completed):
    """Updates stats after a bot task update notification."""
    if completed:
        runtime_ms = 0
        if run_result.duration_total:
            runtime_ms = _secs_to_ms(run_result.duration_total.total_seconds())
        pending_ms = 0
        if run_result.started_ts:
            pending_ms = _secs_to_ms((run_result.started_ts - request.created_ts).total_seconds())
        stats.add_run_entry(
            "run_completed",
            run_result.key,
            bot_id=bot_id,
            dimensions=request.properties.dimensions,
            runtime_ms=runtime_ms,
            user=request.user,
        )
        stats.add_task_entry(
            "task_completed",
            task_pack.request_key_to_result_summary_key(request.key),
            dimensions=request.properties.dimensions,
            pending_ms=pending_ms,
            user=request.user,
        )
    else:
        stats.add_run_entry("run_updated", run_result.key, bot_id=bot_id, dimensions=request.properties.dimensions)
예제 #3
0
def cron_abort_expired_task_to_run():
  """Aborts expired TaskToRun requests to execute a TaskRequest on a bot.

  Three reasons can cause this situation:
  - Higher throughput of task requests incoming than the rate task requests
    being completed, e.g. there's not enough bots to run all the tasks that gets
    in at the current rate. That's normal overflow and must be handled
    accordingly.
  - No bot connected that satisfies the requested dimensions. This is trickier,
    it is either a typo in the dimensions or bots all died and the admins must
    reconnect them.
  - Server has internal failures causing it to fail to either distribute the
    tasks or properly receive results from the bots.
  """
  killed = 0
  skipped = 0
  try:
    for to_run in task_to_run.yield_expired_task_to_run():
      request = to_run.request_key.get()
      if _expire_task(to_run.key, request):
        killed += 1
        stats.add_task_entry(
            'task_request_expired',
            task_pack.request_key_to_result_summary_key(request.key),
            dimensions=request.properties.dimensions,
            user=request.user)
      else:
        # It's not a big deal, the bot will continue running.
        skipped += 1
  finally:
    # TODO(maruel): Use stats_framework.
    logging.info('Killed %d task, skipped %d', killed, skipped)
  return killed
예제 #4
0
def cron_abort_expired_task_to_run():
  """Aborts expired TaskToRun requests to execute a TaskRequest on a bot.

  Three reasons can cause this situation:
  - Higher throughput of task requests incoming than the rate task requests
    being completed, e.g. there's not enough bots to run all the tasks that gets
    in at the current rate. That's normal overflow and must be handled
    accordingly.
  - No bot connected that satisfies the requested dimensions. This is trickier,
    it is either a typo in the dimensions or bots all died and the admins must
    reconnect them.
  - Server has internal failures causing it to fail to either distribute the
    tasks or properly receive results from the bots.
  """
  killed = 0
  skipped = 0
  try:
    for to_run in task_to_run.yield_expired_task_to_run():
      request = to_run.request_key.get()
      if _expire_task(to_run.key, request):
        killed += 1
        stats.add_task_entry(
            'task_request_expired',
            task_pack.request_key_to_result_summary_key(request.key),
            dimensions=request.properties.dimensions,
            user=request.user)
      else:
        # It's not a big deal, the bot will continue running.
        skipped += 1
  finally:
    # TODO(maruel): Use stats_framework.
    logging.info('Killed %d task, skipped %d', killed, skipped)
  return killed
예제 #5
0
def cron_abort_expired_task_to_run(host):
    """Aborts expired TaskToRun requests to execute a TaskRequest on a bot.

  Three reasons can cause this situation:
  - Higher throughput of task requests incoming than the rate task requests
    being completed, e.g. there's not enough bots to run all the tasks that gets
    in at the current rate. That's normal overflow and must be handled
    accordingly.
  - No bot connected that satisfies the requested dimensions. This is trickier,
    it is either a typo in the dimensions or bots all died and the admins must
    reconnect them.
  - Server has internal failures causing it to fail to either distribute the
    tasks or properly receive results from the bots.

  Returns:
    Packed tasks ids of aborted tasks.
  """
    killed = []
    skipped = 0
    try:
        for to_run in task_to_run.yield_expired_task_to_run():
            request = to_run.request_key.get()
            if _expire_task(to_run.key, request):
                # TODO(maruel): Know which try it is.
                killed.append(request)
                ts_mon_metrics.tasks_expired.increment(
                    fields=ts_mon_metrics.extract_job_fields(request.tags))
                stats.add_task_entry(
                    'task_request_expired',
                    task_pack.request_key_to_result_summary_key(request.key),
                    dimensions=request.properties.dimensions,
                    user=request.user)
            else:
                # It's not a big deal, the bot will continue running.
                skipped += 1
    finally:
        if killed:
            logging.warning(
                'EXPIRED!\n%d tasks:\n%s', len(killed),
                '\n'.join('  %s/user/task/%s  %s' %
                          (host, i.task_id, i.properties.dimensions)
                          for i in killed))
        # TODO(maruel): Use stats_framework.
        logging.info('Killed %d task, skipped %d', len(killed), skipped)
    return [i.task_id for i in killed]
예제 #6
0
def cron_abort_expired_task_to_run(host):
    """Aborts expired TaskToRun requests to execute a TaskRequest on a bot.

  Three reasons can cause this situation:
  - Higher throughput of task requests incoming than the rate task requests
    being completed, e.g. there's not enough bots to run all the tasks that gets
    in at the current rate. That's normal overflow and must be handled
    accordingly.
  - No bot connected that satisfies the requested dimensions. This is trickier,
    it is either a typo in the dimensions or bots all died and the admins must
    reconnect them.
  - Server has internal failures causing it to fail to either distribute the
    tasks or properly receive results from the bots.

  Returns:
    Packed tasks ids of aborted tasks.
  """
    killed = []
    skipped = 0
    try:
        for to_run in task_to_run.yield_expired_task_to_run():
            request = to_run.request_key.get()
            if _expire_task(to_run.key, request):
                # TODO(maruel): Know which try it is.
                killed.append(request.task_id)
                stats.add_task_entry(
                    "task_request_expired",
                    task_pack.request_key_to_result_summary_key(request.key),
                    dimensions=request.properties.dimensions,
                    user=request.user,
                )
            else:
                # It's not a big deal, the bot will continue running.
                skipped += 1
    finally:
        if killed:
            logging.error(
                "EXPIRED!\n%d tasks:\n%s",
                len(killed),
                "\n".join("  https://%s/user/task/%s" % (host, i) for i in killed),
            )
        # TODO(maruel): Use stats_framework.
        logging.info("Killed %d task, skipped %d", len(killed), skipped)
    return killed
예제 #7
0
def schedule_request(request):
    """Creates and stores all the entities to schedule a new task request.

  The number of entities created is 3: TaskRequest, TaskResultSummary and
  TaskToRun.

  The TaskRequest is saved first as a DB transaction, then TaskResultSummary and
  TaskToRun are saved as a single DB RPC. The Search index is also updated
  in-between.

  Arguments:
  - request: is in the TaskRequest entity saved in the DB.

  Returns:
    TaskResultSummary. TaskToRun is not returned.
  """
    dupe_future = None
    if request.properties.idempotent:
        # Find a previously run task that is also idempotent and completed. Start a
        # query to fetch items that can be used to dedupe the task. See the comment
        # for this property for more details.
        #
        # Do not use "cls.created_ts > oldest" here because this would require a
        # composite index. It's unnecessary because TaskRequest.key is mostly
        # equivalent to decreasing TaskRequest.created_ts, ordering by key works as
        # well and doesn't require a composite index.
        cls = task_result.TaskResultSummary
        h = request.properties.properties_hash
        dupe_future = cls.query(cls.properties_hash == h).order(cls.key).get_async()

    # At this point, the request is now in the DB but not yet in a mode where it
    # can be triggered or visible. Index it right away so it is searchable. If any
    # of remaining calls in this function fail, the TaskRequest and Search
    # Document will simply point to an incomplete task, which will be ignored.
    #
    # Creates the entities TaskToRun and TaskResultSummary but do not save them
    # yet. TaskRunResult will be created once a bot starts it.
    task = task_to_run.new_task_to_run(request)
    result_summary = task_result.new_result_summary(request)

    # Do not specify a doc_id, as they are guaranteed to be monotonically
    # increasing and searches are done in reverse order, which fits exactly the
    # created_ts ordering. This is useful because DateField is precise to the date
    # (!) and NumberField is signed 32 bits so the best it could do with EPOCH is
    # second resolution up to year 2038.
    index = search.Index(name="requests")
    packed = task_pack.pack_result_summary_key(result_summary.key)
    doc = search.Document(
        fields=[search.TextField(name="name", value=request.name), search.AtomField(name="id", value=packed)]
    )
    # Even if it fails here, we're still fine, as the task is not "alive" yet.
    search_future = index.put_async([doc])

    now = utils.utcnow()

    if dupe_future:
        # Reuse the results!
        dupe_summary = dupe_future.get_result()
        # Refuse tasks older than X days. This is due to the isolate server dropping
        # files. https://code.google.com/p/swarming/issues/detail?id=197
        oldest = now - datetime.timedelta(seconds=config.settings().reusable_task_age_secs)
        if dupe_summary and dupe_summary.created_ts > oldest:
            # If there's a bug, commenting out this block is sufficient to disable the
            # functionality.
            # Setting task.queue_number to None removes it from the scheduling.
            task.queue_number = None
            _copy_entity(dupe_summary, result_summary, ("created_ts", "name", "user", "tags"))
            result_summary.properties_hash = None
            result_summary.try_number = 0
            result_summary.cost_saved_usd = result_summary.cost_usd
            # Only zap after.
            result_summary.costs_usd = []
            result_summary.deduped_from = task_pack.pack_run_result_key(dupe_summary.run_result_key)

    # Get parent task details if applicable.
    parent_task_keys = None
    if request.parent_task_id:
        parent_run_key = task_pack.unpack_run_result_key(request.parent_task_id)
        parent_task_keys = [parent_run_key, task_pack.run_result_key_to_result_summary_key(parent_run_key)]

    result_summary.modified_ts = now

    # Storing these entities makes this task live. It is important at this point
    # that the HTTP handler returns as fast as possible, otherwise the task will
    # be run but the client will not know about it.
    def run():
        ndb.put_multi([result_summary, task])

    def run_parent():
        # This one is slower.
        items = ndb.get_multi(parent_task_keys)
        k = result_summary.task_id
        for item in items:
            item.children_task_ids.append(k)
            item.modified_ts = now
        ndb.put_multi(items)

    # Raising will abort to the caller.
    futures = [datastore_utils.transaction_async(run)]
    if parent_task_keys:
        futures.append(datastore_utils.transaction_async(run_parent))

    try:
        search_future.get_result()
    except search.Error:
        # Do not abort the task, for now search is best effort.
        logging.exception("Put failed")

    for future in futures:
        # Check for failures, it would raise in this case, aborting the call.
        future.get_result()

    stats.add_task_entry(
        "task_enqueued", result_summary.key, dimensions=request.properties.dimensions, user=request.user
    )
    return result_summary
예제 #8
0
def schedule_request(request, check_acls=True):
    """Creates and stores all the entities to schedule a new task request.

  Checks ACLs first. Raises auth.AuthorizationError if caller is not authorized
  to post this request.

  The number of entities created is 3: TaskRequest, TaskToRun and
  TaskResultSummary.

  All 3 entities in the same entity group (TaskReqest, TaskToRun,
  TaskResultSummary) are saved as a DB transaction.

  Arguments:
  - request: TaskRequest entity to be saved in the DB. It's key must not be set
             and the entity must not be saved in the DB yet.
  - check_acls: Whether the request should check ACLs.

  Returns:
    TaskResultSummary. TaskToRun is not returned.
  """
    assert isinstance(request, task_request.TaskRequest), request
    assert not request.key, request.key

    # Raises AuthorizationError with helpful message if the request.authorized
    # can't use some of the requested dimensions.
    if check_acls:
        _check_dimension_acls(request)

    now = utils.utcnow()
    request.key = task_request.new_request_key()
    task = task_to_run.new_task_to_run(request)
    result_summary = task_result.new_result_summary(request)
    result_summary.modified_ts = now

    def get_new_keys():
        # Warning: this assumes knowledge about the hierarchy of each entity.
        key = task_request.new_request_key()
        task.key.parent = key
        old = result_summary.task_id
        result_summary.parent = key
        logging.info('%s conflicted, using %s', old, result_summary.task_id)
        return key

    deduped = False
    if request.properties.idempotent:
        dupe_summary = _find_dupe_task(now, request.properties_hash)
        if dupe_summary:
            # Setting task.queue_number to None removes it from the scheduling.
            task.queue_number = None
            _copy_summary(
                dupe_summary, result_summary,
                ('created_ts', 'modified_ts', 'name', 'user', 'tags'))
            # Zap irrelevant properties. PerformanceStats is also not copied over,
            # since it's not relevant.
            result_summary.properties_hash = None
            result_summary.try_number = 0
            result_summary.cost_saved_usd = result_summary.cost_usd
            # Only zap after.
            result_summary.costs_usd = []
            result_summary.deduped_from = task_pack.pack_run_result_key(
                dupe_summary.run_result_key)
            # In this code path, there's not much to do as the task will not be run,
            # previous results are returned. We still need to store all the entities
            # correctly.
            datastore_utils.insert(request,
                                   get_new_keys,
                                   extra=[task, result_summary])
            logging.debug('New request %s reusing %s', result_summary.task_id,
                          dupe_summary.task_id)
            deduped = True

    if not deduped:
        # Storing these entities makes this task live. It is important at this point
        # that the HTTP handler returns as fast as possible, otherwise the task will
        # be run but the client will not know about it.
        datastore_utils.insert(request,
                               get_new_keys,
                               extra=[task, result_summary])
        logging.debug('New request %s', result_summary.task_id)

    # Get parent task details if applicable.
    if request.parent_task_id:
        parent_run_key = task_pack.unpack_run_result_key(
            request.parent_task_id)
        parent_task_keys = [
            parent_run_key,
            task_pack.run_result_key_to_result_summary_key(parent_run_key),
        ]

        def run_parent():
            # This one is slower.
            items = ndb.get_multi(parent_task_keys)
            k = result_summary.task_id
            for item in items:
                item.children_task_ids.append(k)
                item.modified_ts = now
            ndb.put_multi(items)

        # Raising will abort to the caller. There's a risk that for tasks with
        # parent tasks, the task will be lost due to this transaction.
        # TODO(maruel): An option is to update the parent task as part of a cron
        # job, which would remove this code from the critical path.
        datastore_utils.transaction(run_parent)

    stats.add_task_entry('task_enqueued',
                         result_summary.key,
                         dimensions=request.properties.dimensions,
                         user=request.user)
    ts_mon_metrics.update_jobs_requested_metrics(result_summary, deduped)
    return result_summary
예제 #9
0
def schedule_request(request):
  """Creates and stores all the entities to schedule a new task request.

  The number of entities created is 3: TaskRequest, TaskResultSummary and
  TaskToRun.

  The TaskRequest is saved first as a DB transaction, then TaskResultSummary and
  TaskToRun are saved as a single DB RPC. The Search index is also updated
  in-between.

  Arguments:
  - request: is in the TaskRequest entity saved in the DB.

  Returns:
    TaskResultSummary. TaskToRun is not returned.
  """
  dupe_future = None
  if request.properties.idempotent:
    # Find a previously run task that is also idempotent and completed. Start a
    # query to fetch items that can be used to dedupe the task. See the comment
    # for this property for more details.
    #
    # Do not use "cls.created_ts > oldest" here because this would require a
    # composite index. It's unnecessary because TaskRequest.key is mostly
    # equivalent to decreasing TaskRequest.created_ts, ordering by key works as
    # well and doesn't require a composite index.
    cls = task_result.TaskResultSummary
    h = request.properties.properties_hash
    dupe_future = cls.query(cls.properties_hash==h).order(cls.key).get_async()

  # At this point, the request is now in the DB but not yet in a mode where it
  # can be triggered or visible. Index it right away so it is searchable. If any
  # of remaining calls in this function fail, the TaskRequest and Search
  # Document will simply point to an incomplete task, which will be ignored.
  #
  # Creates the entities TaskToRun and TaskResultSummary but do not save them
  # yet. TaskRunResult will be created once a bot starts it.
  task = task_to_run.new_task_to_run(request)
  result_summary = task_result.new_result_summary(request)

  # Do not specify a doc_id, as they are guaranteed to be monotonically
  # increasing and searches are done in reverse order, which fits exactly the
  # created_ts ordering. This is useful because DateField is precise to the date
  # (!) and NumberField is signed 32 bits so the best it could do with EPOCH is
  # second resolution up to year 2038.
  index = search.Index(name='requests')
  packed = task_pack.pack_result_summary_key(result_summary.key)
  doc = search.Document(
      fields=[
        search.TextField(name='name', value=request.name),
        search.AtomField(name='id', value=packed),
      ])
  # Even if it fails here, we're still fine, as the task is not "alive" yet.
  search_future = index.put_async([doc])

  now = utils.utcnow()

  if dupe_future:
    # Reuse the results!
    dupe_summary = dupe_future.get_result()
    # Refuse tasks older than X days. This is due to the isolate server dropping
    # files. https://code.google.com/p/swarming/issues/detail?id=197
    oldest = now - datetime.timedelta(
        seconds=config.settings().reusable_task_age_secs)
    if dupe_summary and dupe_summary.created_ts > oldest:
      # If there's a bug, commenting out this block is sufficient to disable the
      # functionality.
      # Setting task.queue_number to None removes it from the scheduling.
      task.queue_number = None
      _copy_entity(dupe_summary, result_summary, ('created_ts', 'name', 'user'))
      result_summary.properties_hash = None
      result_summary.try_number = 0
      result_summary.cost_saved_usd = result_summary.cost_usd
      # Only zap after.
      result_summary.costs_usd = []
      result_summary.deduped_from = task_pack.pack_run_result_key(
          dupe_summary.run_result_key)

  # Get parent task details if applicable.
  parent_task_keys = None
  if request.parent_task_id:
    parent_run_key = task_pack.unpack_run_result_key(request.parent_task_id)
    parent_task_keys = [
      parent_run_key,
      task_pack.run_result_key_to_result_summary_key(parent_run_key),
    ]

  result_summary.modified_ts = now

  # Storing these entities makes this task live. It is important at this point
  # that the HTTP handler returns as fast as possible, otherwise the task will
  # be run but the client will not know about it.
  def run():
    ndb.put_multi([result_summary, task])

  def run_parent():
    # This one is slower.
    items = ndb.get_multi(parent_task_keys)
    k = result_summary.task_id
    for item in items:
      item.children_task_ids.append(k)
      item.modified_ts = now
    ndb.put_multi(items)

  # Raising will abort to the caller.
  futures = [datastore_utils.transaction_async(run)]
  if parent_task_keys:
    futures.append(datastore_utils.transaction_async(run_parent))

  try:
    search_future.get_result()
  except search.Error:
    # Do not abort the task, for now search is best effort.
    logging.exception('Put failed')

  for future in futures:
    # Check for failures, it would raise in this case, aborting the call.
    future.get_result()

  stats.add_task_entry(
      'task_enqueued', result_summary.key,
      dimensions=request.properties.dimensions,
      user=request.user)
  return result_summary