Пример #1
0
  def get_request_and_result(self, task_id):
    """Retrieves the TaskRequest for 'task_id' and enforces the ACL.

    Supports both TaskResultSummary (ends with 0) or TaskRunResult (ends with 1
    or 2).

    Returns:
      tuple(TaskRequest, result): result can be either for a TaskRunResult or a
                                  TaskResultSummay.
    """
    try:
      key = task_pack.unpack_result_summary_key(task_id)
      request_key = task_pack.result_summary_key_to_request_key(key)
    except ValueError:
      try:
        key = task_pack.unpack_run_result_key(task_id)
        request_key = task_pack.result_summary_key_to_request_key(
            task_pack.run_result_key_to_result_summary_key(key))
      except ValueError:
        self.abort(404, 'Invalid key format.')
    request, result = ndb.get_multi((request_key, key))
    if not request or not result:
      self.abort(404, '%s not found.' % key.id())
    if not request.has_access:
      self.abort(403, '%s is not accessible.' % key.id())
    return request, result
Пример #2
0
def cancel_task(result_summary_key):
  """Cancels a task if possible."""
  request_key = task_pack.result_summary_key_to_request_key(result_summary_key)
  to_run_key = task_to_run.request_to_task_to_run_key(request_key.get())
  now = utils.utcnow()

  def run():
    to_run, result_summary = ndb.get_multi((to_run_key, result_summary_key))
    was_running = result_summary.state == task_result.State.RUNNING
    if not result_summary.can_be_canceled:
      return False, was_running
    to_run.queue_number = None
    result_summary.state = task_result.State.CANCELED
    result_summary.abandoned_ts = now
    result_summary.modified_ts = now
    ndb.put_multi((to_run, result_summary))
    return True, was_running

  try:
    ok, was_running = datastore_utils.transaction(run)
  except datastore_utils.CommitError as e:
    packed = task_pack.pack_result_summary_key(result_summary_key)
    return 'Failed killing task %s: %s' % (packed, e)
  # Add it to the negative cache.
  task_to_run.set_lookup_cache(to_run_key, False)
  # TODO(maruel): Add stats.
  return ok, was_running
Пример #3
0
  def requests(self, request):
    """Returns tasks requests based on the filters.

    This endpoint is slightly slower than 'list'. Use 'list' or 'count' when
    possible.
    """
    logging.info('%s', request)
    if request.include_performance_stats:
      raise endpoints.BadRequestException(
          'Can\'t set include_performance_stats for tasks/list')
    now = utils.utcnow()
    try:
      # Get the TaskResultSummary keys, then fetch the corresponding
      # TaskRequest entities.
      keys, cursor = datastore_utils.fetch_page(
          self._query_from_request(request),
          request.limit, request.cursor, keys_only=True)
      items = ndb.get_multi(
          task_pack.result_summary_key_to_request_key(k) for k in keys)
    except ValueError as e:
      raise endpoints.BadRequestException(
          'Inappropriate filter for tasks/requests: %s' % e)
    except datastore_errors.NeedIndexError as e:
      logging.error('%s', e)
      raise endpoints.BadRequestException(
          'Requires new index, ask admin to create one.')
    except datastore_errors.BadArgumentError as e:
      logging.error('%s', e)
      raise endpoints.BadRequestException(
          'This combination is unsupported, sorry.')
    return swarming_rpcs.TaskRequests(
        cursor=cursor,
        items=[message_conversion.task_request_to_rpc(i) for i in items],
        now=now)
Пример #4
0
 def request(self, request):
   """Returns the task request corresponding to a task ID."""
   _, summary_key = get_result_key(request.task_id)
   request_key = task_pack.result_summary_key_to_request_key(summary_key)
   entity = get_or_raise(request_key)
   return message_conversion.task_request_from_dict(
       utils.to_json_encodable(entity))
Пример #5
0
def make_request(request, is_bot_or_admin):
    """Registers the request in the DB.

  Fills up some values.

  If parent_task_id is set, properties for the parent are used:
  - priority: defaults to parent.priority - 1
  - user: overriden by parent.user

  """
    assert request.__class__ is TaskRequest
    if request.parent_task_id:
        run_result_key = task_pack.unpack_run_result_key(request.parent_task_id)
        result_summary_key = task_pack.run_result_key_to_result_summary_key(run_result_key)
        request_key = task_pack.result_summary_key_to_request_key(result_summary_key)
        parent = request_key.get()
        if not parent:
            raise ValueError("parent_task_id is not a valid task")
        request.priority = max(min(request.priority, parent.priority - 1), 0)
        # Drop the previous user.
        request.user = parent.user

    # If the priority is below 100, make sure the user has right to do so.
    if request.priority < 100 and not is_bot_or_admin:
        # Silently drop the priority of normal users.
        request.priority = 100

    request.authenticated = auth.get_current_identity()
    if not request.properties.is_terminate and request.properties.grace_period_secs is None:
        request.properties.grace_period_secs = 30
    if request.properties.idempotent is None:
        request.properties.idempotent = False
    _put_request(request)
    return request
Пример #6
0
 def request(self, request):
     """Returns the task request corresponding to a task ID."""
     logging.info('%s', request)
     _, summary_key = get_result_key(request.task_id)
     request_key = task_pack.result_summary_key_to_request_key(summary_key)
     return message_conversion.task_request_to_rpc(
         get_or_raise(request_key))
Пример #7
0
def cancel_task(result_summary_key):
    """Cancels a task if possible."""
    request = task_pack.result_summary_key_to_request_key(result_summary_key).get()
    to_run_key = task_to_run.request_to_task_to_run_key(request)
    now = utils.utcnow()

    def run():
        to_run, result_summary = ndb.get_multi((to_run_key, result_summary_key))
        was_running = result_summary.state == task_result.State.RUNNING
        if not result_summary.can_be_canceled:
            return False, was_running
        to_run.queue_number = None
        result_summary.state = task_result.State.CANCELED
        result_summary.abandoned_ts = now
        result_summary.modified_ts = now

        futures = ndb.put_multi_async((to_run, result_summary))
        _maybe_pubsub_notify_via_tq(result_summary, request)
        for f in futures:
            f.check_success()

        return True, was_running

    try:
        ok, was_running = datastore_utils.transaction(run)
    except datastore_utils.CommitError as e:
        packed = task_pack.pack_result_summary_key(result_summary_key)
        return "Failed killing task %s: %s" % (packed, e)
    # Add it to the negative cache.
    task_to_run.set_lookup_cache(to_run_key, False)
    # TODO(maruel): Add stats.
    return ok, was_running
Пример #8
0
 def test_result_summary_key_to_request_key(self):
     request_key = task_pack.unpack_request_key('11')
     result_summary_key = task_pack.request_key_to_result_summary_key(
         request_key)
     actual = task_pack.result_summary_key_to_request_key(
         result_summary_key)
     self.assertEqual(request_key, actual)
Пример #9
0
def init_new_request(request, allow_high_priority):
  """Initializes a new TaskRequest but doesn't store it.

  Fills up some values and does minimal checks.

  If parent_task_id is set, properties for the parent are used:
  - priority: defaults to parent.priority - 1
  - user: overridden by parent.user

  """
  assert request.__class__ is TaskRequest, request
  if request.parent_task_id:
    run_result_key = task_pack.unpack_run_result_key(request.parent_task_id)
    result_summary_key = task_pack.run_result_key_to_result_summary_key(
        run_result_key)
    request_key = task_pack.result_summary_key_to_request_key(
        result_summary_key)
    parent = request_key.get()
    if not parent:
      raise ValueError('parent_task_id is not a valid task')
    request.priority = max(min(request.priority, parent.priority - 1), 0)
    # Drop the previous user.
    request.user = parent.user

  # If the priority is below 100, make sure the user has right to do so.
  if request.priority < 100 and not allow_high_priority:
    # Special case for terminate request.
    if not request.properties.is_terminate:
      # Silently drop the priority of normal users.
      request.priority = 100

  request.authenticated = auth.get_current_identity()
  if (not request.properties.is_terminate and
      request.properties.grace_period_secs is None):
    request.properties.grace_period_secs = 30
  if request.properties.idempotent is None:
    request.properties.idempotent = False

  request.service_account = 'none'
  if request.service_account_token and request.service_account_token != 'none':
    if request.service_account_token == 'bot':
      request.service_account = 'bot'
    else:
      # TODO(vadimsh): Check the token signature, verify it can be used by the
      # current user, extract service account email.
      raise auth.AuthorizationError('service_account_token is not implemented')

  request.tags.append('priority:%s' % request.priority)
  request.tags.append('user:%s' % request.user)
  request.tags.append('service_account:%s' % request.service_account)
  for key, value in request.properties.dimensions.iteritems():
    request.tags.append('%s:%s' % (key, value))
  request.tags = sorted(set(request.tags))

  if request.properties.idempotent:
    request.properties_hash = request.HASHING_ALGO(
      utils.encode_to_json(request.properties)).digest()
  else:
    request.properties_hash = None
Пример #10
0
  def post(self, task_id):
    try:
      key = task_pack.unpack_result_summary_key(task_id)
      request_key = task_pack.result_summary_key_to_request_key(key)
    except ValueError:
      try:
        key = task_pack.unpack_run_result_key(task_id)
        request_key = task_pack.result_summary_key_to_request_key(
            task_pack.run_result_key_to_result_summary_key(key))
      except (NotImplementedError, ValueError):
        self.abort(404, 'Invalid key format.')

    # Retrying a task is essentially reusing the same task request as the
    # original one, but with new parameters.
    original_request = request_key.get()
    if not original_request:
      self.abort(404, 'Invalid request key.')
    new_request = task_request.make_request_clone(original_request)
    result_summary = task_scheduler.schedule_request(new_request)
    self.redirect('/user/task/%s' % result_summary.task_id)
Пример #11
0
  def post(self, task_id):
    try:
      key = task_pack.unpack_result_summary_key(task_id)
      request_key = task_pack.result_summary_key_to_request_key(key)
    except ValueError:
      try:
        key = task_pack.unpack_run_result_key(task_id)
        request_key = task_pack.result_summary_key_to_request_key(
            task_pack.run_result_key_to_result_summary_key(key))
      except (NotImplementedError, ValueError):
        self.abort(404, 'Invalid key format.')

    # Retrying a task is essentially reusing the same task request as the
    # original one, but with new parameters.
    original_request = request_key.get()
    if not original_request:
      self.abort(404, 'Invalid request key.')
    new_request = task_request.make_request_clone(original_request)
    result_summary = task_scheduler.schedule_request(new_request)
    self.redirect('/user/task/%s' % result_summary.task_id)
Пример #12
0
def bot_kill_task(run_result_key, bot_id):
    """Terminates a task that is currently running as an internal failure.

  Returns:
    str if an error message.
  """
    result_summary_key = task_pack.run_result_key_to_result_summary_key(
        run_result_key)
    request = task_pack.result_summary_key_to_request_key(
        result_summary_key).get()
    server_version = utils.get_app_version()
    now = utils.utcnow()
    packed = task_pack.pack_run_result_key(run_result_key)

    def run():
        run_result, result_summary = ndb.get_multi(
            (run_result_key, result_summary_key))
        if bot_id and run_result.bot_id != bot_id:
            return None, 'Bot %s sent task kill for task %s owned by bot %s' % (
                bot_id, packed, run_result.bot_id)

        if run_result.state == task_result.State.BOT_DIED:
            # Ignore this failure.
            return None, None

        run_result.signal_server_version(server_version)
        run_result.state = task_result.State.BOT_DIED
        run_result.internal_failure = True
        run_result.abandoned_ts = now
        run_result.modified_ts = now
        result_summary.set_from_run_result(run_result, None)

        futures = ndb.put_multi_async((run_result, result_summary))
        _maybe_pubsub_notify_via_tq(result_summary, request)
        for f in futures:
            f.check_success()

        return run_result, None

    try:
        run_result, msg = datastore_utils.transaction(run)
    except datastore_utils.CommitError as e:
        # At worst, the task will be tagged as BOT_DIED after BOT_PING_TOLERANCE
        # seconds passed on the next cron_handle_bot_died cron job.
        return 'Failed killing task %s: %s' % (packed, e)

    if run_result:
        stats.add_run_entry('run_bot_died',
                            run_result.key,
                            bot_id=run_result.bot_id,
                            dimensions=request.properties.dimensions,
                            user=request.user)
    return msg
Пример #13
0
def bot_kill_task(run_result_key, bot_id):
    """Terminates a task that is currently running as an internal failure.

  Returns:
    str if an error message.
  """
    result_summary_key = task_pack.run_result_key_to_result_summary_key(run_result_key)
    request = task_pack.result_summary_key_to_request_key(result_summary_key).get()
    server_version = utils.get_app_version()
    now = utils.utcnow()
    packed = task_pack.pack_run_result_key(run_result_key)

    def run():
        run_result, result_summary = ndb.get_multi((run_result_key, result_summary_key))
        if bot_id and run_result.bot_id != bot_id:
            return None, "Bot %s sent task kill for task %s owned by bot %s" % (bot_id, packed, run_result.bot_id)

        if run_result.state == task_result.State.BOT_DIED:
            # Ignore this failure.
            return None, None

        run_result.signal_server_version(server_version)
        run_result.state = task_result.State.BOT_DIED
        run_result.internal_failure = True
        run_result.abandoned_ts = now
        run_result.modified_ts = now
        result_summary.set_from_run_result(run_result, None)

        futures = ndb.put_multi_async((run_result, result_summary))
        _maybe_pubsub_notify_via_tq(result_summary, request)
        for f in futures:
            f.check_success()

        return run_result, None

    try:
        run_result, msg = datastore_utils.transaction(run)
    except datastore_utils.CommitError as e:
        # At worst, the task will be tagged as BOT_DIED after BOT_PING_TOLERANCE
        # seconds passed on the next cron_handle_bot_died cron job.
        return "Failed killing task %s: %s" % (packed, e)

    if run_result:
        stats.add_run_entry(
            "run_bot_died",
            run_result.key,
            bot_id=run_result.bot_id,
            dimensions=request.properties.dimensions,
            user=request.user,
        )
    return msg
Пример #14
0
def make_request(request, is_bot_or_admin):
    """Registers the request in the DB.

  Fills up some values.

  If parent_task_id is set, properties for the parent are used:
  - priority: defaults to parent.priority - 1
  - user: overriden by parent.user

  """
    assert request.__class__ is TaskRequest
    if request.parent_task_id:
        run_result_key = task_pack.unpack_run_result_key(
            request.parent_task_id)
        result_summary_key = task_pack.run_result_key_to_result_summary_key(
            run_result_key)
        request_key = task_pack.result_summary_key_to_request_key(
            result_summary_key)
        parent = request_key.get()
        if not parent:
            raise ValueError('parent_task_id is not a valid task')
        request.priority = max(min(request.priority, parent.priority - 1), 0)
        # Drop the previous user.
        request.user = parent.user

    # If the priority is below 100, make sure the user has right to do so.
    if request.priority < 100 and not is_bot_or_admin:
        # Silently drop the priority of normal users.
        request.priority = 100

    request.authenticated = auth.get_current_identity()
    if (not request.properties.is_terminate
            and request.properties.grace_period_secs is None):
        request.properties.grace_period_secs = 30
    if request.properties.idempotent is None:
        request.properties.idempotent = False
    _put_request(request)
    return request
Пример #15
0
 def post(self):
   # There's two ways to query, either with TaskToRun.queue_number or with
   # TaskResultSummary.state.
   canceled = 0
   was_running = 0
   q = task_result.TaskResultSummary.query(
       task_result.TaskResultSummary.state == task_result.State.PENDING)
   status = ''
   try:
     for result_key in q.iter(keys_only=True):
       request_obj = task_pack.result_summary_key_to_request_key(
           result_key).get()
       ok, wr = task_scheduler.cancel_task(request_obj, result_key)
       if ok:
         canceled += 1
       if wr:
         was_running += 1
     status = 'Success'
   except runtime.DeadlineExceededError:
     status = 'Deadline exceeded'
   self.response.write(
       'Canceled %d tasks.\n%d tasks were running.\n%s' %
       (canceled, was_running, status))
Пример #16
0
    def requests(self, request):
        """Returns tasks requests based on the filters.

    This endpoint is slightly slower than 'list'. Use 'list' or 'count' when
    possible.
    """
        logging.debug('%s', request)
        if request.include_performance_stats:
            raise endpoints.BadRequestException(
                'Can\'t set include_performance_stats for tasks/list')
        now = utils.utcnow()
        try:
            # Get the TaskResultSummary keys, then fetch the corresponding
            # TaskRequest entities.
            keys, cursor = datastore_utils.fetch_page(
                self._query_from_request(request),
                request.limit,
                request.cursor,
                keys_only=True)
            items = ndb.get_multi(
                task_pack.result_summary_key_to_request_key(k) for k in keys)
        except ValueError as e:
            raise endpoints.BadRequestException(
                'Inappropriate filter for tasks/requests: %s' % e)
        except datastore_errors.NeedIndexError as e:
            logging.error('%s', e)
            raise endpoints.BadRequestException(
                'Requires new index, ask admin to create one.')
        except datastore_errors.BadArgumentError as e:
            logging.error('%s', e)
            raise endpoints.BadRequestException(
                'This combination is unsupported, sorry.')
        return swarming_rpcs.TaskRequests(
            cursor=cursor,
            items=[message_conversion.task_request_to_rpc(i) for i in items],
            now=now)
Пример #17
0
def _handle_dead_bot(run_result_key):
  """Handles TaskRunResult where its bot has stopped showing sign of life.

  Transactionally updates the entities depending on the state of this task. The
  task may be retried automatically, canceled or left alone.

  Returns:
    True if the task was retried, False if the task was killed, None if no
    action was done.
  """
  result_summary_key = task_pack.run_result_key_to_result_summary_key(
      run_result_key)
  request_key = task_pack.result_summary_key_to_request_key(result_summary_key)
  request_future = request_key.get_async()
  now = utils.utcnow()
  server_version = utils.get_app_version()
  packed = task_pack.pack_run_result_key(run_result_key)
  request = request_future.get_result()

  def run():
    """Returns tuple(task_is_retried or None, bot_id).

    1x GET, 1x GETs 2~3x PUT.
    """
    run_result = run_result_key.get()
    if run_result.state != task_result.State.RUNNING:
      # It was updated already or not updating last. Likely DB index was stale.
      return None, run_result.bot_id
    if run_result.modified_ts > now - task_result.BOT_PING_TOLERANCE:
      # The query index IS stale.
      return None, run_result.bot_id

    current_task_slice = run_result.current_task_slice
    run_result.signal_server_version(server_version)
    old_modified = run_result.modified_ts
    run_result.modified_ts = now

    result_summary = result_summary_key.get()
    orig_summary_state = result_summary.state
    if result_summary.try_number != run_result.try_number:
      # Not updating correct run_result, cancel it without touching
      # result_summary.
      to_put = (run_result,)
      run_result.state = task_result.State.BOT_DIED
      run_result.internal_failure = True
      run_result.abandoned_ts = now
      task_is_retried = None
    elif (result_summary.try_number == 1 and now < request.expiration_ts and
          (request.task_slice(current_task_slice).properties.idempotent or
            run_result.started_ts == old_modified)):
      # Retry it. It fits:
      # - first try
      # - not yet expired
      # - One of:
      #   - idempotent
      #   - task hadn't got any ping at all from task_runner.run_command()
      # TODO(maruel): Allow increasing the current_task_slice value.
      # Create a second TaskToRun with the same TaskSlice.
      to_run = task_to_run.new_task_to_run(request, 2, current_task_slice)
      to_put = (run_result, result_summary, to_run)
      run_result.state = task_result.State.BOT_DIED
      run_result.internal_failure = True
      run_result.abandoned_ts = now
      # Do not sync data from run_result to result_summary, since the task is
      # being retried.
      result_summary.reset_to_pending()
      result_summary.modified_ts = now
      task_is_retried = True
    else:
      # Kill it as BOT_DIED, there was more than one try, the task expired in
      # the meantime or it wasn't idempotent.
      to_put = (run_result, result_summary)
      run_result.state = task_result.State.BOT_DIED
      run_result.internal_failure = True
      run_result.abandoned_ts = now
      result_summary.set_from_run_result(run_result, request)
      task_is_retried = False

    futures = ndb.put_multi_async(to_put)
    # if result_summary.state != orig_summary_state:
    if orig_summary_state != result_summary.state:
      _maybe_pubsub_notify_via_tq(result_summary, request)
    for f in futures:
      f.check_success()

    return task_is_retried

  try:
    task_is_retried = datastore_utils.transaction(run)
  except datastore_utils.CommitError:
    task_is_retried = None
  if task_is_retried:
    logging.info('Retried %s', packed)
  elif task_is_retried == False:
    logging.debug('Ignored %s', packed)
  return task_is_retried
Пример #18
0
def bot_update_task(run_result_key, bot_id, output, output_chunk_start,
                    exit_code, duration, hard_timeout, io_timeout, cost_usd,
                    outputs_ref, cipd_pins, performance_stats):
    """Updates a TaskRunResult and TaskResultSummary, along TaskOutput.

  Arguments:
  - run_result_key: ndb.Key to TaskRunResult.
  - bot_id: Self advertised bot id to ensure it's the one expected.
  - output: Data to append to this command output.
  - output_chunk_start: Index of output in the stdout stream.
  - exit_code: Mark that this task completed.
  - duration: Time spent in seconds for this task, excluding overheads.
  - hard_timeout: Bool set if an hard timeout occured.
  - io_timeout: Bool set if an I/O timeout occured.
  - cost_usd: Cost in $USD of this task up to now.
  - outputs_ref: task_request.FilesRef instance or None.
  - cipd_pins: None or task_result.CipdPins
  - performance_stats: task_result.PerformanceStats instance or None. Can only
        be set when the task is completing.

  Invalid states, these are flat out refused:
  - A command is updated after it had an exit code assigned to.

  Returns:
    TaskRunResult.state or None in case of failure.
  """
    assert output_chunk_start is None or isinstance(output_chunk_start, int)
    assert output is None or isinstance(output, str)
    if cost_usd is not None and cost_usd < 0.:
        raise ValueError('cost_usd must be None or greater or equal than 0')
    if duration is not None and duration < 0.:
        raise ValueError('duration must be None or greater or equal than 0')
    if (duration is None) != (exit_code is None):
        raise ValueError(
            'had unexpected duration; expected iff a command completes\n'
            'duration: %r; exit: %r' % (duration, exit_code))
    if performance_stats and duration is None:
        raise ValueError('duration must be set when performance_stats is set\n'
                         'duration: %s; performance_stats: %s' %
                         (duration, performance_stats))

    packed = task_pack.pack_run_result_key(run_result_key)
    logging.debug(
        'bot_update_task(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)',
        packed, bot_id,
        len(output) if output else output, output_chunk_start, exit_code,
        duration, hard_timeout, io_timeout, cost_usd, outputs_ref, cipd_pins,
        performance_stats)

    result_summary_key = task_pack.run_result_key_to_result_summary_key(
        run_result_key)
    request_key = task_pack.result_summary_key_to_request_key(
        result_summary_key)
    request_future = request_key.get_async()
    server_version = utils.get_app_version()
    request = request_future.get_result()
    now = utils.utcnow()

    def run():
        """Returns tuple(TaskRunResult, bool(completed), str(error)).

    Any error is returned as a string to be passed to logging.error() instead of
    logging inside the transaction for performance.
    """
        # 2 consecutive GETs, one PUT.
        run_result_future = run_result_key.get_async()
        result_summary_future = result_summary_key.get_async()
        run_result = run_result_future.get_result()
        if not run_result:
            result_summary_future.wait()
            return None, None, 'is missing'

        if run_result.bot_id != bot_id:
            result_summary_future.wait()
            return None, None, (
                'expected bot (%s) but had update from bot %s' %
                (run_result.bot_id, bot_id))

        if not run_result.started_ts:
            return None, None, 'TaskRunResult is broken; %s' % (
                run_result.to_dict())

        # Assumptions:
        # - duration and exit_code are both set or not set.
        # - same for run_result.
        if exit_code is not None:
            if run_result.exit_code is not None:
                # This happens as an HTTP request is retried when the DB write succeeded
                # but it still returned HTTP 500.
                if run_result.exit_code != exit_code:
                    result_summary_future.wait()
                    return None, None, 'got 2 different exit_code; %s then %s' % (
                        run_result.exit_code, exit_code)
                if run_result.duration != duration:
                    result_summary_future.wait()
                    return None, None, 'got 2 different durations; %s then %s' % (
                        run_result.duration, duration)
            else:
                run_result.duration = duration
                run_result.exit_code = exit_code

        if outputs_ref:
            run_result.outputs_ref = outputs_ref

        if cipd_pins:
            run_result.cipd_pins = cipd_pins

        if run_result.state in task_result.State.STATES_RUNNING:
            if hard_timeout or io_timeout:
                run_result.state = task_result.State.TIMED_OUT
                run_result.completed_ts = now
            elif run_result.exit_code is not None:
                run_result.state = task_result.State.COMPLETED
                run_result.completed_ts = now

        run_result.signal_server_version(server_version)
        run_result.validate(request)
        to_put = [run_result]
        if output:
            # This does 1 multi GETs. This also modifies run_result in place.
            to_put.extend(
                run_result.append_output(output, output_chunk_start or 0))
        if performance_stats:
            performance_stats.key = task_pack.run_result_key_to_performance_stats_key(
                run_result.key)
            to_put.append(performance_stats)

        run_result.cost_usd = max(cost_usd, run_result.cost_usd or 0.)
        run_result.modified_ts = now

        result_summary = result_summary_future.get_result()
        if (result_summary.try_number
                and result_summary.try_number > run_result.try_number):
            # The situation where a shard is retried but the bot running the previous
            # try somehow reappears and reports success, the result must still show
            # the last try's result. We still need to update cost_usd manually.
            result_summary.costs_usd[run_result.try_number -
                                     1] = run_result.cost_usd
            result_summary.modified_ts = now
        else:
            result_summary.set_from_run_result(run_result, request)

        result_summary.validate(request)
        to_put.append(result_summary)
        ndb.put_multi(to_put)

        return result_summary, run_result, None

    try:
        smry, run_result, error = datastore_utils.transaction(run)
    except datastore_utils.CommitError as e:
        logging.info('Got commit error: %s', e)
        # It is important that the caller correctly surface this error.
        return None
    assert bool(error) != bool(run_result), (error, run_result)
    if error:
        logging.error('Task %s %s', packed, error)
        return None
    # Caller must retry if PubSub enqueue fails.
    task_completed = run_result.state != task_result.State.RUNNING
    if not _maybe_pubsub_notify_now(smry, request):
        return None
    _update_stats(run_result, bot_id, request, task_completed)
    if task_completed:
        ts_mon_metrics.update_jobs_completed_metrics(smry)
    return run_result.state
Пример #19
0
def bot_update_task(
    run_result_key, bot_id, output, output_chunk_start, exit_code, duration,
    hard_timeout, io_timeout, cost_usd, outputs_ref, cipd_pins,
    performance_stats):
  """Updates a TaskRunResult and TaskResultSummary, along TaskOutputChunk.

  Arguments:
  - run_result_key: ndb.Key to TaskRunResult.
  - bot_id: Self advertised bot id to ensure it's the one expected.
  - output: Data to append to this command output.
  - output_chunk_start: Index of output in the stdout stream.
  - exit_code: Mark that this task completed.
  - duration: Time spent in seconds for this task, excluding overheads.
  - hard_timeout: Bool set if an hard timeout occured.
  - io_timeout: Bool set if an I/O timeout occured.
  - cost_usd: Cost in $USD of this task up to now.
  - outputs_ref: task_request.FilesRef instance or None.
  - cipd_pins: None or task_result.CipdPins
  - performance_stats: task_result.PerformanceStats instance or None. Can only
        be set when the task is completing.

  Invalid states, these are flat out refused:
  - A command is updated after it had an exit code assigned to.

  Returns:
    TaskRunResult.state or None in case of failure.
  """
  assert output_chunk_start is None or isinstance(output_chunk_start, int)
  assert output is None or isinstance(output, str)
  if cost_usd is not None and cost_usd < 0.:
    raise ValueError('cost_usd must be None or greater or equal than 0')
  if duration is not None and duration < 0.:
    raise ValueError('duration must be None or greater or equal than 0')
  if (duration is None) != (exit_code is None):
    raise ValueError(
        'had unexpected duration; expected iff a command completes\n'
        'duration: %r; exit: %r' % (duration, exit_code))
  if performance_stats and duration is None:
    raise ValueError(
        'duration must be set when performance_stats is set\n'
        'duration: %s; performance_stats: %s' %
        (duration, performance_stats))

  packed = task_pack.pack_run_result_key(run_result_key)
  logging.debug(
      'bot_update_task(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)',
      packed, bot_id, len(output) if output else output, output_chunk_start,
      exit_code, duration, hard_timeout, io_timeout, cost_usd, outputs_ref,
      cipd_pins, performance_stats)

  result_summary_key = task_pack.run_result_key_to_result_summary_key(
      run_result_key)
  request_key = task_pack.result_summary_key_to_request_key(result_summary_key)
  request_future = request_key.get_async()
  server_version = utils.get_app_version()
  request = request_future.get_result()
  now = utils.utcnow()

  run = lambda: _bot_update_tx(
      run_result_key, bot_id, output, output_chunk_start, exit_code, duration,
      hard_timeout, io_timeout, cost_usd, outputs_ref, cipd_pins,
      performance_stats, now, result_summary_key, server_version, request)
  try:
    smry, run_result, error = datastore_utils.transaction(run)
  except datastore_utils.CommitError as e:
    logging.info('Got commit error: %s', e)
    # It is important that the caller correctly surface this error as the bot
    # will retry on HTTP 500.
    return None
  if smry and smry.state != task_result.State.RUNNING:
    # Take no chance and explicitly clear the ndb memcache entry. A very rare
    # race condition is observed where a stale version of the entities it kept
    # in memcache.
    ndb.get_context()._clear_memcache(
        [result_summary_key, run_result_key]).check_success()
  assert bool(error) != bool(run_result), (error, run_result)
  if error:
    logging.error('Task %s %s', packed, error)
    return None
  # Caller must retry if PubSub enqueue fails.
  if not _maybe_pubsub_notify_now(smry, request):
    return None
  if smry.state not in task_result.State.STATES_RUNNING:
    event_mon_metrics.send_task_event(smry)
    ts_mon_metrics.on_task_completed(smry)

  # Hack a bit to tell the bot what it needs to hear (see handler_bot.py). It's
  # kind of an ugly hack but the other option is to return the whole run_result.
  if run_result.killing:
    return task_result.State.KILLED
  return run_result.state
Пример #20
0
    def get(self, task_id):
        try:
            key = task_pack.unpack_result_summary_key(task_id)
            request_key = task_pack.result_summary_key_to_request_key(key)
        except ValueError:
            try:
                key = task_pack.unpack_run_result_key(task_id)
                request_key = task_pack.result_summary_key_to_request_key(
                    task_pack.run_result_key_to_result_summary_key(key)
                )
            except (NotImplementedError, ValueError):
                self.abort(404, "Invalid key format.")

        # 'result' can be either a TaskRunResult or TaskResultSummary.
        result_future = key.get_async()
        request_future = request_key.get_async()
        result = result_future.get_result()
        if not result:
            self.abort(404, "Invalid key.")

        if not acl.is_privileged_user():
            self.abort(403, "Implement access control based on the user")

        request = request_future.get_result()
        parent_task_future = None
        if request.parent_task_id:
            parent_key = task_pack.unpack_run_result_key(request.parent_task_id)
            parent_task_future = parent_key.get_async()
        children_tasks_futures = [task_pack.unpack_result_summary_key(c).get_async() for c in result.children_task_ids]

        bot_id = result.bot_id
        following_task_future = None
        previous_task_future = None
        if result.started_ts:
            # Use a shortcut name because it becomes unwieldy otherwise.
            cls = task_result.TaskRunResult

            # Note that the links will be to the TaskRunResult, not to
            # TaskResultSummary.
            following_task_future = (
                cls.query(cls.bot_id == bot_id, cls.started_ts > result.started_ts).order(cls.started_ts).get_async()
            )
            previous_task_future = (
                cls.query(cls.bot_id == bot_id, cls.started_ts < result.started_ts).order(-cls.started_ts).get_async()
            )

        bot_future = bot_management.get_info_key(bot_id).get_async() if bot_id else None

        following_task = None
        if following_task_future:
            following_task = following_task_future.get_result()

        previous_task = None
        if previous_task_future:
            previous_task = previous_task_future.get_result()

        parent_task = None
        if parent_task_future:
            parent_task = parent_task_future.get_result()
        children_tasks = [c.get_result() for c in children_tasks_futures]

        params = {
            "bot": bot_future.get_result() if bot_future else None,
            "children_tasks": children_tasks,
            "is_admin": acl.is_admin(),
            "is_gae_admin": users.is_current_user_admin(),
            "is_privileged_user": acl.is_privileged_user(),
            "following_task": following_task,
            "full_appid": os.environ["APPLICATION_ID"],
            "host_url": self.request.host_url,
            "is_running": result.state == task_result.State.RUNNING,
            "now": utils.utcnow(),
            "parent_task": parent_task,
            "previous_task": previous_task,
            "request": request,
            "task": result,
            "xsrf_token": self.generate_xsrf_token(),
        }
        self.response.write(template.render("swarming/user_task.html", params))
Пример #21
0
def _handle_dead_bot(run_result_key):
    """Handles TaskRunResult where its bot has stopped showing sign of life.

  Transactionally updates the entities depending on the state of this task. The
  task may be retried automatically, canceled or left alone.

  Returns:
    True if the task was retried, False if the task was killed, None if no
    action was done.
  """
    result_summary_key = task_pack.run_result_key_to_result_summary_key(
        run_result_key)
    request_key = task_pack.result_summary_key_to_request_key(
        result_summary_key)
    request_future = request_key.get_async()
    now = utils.utcnow()
    server_version = utils.get_app_version()
    packed = task_pack.pack_run_result_key(run_result_key)
    request = request_future.get_result()
    to_run_key = task_to_run.request_to_task_to_run_key(request)

    def run():
        """Returns tuple(task_is_retried or None, bot_id)."""
        # Do one GET, one PUT at the end.
        run_result, result_summary, to_run = ndb.get_multi(
            (run_result_key, result_summary_key, to_run_key))
        if run_result.state != task_result.State.RUNNING:
            # It was updated already or not updating last. Likely DB index was stale.
            return None, run_result.bot_id
        if run_result.modified_ts > now - task_result.BOT_PING_TOLERANCE:
            # The query index IS stale.
            return None, run_result.bot_id

        run_result.signal_server_version(server_version)
        run_result.modified_ts = now

        notify = False
        if result_summary.try_number != run_result.try_number:
            # Not updating correct run_result, cancel it without touching
            # result_summary.
            to_put = (run_result, )
            run_result.state = task_result.State.BOT_DIED
            run_result.internal_failure = True
            run_result.abandoned_ts = now
            task_is_retried = None
        elif result_summary.try_number == 1 and now < request.expiration_ts:
            # Retry it.
            to_put = (run_result, result_summary, to_run)
            to_run.queue_number = task_to_run.gen_queue_number(request)
            run_result.state = task_result.State.BOT_DIED
            run_result.internal_failure = True
            run_result.abandoned_ts = now
            # Do not sync data from run_result to result_summary, since the task is
            # being retried.
            result_summary.reset_to_pending()
            result_summary.modified_ts = now
            task_is_retried = True
        else:
            # Cancel it, there was more than one try or the task expired in the
            # meantime.
            to_put = (run_result, result_summary)
            run_result.state = task_result.State.BOT_DIED
            run_result.internal_failure = True
            run_result.abandoned_ts = now
            result_summary.set_from_run_result(run_result, request)
            notify = True
            task_is_retried = False

        futures = ndb.put_multi_async(to_put)
        if notify:
            _maybe_pubsub_notify_via_tq(result_summary, request)
        for f in futures:
            f.check_success()

        return task_is_retried, run_result.bot_id

    try:
        task_is_retried, bot_id = datastore_utils.transaction(run)
    except datastore_utils.CommitError:
        task_is_retried, bot_id = None, None
    if task_is_retried is not None:
        task_to_run.set_lookup_cache(to_run_key, task_is_retried)
        if not task_is_retried:
            stats.add_run_entry('run_bot_died',
                                run_result_key,
                                bot_id=bot_id[0],
                                dimensions=request.properties.dimensions,
                                user=request.user)
        else:
            logging.info('Retried %s', packed)
    else:
        logging.info('Ignored %s', packed)
    return task_is_retried
Пример #22
0
def bot_update_task(
    run_result_key,
    bot_id,
    output,
    output_chunk_start,
    exit_code,
    duration,
    hard_timeout,
    io_timeout,
    cost_usd,
    outputs_ref,
):
    """Updates a TaskRunResult and TaskResultSummary, along TaskOutput.

  Arguments:
  - run_result_key: ndb.Key to TaskRunResult.
  - bot_id: Self advertised bot id to ensure it's the one expected.
  - output: Data to append to this command output.
  - output_chunk_start: Index of output in the stdout stream.
  - exit_code: Mark that this command is terminated.
  - duration: Time spent in seconds for this command.
  - hard_timeout: Bool set if an hard timeout occured.
  - io_timeout: Bool set if an I/O timeout occured.
  - cost_usd: Cost in $USD of this task up to now.
  - outputs_ref: Serialized FilesRef instance or None.

  Invalid states, these are flat out refused:
  - A command is updated after it had an exit code assigned to.

  Returns:
    tuple(bool, bool); first is if the update succeeded, second is if the task
    completed.
  """
    assert output_chunk_start is None or isinstance(output_chunk_start, int)
    assert output is None or isinstance(output, str)
    if cost_usd is not None and cost_usd < 0.0:
        raise ValueError("cost_usd must be None or greater or equal than 0")

    packed = task_pack.pack_run_result_key(run_result_key)
    logging.debug(
        "bot_update_task(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
        packed,
        bot_id,
        len(output) if output else output,
        output_chunk_start,
        exit_code,
        duration,
        hard_timeout,
        io_timeout,
        cost_usd,
        outputs_ref,
    )

    result_summary_key = task_pack.run_result_key_to_result_summary_key(run_result_key)
    request_key = task_pack.result_summary_key_to_request_key(result_summary_key)
    request_future = request_key.get_async()
    server_version = utils.get_app_version()
    request = request_future.get_result()
    now = utils.utcnow()

    def run():
        # 2 consecutive GETs, one PUT.
        run_result_future = run_result_key.get_async()
        result_summary_future = result_summary_key.get_async()
        run_result = run_result_future.get_result()
        if not run_result:
            result_summary_future.wait()
            return None, None, False, "is missing"

        if run_result.bot_id != bot_id:
            result_summary_future.wait()
            return None, None, False, ("expected bot (%s) but had update from bot %s" % (run_result.bot_id, bot_id))

        if not run_result.started_ts:
            return None, None, False, "TaskRunResult is broken; %s" % (run_result.to_dict())

        # This happens as an HTTP request is retried when the DB write succeeded but
        # it still returned HTTP 500.
        if run_result.exit_code is not None and exit_code is not None:
            if run_result.exit_code != exit_code:
                result_summary_future.wait()
                return None, None, False, "got 2 different exit_code; %s then %s" % (run_result.exit_code, exit_code)

        if run_result.durations and duration is not None:
            if run_result.durations[0] != duration:
                result_summary_future.wait()
                return None, None, False, "got 2 different durations; %s then %s" % (run_result.durations[0], duration)

        if (duration is None) != (exit_code is None):
            result_summary_future.wait()
            return (
                None,
                None,
                False,
                (
                    "had unexpected duration; expected iff a command completes\n"
                    "duration: %s vs %s; exit: %s vs %s"
                    % (run_result.durations, duration, run_result.exit_code, exit_code)
                ),
            )

        # If the command completed. Check if the value wasn't set already.
        if duration is not None and not run_result.durations:
            run_result.durations.append(duration)
        if exit_code is not None and run_result.exit_code is None:
            run_result.exit_codes.append(exit_code)

        if outputs_ref:
            run_result.outputs_ref = task_request.FilesRef(**outputs_ref)

        task_completed = run_result.exit_code is not None
        if run_result.state in task_result.State.STATES_RUNNING:
            if hard_timeout or io_timeout:
                run_result.state = task_result.State.TIMED_OUT
                run_result.completed_ts = now
            elif task_completed:
                run_result.state = task_result.State.COMPLETED
                run_result.completed_ts = now

        run_result.signal_server_version(server_version)
        to_put = [run_result]
        if output:
            # This does 1 multi GETs. This also modifies run_result in place.
            to_put.extend(run_result.append_output(0, output, output_chunk_start or 0))

        run_result.cost_usd = max(cost_usd, run_result.cost_usd or 0.0)
        run_result.modified_ts = now

        result_summary = result_summary_future.get_result()
        if result_summary.try_number and result_summary.try_number > run_result.try_number:
            # The situation where a shard is retried but the bot running the previous
            # try somehow reappears and reports success, the result must still show
            # the last try's result. We still need to update cost_usd manually.
            result_summary.costs_usd[run_result.try_number - 1] = run_result.cost_usd
            result_summary.modified_ts = now
        else:
            result_summary.set_from_run_result(run_result, request)

        to_put.append(result_summary)
        ndb.put_multi(to_put)

        return result_summary, run_result, task_completed, None

    try:
        smry, run_result, task_completed, error = datastore_utils.transaction(run)
    except datastore_utils.CommitError as e:
        logging.info("Got commit error: %s", e)
        # It is important that the caller correctly surface this error.
        return False, False

    if run_result:
        # Caller must retry if PubSub enqueue fails.
        if not _maybe_pubsub_notify_now(smry, request):
            return False, False
        _update_stats(run_result, bot_id, request, task_completed)
    if error:
        logging.error("Task %s %s", packed, error)
    return True, task_completed
Пример #23
0
 def get(self, task_id):
     logging.error('Unexpected old client')
     _, summary_key = self.get_result_key(task_id)
     request_key = task_pack.result_summary_key_to_request_key(summary_key)
     self.send_response(utils.to_json_encodable(request_key.get()))
Пример #24
0
def get_task_account_token(task_id, bot_id, scopes):
    """Returns an access token for a service account associated with a task.

  Assumes authorization checks have been made already. If the task is not
  configured to use service account returns ('none', None). If the task is
  configured to use whatever bot is using when calling Swarming, returns
  ('bot', None).

  Otherwise returns (<email>, AccessToken with valid token for <email>).

  If the task has realm, it calls MintServiceAccountToken rpc using the realm.
  Otherwise, it calls MintOAuthTokenViaGrant with grant token. The legacy path
  will be deprecated after migrating to Realm-based configurations.

  Args:
    task_id: ID of the task.
    bot_id: ID of the bot that executes the task, for logs.
    scopes: list of requested OAuth scopes.

  Returns:
    (<service account email> or 'bot' or 'none', AccessToken or None).

  Raises:
    PermissionError if the token server forbids the usage.
    MisconfigurationError if the service account is misconfigured.
    InternalError if the RPC fails unexpectedly.
  """
    # Grab corresponding TaskRequest.
    try:
        result_summary_key = task_pack.run_result_key_to_result_summary_key(
            task_pack.unpack_run_result_key(task_id))
        task_request_key = task_pack.result_summary_key_to_request_key(
            result_summary_key)
    except ValueError as exc:
        logging.error('Unexpectedly bad task_id: %s', exc)
        raise MisconfigurationError('Bad task_id: %s' % task_id)

    task_request = task_request_key.get()
    if not task_request:
        raise MisconfigurationError('No such task request: %s' % task_id)

    # 'none' or 'bot' cases are handled by the bot locally, no token for them.
    if task_request.service_account in ('none', 'bot'):
        return task_request.service_account, None

    # The only possible case is a service account email. Double check this.
    if not service_accounts_utils.is_service_account(
            task_request.service_account):
        raise MisconfigurationError('Not a service account email: %s' %
                                    task_request.service_account)

    # Additional information for Token Server's logs.
    audit_tags = [
        'swarming:bot_id:%s' % bot_id,
        'swarming:task_id:%s' % task_id,
        'swarming:task_name:%s' % task_request.name,
    ]

    # task_request.service_account_token can be empty here only when the task has
    # a realm and the service account was authorized via realm ACLs. Use
    # MintServiceAccountToken RPC for such tasks.
    if not task_request.service_account_token:
        assert task_request.realm
        # Re-check if the service account is still allowed to run in the realm,
        # because it may have changed since the last check.
        pool_cfg = pools_config.get_pool_config(task_request.pool)
        realms.check_tasks_act_as(task_request, pool_cfg, enforce=True)
        access_token, expiry = _mint_service_account_token(
            task_request.service_account, task_request.realm, scopes,
            audit_tags)
    else:
        # Use grant token to grab the real OAuth token. Note that the bot caches the
        # resulting OAuth token internally, so we don't bother to cache it here.
        access_token, expiry = _mint_oauth_token_via_grant(
            task_request.service_account_token, scopes, audit_tags)

    # Log and return the token.
    token = AccessToken(access_token,
                        int(utils.datetime_to_timestamp(expiry) / 1e6))
    _check_and_log_token('task associated', task_request.service_account,
                         token)
    return task_request.service_account, token
Пример #25
0
 def request(self, request):
     """Returns the task result corresponding to a task ID."""
     _, summary_key = get_result_key(request.task_id)
     request_key = task_pack.result_summary_key_to_request_key(summary_key)
     entity = get_or_raise(request_key)
     return message_conversion.task_request_from_dict(entity.to_dict())
Пример #26
0
 def get(self, task_id):
   _, summary_key = self.get_result_key(task_id)
   request_key = task_pack.result_summary_key_to_request_key(summary_key)
   self.send_response(utils.to_json_encodable(request_key.get()))
Пример #27
0
  def get(self, task_id):
    try:
      key = task_pack.unpack_result_summary_key(task_id)
      request_key = task_pack.result_summary_key_to_request_key(key)
    except ValueError:
      try:
        key = task_pack.unpack_run_result_key(task_id)
        request_key = task_pack.result_summary_key_to_request_key(
            task_pack.run_result_key_to_result_summary_key(key))
      except (NotImplementedError, ValueError):
        self.abort(404, 'Invalid key format.')

    # 'result' can be either a TaskRunResult or TaskResultSummary.
    result_future = key.get_async()
    request_future = request_key.get_async()
    result = result_future.get_result()
    if not result:
      self.abort(404, 'Invalid key.')

    if not acl.is_privileged_user():
      self.abort(403, 'Implement access control based on the user')

    request = request_future.get_result()
    parent_task_future = None
    if request.parent_task_id:
      parent_key = task_pack.unpack_run_result_key(request.parent_task_id)
      parent_task_future = parent_key.get_async()
    children_tasks_futures = [
      task_pack.unpack_result_summary_key(c).get_async()
      for c in result.children_task_ids
    ]

    bot_id = result.bot_id
    following_task_future = None
    previous_task_future = None
    if result.started_ts:
      # Use a shortcut name because it becomes unwieldy otherwise.
      cls = task_result.TaskRunResult

      # Note that the links will be to the TaskRunResult, not to
      # TaskResultSummary.
      following_task_future = cls.query(
          cls.bot_id == bot_id,
          cls.started_ts > result.started_ts,
          ).order(cls.started_ts).get_async()
      previous_task_future = cls.query(
          cls.bot_id == bot_id,
          cls.started_ts < result.started_ts,
          ).order(-cls.started_ts).get_async()

    bot_future = (
        bot_management.get_info_key(bot_id).get_async() if bot_id else None)

    following_task = None
    if following_task_future:
      following_task = following_task_future.get_result()

    previous_task = None
    if previous_task_future:
      previous_task = previous_task_future.get_result()

    parent_task = None
    if parent_task_future:
      parent_task = parent_task_future.get_result()
    children_tasks = [c.get_result() for c in children_tasks_futures]

    params = {
      'bot': bot_future.get_result() if bot_future else None,
      'children_tasks': children_tasks,
      'is_admin': acl.is_admin(),
      'is_gae_admin': users.is_current_user_admin(),
      'is_privileged_user': acl.is_privileged_user(),
      'following_task': following_task,
      'full_appid': os.environ['APPLICATION_ID'],
      'host_url': self.request.host_url,
      'is_running': result.state == task_result.State.RUNNING,
      'now': utils.utcnow(),
      'parent_task': parent_task,
      'previous_task': previous_task,
      'request': request,
      'task': result,
      'xsrf_token': self.generate_xsrf_token(),
    }
    self.response.write(template.render('swarming/user_task.html', params))
Пример #28
0
 def request_key(self):
     """Returns the TaskRequest ndb.Key that is related to this entity."""
     return task_pack.result_summary_key_to_request_key(self.key)
Пример #29
0
def bot_update_task(
    run_result_key, bot_id, output, output_chunk_start,
    exit_code, duration, hard_timeout, io_timeout, cost_usd, outputs_ref):
  """Updates a TaskRunResult and TaskResultSummary, along TaskOutput.

  Arguments:
  - run_result_key: ndb.Key to TaskRunResult.
  - bot_id: Self advertised bot id to ensure it's the one expected.
  - output: Data to append to this command output.
  - output_chunk_start: Index of output in the stdout stream.
  - exit_code: Mark that this command is terminated.
  - duration: Time spent in seconds for this command.
  - hard_timeout: Bool set if an hard timeout occured.
  - io_timeout: Bool set if an I/O timeout occured.
  - cost_usd: Cost in $USD of this task up to now.
  - outputs_ref: Serialized FilesRef instance or None.

  Invalid states, these are flat out refused:
  - A command is updated after it had an exit code assigned to.

  Returns:
    tuple(bool, bool); first is if the update succeeded, second is if the task
    completed.
  """
  assert output_chunk_start is None or isinstance(output_chunk_start, int)
  assert output is None or isinstance(output, str)
  if cost_usd is not None and cost_usd < 0.:
    raise ValueError('cost_usd must be None or greater or equal than 0')

  result_summary_key = task_pack.run_result_key_to_result_summary_key(
      run_result_key)
  request_key = task_pack.result_summary_key_to_request_key(result_summary_key)
  request_future = request_key.get_async()
  server_version = utils.get_app_version()
  packed = task_pack.pack_run_result_key(run_result_key)
  request = request_future.get_result()
  now = utils.utcnow()

  def run():
    # 2 consecutive GETs, one PUT.
    run_result_future = run_result_key.get_async()
    result_summary_future = result_summary_key.get_async()
    run_result = run_result_future.get_result()
    if not run_result:
      result_summary_future.wait()
      return None, False, 'is missing'

    if run_result.bot_id != bot_id:
      result_summary_future.wait()
      return None, False, 'expected bot (%s) but had update from bot %s' % (
          run_result.bot_id, bot_id)

    # This happens as an HTTP request is retried when the DB write succeeded but
    # it still returned HTTP 500.
    if len(run_result.exit_codes) and exit_code is not None:
      if run_result.exit_codes[0] != exit_code:
        result_summary_future.wait()
        return None, False, 'got 2 different exit_codes; %d then %d' % (
            run_result.exit_codes[0], exit_code)

    if (duration is None) != (exit_code is None):
      result_summary_future.wait()
      return None, False, (
          'had unexpected duration; expected iff a command completes; index %d'
          % len(run_result.exit_codes))

    if exit_code is not None:
      # The command completed.
      run_result.durations.append(duration)
      run_result.exit_codes.append(exit_code)

    if outputs_ref:
      run_result.outputs_ref = task_request.FilesRef(**outputs_ref)

    task_completed = len(run_result.exit_codes) == 1
    if run_result.state in task_result.State.STATES_RUNNING:
      if hard_timeout or io_timeout:
        run_result.state = task_result.State.TIMED_OUT
        run_result.completed_ts = now
      elif task_completed:
        run_result.state = task_result.State.COMPLETED
        run_result.completed_ts = now

    run_result.signal_server_version(server_version)
    to_put = [run_result]
    if output:
      # This does 1 multi GETs. This also modifies run_result in place.
      to_put.extend(
          run_result.append_output(0, output, output_chunk_start or 0))

    run_result.cost_usd = max(cost_usd, run_result.cost_usd or 0.)
    run_result.modified_ts = now

    result_summary = result_summary_future.get_result()
    if (result_summary.try_number and
        result_summary.try_number > run_result.try_number):
      # The situation where a shard is retried but the bot running the previous
      # try somehow reappears and reports success, the result must still show
      # the last try's result. We still need to update cost_usd manually.
      result_summary.costs_usd[run_result.try_number-1] = run_result.cost_usd
      result_summary.modified_ts = now
    else:
      result_summary.set_from_run_result(run_result, request)

    to_put.append(result_summary)
    ndb.put_multi(to_put)
    return run_result, task_completed, None

  try:
    run_result, task_completed, error = datastore_utils.transaction(run)
  except datastore_utils.CommitError:
    # It is important that the caller correctly surface this error.
    return False, False

  if run_result:
    _update_stats(run_result, bot_id, request, task_completed)
  if error:
      logging.error('Task %s %s', packed, error)
  return True, task_completed
Пример #30
0
 def request_key(self):
   """Returns the TaskRequest ndb.Key that is related to this entity."""
   return task_pack.result_summary_key_to_request_key(self.key)
Пример #31
0
def get_task_account_token(task_id, bot_id, scopes):
    """Returns an access token for a service account associated with a task.

  Assumes authorization checks have been made already. If the task is not
  configured to use service account returns ('none', None). If the task is
  configured to use whatever bot is using when calling Swarming, returns
  ('bot', None).

  Otherwise returns (<email>, AccessToken with valid token for <email>).

  Args:
    task_id: ID of the task.
    bot_id: ID of the bot that executes the task, for logs.
    scopes: list of requested OAuth scopes.

  Returns:
    (<service account email> or 'bot' or 'none', AccessToken or None).

  Raises:
    PermissionError if the token server forbids the usage.
    MisconfigurationError if the service account is misconfigured.
    InternalError if the RPC fails unexpectedly.
  """
    # Grab corresponding TaskRequest.
    try:
        result_summary_key = task_pack.run_result_key_to_result_summary_key(
            task_pack.unpack_run_result_key(task_id))
        task_request_key = task_pack.result_summary_key_to_request_key(
            result_summary_key)
    except ValueError as exc:
        logging.error('Unexpectedly bad task_id: %s', exc)
        raise MisconfigurationError('Bad task_id: %s' % task_id)

    task_request = task_request_key.get()
    if not task_request:
        raise MisconfigurationError('No such task request: %s' % task_id)

    # 'none' or 'bot' cases are handled by the bot locally, no token for them.
    if task_request.service_account in ('none', 'bot'):
        return task_request.service_account, None

    # The only possible case is a service account email. Double check this.
    if not is_service_account(task_request.service_account):
        raise MisconfigurationError('Not a service account email: %s' %
                                    task_request.service_account)

    # Should have a token prepared by 'get_oauth_token_grant' already.
    if not task_request.service_account_token:
        raise MisconfigurationError(
            'The task request %s has no associated service account token' %
            task_id)

    # Additional information for Token Server's logs.
    audit_tags = [
        'swarming:bot_id:%s' % bot_id,
        'swarming:task_id:%s' % task_id,
        'swarming:task_name:%s' % task_request.name,
    ]

    # Use this token to grab the real OAuth token. Note that the bot caches the
    # resulting OAuth token internally, so we don't bother to cache it here.
    access_token, expiry = _mint_oauth_token_via_grant(
        task_request.service_account_token, scopes, audit_tags)

    # Log and return the token.
    token = AccessToken(access_token,
                        int(utils.datetime_to_timestamp(expiry) / 1e6))
    _check_and_log_token('task associated', task_request.service_account,
                         token)
    return task_request.service_account, token
Пример #32
0
 def test_result_summary_key_to_request_key(self):
   request_key = task_pack.unpack_request_key('11')
   result_summary_key = task_pack.request_key_to_result_summary_key(
       request_key)
   actual = task_pack.result_summary_key_to_request_key(result_summary_key)
   self.assertEqual(request_key, actual)
Пример #33
0
def _handle_dead_bot(run_result_key):
    """Handles TaskRunResult where its bot has stopped showing sign of life.

  Transactionally updates the entities depending on the state of this task. The
  task may be retried automatically, canceled or left alone.

  Returns:
    True if the task was retried, False if the task was killed, None if no
    action was done.
  """
    result_summary_key = task_pack.run_result_key_to_result_summary_key(run_result_key)
    request_key = task_pack.result_summary_key_to_request_key(result_summary_key)
    request_future = request_key.get_async()
    now = utils.utcnow()
    server_version = utils.get_app_version()
    packed = task_pack.pack_run_result_key(run_result_key)
    request = request_future.get_result()
    to_run_key = task_to_run.request_to_task_to_run_key(request)

    def run():
        """Returns tuple(task_is_retried or None, bot_id)."""
        # Do one GET, one PUT at the end.
        run_result, result_summary, to_run = ndb.get_multi((run_result_key, result_summary_key, to_run_key))
        if run_result.state != task_result.State.RUNNING:
            # It was updated already or not updating last. Likely DB index was stale.
            return None, run_result.bot_id

        run_result.signal_server_version(server_version)
        run_result.modified_ts = now

        notify = False
        if result_summary.try_number != run_result.try_number:
            # Not updating correct run_result, cancel it without touching
            # result_summary.
            to_put = (run_result,)
            run_result.state = task_result.State.BOT_DIED
            run_result.internal_failure = True
            run_result.abandoned_ts = now
            task_is_retried = None
        elif result_summary.try_number == 1 and now < request.expiration_ts:
            # Retry it.
            to_put = (run_result, result_summary, to_run)
            to_run.queue_number = task_to_run.gen_queue_number(request)
            run_result.state = task_result.State.BOT_DIED
            run_result.internal_failure = True
            run_result.abandoned_ts = now
            # Do not sync data from run_result to result_summary, since the task is
            # being retried.
            result_summary.reset_to_pending()
            result_summary.modified_ts = now
            task_is_retried = True
        else:
            # Cancel it, there was more than one try or the task expired in the
            # meantime.
            to_put = (run_result, result_summary)
            run_result.state = task_result.State.BOT_DIED
            run_result.internal_failure = True
            run_result.abandoned_ts = now
            result_summary.set_from_run_result(run_result, request)
            notify = True
            task_is_retried = False

        futures = ndb.put_multi_async(to_put)
        if notify:
            _maybe_pubsub_notify_via_tq(result_summary, request)
        for f in futures:
            f.check_success()

        return task_is_retried, run_result.bot_id

    try:
        task_is_retried, bot_id = datastore_utils.transaction(run)
    except datastore_utils.CommitError:
        task_is_retried, bot_id = None, None
    if task_is_retried is not None:
        task_to_run.set_lookup_cache(to_run_key, task_is_retried)
        if not task_is_retried:
            stats.add_run_entry(
                "run_bot_died",
                run_result_key,
                bot_id=bot_id[0],
                dimensions=request.properties.dimensions,
                user=request.user,
            )
        else:
            logging.info("Retried %s", packed)
    else:
        logging.info("Ignored %s", packed)
    return task_is_retried
Пример #34
0
  def get(self, task_id):
    try:
      key = task_pack.unpack_result_summary_key(task_id)
      request_key = task_pack.result_summary_key_to_request_key(key)
    except ValueError:
      try:
        key = task_pack.unpack_run_result_key(task_id)
        request_key = task_pack.result_summary_key_to_request_key(
            task_pack.run_result_key_to_result_summary_key(key))
      except (NotImplementedError, ValueError):
        self.abort(404, 'Invalid key format.')

    # 'result' can be either a TaskRunResult or TaskResultSummary.
    result_future = key.get_async()
    request_future = request_key.get_async()
    result = result_future.get_result()
    if not result:
      self.abort(404, 'Invalid key.')

    if not acl.is_privileged_user():
      self.abort(403, 'Implement access control based on the user')

    request = request_future.get_result()
    parent_task_future = None
    if request.parent_task_id:
      parent_key = task_pack.unpack_run_result_key(request.parent_task_id)
      parent_task_future = parent_key.get_async()
    children_tasks_futures = [
      task_pack.unpack_result_summary_key(c).get_async()
      for c in result.children_task_ids
    ]

    bot_id = result.bot_id
    following_task_future = None
    previous_task_future = None
    if result.started_ts:
      # Use a shortcut name because it becomes unwieldy otherwise.
      cls = task_result.TaskRunResult

      # Note that the links will be to the TaskRunResult, not to
      # TaskResultSummary.
      following_task_future = cls.query(
          cls.bot_id == bot_id,
          cls.started_ts > result.started_ts,
          ).order(cls.started_ts).get_async()
      previous_task_future = cls.query(
          cls.bot_id == bot_id,
          cls.started_ts < result.started_ts,
          ).order(-cls.started_ts).get_async()

    bot_future = (
        bot_management.get_info_key(bot_id).get_async() if bot_id else None)

    following_task = None
    if following_task_future:
      following_task = following_task_future.get_result()

    previous_task = None
    if previous_task_future:
      previous_task = previous_task_future.get_result()

    parent_task = None
    if parent_task_future:
      parent_task = parent_task_future.get_result()
    children_tasks = [c.get_result() for c in children_tasks_futures]

    params = {
      'bot': bot_future.get_result() if bot_future else None,
      'children_tasks': children_tasks,
      'is_admin': acl.is_admin(),
      'is_gae_admin': users.is_current_user_admin(),
      'is_privileged_user': acl.is_privileged_user(),
      'following_task': following_task,
      'full_appid': os.environ['APPLICATION_ID'],
      'host_url': self.request.host_url,
      'is_running': result.state == task_result.State.RUNNING,
      'now': utils.utcnow(),
      'parent_task': parent_task,
      'previous_task': previous_task,
      'request': request,
      'task': result,
      'xsrf_token': self.generate_xsrf_token(),
    }
    self.response.write(template.render('swarming/user_task.html', params))
Пример #35
0
 def get(self, task_id):
   logging.error('Unexpected old client')
   _, summary_key = self.get_result_key(task_id)
   request_key = task_pack.result_summary_key_to_request_key(summary_key)
   self.send_response(utils.to_json_encodable(request_key.get()))
Пример #36
0
 def get(self, task_id):
   _, summary_key = self.get_result_key(task_id)
   request_key = task_pack.result_summary_key_to_request_key(summary_key)
   self.send_response(utils.to_json_encodable(request_key.get()))
Пример #37
0
def make_request(data):
  """Constructs a TaskRequest out of a yet-to-be-specified API.

  Argument:
  - data: dict with:
    - name
    - parent_task_id*
    - properties
      - commands
      - data
      - dimensions
      - env
      - execution_timeout_secs
      - grace_period_secs*
      - idempotent*
      - io_timeout_secs
    - priority
    - scheduling_expiration_secs
    - tags
    - user

  * are optional.

  If parent_task_id is set, properties for the parent are used:
  - priority: defaults to parent.priority - 1
  - user: overriden by parent.user

  Returns:
    The newly created TaskRequest.
  """
  # Save ourself headaches with typos and refuses unexpected values.
  _assert_keys(_EXPECTED_DATA_KEYS, _REQUIRED_DATA_KEYS, data, 'request keys')
  data_properties = data['properties']
  _assert_keys(
      _EXPECTED_PROPERTIES_KEYS, _REQUIRED_PROPERTIES_KEYS, data_properties,
      'request properties keys')

  parent_task_id = data.get('parent_task_id') or None
  if parent_task_id:
    data = data.copy()
    run_result_key = task_pack.unpack_run_result_key(parent_task_id)
    result_summary_key = task_pack.run_result_key_to_result_summary_key(
        run_result_key)
    request_key = task_pack.result_summary_key_to_request_key(
        result_summary_key)
    parent = request_key.get()
    if not parent:
      raise ValueError('parent_task_id is not a valid task')
    data['priority'] = max(min(data['priority'], parent.priority - 1), 0)
    # Drop the previous user.
    data['user'] = parent.user

  # Can't be a validator yet as we wouldn't be able to load previous task
  # requests.
  if len(data_properties.get('commands') or []) > 1:
    raise datastore_errors.BadValueError('Only one command is supported')

  # Class TaskProperties takes care of making everything deterministic.
  properties = TaskProperties(
      commands=data_properties['commands'],
      data=data_properties['data'],
      dimensions=data_properties['dimensions'],
      env=data_properties['env'],
      execution_timeout_secs=data_properties['execution_timeout_secs'],
      grace_period_secs=data_properties.get('grace_period_secs', 30),
      idempotent=data_properties.get('idempotent', False),
      io_timeout_secs=data_properties['io_timeout_secs'])

  now = utils.utcnow()
  expiration_ts = now + datetime.timedelta(
      seconds=data['scheduling_expiration_secs'])

  request = TaskRequest(
      authenticated=auth.get_current_identity(),
      created_ts=now,
      expiration_ts=expiration_ts,
      name=data['name'],
      parent_task_id=parent_task_id,
      priority=data['priority'],
      properties=properties,
      tags=data['tags'],
      user=data['user'] or '')
  _put_request(request)
  return request
Пример #38
0
 def request(self, request):
   """Returns the task request corresponding to a task ID."""
   logging.info('%s', request)
   _, summary_key = get_result_key(request.task_id)
   request_key = task_pack.result_summary_key_to_request_key(summary_key)
   return message_conversion.task_request_to_rpc(get_or_raise(request_key))