예제 #1
0
def cancel_task(result_summary_key):
    """Cancels a task if possible."""
    request = task_pack.result_summary_key_to_request_key(result_summary_key).get()
    to_run_key = task_to_run.request_to_task_to_run_key(request)
    now = utils.utcnow()

    def run():
        to_run, result_summary = ndb.get_multi((to_run_key, result_summary_key))
        was_running = result_summary.state == task_result.State.RUNNING
        if not result_summary.can_be_canceled:
            return False, was_running
        to_run.queue_number = None
        result_summary.state = task_result.State.CANCELED
        result_summary.abandoned_ts = now
        result_summary.modified_ts = now

        futures = ndb.put_multi_async((to_run, result_summary))
        _maybe_pubsub_notify_via_tq(result_summary, request)
        for f in futures:
            f.check_success()

        return True, was_running

    try:
        ok, was_running = datastore_utils.transaction(run)
    except datastore_utils.CommitError as e:
        packed = task_pack.pack_result_summary_key(result_summary_key)
        return "Failed killing task %s: %s" % (packed, e)
    # Add it to the negative cache.
    task_to_run.set_lookup_cache(to_run_key, False)
    # TODO(maruel): Add stats.
    return ok, was_running
예제 #2
0
def cancel_task(result_summary_key):
  """Cancels a task if possible."""
  request_key = task_pack.result_summary_key_to_request_key(result_summary_key)
  to_run_key = task_to_run.request_to_task_to_run_key(request_key.get())
  now = utils.utcnow()

  def run():
    to_run, result_summary = ndb.get_multi((to_run_key, result_summary_key))
    was_running = result_summary.state == task_result.State.RUNNING
    if not result_summary.can_be_canceled:
      return False, was_running
    to_run.queue_number = None
    result_summary.state = task_result.State.CANCELED
    result_summary.abandoned_ts = now
    result_summary.modified_ts = now
    ndb.put_multi((to_run, result_summary))
    return True, was_running

  try:
    ok, was_running = datastore_utils.transaction(run)
  except datastore_utils.CommitError as e:
    packed = task_pack.pack_result_summary_key(result_summary_key)
    return 'Failed killing task %s: %s' % (packed, e)
  # Add it to the negative cache.
  task_to_run.set_lookup_cache(to_run_key, False)
  # TODO(maruel): Add stats.
  return ok, was_running
예제 #3
0
def _expire_task(to_run_key, request):
    """Expires a TaskResultSummary and unschedules the TaskToRun.

  Returns:
    True on success.
  """
    # Look if the TaskToRun is reapable once before doing the check inside the
    # transaction. This reduces the likelihood of failing this check inside the
    # transaction, which is an order of magnitude more costly.
    if not to_run_key.get().is_reapable:
        logging.info('Not reapable anymore')
        return None

    result_summary_key = task_pack.request_key_to_result_summary_key(
        request.key)
    now = utils.utcnow()

    def run():
        # 2 concurrent GET, one PUT. Optionally with an additional serialized GET.
        to_run_future = to_run_key.get_async()
        result_summary_future = result_summary_key.get_async()
        to_run = to_run_future.get_result()
        if not to_run or not to_run.is_reapable:
            result_summary_future.wait()
            return False

        to_run.queue_number = None
        result_summary = result_summary_future.get_result()
        if result_summary.try_number:
            # It's a retry that is being expired. Keep the old state. That requires an
            # additional pipelined GET but that shouldn't be the common case.
            run_result = result_summary.run_result_key.get()
            result_summary.set_from_run_result(run_result, request)
        else:
            result_summary.state = task_result.State.EXPIRED
        result_summary.abandoned_ts = now
        result_summary.modified_ts = now

        futures = ndb.put_multi_async((to_run, result_summary))
        _maybe_pubsub_notify_via_tq(result_summary, request)
        for f in futures:
            f.check_success()

        return True

    # It'll be caught by next cron job execution in case of failure.
    try:
        success = datastore_utils.transaction(run)
    except datastore_utils.CommitError:
        success = False
    if success:
        task_to_run.set_lookup_cache(to_run_key, False)
        logging.info('Expired %s',
                     task_pack.pack_result_summary_key(result_summary_key))
    return success
예제 #4
0
def _expire_task(to_run_key, request):
    """Expires a TaskResultSummary and unschedules the TaskToRun.

  Returns:
    True on success.
  """
    # Look if the TaskToRun is reapable once before doing the check inside the
    # transaction. This reduces the likelihood of failing this check inside the
    # transaction, which is an order of magnitude more costly.
    if not to_run_key.get().is_reapable:
        logging.info("Not reapable anymore")
        return None

    result_summary_key = task_pack.request_key_to_result_summary_key(request.key)
    now = utils.utcnow()

    def run():
        # 2 concurrent GET, one PUT. Optionally with an additional serialized GET.
        to_run_future = to_run_key.get_async()
        result_summary_future = result_summary_key.get_async()
        to_run = to_run_future.get_result()
        if not to_run or not to_run.is_reapable:
            result_summary_future.wait()
            return False

        to_run.queue_number = None
        result_summary = result_summary_future.get_result()
        if result_summary.try_number:
            # It's a retry that is being expired. Keep the old state. That requires an
            # additional pipelined GET but that shouldn't be the common case.
            run_result = result_summary.run_result_key.get()
            result_summary.set_from_run_result(run_result, request)
        else:
            result_summary.state = task_result.State.EXPIRED
        result_summary.abandoned_ts = now
        result_summary.modified_ts = now

        futures = ndb.put_multi_async((to_run, result_summary))
        _maybe_pubsub_notify_via_tq(result_summary, request)
        for f in futures:
            f.check_success()

        return True

    # It'll be caught by next cron job execution in case of failure.
    try:
        success = datastore_utils.transaction(run)
    except datastore_utils.CommitError:
        success = False
    if success:
        task_to_run.set_lookup_cache(to_run_key, False)
        logging.info("Expired %s", task_pack.pack_result_summary_key(result_summary_key))
    return success
예제 #5
0
 def test_set_lookup_cache(self):
   # Create two TaskToRun on the same TaskRequest and assert that affecting one
   # negative cache entry doesn't affect the other.
   request = self.mkreq(1, _gen_request())
   to_run_1 = task_to_run.new_task_to_run(request, 1, 0)
   to_run_1.put()
   to_run_2 = task_to_run.new_task_to_run(request, 2, 0)
   to_run_2.put()
   lookup = lambda k: task_to_run._lookup_cache_is_taken_async(k).get_result()
   # By default, the negative cache is false, i.e. it is safe to reap the task.
   self.assertEqual(False, lookup(to_run_1.key))
   self.assertEqual(False, lookup(to_run_2.key))
   # Mark to_run_1 as safe to reap.
   task_to_run.set_lookup_cache(to_run_1.key, True)
   self.assertEqual(False, lookup(to_run_1.key))
   self.assertEqual(False, lookup(to_run_2.key))
   # Mark to_run_1 as unreapable, i.e. a bot is about to reap it.
   task_to_run.set_lookup_cache(to_run_1.key, False)
   self.assertEqual(True, lookup(to_run_1.key))
   self.assertEqual(False, lookup(to_run_2.key))
   task_to_run.set_lookup_cache(to_run_1.key, True)
   self.assertEqual(False, lookup(to_run_1.key))
   self.assertEqual(False, lookup(to_run_2.key))
   # Mark to_run_2 as unreapable, i.e. a bot is about to reap it.
   task_to_run.set_lookup_cache(to_run_2.key, False)
   self.assertEqual(False, lookup(to_run_1.key))
   self.assertEqual(True, lookup(to_run_2.key))
예제 #6
0
def _reap_task(to_run_key, request, bot_id, bot_version, bot_dimensions):
    """Reaps a task and insert the results entity.

  Returns:
    TaskRunResult if successful, None otherwise.
  """
    assert bot_id, bot_id
    assert request.key == task_to_run.task_to_run_key_to_request_key(
        to_run_key)
    result_summary_key = task_pack.request_key_to_result_summary_key(
        request.key)

    now = utils.utcnow()

    def run():
        # 2 GET, 1 PUT at the end.
        to_run_future = to_run_key.get_async()
        result_summary_future = result_summary_key.get_async()
        to_run = to_run_future.get_result()
        result_summary = result_summary_future.get_result()
        if not to_run:
            logging.error('Missing TaskToRun?\n%s', result_summary.task_id)
            return None
        if not to_run.is_reapable:
            logging.info('%s is not reapable', result_summary.task_id)
            return None
        if result_summary.bot_id == bot_id:
            # This means two things, first it's a retry, second it's that the first
            # try failed and the retry is being reaped by the same bot. Deny that, as
            # the bot may be deeply broken and could be in a killing spree.
            logging.warning('%s can\'t retry its own internal failure task',
                            result_summary.task_id)
            return None
        to_run.queue_number = None
        run_result = task_result.new_run_result(
            request, (result_summary.try_number or 0) + 1, bot_id, bot_version,
            bot_dimensions)
        run_result.modified_ts = now
        result_summary.set_from_run_result(run_result, request)
        ndb.put_multi([to_run, run_result, result_summary])
        return run_result

    # The bot will reap the next available task in case of failure, no big deal.
    try:
        run_result = datastore_utils.transaction(run, retries=0)
    except datastore_utils.CommitError:
        run_result = None
    if run_result:
        task_to_run.set_lookup_cache(to_run_key, False)
    return run_result
예제 #7
0
  def run():
    """1 DB GET, 1 memcache write, 2x DB PUTs, 1x task queue."""
    # Need to get the current try number to know which TaskToRun to fetch.
    result_summary = result_key.get()
    was_running = result_summary.state == task_result.State.RUNNING
    if not result_summary.can_be_canceled:
      return False, was_running

    entities = [result_summary]
    if not was_running:
      # PENDING.
      result_summary.state = task_result.State.CANCELED
      to_run_key = task_to_run.request_to_task_to_run_key(
          request,
          result_summary.try_number or 1,
          result_summary.current_task_slice or 0)
      to_run_future = to_run_key.get_async()

      # Add it to the negative cache.
      task_to_run.set_lookup_cache(to_run_key, False)

      to_run = to_run_future.get_result()
      entities.append(to_run)
      to_run.queue_number = None
    else:
      if not kill_running:
        # Deny canceling a task that started.
        return False, was_running
      # RUNNING.
      run_result = result_summary.run_result_key.get()
      entities.append(run_result)
      # Do not change state to KILLED yet. Instead, use a 2 phase commit:
      # - set killing to True
      # - on next bot report, tell it to kill the task
      # - once the bot reports the task as terminated, set state to KILLED
      run_result.killing = True
      run_result.abandoned_ts = now
      run_result.modified_ts = now
      entities.append(run_result)
    result_summary.abandoned_ts = now
    result_summary.modified_ts = now

    futures = ndb.put_multi_async(entities)
    _maybe_pubsub_notify_via_tq(result_summary, request)
    for f in futures:
      f.check_success()
    return True, was_running
예제 #8
0
def _reap_task(to_run_key, request, bot_id, bot_version, bot_dimensions):
    """Reaps a task and insert the results entity.

  Returns:
    TaskRunResult if successful, None otherwise.
  """
    assert bot_id, bot_id
    assert request.key == task_to_run.task_to_run_key_to_request_key(to_run_key)
    result_summary_key = task_pack.request_key_to_result_summary_key(request.key)

    now = utils.utcnow()

    def run():
        # 2 GET, 1 PUT at the end.
        to_run_future = to_run_key.get_async()
        result_summary_future = result_summary_key.get_async()
        to_run = to_run_future.get_result()
        if not to_run or not to_run.is_reapable:
            result_summary_future.wait()
            return None
        result_summary = result_summary_future.get_result()
        if result_summary.bot_id == bot_id:
            # This means two things, first it's a retry, second it's that the first
            # try failed and the retry is being reaped by the same bot. Deny that, as
            # the bot may be deeply broken and could be in a killing spree.
            return None
        to_run.queue_number = None
        run_result = task_result.new_run_result(
            request, (result_summary.try_number or 0) + 1, bot_id, bot_version, bot_dimensions
        )
        run_result.modified_ts = now
        result_summary.set_from_run_result(run_result, request)
        ndb.put_multi([to_run, run_result, result_summary])
        return run_result

    # The bot will reap the next available task in case of failure, no big deal.
    try:
        run_result = datastore_utils.transaction(run, retries=0)
    except datastore_utils.CommitError:
        run_result = None
    if run_result:
        task_to_run.set_lookup_cache(to_run_key, False)
    return run_result
예제 #9
0
def cancel_task(request, result_key):
    """Cancels a task if possible.

  Ensures that the associated TaskToRun is canceled and updates the
  TaskResultSummary/TaskRunResult accordingly.

  Warning: ACL check must have been done before.
  """
    to_run_key = task_to_run.request_to_task_to_run_key(request)
    if result_key.kind() == 'TaskRunResult':
        result_key = task_pack.run_result_key_to_result_summary_key(result_key)
    now = utils.utcnow()

    def run():
        to_run, result_summary = ndb.get_multi((to_run_key, result_key))
        was_running = result_summary.state == task_result.State.RUNNING
        if not result_summary.can_be_canceled:
            return False, was_running
        to_run.queue_number = None
        result_summary.state = task_result.State.CANCELED
        result_summary.abandoned_ts = now
        result_summary.modified_ts = now

        futures = ndb.put_multi_async((to_run, result_summary))
        _maybe_pubsub_notify_via_tq(result_summary, request)
        for f in futures:
            f.check_success()

        return True, was_running

    try:
        ok, was_running = datastore_utils.transaction(run)
    except datastore_utils.CommitError as e:
        packed = task_pack.pack_result_summary_key(result_key)
        return 'Failed killing task %s: %s' % (packed, e)
    # Add it to the negative cache.
    task_to_run.set_lookup_cache(to_run_key, False)
    # TODO(maruel): Add stats.
    return ok, was_running
예제 #10
0
 def test_set_lookup_cache(self):
     to_run = _gen_new_task_to_run(properties=dict(dimensions={u"OS": u"Windows-3.1.1"}))
     self.assertEqual(False, task_to_run._lookup_cache_is_taken(to_run.key))
     task_to_run.set_lookup_cache(to_run.key, True)
     self.assertEqual(False, task_to_run._lookup_cache_is_taken(to_run.key))
     task_to_run.set_lookup_cache(to_run.key, False)
     self.assertEqual(True, task_to_run._lookup_cache_is_taken(to_run.key))
     task_to_run.set_lookup_cache(to_run.key, True)
     self.assertEqual(False, task_to_run._lookup_cache_is_taken(to_run.key))
예제 #11
0
 def test_set_lookup_cache(self):
     to_run = _gen_new_task_to_run(properties=dict(
         dimensions={u'OS': u'Windows-3.1.1'}))
     self.assertEqual(False, task_to_run._lookup_cache_is_taken(to_run.key))
     task_to_run.set_lookup_cache(to_run.key, True)
     self.assertEqual(False, task_to_run._lookup_cache_is_taken(to_run.key))
     task_to_run.set_lookup_cache(to_run.key, False)
     self.assertEqual(True, task_to_run._lookup_cache_is_taken(to_run.key))
     task_to_run.set_lookup_cache(to_run.key, True)
     self.assertEqual(False, task_to_run._lookup_cache_is_taken(to_run.key))
예제 #12
0
 def test_set_lookup_cache(self):
   to_run = _gen_new_task_to_run(
       properties={
         'dimensions': {u'OS': u'Windows-3.1.1', u'pool': u'default'},
       })
   self.assertEqual(False, task_to_run._lookup_cache_is_taken(to_run.key))
   task_to_run.set_lookup_cache(to_run.key, True)
   self.assertEqual(False, task_to_run._lookup_cache_is_taken(to_run.key))
   task_to_run.set_lookup_cache(to_run.key, False)
   self.assertEqual(True, task_to_run._lookup_cache_is_taken(to_run.key))
   task_to_run.set_lookup_cache(to_run.key, True)
   self.assertEqual(False, task_to_run._lookup_cache_is_taken(to_run.key))
예제 #13
0
 def test_set_lookup_cache(self):
     to_run = _gen_new_task_to_run(properties={
         'dimensions': {
             u'OS': u'Windows-3.1.1',
             u'pool': u'default'
         },
     })
     self.assertEqual(False, task_to_run._lookup_cache_is_taken(to_run.key))
     task_to_run.set_lookup_cache(to_run.key, True)
     self.assertEqual(False, task_to_run._lookup_cache_is_taken(to_run.key))
     task_to_run.set_lookup_cache(to_run.key, False)
     self.assertEqual(True, task_to_run._lookup_cache_is_taken(to_run.key))
     task_to_run.set_lookup_cache(to_run.key, True)
     self.assertEqual(False, task_to_run._lookup_cache_is_taken(to_run.key))
예제 #14
0
def _reap_task(bot_dimensions, bot_version, to_run_key, request):
  """Reaps a task and insert the results entity.

  Returns:
    (TaskRunResult, SecretBytes) if successful, (None, None) otherwise.
  """
  assert request.key == task_to_run.task_to_run_key_to_request_key(to_run_key)
  result_summary_key = task_pack.request_key_to_result_summary_key(request.key)
  bot_id = bot_dimensions[u'id'][0]

  now = utils.utcnow()
  # Log before the task id in case the function fails in a bad state where the
  # DB TX ran but the reply never comes to the bot. This is the worst case as
  # this leads to a task that results in BOT_DIED without ever starting. This
  # case is specifically handled in cron_handle_bot_died().
  logging.info(
      '_reap_task(%s)', task_pack.pack_result_summary_key(result_summary_key))

  def run():
    # 3 GET, 1 PUT at the end.
    to_run_future = to_run_key.get_async()
    result_summary_future = result_summary_key.get_async()
    to_run = to_run_future.get_result()
    t = request.task_slice(to_run.task_slice_index)
    if t.properties.has_secret_bytes:
      secret_bytes_future = request.secret_bytes_key.get_async()
    result_summary = result_summary_future.get_result()
    orig_summary_state = result_summary.state
    secret_bytes = None
    if t.properties.has_secret_bytes:
      secret_bytes = secret_bytes_future.get_result()
    if not to_run:
      logging.error('Missing TaskToRun?\n%s', result_summary.task_id)
      return None, None
    if not to_run.is_reapable:
      logging.info('%s is not reapable', result_summary.task_id)
      return None, None
    if result_summary.bot_id == bot_id:
      # This means two things, first it's a retry, second it's that the first
      # try failed and the retry is being reaped by the same bot. Deny that, as
      # the bot may be deeply broken and could be in a killing spree.
      # TODO(maruel): Allow retry for bot locked task using 'id' dimension.
      logging.warning(
          '%s can\'t retry its own internal failure task',
          result_summary.task_id)
      return None, None
    to_run.queue_number = None
    run_result = task_result.new_run_result(
        request, to_run, bot_id, bot_version, bot_dimensions)
    # Upon bot reap, both .started_ts and .modified_ts matches. They differ on
    # the first ping.
    run_result.started_ts = now
    run_result.modified_ts = now
    result_summary.set_from_run_result(run_result, request)
    ndb.put_multi([to_run, run_result, result_summary])
    if result_summary.state != orig_summary_state:
      _maybe_pubsub_notify_via_tq(result_summary, request)
    return run_result, secret_bytes

  # Add it to the negative cache *before* running the transaction. This will
  # inhibit concurrently readers to try to reap this task. The downside is if
  # this request fails in the middle of the transaction, the task may stay
  # unreapable for up to 15 seconds.
  if not task_to_run.set_lookup_cache(to_run_key, False):
    logging.debug('hit negative cache')
    return None, None

  try:
    run_result, secret_bytes = datastore_utils.transaction(run, retries=0)
  except datastore_utils.CommitError:
    # The challenge here is that the transaction may have failed because:
    # - The DB had an hickup and the TaskToRun, TaskRunResult and
    #   TaskResultSummary haven't been updated.
    # - The entities had been updated by a concurrent transaction on another
    #   handler so it was not reapable anyway. This does cause exceptions as
    #   both GET returns the TaskToRun.queue_number != None but only one succeed
    #   at the PUT.
    #
    # In the first case, we may want to reset the negative cache, while we don't
    # want to in the later case. The trade off are one of:
    # - negative cache is incorrectly set, so the task is not reapable for 15s
    # - resetting the negative cache would cause even more contention
    #
    # We chose the first one here for now, as the when the DB starts misbehaving
    # and the index becomes stale, it means the DB is *already* not in good
    # shape, so it is preferable to not put more stress on it, and skipping a
    # few tasks for 15s may even actively help the DB to stabilize.
    logging.info('CommitError; reaping failed')
    # The bot will reap the next available task in case of failure, no big deal.
    run_result = None
    secret_bytes = None
  return run_result, secret_bytes
예제 #15
0
def _expire_task(to_run_key, request):
  """Expires a TaskResultSummary and unschedules the TaskToRun.

  This function is only meant to process PENDING tasks.

  If a follow up TaskSlice is available, reenqueue a new TaskToRun instead of
  expiring the TaskResultSummary.

  Returns:
    TaskResultSummary on success, bool if reenqueued (due to following
    TaskSlice).
  """
  # Look if the TaskToRun is reapable once before doing the check inside the
  # transaction. This reduces the likelihood of failing this check inside the
  # transaction, which is an order of magnitude more costly.
  if not to_run_key.get().is_reapable:
    logging.info('Not reapable anymore')
    return None, None

  result_summary_key = task_pack.request_key_to_result_summary_key(request.key)
  now = utils.utcnow()

  def run():
    # 2 concurrent GET, one PUT. Optionally with an additional serialized GET.
    to_run_future = to_run_key.get_async()
    result_summary_future = result_summary_key.get_async()
    to_run = to_run_future.get_result()
    if not to_run or not to_run.is_reapable:
      result_summary_future.get_result()
      return None, None

    # In any case, dequeue the TaskToRun.
    to_run.queue_number = None
    result_summary = result_summary_future.get_result()
    to_put = [to_run, result_summary]
    # Check if there's a TaskSlice fallback that could be reenqueued.
    new_to_run = None
    index = result_summary.current_task_slice+1
    while index < request.num_task_slices:
      dimensions = request.task_slice(index).properties.dimensions
      if _has_capacity(dimensions):
        # Enqueue a new TasktoRun for this next TaskSlice, it has capacity!
        new_to_run = task_to_run.new_task_to_run(request, 1, index)
        result_summary.current_task_slice = index
        to_put.append(new_to_run)
        break
      index += 1

    if not new_to_run:
      # There's no fallback, giving up.
      if result_summary.try_number:
        # It's a retry that is being expired, i.e. the first try had BOT_DIED.
        # Keep the old state. That requires an additional pipelined GET but that
        # shouldn't be the common case.
        run_result = result_summary.run_result_key.get()
        result_summary.set_from_run_result(run_result, request)
      else:
        result_summary.state = task_result.State.EXPIRED
      result_summary.abandoned_ts = now
    result_summary.modified_ts = now

    futures = ndb.put_multi_async(to_put)
    _maybe_pubsub_notify_via_tq(result_summary, request)
    for f in futures:
      f.check_success()

    return result_summary, new_to_run

  # Add it to the negative cache *before* running the transaction. Either way
  # the task was already reaped or the task is correctly expired and not
  # reapable.
  task_to_run.set_lookup_cache(to_run_key, False)

  # It'll be caught by next cron job execution in case of failure.
  try:
    res, r = datastore_utils.transaction(run)
  except datastore_utils.CommitError:
    res = None
    r = None
  if res:
    logging.info(
        'Expired %s', task_pack.pack_result_summary_key(result_summary_key))
    ts_mon_metrics.on_task_completed(res)
  return res, r
예제 #16
0
def _handle_dead_bot(run_result_key):
    """Handles TaskRunResult where its bot has stopped showing sign of life.

  Transactionally updates the entities depending on the state of this task. The
  task may be retried automatically, canceled or left alone.

  Returns:
    True if the task was retried, False if the task was killed, None if no
    action was done.
  """
    result_summary_key = task_pack.run_result_key_to_result_summary_key(run_result_key)
    request_key = task_pack.result_summary_key_to_request_key(result_summary_key)
    request_future = request_key.get_async()
    now = utils.utcnow()
    server_version = utils.get_app_version()
    packed = task_pack.pack_run_result_key(run_result_key)
    request = request_future.get_result()
    to_run_key = task_to_run.request_to_task_to_run_key(request)

    def run():
        """Returns tuple(task_is_retried or None, bot_id)."""
        # Do one GET, one PUT at the end.
        run_result, result_summary, to_run = ndb.get_multi((run_result_key, result_summary_key, to_run_key))
        if run_result.state != task_result.State.RUNNING:
            # It was updated already or not updating last. Likely DB index was stale.
            return None, run_result.bot_id

        run_result.signal_server_version(server_version)
        run_result.modified_ts = now

        notify = False
        if result_summary.try_number != run_result.try_number:
            # Not updating correct run_result, cancel it without touching
            # result_summary.
            to_put = (run_result,)
            run_result.state = task_result.State.BOT_DIED
            run_result.internal_failure = True
            run_result.abandoned_ts = now
            task_is_retried = None
        elif result_summary.try_number == 1 and now < request.expiration_ts:
            # Retry it.
            to_put = (run_result, result_summary, to_run)
            to_run.queue_number = task_to_run.gen_queue_number(request)
            run_result.state = task_result.State.BOT_DIED
            run_result.internal_failure = True
            run_result.abandoned_ts = now
            # Do not sync data from run_result to result_summary, since the task is
            # being retried.
            result_summary.reset_to_pending()
            result_summary.modified_ts = now
            task_is_retried = True
        else:
            # Cancel it, there was more than one try or the task expired in the
            # meantime.
            to_put = (run_result, result_summary)
            run_result.state = task_result.State.BOT_DIED
            run_result.internal_failure = True
            run_result.abandoned_ts = now
            result_summary.set_from_run_result(run_result, request)
            notify = True
            task_is_retried = False

        futures = ndb.put_multi_async(to_put)
        if notify:
            _maybe_pubsub_notify_via_tq(result_summary, request)
        for f in futures:
            f.check_success()

        return task_is_retried, run_result.bot_id

    try:
        task_is_retried, bot_id = datastore_utils.transaction(run)
    except datastore_utils.CommitError:
        task_is_retried, bot_id = None, None
    if task_is_retried is not None:
        task_to_run.set_lookup_cache(to_run_key, task_is_retried)
        if not task_is_retried:
            stats.add_run_entry(
                "run_bot_died",
                run_result_key,
                bot_id=bot_id[0],
                dimensions=request.properties.dimensions,
                user=request.user,
            )
        else:
            logging.info("Retried %s", packed)
    else:
        logging.info("Ignored %s", packed)
    return task_is_retried
예제 #17
0
def _handle_dead_bot(run_result_key):
    """Handles TaskRunResult where its bot has stopped showing sign of life.

  Transactionally updates the entities depending on the state of this task. The
  task may be retried automatically, canceled or left alone.

  Returns:
    True if the task was retried, False if the task was killed, None if no
    action was done.
  """
    result_summary_key = task_pack.run_result_key_to_result_summary_key(
        run_result_key)
    request_key = task_pack.result_summary_key_to_request_key(
        result_summary_key)
    request_future = request_key.get_async()
    now = utils.utcnow()
    server_version = utils.get_app_version()
    packed = task_pack.pack_run_result_key(run_result_key)
    request = request_future.get_result()
    to_run_key = task_to_run.request_to_task_to_run_key(request)

    def run():
        """Returns tuple(task_is_retried or None, bot_id)."""
        # Do one GET, one PUT at the end.
        run_result, result_summary, to_run = ndb.get_multi(
            (run_result_key, result_summary_key, to_run_key))
        if run_result.state != task_result.State.RUNNING:
            # It was updated already or not updating last. Likely DB index was stale.
            return None, run_result.bot_id
        if run_result.modified_ts > now - task_result.BOT_PING_TOLERANCE:
            # The query index IS stale.
            return None, run_result.bot_id

        run_result.signal_server_version(server_version)
        run_result.modified_ts = now

        notify = False
        if result_summary.try_number != run_result.try_number:
            # Not updating correct run_result, cancel it without touching
            # result_summary.
            to_put = (run_result, )
            run_result.state = task_result.State.BOT_DIED
            run_result.internal_failure = True
            run_result.abandoned_ts = now
            task_is_retried = None
        elif result_summary.try_number == 1 and now < request.expiration_ts:
            # Retry it.
            to_put = (run_result, result_summary, to_run)
            to_run.queue_number = task_to_run.gen_queue_number(request)
            run_result.state = task_result.State.BOT_DIED
            run_result.internal_failure = True
            run_result.abandoned_ts = now
            # Do not sync data from run_result to result_summary, since the task is
            # being retried.
            result_summary.reset_to_pending()
            result_summary.modified_ts = now
            task_is_retried = True
        else:
            # Cancel it, there was more than one try or the task expired in the
            # meantime.
            to_put = (run_result, result_summary)
            run_result.state = task_result.State.BOT_DIED
            run_result.internal_failure = True
            run_result.abandoned_ts = now
            result_summary.set_from_run_result(run_result, request)
            notify = True
            task_is_retried = False

        futures = ndb.put_multi_async(to_put)
        if notify:
            _maybe_pubsub_notify_via_tq(result_summary, request)
        for f in futures:
            f.check_success()

        return task_is_retried, run_result.bot_id

    try:
        task_is_retried, bot_id = datastore_utils.transaction(run)
    except datastore_utils.CommitError:
        task_is_retried, bot_id = None, None
    if task_is_retried is not None:
        task_to_run.set_lookup_cache(to_run_key, task_is_retried)
        if not task_is_retried:
            stats.add_run_entry('run_bot_died',
                                run_result_key,
                                bot_id=bot_id[0],
                                dimensions=request.properties.dimensions,
                                user=request.user)
        else:
            logging.info('Retried %s', packed)
    else:
        logging.info('Ignored %s', packed)
    return task_is_retried