def _reap_task(to_run_key, request, bot_id, bot_version, bot_dimensions): """Reaps a task and insert the results entity. Returns: TaskRunResult if successful, None otherwise. """ assert bot_id, bot_id assert request.key == task_to_run.task_to_run_key_to_request_key( to_run_key) result_summary_key = task_pack.request_key_to_result_summary_key( request.key) now = utils.utcnow() def run(): # 2 GET, 1 PUT at the end. to_run_future = to_run_key.get_async() result_summary_future = result_summary_key.get_async() to_run = to_run_future.get_result() result_summary = result_summary_future.get_result() if not to_run: logging.error('Missing TaskToRun?\n%s', result_summary.task_id) return None if not to_run.is_reapable: logging.info('%s is not reapable', result_summary.task_id) return None if result_summary.bot_id == bot_id: # This means two things, first it's a retry, second it's that the first # try failed and the retry is being reaped by the same bot. Deny that, as # the bot may be deeply broken and could be in a killing spree. logging.warning('%s can\'t retry its own internal failure task', result_summary.task_id) return None to_run.queue_number = None run_result = task_result.new_run_result( request, (result_summary.try_number or 0) + 1, bot_id, bot_version, bot_dimensions) run_result.modified_ts = now result_summary.set_from_run_result(run_result, request) ndb.put_multi([to_run, run_result, result_summary]) return run_result # The bot will reap the next available task in case of failure, no big deal. try: run_result = datastore_utils.transaction(run, retries=0) except datastore_utils.CommitError: run_result = None if run_result: task_to_run.set_lookup_cache(to_run_key, False) return run_result
def _reap_task(to_run_key, request, bot_id, bot_version, bot_dimensions): """Reaps a task and insert the results entity. Returns: TaskRunResult if successful, None otherwise. """ assert bot_id, bot_id assert request.key == task_to_run.task_to_run_key_to_request_key(to_run_key) result_summary_key = task_pack.request_key_to_result_summary_key(request.key) now = utils.utcnow() def run(): # 2 GET, 1 PUT at the end. to_run_future = to_run_key.get_async() result_summary_future = result_summary_key.get_async() to_run = to_run_future.get_result() if not to_run or not to_run.is_reapable: result_summary_future.wait() return None result_summary = result_summary_future.get_result() if result_summary.bot_id == bot_id: # This means two things, first it's a retry, second it's that the first # try failed and the retry is being reaped by the same bot. Deny that, as # the bot may be deeply broken and could be in a killing spree. return None to_run.queue_number = None run_result = task_result.new_run_result( request, (result_summary.try_number or 0) + 1, bot_id, bot_version, bot_dimensions ) run_result.modified_ts = now result_summary.set_from_run_result(run_result, request) ndb.put_multi([to_run, run_result, result_summary]) return run_result # The bot will reap the next available task in case of failure, no big deal. try: run_result = datastore_utils.transaction(run, retries=0) except datastore_utils.CommitError: run_result = None if run_result: task_to_run.set_lookup_cache(to_run_key, False) return run_result
def test_task_to_run_key_to_request_key(self): request = task_request.make_request(_gen_request(), True) task_key = task_to_run.request_to_task_to_run_key(request) actual = task_to_run.task_to_run_key_to_request_key(task_key) self.assertEqual(request.key, actual)
def test_task_to_run_key_to_request_key(self): request = self.mkreq(1, _gen_request()) task_key = task_to_run.request_to_task_to_run_key(request, 1, 0) actual = task_to_run.task_to_run_key_to_request_key(task_key) self.assertEqual(request.key, actual)
def test_task_to_run_key_to_request_key(self): request = task_request.make_request(_gen_request_data()) task_key = task_to_run.request_to_task_to_run_key(request) actual = task_to_run.task_to_run_key_to_request_key(task_key) self.assertEqual(request.key, actual)
def _reap_task(bot_dimensions, bot_version, to_run_key, request): """Reaps a task and insert the results entity. Returns: (TaskRunResult, SecretBytes) if successful, (None, None) otherwise. """ assert request.key == task_to_run.task_to_run_key_to_request_key(to_run_key) result_summary_key = task_pack.request_key_to_result_summary_key(request.key) bot_id = bot_dimensions[u'id'][0] now = utils.utcnow() # Log before the task id in case the function fails in a bad state where the # DB TX ran but the reply never comes to the bot. This is the worst case as # this leads to a task that results in BOT_DIED without ever starting. This # case is specifically handled in cron_handle_bot_died(). logging.info( '_reap_task(%s)', task_pack.pack_result_summary_key(result_summary_key)) def run(): # 3 GET, 1 PUT at the end. to_run_future = to_run_key.get_async() result_summary_future = result_summary_key.get_async() to_run = to_run_future.get_result() t = request.task_slice(to_run.task_slice_index) if t.properties.has_secret_bytes: secret_bytes_future = request.secret_bytes_key.get_async() result_summary = result_summary_future.get_result() orig_summary_state = result_summary.state secret_bytes = None if t.properties.has_secret_bytes: secret_bytes = secret_bytes_future.get_result() if not to_run: logging.error('Missing TaskToRun?\n%s', result_summary.task_id) return None, None if not to_run.is_reapable: logging.info('%s is not reapable', result_summary.task_id) return None, None if result_summary.bot_id == bot_id: # This means two things, first it's a retry, second it's that the first # try failed and the retry is being reaped by the same bot. Deny that, as # the bot may be deeply broken and could be in a killing spree. # TODO(maruel): Allow retry for bot locked task using 'id' dimension. logging.warning( '%s can\'t retry its own internal failure task', result_summary.task_id) return None, None to_run.queue_number = None run_result = task_result.new_run_result( request, to_run, bot_id, bot_version, bot_dimensions) # Upon bot reap, both .started_ts and .modified_ts matches. They differ on # the first ping. run_result.started_ts = now run_result.modified_ts = now result_summary.set_from_run_result(run_result, request) ndb.put_multi([to_run, run_result, result_summary]) if result_summary.state != orig_summary_state: _maybe_pubsub_notify_via_tq(result_summary, request) return run_result, secret_bytes # Add it to the negative cache *before* running the transaction. This will # inhibit concurrently readers to try to reap this task. The downside is if # this request fails in the middle of the transaction, the task may stay # unreapable for up to 15 seconds. if not task_to_run.set_lookup_cache(to_run_key, False): logging.debug('hit negative cache') return None, None try: run_result, secret_bytes = datastore_utils.transaction(run, retries=0) except datastore_utils.CommitError: # The challenge here is that the transaction may have failed because: # - The DB had an hickup and the TaskToRun, TaskRunResult and # TaskResultSummary haven't been updated. # - The entities had been updated by a concurrent transaction on another # handler so it was not reapable anyway. This does cause exceptions as # both GET returns the TaskToRun.queue_number != None but only one succeed # at the PUT. # # In the first case, we may want to reset the negative cache, while we don't # want to in the later case. The trade off are one of: # - negative cache is incorrectly set, so the task is not reapable for 15s # - resetting the negative cache would cause even more contention # # We chose the first one here for now, as the when the DB starts misbehaving # and the index becomes stale, it means the DB is *already* not in good # shape, so it is preferable to not put more stress on it, and skipping a # few tasks for 15s may even actively help the DB to stabilize. logging.info('CommitError; reaping failed') # The bot will reap the next available task in case of failure, no big deal. run_result = None secret_bytes = None return run_result, secret_bytes