def test_pack_run_result_key(self): request_key = task_pack.unpack_request_key('11') result_summary_key = task_pack.request_key_to_result_summary_key( request_key) run_result_key = task_pack.result_summary_key_to_run_result_key( result_summary_key, 1) self.assertEqual('111', task_pack.pack_run_result_key(run_result_key)) with self.assertRaises(AssertionError): task_pack.pack_run_result_key(result_summary_key)
def make_task_request( self, service_account, service_account_token, try_number=1): now = utils.utcnow() args = { 'created_ts': now, 'manual_tags': [u'tag:1'], 'name': 'Request with %s' % service_account, 'priority': 50, 'task_slices': [ task_request.TaskSlice( expiration_secs=60, properties=task_request.TaskProperties( command=[u'command1'], dimensions_data={u'pool': [u'default']}, execution_timeout_secs=24*60*60)), ], 'user': '******', } req = task_request.TaskRequest(**args) task_request.init_new_request(req, True) req.key = task_request.new_request_key() req.service_account = service_account req.service_account_token = service_account_token req.put() summary_key = task_pack.request_key_to_result_summary_key(req.key) run_result_key = task_pack.result_summary_key_to_run_result_key( summary_key, try_number) return task_pack.pack_run_result_key(run_result_key)
def _cmd_run(self, request, run_result_key, bot_id): # Only one of 'command' or 'inputs_ref' can be set. out = { 'cmd': 'run', 'manifest': { 'bot_id': bot_id, 'command': request.properties.commands[0] if request.properties.commands else None, 'data': request.properties.data, 'dimensions': request.properties.dimensions, 'env': request.properties.env, 'extra_args': request.properties.extra_args, 'grace_period': request.properties.grace_period_secs, 'hard_timeout': request.properties.execution_timeout_secs, 'host': utils.get_versioned_hosturl(), 'io_timeout': request.properties.io_timeout_secs, 'inputs_ref': request.properties.inputs_ref, 'task_id': task_pack.pack_run_result_key(run_result_key), }, } self.send_response(utils.to_json_encodable(out))
def cron_handle_bot_died(host): """Aborts or retry stale TaskRunResult where the bot stopped sending updates. If the task was at its first try, it'll be retried. Otherwise the task will be canceled. Returns: - task IDs killed - number of task retried - number of task ignored """ ignored = 0 killed = [] retried = 0 try: for run_result_key in task_result.yield_run_result_keys_with_dead_bot(): result = _handle_dead_bot(run_result_key) if result is True: retried += 1 elif result is False: killed.append(task_pack.pack_run_result_key(run_result_key)) else: ignored += 1 finally: if killed: logging.error( 'BOT_DIED!\n%d tasks:\n%s', len(killed), '\n'.join(' %s/user/task/%s' % (host, i) for i in killed)) logging.info( 'Killed %d; retried %d; ignored: %d', len(killed), retried, ignored) # These are returned primarily for unit testing verification. return killed, retried, ignored
def cron_handle_bot_died(host): """Aborts or retry stale TaskRunResult where the bot stopped sending updates. If the task was at its first try, it'll be retried. Otherwise the task will be canceled. """ ignored = 0 killed = [] retried = 0 try: for run_result_key in task_result.yield_run_result_keys_with_dead_bot( ): result = _handle_dead_bot(run_result_key) if result is True: retried += 1 elif result is False: killed.append(task_pack.pack_run_result_key(run_result_key)) else: ignored += 1 finally: if killed: logging.error( 'BOT_DIED!\n%d tasks:\n%s', len(killed), '\n'.join(' %s/user/task/%s' % (host, i) for i in killed)) # TODO(maruel): Use stats_framework. logging.info('Killed %d; retried %d; ignored: %d', len(killed), retried, ignored) return killed, retried, ignored
def cron_handle_bot_died(host): """Aborts or retry stale TaskRunResult where the bot stopped sending updates. If the task was at its first try, it'll be retried. Otherwise the task will be canceled. """ ignored = 0 killed = [] retried = 0 try: for run_result_key in task_result.yield_run_result_keys_with_dead_bot(): result = _handle_dead_bot(run_result_key) if result is True: retried += 1 elif result is False: killed.append(task_pack.pack_run_result_key(run_result_key)) else: ignored += 1 finally: if killed: logging.error( "BOT_DIED!\n%d tasks:\n%s", len(killed), "\n".join(" https://%s/user/task/%s" % (host, i) for i in killed), ) # TODO(maruel): Use stats_framework. logging.info("Killed %d; retried %d; ignored: %d", len(killed), retried, ignored) return killed, retried, ignored
def bot_kill_task(run_result_key, bot_id): """Terminates a task that is currently running as an internal failure. Returns: str if an error message. """ result_summary_key = task_pack.run_result_key_to_result_summary_key( run_result_key) request = task_pack.result_summary_key_to_request_key( result_summary_key).get() server_version = utils.get_app_version() now = utils.utcnow() packed = task_pack.pack_run_result_key(run_result_key) def run(): run_result, result_summary = ndb.get_multi( (run_result_key, result_summary_key)) if bot_id and run_result.bot_id != bot_id: return None, 'Bot %s sent task kill for task %s owned by bot %s' % ( bot_id, packed, run_result.bot_id) if run_result.state == task_result.State.BOT_DIED: # Ignore this failure. return None, None run_result.signal_server_version(server_version) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now run_result.modified_ts = now result_summary.set_from_run_result(run_result, None) futures = ndb.put_multi_async((run_result, result_summary)) _maybe_pubsub_notify_via_tq(result_summary, request) for f in futures: f.check_success() return run_result, None try: run_result, msg = datastore_utils.transaction(run) except datastore_utils.CommitError as e: # At worst, the task will be tagged as BOT_DIED after BOT_PING_TOLERANCE # seconds passed on the next cron_handle_bot_died cron job. return 'Failed killing task %s: %s' % (packed, e) if run_result: stats.add_run_entry('run_bot_died', run_result.key, bot_id=run_result.bot_id, dimensions=request.properties.dimensions, user=request.user) return msg
def bot_kill_task(run_result_key, bot_id): """Terminates a task that is currently running as an internal failure. Returns: str if an error message. """ result_summary_key = task_pack.run_result_key_to_result_summary_key(run_result_key) request = task_pack.result_summary_key_to_request_key(result_summary_key).get() server_version = utils.get_app_version() now = utils.utcnow() packed = task_pack.pack_run_result_key(run_result_key) def run(): run_result, result_summary = ndb.get_multi((run_result_key, result_summary_key)) if bot_id and run_result.bot_id != bot_id: return None, "Bot %s sent task kill for task %s owned by bot %s" % (bot_id, packed, run_result.bot_id) if run_result.state == task_result.State.BOT_DIED: # Ignore this failure. return None, None run_result.signal_server_version(server_version) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now run_result.modified_ts = now result_summary.set_from_run_result(run_result, None) futures = ndb.put_multi_async((run_result, result_summary)) _maybe_pubsub_notify_via_tq(result_summary, request) for f in futures: f.check_success() return run_result, None try: run_result, msg = datastore_utils.transaction(run) except datastore_utils.CommitError as e: # At worst, the task will be tagged as BOT_DIED after BOT_PING_TOLERANCE # seconds passed on the next cron_handle_bot_died cron job. return "Failed killing task %s: %s" % (packed, e) if run_result: stats.add_run_entry( "run_bot_died", run_result.key, bot_id=run_result.bot_id, dimensions=request.properties.dimensions, user=request.user, ) return msg
def task_result_to_rpc(entity, send_stats): """"Returns a swarming_rpcs.TaskResult from a task_result.TaskResultSummary or task_result.TaskRunResult. """ outputs_ref = (_ndb_to_rpc(swarming_rpcs.FilesRef, entity.outputs_ref) if entity.outputs_ref else None) cipd_pins = None if entity.cipd_pins: cipd_pins = swarming_rpcs.CipdPins( client_package=(_ndb_to_rpc(swarming_rpcs.CipdPackage, entity.cipd_pins.client_package) if entity.cipd_pins.client_package else None), packages=[ _ndb_to_rpc(swarming_rpcs.CipdPackage, pkg) for pkg in entity.cipd_pins.packages ] if entity.cipd_pins.packages else None) performance_stats = None if send_stats and entity.performance_stats.is_valid: def op(entity): if entity: return _ndb_to_rpc(swarming_rpcs.OperationStats, entity) performance_stats = _ndb_to_rpc( swarming_rpcs.PerformanceStats, entity.performance_stats, isolated_download=op(entity.performance_stats.isolated_download), isolated_upload=op(entity.performance_stats.isolated_upload)) kwargs = { 'bot_dimensions': _string_list_pairs_from_dict(entity.bot_dimensions or {}), 'cipd_pins': cipd_pins, 'outputs_ref': outputs_ref, 'performance_stats': performance_stats, 'state': swarming_rpcs.StateField(entity.state), } if entity.__class__ is task_result.TaskRunResult: kwargs['costs_usd'] = [] if entity.cost_usd is not None: kwargs['costs_usd'].append(entity.cost_usd) kwargs['tags'] = [] kwargs['user'] = None kwargs['run_id'] = entity.task_id else: assert entity.__class__ is task_result.TaskResultSummary, entity # This returns the right value for deduped tasks too. k = entity.run_result_key kwargs['run_id'] = task_pack.pack_run_result_key(k) if k else None return _ndb_to_rpc(swarming_rpcs.TaskResult, entity, **kwargs)
def _dedupe_result_summary(dupe_summary, result_summary, task_slice_index): """Copies the results from dupe_summary into result_summary.""" # PerformanceStats is not copied over, since it's not relevant, nothing # ran. _copy_summary( dupe_summary, result_summary, ('created_ts', 'modified_ts', 'name', 'user', 'tags')) # Zap irrelevant properties. result_summary.cost_saved_usd = dupe_summary.cost_usd result_summary.costs_usd = [] result_summary.current_task_slice = task_slice_index result_summary.deduped_from = task_pack.pack_run_result_key( dupe_summary.run_result_key) # It is not possible to dedupe against a deduped task, so zap properties_hash. result_summary.properties_hash = None result_summary.try_number = 0
def _cmd_run(self, request, run_result_key, bot_id): out = { 'cmd': 'run', 'manifest': { 'bot_id': bot_id, 'command': request.properties.commands[0], 'data': request.properties.data, 'env': request.properties.env, 'grace_period': request.properties.grace_period_secs, 'host': utils.get_versioned_hosturl(), 'hard_timeout': request.properties.execution_timeout_secs, 'io_timeout': request.properties.io_timeout_secs, 'task_id': task_pack.pack_run_result_key(run_result_key), }, } self.send_response(out)
def _cmd_run(self, request, run_result_key, bot_id): # Only one of 'command' or 'inputs_ref' can be set. out = { "cmd": "run", "manifest": { "bot_id": bot_id, "command": request.properties.commands[0] if request.properties.commands else None, "data": request.properties.data, "dimensions": request.properties.dimensions, "env": request.properties.env, "extra_args": request.properties.extra_args, "grace_period": request.properties.grace_period_secs, "hard_timeout": request.properties.execution_timeout_secs, "host": utils.get_versioned_hosturl(), "io_timeout": request.properties.io_timeout_secs, "inputs_ref": request.properties.inputs_ref, "task_id": task_pack.pack_run_result_key(run_result_key), }, } self.send_response(utils.to_json_encodable(out))
def _cmd_run(self, request, run_result_key, bot_id): cmd = None if request.properties.commands: cmd = request.properties.commands[0] elif request.properties.command: cmd = request.properties.command out = { 'cmd': 'run', 'manifest': { 'bot_id': bot_id, 'caches': [c.to_dict() for c in request.properties.caches], 'cipd_input': { 'client_package': (request.properties.cipd_input.client_package.to_dict()), 'packages': [ p.to_dict() for p in request.properties.cipd_input.packages ], 'server': request.properties.cipd_input.server, } if request.properties.cipd_input else None, 'command': cmd, 'dimensions': request.properties.dimensions, 'env': request.properties.env, 'extra_args': request.properties.extra_args, 'grace_period': request.properties.grace_period_secs, 'hard_timeout': request.properties.execution_timeout_secs, 'host': utils.get_versioned_hosturl(), 'io_timeout': request.properties.io_timeout_secs, 'isolated': { 'input': request.properties.inputs_ref.isolated, 'namespace': request.properties.inputs_ref.namespace, 'server': request.properties.inputs_ref.isolatedserver, } if request.properties.inputs_ref else None, 'service_account': request.service_account, 'task_id': task_pack.pack_run_result_key(run_result_key), }, } self.send_response(utils.to_json_encodable(out))
def _cmd_run(self, request, run_result_key, bot_id): cmd = None if request.properties.commands: cmd = request.properties.commands[0] elif request.properties.command: cmd = request.properties.command out = { 'cmd': 'run', 'manifest': { 'bot_id': bot_id, 'command': cmd, 'dimensions': request.properties.dimensions, 'env': request.properties.env, 'extra_args': request.properties.extra_args, 'grace_period': request.properties.grace_period_secs, 'hard_timeout': request.properties.execution_timeout_secs, 'host': utils.get_versioned_hosturl(), 'io_timeout': request.properties.io_timeout_secs, 'inputs_ref': request.properties.inputs_ref, 'packages': [{ 'package_name': p.package_name, 'version': p.version, } for p in request.properties.packages], 'task_id': task_pack.pack_run_result_key(run_result_key), }, } self.send_response(utils.to_json_encodable(out))
def bot_update_task( run_result_key, bot_id, output, output_chunk_start, exit_code, duration, hard_timeout, io_timeout, cost_usd, outputs_ref, cipd_pins, performance_stats): """Updates a TaskRunResult and TaskResultSummary, along TaskOutputChunk. Arguments: - run_result_key: ndb.Key to TaskRunResult. - bot_id: Self advertised bot id to ensure it's the one expected. - output: Data to append to this command output. - output_chunk_start: Index of output in the stdout stream. - exit_code: Mark that this task completed. - duration: Time spent in seconds for this task, excluding overheads. - hard_timeout: Bool set if an hard timeout occured. - io_timeout: Bool set if an I/O timeout occured. - cost_usd: Cost in $USD of this task up to now. - outputs_ref: task_request.FilesRef instance or None. - cipd_pins: None or task_result.CipdPins - performance_stats: task_result.PerformanceStats instance or None. Can only be set when the task is completing. Invalid states, these are flat out refused: - A command is updated after it had an exit code assigned to. Returns: TaskRunResult.state or None in case of failure. """ assert output_chunk_start is None or isinstance(output_chunk_start, int) assert output is None or isinstance(output, str) if cost_usd is not None and cost_usd < 0.: raise ValueError('cost_usd must be None or greater or equal than 0') if duration is not None and duration < 0.: raise ValueError('duration must be None or greater or equal than 0') if (duration is None) != (exit_code is None): raise ValueError( 'had unexpected duration; expected iff a command completes\n' 'duration: %r; exit: %r' % (duration, exit_code)) if performance_stats and duration is None: raise ValueError( 'duration must be set when performance_stats is set\n' 'duration: %s; performance_stats: %s' % (duration, performance_stats)) packed = task_pack.pack_run_result_key(run_result_key) logging.debug( 'bot_update_task(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', packed, bot_id, len(output) if output else output, output_chunk_start, exit_code, duration, hard_timeout, io_timeout, cost_usd, outputs_ref, cipd_pins, performance_stats) result_summary_key = task_pack.run_result_key_to_result_summary_key( run_result_key) request_key = task_pack.result_summary_key_to_request_key(result_summary_key) request_future = request_key.get_async() server_version = utils.get_app_version() request = request_future.get_result() now = utils.utcnow() run = lambda: _bot_update_tx( run_result_key, bot_id, output, output_chunk_start, exit_code, duration, hard_timeout, io_timeout, cost_usd, outputs_ref, cipd_pins, performance_stats, now, result_summary_key, server_version, request) try: smry, run_result, error = datastore_utils.transaction(run) except datastore_utils.CommitError as e: logging.info('Got commit error: %s', e) # It is important that the caller correctly surface this error as the bot # will retry on HTTP 500. return None if smry and smry.state != task_result.State.RUNNING: # Take no chance and explicitly clear the ndb memcache entry. A very rare # race condition is observed where a stale version of the entities it kept # in memcache. ndb.get_context()._clear_memcache( [result_summary_key, run_result_key]).check_success() assert bool(error) != bool(run_result), (error, run_result) if error: logging.error('Task %s %s', packed, error) return None # Caller must retry if PubSub enqueue fails. if not _maybe_pubsub_notify_now(smry, request): return None if smry.state not in task_result.State.STATES_RUNNING: event_mon_metrics.send_task_event(smry) ts_mon_metrics.on_task_completed(smry) # Hack a bit to tell the bot what it needs to hear (see handler_bot.py). It's # kind of an ugly hack but the other option is to return the whole run_result. if run_result.killing: return task_result.State.KILLED return run_result.state
def _handle_dead_bot(run_result_key): """Handles TaskRunResult where its bot has stopped showing sign of life. Transactionally updates the entities depending on the state of this task. The task may be retried automatically, canceled or left alone. Returns: True if the task was retried, False if the task was killed, None if no action was done. """ result_summary_key = task_pack.run_result_key_to_result_summary_key(run_result_key) request_key = task_pack.result_summary_key_to_request_key(result_summary_key) request_future = request_key.get_async() now = utils.utcnow() server_version = utils.get_app_version() packed = task_pack.pack_run_result_key(run_result_key) request = request_future.get_result() to_run_key = task_to_run.request_to_task_to_run_key(request) def run(): """Returns tuple(task_is_retried or None, bot_id).""" # Do one GET, one PUT at the end. run_result, result_summary, to_run = ndb.get_multi((run_result_key, result_summary_key, to_run_key)) if run_result.state != task_result.State.RUNNING: # It was updated already or not updating last. Likely DB index was stale. return None, run_result.bot_id run_result.signal_server_version(server_version) run_result.modified_ts = now notify = False if result_summary.try_number != run_result.try_number: # Not updating correct run_result, cancel it without touching # result_summary. to_put = (run_result,) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now task_is_retried = None elif result_summary.try_number == 1 and now < request.expiration_ts: # Retry it. to_put = (run_result, result_summary, to_run) to_run.queue_number = task_to_run.gen_queue_number(request) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now # Do not sync data from run_result to result_summary, since the task is # being retried. result_summary.reset_to_pending() result_summary.modified_ts = now task_is_retried = True else: # Cancel it, there was more than one try or the task expired in the # meantime. to_put = (run_result, result_summary) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now result_summary.set_from_run_result(run_result, request) notify = True task_is_retried = False futures = ndb.put_multi_async(to_put) if notify: _maybe_pubsub_notify_via_tq(result_summary, request) for f in futures: f.check_success() return task_is_retried, run_result.bot_id try: task_is_retried, bot_id = datastore_utils.transaction(run) except datastore_utils.CommitError: task_is_retried, bot_id = None, None if task_is_retried is not None: task_to_run.set_lookup_cache(to_run_key, task_is_retried) if not task_is_retried: stats.add_run_entry( "run_bot_died", run_result_key, bot_id=bot_id[0], dimensions=request.properties.dimensions, user=request.user, ) else: logging.info("Retried %s", packed) else: logging.info("Ignored %s", packed) return task_is_retried
def bot_update_task(run_result_key, bot_id, output, output_chunk_start, exit_code, duration, hard_timeout, io_timeout, cost_usd, outputs_ref, cipd_pins, performance_stats): """Updates a TaskRunResult and TaskResultSummary, along TaskOutput. Arguments: - run_result_key: ndb.Key to TaskRunResult. - bot_id: Self advertised bot id to ensure it's the one expected. - output: Data to append to this command output. - output_chunk_start: Index of output in the stdout stream. - exit_code: Mark that this task completed. - duration: Time spent in seconds for this task, excluding overheads. - hard_timeout: Bool set if an hard timeout occured. - io_timeout: Bool set if an I/O timeout occured. - cost_usd: Cost in $USD of this task up to now. - outputs_ref: task_request.FilesRef instance or None. - cipd_pins: None or task_result.CipdPins - performance_stats: task_result.PerformanceStats instance or None. Can only be set when the task is completing. Invalid states, these are flat out refused: - A command is updated after it had an exit code assigned to. Returns: TaskRunResult.state or None in case of failure. """ assert output_chunk_start is None or isinstance(output_chunk_start, int) assert output is None or isinstance(output, str) if cost_usd is not None and cost_usd < 0.: raise ValueError('cost_usd must be None or greater or equal than 0') if duration is not None and duration < 0.: raise ValueError('duration must be None or greater or equal than 0') if (duration is None) != (exit_code is None): raise ValueError( 'had unexpected duration; expected iff a command completes\n' 'duration: %r; exit: %r' % (duration, exit_code)) if performance_stats and duration is None: raise ValueError('duration must be set when performance_stats is set\n' 'duration: %s; performance_stats: %s' % (duration, performance_stats)) packed = task_pack.pack_run_result_key(run_result_key) logging.debug( 'bot_update_task(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', packed, bot_id, len(output) if output else output, output_chunk_start, exit_code, duration, hard_timeout, io_timeout, cost_usd, outputs_ref, cipd_pins, performance_stats) result_summary_key = task_pack.run_result_key_to_result_summary_key( run_result_key) request_key = task_pack.result_summary_key_to_request_key( result_summary_key) request_future = request_key.get_async() server_version = utils.get_app_version() request = request_future.get_result() now = utils.utcnow() def run(): """Returns tuple(TaskRunResult, bool(completed), str(error)). Any error is returned as a string to be passed to logging.error() instead of logging inside the transaction for performance. """ # 2 consecutive GETs, one PUT. run_result_future = run_result_key.get_async() result_summary_future = result_summary_key.get_async() run_result = run_result_future.get_result() if not run_result: result_summary_future.wait() return None, None, 'is missing' if run_result.bot_id != bot_id: result_summary_future.wait() return None, None, ( 'expected bot (%s) but had update from bot %s' % (run_result.bot_id, bot_id)) if not run_result.started_ts: return None, None, 'TaskRunResult is broken; %s' % ( run_result.to_dict()) # Assumptions: # - duration and exit_code are both set or not set. # - same for run_result. if exit_code is not None: if run_result.exit_code is not None: # This happens as an HTTP request is retried when the DB write succeeded # but it still returned HTTP 500. if run_result.exit_code != exit_code: result_summary_future.wait() return None, None, 'got 2 different exit_code; %s then %s' % ( run_result.exit_code, exit_code) if run_result.duration != duration: result_summary_future.wait() return None, None, 'got 2 different durations; %s then %s' % ( run_result.duration, duration) else: run_result.duration = duration run_result.exit_code = exit_code if outputs_ref: run_result.outputs_ref = outputs_ref if cipd_pins: run_result.cipd_pins = cipd_pins if run_result.state in task_result.State.STATES_RUNNING: if hard_timeout or io_timeout: run_result.state = task_result.State.TIMED_OUT run_result.completed_ts = now elif run_result.exit_code is not None: run_result.state = task_result.State.COMPLETED run_result.completed_ts = now run_result.signal_server_version(server_version) run_result.validate(request) to_put = [run_result] if output: # This does 1 multi GETs. This also modifies run_result in place. to_put.extend( run_result.append_output(output, output_chunk_start or 0)) if performance_stats: performance_stats.key = task_pack.run_result_key_to_performance_stats_key( run_result.key) to_put.append(performance_stats) run_result.cost_usd = max(cost_usd, run_result.cost_usd or 0.) run_result.modified_ts = now result_summary = result_summary_future.get_result() if (result_summary.try_number and result_summary.try_number > run_result.try_number): # The situation where a shard is retried but the bot running the previous # try somehow reappears and reports success, the result must still show # the last try's result. We still need to update cost_usd manually. result_summary.costs_usd[run_result.try_number - 1] = run_result.cost_usd result_summary.modified_ts = now else: result_summary.set_from_run_result(run_result, request) result_summary.validate(request) to_put.append(result_summary) ndb.put_multi(to_put) return result_summary, run_result, None try: smry, run_result, error = datastore_utils.transaction(run) except datastore_utils.CommitError as e: logging.info('Got commit error: %s', e) # It is important that the caller correctly surface this error. return None assert bool(error) != bool(run_result), (error, run_result) if error: logging.error('Task %s %s', packed, error) return None # Caller must retry if PubSub enqueue fails. task_completed = run_result.state != task_result.State.RUNNING if not _maybe_pubsub_notify_now(smry, request): return None _update_stats(run_result, bot_id, request, task_completed) if task_completed: ts_mon_metrics.update_jobs_completed_metrics(smry) return run_result.state
def test_integration(self): # Creates a TaskRequest, along its TaskResultSummary and TaskToRun. Have a # bot reap the task, and complete the task. Ensure the resulting # TaskResultSummary and TaskRunResult are properly updated. request = task_request.make_request(_gen_request(), True) result_summary = task_result.new_result_summary(request) to_run = task_to_run.new_task_to_run(request) result_summary.modified_ts = utils.utcnow() ndb.transaction(lambda: ndb.put_multi([result_summary, to_run])) expected = { 'abandoned_ts': None, 'bot_dimensions': None, 'bot_id': None, 'bot_version': None, 'children_task_ids': [], 'completed_ts': None, 'costs_usd': [], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': self.now, 'name': u'Request name', 'outputs_ref': None, 'properties_hash': None, 'server_versions': [], 'started_ts': None, 'state': task_result.State.PENDING, 'try_number': None, 'tags': [u'priority:50', u'tag:1', u'user:Jesus'], 'user': u'Jesus', } self.assertEqual(expected, result_summary.to_dict()) # Nothing changed 2 secs later except latency. self.mock_now(self.now, 2) self.assertEqual(expected, result_summary.to_dict()) # Task is reaped after 2 seconds (4 secs total). reap_ts = self.now + datetime.timedelta(seconds=4) self.mock_now(reap_ts) to_run.queue_number = None to_run.put() run_result = task_result.new_run_result(request, 1, 'localhost', 'abc', {}) run_result.modified_ts = utils.utcnow() result_summary.set_from_run_result(run_result, request) ndb.transaction(lambda: ndb.put_multi((result_summary, run_result))) expected = { 'abandoned_ts': None, 'bot_dimensions': {}, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': None, 'costs_usd': [0.], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': reap_ts, 'name': u'Request name', 'outputs_ref': None, 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': reap_ts, 'state': task_result.State.RUNNING, 'tags': [u'priority:50', u'tag:1', u'user:Jesus'], 'try_number': 1, 'user': u'Jesus', } self.assertEqual(expected, result_summary.key.get().to_dict()) # Task completed after 2 seconds (6 secs total), the task has been running # for 2 seconds. complete_ts = self.now + datetime.timedelta(seconds=6) self.mock_now(complete_ts) run_result.completed_ts = complete_ts run_result.exit_codes.append(0) run_result.state = task_result.State.COMPLETED run_result.modified_ts = utils.utcnow() ndb.transaction( lambda: ndb.put_multi(run_result.append_output(0, 'foo', 0))) result_summary.set_from_run_result(run_result, request) ndb.transaction(lambda: ndb.put_multi((result_summary, run_result))) expected = { 'abandoned_ts': None, 'bot_dimensions': {}, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': complete_ts, 'costs_usd': [0.], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'durations': [], 'exit_codes': [0], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': complete_ts, 'name': u'Request name', 'outputs_ref': None, 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': reap_ts, 'state': task_result.State.COMPLETED, 'tags': [u'priority:50', u'tag:1', u'user:Jesus'], 'try_number': 1, 'user': u'Jesus', } self.assertEqual(expected, result_summary.key.get().to_dict()) self.assertEqual(['foo'], list(result_summary.get_outputs())) self.assertEqual(datetime.timedelta(seconds=2), result_summary.duration_total) self.assertEqual(datetime.timedelta(seconds=2), result_summary.duration_now(utils.utcnow())) self.assertEqual(datetime.timedelta(seconds=4), result_summary.pending) self.assertEqual(datetime.timedelta(seconds=4), result_summary.pending_now(utils.utcnow())) self.assertEqual(task_pack.pack_result_summary_key(result_summary.key), result_summary.task_id) self.assertEqual(complete_ts, result_summary.ended_ts) self.assertEqual(task_pack.pack_run_result_key(run_result.key), run_result.task_id) self.assertEqual(complete_ts, run_result.ended_ts)
def schedule_request(request): """Creates and stores all the entities to schedule a new task request. The number of entities created is 3: TaskRequest, TaskResultSummary and TaskToRun. The TaskRequest is saved first as a DB transaction, then TaskResultSummary and TaskToRun are saved as a single DB RPC. The Search index is also updated in-between. Arguments: - request: is in the TaskRequest entity saved in the DB. Returns: TaskResultSummary. TaskToRun is not returned. """ dupe_future = None if request.properties.idempotent: # Find a previously run task that is also idempotent and completed. Start a # query to fetch items that can be used to dedupe the task. See the comment # for this property for more details. # # Do not use "cls.created_ts > oldest" here because this would require a # composite index. It's unnecessary because TaskRequest.key is mostly # equivalent to decreasing TaskRequest.created_ts, ordering by key works as # well and doesn't require a composite index. cls = task_result.TaskResultSummary h = request.properties.properties_hash dupe_future = cls.query(cls.properties_hash==h).order(cls.key).get_async() # At this point, the request is now in the DB but not yet in a mode where it # can be triggered or visible. Index it right away so it is searchable. If any # of remaining calls in this function fail, the TaskRequest and Search # Document will simply point to an incomplete task, which will be ignored. # # Creates the entities TaskToRun and TaskResultSummary but do not save them # yet. TaskRunResult will be created once a bot starts it. task = task_to_run.new_task_to_run(request) result_summary = task_result.new_result_summary(request) # Do not specify a doc_id, as they are guaranteed to be monotonically # increasing and searches are done in reverse order, which fits exactly the # created_ts ordering. This is useful because DateField is precise to the date # (!) and NumberField is signed 32 bits so the best it could do with EPOCH is # second resolution up to year 2038. index = search.Index(name='requests') packed = task_pack.pack_result_summary_key(result_summary.key) doc = search.Document( fields=[ search.TextField(name='name', value=request.name), search.AtomField(name='id', value=packed), ]) # Even if it fails here, we're still fine, as the task is not "alive" yet. search_future = index.put_async([doc]) now = utils.utcnow() if dupe_future: # Reuse the results! dupe_summary = dupe_future.get_result() # Refuse tasks older than X days. This is due to the isolate server dropping # files. https://code.google.com/p/swarming/issues/detail?id=197 oldest = now - datetime.timedelta( seconds=config.settings().reusable_task_age_secs) if dupe_summary and dupe_summary.created_ts > oldest: # If there's a bug, commenting out this block is sufficient to disable the # functionality. # Setting task.queue_number to None removes it from the scheduling. task.queue_number = None _copy_entity(dupe_summary, result_summary, ('created_ts', 'name', 'user')) result_summary.properties_hash = None result_summary.try_number = 0 result_summary.cost_saved_usd = result_summary.cost_usd # Only zap after. result_summary.costs_usd = [] result_summary.deduped_from = task_pack.pack_run_result_key( dupe_summary.run_result_key) # Get parent task details if applicable. parent_task_keys = None if request.parent_task_id: parent_run_key = task_pack.unpack_run_result_key(request.parent_task_id) parent_task_keys = [ parent_run_key, task_pack.run_result_key_to_result_summary_key(parent_run_key), ] result_summary.modified_ts = now # Storing these entities makes this task live. It is important at this point # that the HTTP handler returns as fast as possible, otherwise the task will # be run but the client will not know about it. def run(): ndb.put_multi([result_summary, task]) def run_parent(): # This one is slower. items = ndb.get_multi(parent_task_keys) k = result_summary.task_id for item in items: item.children_task_ids.append(k) item.modified_ts = now ndb.put_multi(items) # Raising will abort to the caller. futures = [datastore_utils.transaction_async(run)] if parent_task_keys: futures.append(datastore_utils.transaction_async(run_parent)) try: search_future.get_result() except search.Error: # Do not abort the task, for now search is best effort. logging.exception('Put failed') for future in futures: # Check for failures, it would raise in this case, aborting the call. future.get_result() stats.add_task_entry( 'task_enqueued', result_summary.key, dimensions=request.properties.dimensions, user=request.user) return result_summary
def schedule_request(request, check_acls=True): """Creates and stores all the entities to schedule a new task request. Checks ACLs first. Raises auth.AuthorizationError if caller is not authorized to post this request. The number of entities created is 3: TaskRequest, TaskToRun and TaskResultSummary. All 3 entities in the same entity group (TaskReqest, TaskToRun, TaskResultSummary) are saved as a DB transaction. Arguments: - request: TaskRequest entity to be saved in the DB. It's key must not be set and the entity must not be saved in the DB yet. - check_acls: Whether the request should check ACLs. Returns: TaskResultSummary. TaskToRun is not returned. """ assert isinstance(request, task_request.TaskRequest), request assert not request.key, request.key # Raises AuthorizationError with helpful message if the request.authorized # can't use some of the requested dimensions. if check_acls: _check_dimension_acls(request) now = utils.utcnow() request.key = task_request.new_request_key() task = task_to_run.new_task_to_run(request) result_summary = task_result.new_result_summary(request) result_summary.modified_ts = now def get_new_keys(): # Warning: this assumes knowledge about the hierarchy of each entity. key = task_request.new_request_key() task.key.parent = key old = result_summary.task_id result_summary.parent = key logging.info('%s conflicted, using %s', old, result_summary.task_id) return key deduped = False if request.properties.idempotent: dupe_summary = _find_dupe_task(now, request.properties_hash) if dupe_summary: # Setting task.queue_number to None removes it from the scheduling. task.queue_number = None _copy_summary( dupe_summary, result_summary, ('created_ts', 'modified_ts', 'name', 'user', 'tags')) # Zap irrelevant properties. PerformanceStats is also not copied over, # since it's not relevant. result_summary.properties_hash = None result_summary.try_number = 0 result_summary.cost_saved_usd = result_summary.cost_usd # Only zap after. result_summary.costs_usd = [] result_summary.deduped_from = task_pack.pack_run_result_key( dupe_summary.run_result_key) # In this code path, there's not much to do as the task will not be run, # previous results are returned. We still need to store all the entities # correctly. datastore_utils.insert(request, get_new_keys, extra=[task, result_summary]) logging.debug('New request %s reusing %s', result_summary.task_id, dupe_summary.task_id) deduped = True if not deduped: # Storing these entities makes this task live. It is important at this point # that the HTTP handler returns as fast as possible, otherwise the task will # be run but the client will not know about it. datastore_utils.insert(request, get_new_keys, extra=[task, result_summary]) logging.debug('New request %s', result_summary.task_id) # Get parent task details if applicable. if request.parent_task_id: parent_run_key = task_pack.unpack_run_result_key( request.parent_task_id) parent_task_keys = [ parent_run_key, task_pack.run_result_key_to_result_summary_key(parent_run_key), ] def run_parent(): # This one is slower. items = ndb.get_multi(parent_task_keys) k = result_summary.task_id for item in items: item.children_task_ids.append(k) item.modified_ts = now ndb.put_multi(items) # Raising will abort to the caller. There's a risk that for tasks with # parent tasks, the task will be lost due to this transaction. # TODO(maruel): An option is to update the parent task as part of a cron # job, which would remove this code from the critical path. datastore_utils.transaction(run_parent) stats.add_task_entry('task_enqueued', result_summary.key, dimensions=request.properties.dimensions, user=request.user) ts_mon_metrics.update_jobs_requested_metrics(result_summary, deduped) return result_summary
def _handle_dead_bot(run_result_key): """Handles TaskRunResult where its bot has stopped showing sign of life. Transactionally updates the entities depending on the state of this task. The task may be retried automatically, canceled or left alone. Returns: True if the task was retried, False if the task was killed, None if no action was done. """ result_summary_key = task_pack.run_result_key_to_result_summary_key( run_result_key) request_key = task_pack.result_summary_key_to_request_key(result_summary_key) request_future = request_key.get_async() now = utils.utcnow() server_version = utils.get_app_version() packed = task_pack.pack_run_result_key(run_result_key) request = request_future.get_result() def run(): """Returns tuple(task_is_retried or None, bot_id). 1x GET, 1x GETs 2~3x PUT. """ run_result = run_result_key.get() if run_result.state != task_result.State.RUNNING: # It was updated already or not updating last. Likely DB index was stale. return None, run_result.bot_id if run_result.modified_ts > now - task_result.BOT_PING_TOLERANCE: # The query index IS stale. return None, run_result.bot_id current_task_slice = run_result.current_task_slice run_result.signal_server_version(server_version) old_modified = run_result.modified_ts run_result.modified_ts = now result_summary = result_summary_key.get() orig_summary_state = result_summary.state if result_summary.try_number != run_result.try_number: # Not updating correct run_result, cancel it without touching # result_summary. to_put = (run_result,) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now task_is_retried = None elif (result_summary.try_number == 1 and now < request.expiration_ts and (request.task_slice(current_task_slice).properties.idempotent or run_result.started_ts == old_modified)): # Retry it. It fits: # - first try # - not yet expired # - One of: # - idempotent # - task hadn't got any ping at all from task_runner.run_command() # TODO(maruel): Allow increasing the current_task_slice value. # Create a second TaskToRun with the same TaskSlice. to_run = task_to_run.new_task_to_run(request, 2, current_task_slice) to_put = (run_result, result_summary, to_run) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now # Do not sync data from run_result to result_summary, since the task is # being retried. result_summary.reset_to_pending() result_summary.modified_ts = now task_is_retried = True else: # Kill it as BOT_DIED, there was more than one try, the task expired in # the meantime or it wasn't idempotent. to_put = (run_result, result_summary) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now result_summary.set_from_run_result(run_result, request) task_is_retried = False futures = ndb.put_multi_async(to_put) # if result_summary.state != orig_summary_state: if orig_summary_state != result_summary.state: _maybe_pubsub_notify_via_tq(result_summary, request) for f in futures: f.check_success() return task_is_retried try: task_is_retried = datastore_utils.transaction(run) except datastore_utils.CommitError: task_is_retried = None if task_is_retried: logging.info('Retried %s', packed) elif task_is_retried == False: logging.debug('Ignored %s', packed) return task_is_retried
def test_integration(self): # Creates a TaskRequest, along its TaskResultSummary and TaskToRun. Have a # bot reap the task, and complete the task. Ensure the resulting # TaskResultSummary and TaskRunResult are properly updated. request = task_request.make_request(_gen_request(), True) result_summary = task_result.new_result_summary(request) to_run = task_to_run.new_task_to_run(request) result_summary.modified_ts = utils.utcnow() ndb.transaction(lambda: ndb.put_multi([result_summary, to_run])) expected = { 'abandoned_ts': None, 'bot_dimensions': None, 'bot_id': None, 'bot_version': None, 'children_task_ids': [], 'completed_ts': None, 'costs_usd': [], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': self.now, 'name': u'Request name', 'outputs_ref': None, 'properties_hash': None, 'server_versions': [], 'started_ts': None, 'state': task_result.State.PENDING, 'try_number': None, 'tags': [u'priority:50', u'tag:1', u'user:Jesus'], 'user': u'Jesus', } self.assertEqual(expected, result_summary.to_dict()) # Nothing changed 2 secs later except latency. self.mock_now(self.now, 2) self.assertEqual(expected, result_summary.to_dict()) # Task is reaped after 2 seconds (4 secs total). reap_ts = self.now + datetime.timedelta(seconds=4) self.mock_now(reap_ts) to_run.queue_number = None to_run.put() run_result = task_result.new_run_result(request, 1, 'localhost', 'abc', {}) run_result.modified_ts = utils.utcnow() result_summary.set_from_run_result(run_result, request) ndb.transaction(lambda: ndb.put_multi((result_summary, run_result))) expected = { 'abandoned_ts': None, 'bot_dimensions': {}, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': None, 'costs_usd': [0.], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': reap_ts, 'name': u'Request name', 'outputs_ref': None, 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': reap_ts, 'state': task_result.State.RUNNING, 'tags': [u'priority:50', u'tag:1', u'user:Jesus'], 'try_number': 1, 'user': u'Jesus', } self.assertEqual(expected, result_summary.key.get().to_dict()) # Task completed after 2 seconds (6 secs total), the task has been running # for 2 seconds. complete_ts = self.now + datetime.timedelta(seconds=6) self.mock_now(complete_ts) run_result.completed_ts = complete_ts run_result.exit_codes.append(0) run_result.state = task_result.State.COMPLETED run_result.modified_ts = utils.utcnow() ndb.transaction( lambda: ndb.put_multi(run_result.append_output(0, 'foo', 0))) result_summary.set_from_run_result(run_result, request) ndb.transaction(lambda: ndb.put_multi((result_summary, run_result))) expected = { 'abandoned_ts': None, 'bot_dimensions': {}, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': complete_ts, 'costs_usd': [0.], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'durations': [], 'exit_codes': [0], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': complete_ts, 'name': u'Request name', 'outputs_ref': None, 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': reap_ts, 'state': task_result.State.COMPLETED, 'tags': [u'priority:50', u'tag:1', u'user:Jesus'], 'try_number': 1, 'user': u'Jesus', } self.assertEqual(expected, result_summary.key.get().to_dict()) self.assertEqual(['foo'], list(result_summary.get_outputs())) self.assertEqual( datetime.timedelta(seconds=2), result_summary.duration_total) self.assertEqual( datetime.timedelta(seconds=2), result_summary.duration_now(utils.utcnow())) self.assertEqual( datetime.timedelta(seconds=4), result_summary.pending) self.assertEqual( datetime.timedelta(seconds=4), result_summary.pending_now(utils.utcnow())) self.assertEqual( task_pack.pack_result_summary_key(result_summary.key), result_summary.task_id) self.assertEqual(complete_ts, result_summary.ended_ts) self.assertEqual( task_pack.pack_run_result_key(run_result.key), run_result.task_id) self.assertEqual(complete_ts, run_result.ended_ts)
def task_id(self): return task_pack.pack_run_result_key(self.key)
def schedule_request(request): """Creates and stores all the entities to schedule a new task request. The number of entities created is 3: TaskRequest, TaskResultSummary and TaskToRun. The TaskRequest is saved first as a DB transaction, then TaskResultSummary and TaskToRun are saved as a single DB RPC. The Search index is also updated in-between. Arguments: - request: is in the TaskRequest entity saved in the DB. Returns: TaskResultSummary. TaskToRun is not returned. """ dupe_future = None if request.properties.idempotent: # Find a previously run task that is also idempotent and completed. Start a # query to fetch items that can be used to dedupe the task. See the comment # for this property for more details. # # Do not use "cls.created_ts > oldest" here because this would require a # composite index. It's unnecessary because TaskRequest.key is mostly # equivalent to decreasing TaskRequest.created_ts, ordering by key works as # well and doesn't require a composite index. cls = task_result.TaskResultSummary h = request.properties.properties_hash dupe_future = cls.query(cls.properties_hash == h).order(cls.key).get_async() # At this point, the request is now in the DB but not yet in a mode where it # can be triggered or visible. Index it right away so it is searchable. If any # of remaining calls in this function fail, the TaskRequest and Search # Document will simply point to an incomplete task, which will be ignored. # # Creates the entities TaskToRun and TaskResultSummary but do not save them # yet. TaskRunResult will be created once a bot starts it. task = task_to_run.new_task_to_run(request) result_summary = task_result.new_result_summary(request) # Do not specify a doc_id, as they are guaranteed to be monotonically # increasing and searches are done in reverse order, which fits exactly the # created_ts ordering. This is useful because DateField is precise to the date # (!) and NumberField is signed 32 bits so the best it could do with EPOCH is # second resolution up to year 2038. index = search.Index(name="requests") packed = task_pack.pack_result_summary_key(result_summary.key) doc = search.Document( fields=[search.TextField(name="name", value=request.name), search.AtomField(name="id", value=packed)] ) # Even if it fails here, we're still fine, as the task is not "alive" yet. search_future = index.put_async([doc]) now = utils.utcnow() if dupe_future: # Reuse the results! dupe_summary = dupe_future.get_result() # Refuse tasks older than X days. This is due to the isolate server dropping # files. https://code.google.com/p/swarming/issues/detail?id=197 oldest = now - datetime.timedelta(seconds=config.settings().reusable_task_age_secs) if dupe_summary and dupe_summary.created_ts > oldest: # If there's a bug, commenting out this block is sufficient to disable the # functionality. # Setting task.queue_number to None removes it from the scheduling. task.queue_number = None _copy_entity(dupe_summary, result_summary, ("created_ts", "name", "user", "tags")) result_summary.properties_hash = None result_summary.try_number = 0 result_summary.cost_saved_usd = result_summary.cost_usd # Only zap after. result_summary.costs_usd = [] result_summary.deduped_from = task_pack.pack_run_result_key(dupe_summary.run_result_key) # Get parent task details if applicable. parent_task_keys = None if request.parent_task_id: parent_run_key = task_pack.unpack_run_result_key(request.parent_task_id) parent_task_keys = [parent_run_key, task_pack.run_result_key_to_result_summary_key(parent_run_key)] result_summary.modified_ts = now # Storing these entities makes this task live. It is important at this point # that the HTTP handler returns as fast as possible, otherwise the task will # be run but the client will not know about it. def run(): ndb.put_multi([result_summary, task]) def run_parent(): # This one is slower. items = ndb.get_multi(parent_task_keys) k = result_summary.task_id for item in items: item.children_task_ids.append(k) item.modified_ts = now ndb.put_multi(items) # Raising will abort to the caller. futures = [datastore_utils.transaction_async(run)] if parent_task_keys: futures.append(datastore_utils.transaction_async(run_parent)) try: search_future.get_result() except search.Error: # Do not abort the task, for now search is best effort. logging.exception("Put failed") for future in futures: # Check for failures, it would raise in this case, aborting the call. future.get_result() stats.add_task_entry( "task_enqueued", result_summary.key, dimensions=request.properties.dimensions, user=request.user ) return result_summary
def key_packed(self): return task_pack.pack_run_result_key(self.key)
def task_id(self): """Returns an encoded task id for this TaskToRun. Note: this includes the try_number but not the task_slice_index. """ return task_pack.pack_run_result_key(self.run_result_key)
def _cmd_run(self, request, secret_bytes, run_result, bot_id, bot_group_cfg): logging.info('Run: %s', request.task_id) props = request.task_slice(run_result.current_task_slice).properties out = { 'cmd': 'run', 'manifest': { 'bot_id': bot_id, 'bot_authenticated_as': auth.get_peer_identity().to_bytes(), 'caches': [c.to_dict() for c in props.caches], 'cipd_input': { 'client_package': props.cipd_input.client_package.to_dict(), 'packages': [p.to_dict() for p in props.cipd_input.packages], 'server': props.cipd_input.server, } if props.cipd_input else None, 'command': props.command, 'dimensions': props.dimensions, 'env': props.env, 'env_prefixes': props.env_prefixes, 'extra_args': props.extra_args, 'grace_period': props.grace_period_secs, 'hard_timeout': props.execution_timeout_secs, 'host': utils.get_versioned_hosturl(), 'io_timeout': props.io_timeout_secs, 'secret_bytes': (secret_bytes.secret_bytes.encode('base64') if secret_bytes else None), 'isolated': { 'input': props.inputs_ref.isolated, 'namespace': props.inputs_ref.namespace, 'server': props.inputs_ref.isolatedserver, } if props.inputs_ref else None, 'outputs': props.outputs, 'relative_cwd': props.relative_cwd, 'service_accounts': { 'system': { # 'none', 'bot' or email. Bot interprets 'none' and 'bot' locally. # When it sees something else, it uses /oauth_token API endpoint to # grab tokens through server. 'service_account': bot_group_cfg.system_service_account or 'none', }, 'task': { # Same here. 'service_account': request.service_account, }, }, 'task_id': task_pack.pack_run_result_key(run_result.key), }, } self.send_response(utils.to_json_encodable(out))
def test_integration(self): # Creates a TaskRequest, along its TaskResultSummary and TaskToRun. Have a # bot reap the task, and complete the task. Ensure the resulting # TaskResultSummary and TaskRunResult are properly updated. request = mkreq(_gen_request()) result_summary = task_result.new_result_summary(request) to_run = task_to_run.new_task_to_run(request) result_summary.modified_ts = utils.utcnow() ndb.transaction(lambda: ndb.put_multi([result_summary, to_run])) expected = { 'abandoned_ts': None, 'bot_dimensions': None, 'bot_id': None, 'bot_version': None, 'cipd_pins': None, 'children_task_ids': [], 'completed_ts': None, 'costs_usd': [], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'duration': None, 'exit_code': None, 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': self.now, 'name': u'Request name', 'outputs_ref': None, 'properties_hash': None, 'server_versions': [], 'started_ts': None, 'state': task_result.State.PENDING, 'try_number': None, 'tags': [ u'pool:default', u'priority:50', u'service_account:none', u'tag:1', u'user:Jesus', ], 'user': u'Jesus', } self.assertEqual(expected, result_summary.to_dict()) # Nothing changed 2 secs later except latency. self.mock_now(self.now, 2) self.assertEqual(expected, result_summary.to_dict()) # Task is reaped after 2 seconds (4 secs total). reap_ts = self.now + datetime.timedelta(seconds=4) self.mock_now(reap_ts) to_run.queue_number = None to_run.put() run_result = task_result.new_run_result(request, 1, 'localhost', 'abc', {}) run_result.modified_ts = utils.utcnow() result_summary.set_from_run_result(run_result, request) ndb.transaction(lambda: ndb.put_multi((result_summary, run_result))) expected = { 'abandoned_ts': None, 'bot_dimensions': {}, 'bot_id': u'localhost', 'bot_version': u'abc', 'cipd_pins': None, 'children_task_ids': [], 'completed_ts': None, 'costs_usd': [0.], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'duration': None, 'exit_code': None, 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': reap_ts, 'name': u'Request name', 'outputs_ref': None, 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': reap_ts, 'state': task_result.State.RUNNING, 'tags': [ u'pool:default', u'priority:50', u'service_account:none', u'tag:1', u'user:Jesus', ], 'try_number': 1, 'user': u'Jesus', } self.assertEqual(expected, result_summary.key.get().to_dict()) # Task completed after 2 seconds (6 secs total), the task has been running # for 2 seconds. complete_ts = self.now + datetime.timedelta(seconds=6) self.mock_now(complete_ts) run_result.completed_ts = complete_ts run_result.duration = 0.1 run_result.exit_code = 0 run_result.state = task_result.State.COMPLETED run_result.modified_ts = utils.utcnow() task_result.PerformanceStats( key=task_pack.run_result_key_to_performance_stats_key(run_result.key), bot_overhead=0.1, isolated_download=task_result.OperationStats( duration=0.05, initial_number_items=10, initial_size=10000, items_cold='foo', items_hot='bar'), isolated_upload=task_result.OperationStats( duration=0.01, items_cold='foo')).put() ndb.transaction(lambda: ndb.put_multi(run_result.append_output('foo', 0))) result_summary.set_from_run_result(run_result, request) ndb.transaction(lambda: ndb.put_multi((result_summary, run_result))) expected = { 'abandoned_ts': None, 'bot_dimensions': {}, 'bot_id': u'localhost', 'bot_version': u'abc', 'cipd_pins': None, 'children_task_ids': [], 'completed_ts': complete_ts, 'costs_usd': [0.], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'duration': 0.1, 'exit_code': 0, 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': complete_ts, 'name': u'Request name', 'outputs_ref': None, 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': reap_ts, 'state': task_result.State.COMPLETED, 'tags': [ u'pool:default', u'priority:50', u'service_account:none', u'tag:1', u'user:Jesus', ], 'try_number': 1, 'user': u'Jesus', } self.assertEqual(expected, result_summary.key.get().to_dict()) expected = { 'bot_overhead': 0.1, 'isolated_download': { 'duration': 0.05, 'initial_number_items': 10, 'initial_size': 10000, 'items_cold': 'foo', 'items_hot': 'bar', }, 'isolated_upload': { 'duration': 0.01, 'initial_number_items': None, 'initial_size': None, 'items_cold': 'foo', 'items_hot': None, }, 'package_installation': { 'duration': None, 'initial_number_items': None, 'initial_size': None, 'items_cold': None, 'items_hot': None, }, } self.assertEqual(expected, result_summary.performance_stats.to_dict()) self.assertEqual('foo', result_summary.get_output()) self.assertEqual( datetime.timedelta(seconds=2), result_summary.duration_as_seen_by_server) self.assertEqual( datetime.timedelta(seconds=0.1), result_summary.duration_now(utils.utcnow())) self.assertEqual( datetime.timedelta(seconds=4), result_summary.pending) self.assertEqual( datetime.timedelta(seconds=4), result_summary.pending_now(utils.utcnow())) self.assertEqual( task_pack.pack_result_summary_key(result_summary.key), result_summary.task_id) self.assertEqual(complete_ts, result_summary.ended_ts) self.assertEqual( task_pack.pack_run_result_key(run_result.key), run_result.task_id) self.assertEqual(complete_ts, run_result.ended_ts)
def bot_update_task( run_result_key, bot_id, output, output_chunk_start, exit_code, duration, hard_timeout, io_timeout, cost_usd, outputs_ref, ): """Updates a TaskRunResult and TaskResultSummary, along TaskOutput. Arguments: - run_result_key: ndb.Key to TaskRunResult. - bot_id: Self advertised bot id to ensure it's the one expected. - output: Data to append to this command output. - output_chunk_start: Index of output in the stdout stream. - exit_code: Mark that this command is terminated. - duration: Time spent in seconds for this command. - hard_timeout: Bool set if an hard timeout occured. - io_timeout: Bool set if an I/O timeout occured. - cost_usd: Cost in $USD of this task up to now. - outputs_ref: Serialized FilesRef instance or None. Invalid states, these are flat out refused: - A command is updated after it had an exit code assigned to. Returns: tuple(bool, bool); first is if the update succeeded, second is if the task completed. """ assert output_chunk_start is None or isinstance(output_chunk_start, int) assert output is None or isinstance(output, str) if cost_usd is not None and cost_usd < 0.0: raise ValueError("cost_usd must be None or greater or equal than 0") packed = task_pack.pack_run_result_key(run_result_key) logging.debug( "bot_update_task(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", packed, bot_id, len(output) if output else output, output_chunk_start, exit_code, duration, hard_timeout, io_timeout, cost_usd, outputs_ref, ) result_summary_key = task_pack.run_result_key_to_result_summary_key(run_result_key) request_key = task_pack.result_summary_key_to_request_key(result_summary_key) request_future = request_key.get_async() server_version = utils.get_app_version() request = request_future.get_result() now = utils.utcnow() def run(): # 2 consecutive GETs, one PUT. run_result_future = run_result_key.get_async() result_summary_future = result_summary_key.get_async() run_result = run_result_future.get_result() if not run_result: result_summary_future.wait() return None, None, False, "is missing" if run_result.bot_id != bot_id: result_summary_future.wait() return None, None, False, ("expected bot (%s) but had update from bot %s" % (run_result.bot_id, bot_id)) if not run_result.started_ts: return None, None, False, "TaskRunResult is broken; %s" % (run_result.to_dict()) # This happens as an HTTP request is retried when the DB write succeeded but # it still returned HTTP 500. if run_result.exit_code is not None and exit_code is not None: if run_result.exit_code != exit_code: result_summary_future.wait() return None, None, False, "got 2 different exit_code; %s then %s" % (run_result.exit_code, exit_code) if run_result.durations and duration is not None: if run_result.durations[0] != duration: result_summary_future.wait() return None, None, False, "got 2 different durations; %s then %s" % (run_result.durations[0], duration) if (duration is None) != (exit_code is None): result_summary_future.wait() return ( None, None, False, ( "had unexpected duration; expected iff a command completes\n" "duration: %s vs %s; exit: %s vs %s" % (run_result.durations, duration, run_result.exit_code, exit_code) ), ) # If the command completed. Check if the value wasn't set already. if duration is not None and not run_result.durations: run_result.durations.append(duration) if exit_code is not None and run_result.exit_code is None: run_result.exit_codes.append(exit_code) if outputs_ref: run_result.outputs_ref = task_request.FilesRef(**outputs_ref) task_completed = run_result.exit_code is not None if run_result.state in task_result.State.STATES_RUNNING: if hard_timeout or io_timeout: run_result.state = task_result.State.TIMED_OUT run_result.completed_ts = now elif task_completed: run_result.state = task_result.State.COMPLETED run_result.completed_ts = now run_result.signal_server_version(server_version) to_put = [run_result] if output: # This does 1 multi GETs. This also modifies run_result in place. to_put.extend(run_result.append_output(0, output, output_chunk_start or 0)) run_result.cost_usd = max(cost_usd, run_result.cost_usd or 0.0) run_result.modified_ts = now result_summary = result_summary_future.get_result() if result_summary.try_number and result_summary.try_number > run_result.try_number: # The situation where a shard is retried but the bot running the previous # try somehow reappears and reports success, the result must still show # the last try's result. We still need to update cost_usd manually. result_summary.costs_usd[run_result.try_number - 1] = run_result.cost_usd result_summary.modified_ts = now else: result_summary.set_from_run_result(run_result, request) to_put.append(result_summary) ndb.put_multi(to_put) return result_summary, run_result, task_completed, None try: smry, run_result, task_completed, error = datastore_utils.transaction(run) except datastore_utils.CommitError as e: logging.info("Got commit error: %s", e) # It is important that the caller correctly surface this error. return False, False if run_result: # Caller must retry if PubSub enqueue fails. if not _maybe_pubsub_notify_now(smry, request): return False, False _update_stats(run_result, bot_id, request, task_completed) if error: logging.error("Task %s %s", packed, error) return True, task_completed
def add_run_entry(action, run_result_key, **kwargs): """Action about a TaskRunResult.""" assert action.startswith('run_'), action run_id = task_pack.pack_run_result_key(run_result_key) return add_entry(action=action, run_id=run_id, **kwargs)
def test_integration(self): # Creates a TaskRequest, along its TaskResultSummary and TaskToRun. Have a # bot reap the task, and complete the task. Ensure the resulting # TaskResultSummary and TaskRunResult are properly updated. request = task_request.make_request(_gen_request(), True) result_summary = task_result.new_result_summary(request) to_run = task_to_run.new_task_to_run(request) result_summary.modified_ts = utils.utcnow() ndb.transaction(lambda: ndb.put_multi([result_summary, to_run])) expected = { "abandoned_ts": None, "bot_dimensions": None, "bot_id": None, "bot_version": None, "children_task_ids": [], "completed_ts": None, "costs_usd": [], "cost_saved_usd": None, "created_ts": self.now, "deduped_from": None, "durations": [], "exit_codes": [], "failure": False, "id": "1d69b9f088008810", "internal_failure": False, "modified_ts": self.now, "name": u"Request name", "outputs_ref": None, "properties_hash": None, "server_versions": [], "started_ts": None, "state": task_result.State.PENDING, "try_number": None, "tags": [u"priority:50", u"tag:1", u"user:Jesus"], "user": u"Jesus", } self.assertEqual(expected, result_summary.to_dict()) # Nothing changed 2 secs later except latency. self.mock_now(self.now, 2) self.assertEqual(expected, result_summary.to_dict()) # Task is reaped after 2 seconds (4 secs total). reap_ts = self.now + datetime.timedelta(seconds=4) self.mock_now(reap_ts) to_run.queue_number = None to_run.put() run_result = task_result.new_run_result(request, 1, "localhost", "abc", {}) run_result.modified_ts = utils.utcnow() result_summary.set_from_run_result(run_result, request) ndb.transaction(lambda: ndb.put_multi((result_summary, run_result))) expected = { "abandoned_ts": None, "bot_dimensions": {}, "bot_id": u"localhost", "bot_version": u"abc", "children_task_ids": [], "completed_ts": None, "costs_usd": [0.0], "cost_saved_usd": None, "created_ts": self.now, "deduped_from": None, "durations": [], "exit_codes": [], "failure": False, "id": "1d69b9f088008810", "internal_failure": False, "modified_ts": reap_ts, "name": u"Request name", "outputs_ref": None, "properties_hash": None, "server_versions": [u"v1a"], "started_ts": reap_ts, "state": task_result.State.RUNNING, "tags": [u"priority:50", u"tag:1", u"user:Jesus"], "try_number": 1, "user": u"Jesus", } self.assertEqual(expected, result_summary.key.get().to_dict()) # Task completed after 2 seconds (6 secs total), the task has been running # for 2 seconds. complete_ts = self.now + datetime.timedelta(seconds=6) self.mock_now(complete_ts) run_result.completed_ts = complete_ts run_result.exit_codes.append(0) run_result.state = task_result.State.COMPLETED run_result.modified_ts = utils.utcnow() ndb.transaction(lambda: ndb.put_multi(run_result.append_output(0, "foo", 0))) result_summary.set_from_run_result(run_result, request) ndb.transaction(lambda: ndb.put_multi((result_summary, run_result))) expected = { "abandoned_ts": None, "bot_dimensions": {}, "bot_id": u"localhost", "bot_version": u"abc", "children_task_ids": [], "completed_ts": complete_ts, "costs_usd": [0.0], "cost_saved_usd": None, "created_ts": self.now, "deduped_from": None, "durations": [], "exit_codes": [0], "failure": False, "id": "1d69b9f088008810", "internal_failure": False, "modified_ts": complete_ts, "name": u"Request name", "outputs_ref": None, "properties_hash": None, "server_versions": [u"v1a"], "started_ts": reap_ts, "state": task_result.State.COMPLETED, "tags": [u"priority:50", u"tag:1", u"user:Jesus"], "try_number": 1, "user": u"Jesus", } self.assertEqual(expected, result_summary.key.get().to_dict()) self.assertEqual(["foo"], list(result_summary.get_outputs())) self.assertEqual(datetime.timedelta(seconds=2), result_summary.duration) self.assertEqual(datetime.timedelta(seconds=2), result_summary.duration_now(utils.utcnow())) self.assertEqual(datetime.timedelta(seconds=4), result_summary.pending) self.assertEqual(datetime.timedelta(seconds=4), result_summary.pending_now(utils.utcnow())) self.assertEqual(task_pack.pack_result_summary_key(result_summary.key), result_summary.task_id) self.assertEqual(complete_ts, result_summary.ended_ts) self.assertEqual(task_pack.pack_run_result_key(run_result.key), run_result.task_id) self.assertEqual(complete_ts, run_result.ended_ts)
def _handle_dead_bot(run_result_key): """Handles TaskRunResult where its bot has stopped showing sign of life. Transactionally updates the entities depending on the state of this task. The task may be retried automatically, canceled or left alone. Returns: True if the task was retried, False if the task was killed, None if no action was done. """ result_summary_key = task_pack.run_result_key_to_result_summary_key( run_result_key) request_key = task_pack.result_summary_key_to_request_key( result_summary_key) request_future = request_key.get_async() now = utils.utcnow() server_version = utils.get_app_version() packed = task_pack.pack_run_result_key(run_result_key) request = request_future.get_result() to_run_key = task_to_run.request_to_task_to_run_key(request) def run(): """Returns tuple(task_is_retried or None, bot_id).""" # Do one GET, one PUT at the end. run_result, result_summary, to_run = ndb.get_multi( (run_result_key, result_summary_key, to_run_key)) if run_result.state != task_result.State.RUNNING: # It was updated already or not updating last. Likely DB index was stale. return None, run_result.bot_id if run_result.modified_ts > now - task_result.BOT_PING_TOLERANCE: # The query index IS stale. return None, run_result.bot_id run_result.signal_server_version(server_version) run_result.modified_ts = now notify = False if result_summary.try_number != run_result.try_number: # Not updating correct run_result, cancel it without touching # result_summary. to_put = (run_result, ) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now task_is_retried = None elif result_summary.try_number == 1 and now < request.expiration_ts: # Retry it. to_put = (run_result, result_summary, to_run) to_run.queue_number = task_to_run.gen_queue_number(request) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now # Do not sync data from run_result to result_summary, since the task is # being retried. result_summary.reset_to_pending() result_summary.modified_ts = now task_is_retried = True else: # Cancel it, there was more than one try or the task expired in the # meantime. to_put = (run_result, result_summary) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now result_summary.set_from_run_result(run_result, request) notify = True task_is_retried = False futures = ndb.put_multi_async(to_put) if notify: _maybe_pubsub_notify_via_tq(result_summary, request) for f in futures: f.check_success() return task_is_retried, run_result.bot_id try: task_is_retried, bot_id = datastore_utils.transaction(run) except datastore_utils.CommitError: task_is_retried, bot_id = None, None if task_is_retried is not None: task_to_run.set_lookup_cache(to_run_key, task_is_retried) if not task_is_retried: stats.add_run_entry('run_bot_died', run_result_key, bot_id=bot_id[0], dimensions=request.properties.dimensions, user=request.user) else: logging.info('Retried %s', packed) else: logging.info('Ignored %s', packed) return task_is_retried
def test_integration(self): # Creates a TaskRequest, along its TaskResultSummary and TaskToRun. Have a # bot reap the task, and complete the task. Ensure the resulting # TaskResultSummary and TaskRunResult are properly updated. request = _gen_request() result_summary = task_result.new_result_summary(request) to_run = task_to_run.new_task_to_run(request, 1, 0) result_summary.modified_ts = utils.utcnow() ndb.transaction(lambda: ndb.put_multi([result_summary, to_run])) expected = self._gen_summary(modified_ts=self.now) self.assertEqual(expected, result_summary.to_dict()) # Nothing changed 2 secs later except latency. self.mock_now(self.now, 2) self.assertEqual(expected, result_summary.to_dict()) # Task is reaped after 2 seconds (4 secs total). reap_ts = self.now + datetime.timedelta(seconds=4) self.mock_now(reap_ts) to_run.queue_number = None to_run.put() run_result = task_result.new_run_result(request, to_run, u'localhost', u'abc', {}) run_result.started_ts = utils.utcnow() run_result.modified_ts = run_result.started_ts ndb.transaction( lambda: result_summary.set_from_run_result(run_result, request)) ndb.transaction(lambda: ndb.put_multi((result_summary, run_result))) expected = self._gen_summary(bot_dimensions={}, bot_version=u'abc', bot_id=u'localhost', costs_usd=[0.], modified_ts=reap_ts, state=task_result.State.RUNNING, started_ts=reap_ts, try_number=1) self.assertEqual(expected, result_summary.key.get().to_dict()) # Task completed after 2 seconds (6 secs total), the task has been running # for 2 seconds. complete_ts = self.now + datetime.timedelta(seconds=6) self.mock_now(complete_ts) run_result.completed_ts = complete_ts run_result.duration = 0.1 run_result.exit_code = 0 run_result.state = task_result.State.COMPLETED run_result.modified_ts = utils.utcnow() task_result.PerformanceStats( key=task_pack.run_result_key_to_performance_stats_key( run_result.key), bot_overhead=0.1, isolated_download=task_result.OperationStats( duration=0.05, initial_number_items=10, initial_size=10000, items_cold=large.pack([1, 2]), items_hot=large.pack([3, 4, 5])), isolated_upload=task_result.OperationStats(duration=0.01, items_cold=large.pack( [10]))).put() ndb.transaction( lambda: ndb.put_multi(run_result.append_output('foo', 0))) ndb.transaction( lambda: result_summary.set_from_run_result(run_result, request)) ndb.transaction(lambda: ndb.put_multi((result_summary, run_result))) expected = self._gen_summary(bot_dimensions={}, bot_version=u'abc', bot_id=u'localhost', completed_ts=complete_ts, costs_usd=[0.], duration=0.1, exit_code=0, modified_ts=complete_ts, state=task_result.State.COMPLETED, started_ts=reap_ts, try_number=1) self.assertEqual(expected, result_summary.key.get().to_dict()) expected = { 'bot_overhead': 0.1, 'isolated_download': { 'duration': 0.05, 'initial_number_items': 10, 'initial_size': 10000, 'items_cold': large.pack([1, 2]), 'items_hot': large.pack([3, 4, 5]), 'num_items_cold': 2, 'total_bytes_items_cold': 3, 'num_items_hot': 3, 'total_bytes_items_hot': 12, }, 'isolated_upload': { 'duration': 0.01, 'initial_number_items': None, 'initial_size': None, 'items_cold': large.pack([10]), 'items_hot': None, 'num_items_cold': 1, 'total_bytes_items_cold': 10, 'num_items_hot': None, 'total_bytes_items_hot': None, }, 'package_installation': { 'duration': None, 'initial_number_items': None, 'initial_size': None, 'items_cold': None, 'items_hot': None, 'num_items_cold': None, 'total_bytes_items_cold': None, 'num_items_hot': None, 'total_bytes_items_hot': None, }, } self.assertEqual(expected, result_summary.performance_stats.to_dict()) self.assertEqual('foo', result_summary.get_output()) self.assertEqual(datetime.timedelta(seconds=2), result_summary.duration_as_seen_by_server) self.assertEqual(datetime.timedelta(seconds=0.1), result_summary.duration_now(utils.utcnow())) self.assertEqual(datetime.timedelta(seconds=4), result_summary.pending) self.assertEqual(datetime.timedelta(seconds=4), result_summary.pending_now(utils.utcnow())) self.assertEqual(task_pack.pack_result_summary_key(result_summary.key), result_summary.task_id) self.assertEqual(complete_ts, result_summary.ended_ts) self.assertEqual(task_pack.pack_run_result_key(run_result.key), run_result.task_id) self.assertEqual(complete_ts, run_result.ended_ts)
def bot_update_task( run_result_key, bot_id, output, output_chunk_start, exit_code, duration, hard_timeout, io_timeout, cost_usd, outputs_ref): """Updates a TaskRunResult and TaskResultSummary, along TaskOutput. Arguments: - run_result_key: ndb.Key to TaskRunResult. - bot_id: Self advertised bot id to ensure it's the one expected. - output: Data to append to this command output. - output_chunk_start: Index of output in the stdout stream. - exit_code: Mark that this command is terminated. - duration: Time spent in seconds for this command. - hard_timeout: Bool set if an hard timeout occured. - io_timeout: Bool set if an I/O timeout occured. - cost_usd: Cost in $USD of this task up to now. - outputs_ref: Serialized FilesRef instance or None. Invalid states, these are flat out refused: - A command is updated after it had an exit code assigned to. Returns: tuple(bool, bool); first is if the update succeeded, second is if the task completed. """ assert output_chunk_start is None or isinstance(output_chunk_start, int) assert output is None or isinstance(output, str) if cost_usd is not None and cost_usd < 0.: raise ValueError('cost_usd must be None or greater or equal than 0') result_summary_key = task_pack.run_result_key_to_result_summary_key( run_result_key) request_key = task_pack.result_summary_key_to_request_key(result_summary_key) request_future = request_key.get_async() server_version = utils.get_app_version() packed = task_pack.pack_run_result_key(run_result_key) request = request_future.get_result() now = utils.utcnow() def run(): # 2 consecutive GETs, one PUT. run_result_future = run_result_key.get_async() result_summary_future = result_summary_key.get_async() run_result = run_result_future.get_result() if not run_result: result_summary_future.wait() return None, False, 'is missing' if run_result.bot_id != bot_id: result_summary_future.wait() return None, False, 'expected bot (%s) but had update from bot %s' % ( run_result.bot_id, bot_id) # This happens as an HTTP request is retried when the DB write succeeded but # it still returned HTTP 500. if len(run_result.exit_codes) and exit_code is not None: if run_result.exit_codes[0] != exit_code: result_summary_future.wait() return None, False, 'got 2 different exit_codes; %d then %d' % ( run_result.exit_codes[0], exit_code) if (duration is None) != (exit_code is None): result_summary_future.wait() return None, False, ( 'had unexpected duration; expected iff a command completes; index %d' % len(run_result.exit_codes)) if exit_code is not None: # The command completed. run_result.durations.append(duration) run_result.exit_codes.append(exit_code) if outputs_ref: run_result.outputs_ref = task_request.FilesRef(**outputs_ref) task_completed = len(run_result.exit_codes) == 1 if run_result.state in task_result.State.STATES_RUNNING: if hard_timeout or io_timeout: run_result.state = task_result.State.TIMED_OUT run_result.completed_ts = now elif task_completed: run_result.state = task_result.State.COMPLETED run_result.completed_ts = now run_result.signal_server_version(server_version) to_put = [run_result] if output: # This does 1 multi GETs. This also modifies run_result in place. to_put.extend( run_result.append_output(0, output, output_chunk_start or 0)) run_result.cost_usd = max(cost_usd, run_result.cost_usd or 0.) run_result.modified_ts = now result_summary = result_summary_future.get_result() if (result_summary.try_number and result_summary.try_number > run_result.try_number): # The situation where a shard is retried but the bot running the previous # try somehow reappears and reports success, the result must still show # the last try's result. We still need to update cost_usd manually. result_summary.costs_usd[run_result.try_number-1] = run_result.cost_usd result_summary.modified_ts = now else: result_summary.set_from_run_result(run_result, request) to_put.append(result_summary) ndb.put_multi(to_put) return run_result, task_completed, None try: run_result, task_completed, error = datastore_utils.transaction(run) except datastore_utils.CommitError: # It is important that the caller correctly surface this error. return False, False if run_result: _update_stats(run_result, bot_id, request, task_completed) if error: logging.error('Task %s %s', packed, error) return True, task_completed