def post(self): res = self._process() event = res.request.get('event') if event not in self.ALLOWED_EVENTS: logging.error('Unexpected event type') self.abort_with_error(400, error='Unsupported event type') message = res.request.get('message') # Record the event in a BotEvent entity so it can be listed on the bot's # page. bot_management.bot_event( event_type=event, bot_id=res.bot_id, external_ip=self.request.remote_addr, authenticated_as=auth.get_peer_identity().to_bytes(), dimensions=res.dimensions, state=res.state, version=res.version, quarantined=bool(res.quarantined_msg), maintenance_msg=res.maintenance_msg, task_id=None, task_name=None, message=message) if event == 'bot_error': # Also logs this to ereporter2, so it will be listed in the server's # hourly ereporter2 report. THIS IS NOISY so it should only be done with # issues requiring action. In this case, include again the bot's URL since # there's no context in the report. Redundantly include the bot id so # messages are bucketted by bot. line = ('%s\n' '\nhttps://%s/restricted/bot/%s') % ( message, app_identity.get_default_version_hostname(), res.bot_id) ereporter2.log_request(self.request, source='bot', message=line) self.send_response({})
def post(self): (request, bot_id, version, state, dimensions, quarantined_msg) = self._process() event = request.get('event') if event not in ('bot_error', 'bot_rebooting', 'bot_shutdown'): self.abort_with_error(400, error='Unsupported event type') message = request.get('message') bot_management.bot_event(event_type=event, bot_id=bot_id, external_ip=self.request.remote_addr, dimensions=dimensions, state=state, version=version, quarantined=bool(quarantined_msg), task_id=None, task_name=None, message=message) if event == 'bot_error': line = ('Bot: https://%s/restricted/bot/%s\n' 'Bot error:\n' '%s') % (app_identity.get_default_version_hostname(), bot_id, message) ereporter2.log_request(self.request, source='bot', message=line) self.send_response({})
def post(self, task_id=None): request = self.parse_body() bot_id = request.get('id') task_id = request.get('task_id', '') message = request.get('message', 'unknown') bot_management.bot_event( event_type='task_error', bot_id=bot_id, external_ip=self.request.remote_addr, dimensions=None, state=None, version=None, quarantined=None, task_id=task_id, task_name=None, message=message) line = ( 'Bot: https://%s/restricted/bot/%s\n' 'Task failed: https://%s/user/task/%s\n' '%s') % ( app_identity.get_default_version_hostname(), bot_id, app_identity.get_default_version_hostname(), task_id, message) ereporter2.log_request(self.request, source='bot', message=line) msg = log_unexpected_keys( self.EXPECTED_KEYS, request, self.request, 'bot', 'keys') if msg: self.abort_with_error(400, error=msg) msg = task_scheduler.bot_kill_task( task_pack.unpack_run_result_key(task_id), bot_id) if msg: logging.error(msg) self.abort_with_error(400, error=msg) self.send_response({})
def post(self, task_id=None): request = self.parse_body() bot_id = request.get('id') task_id = request.get('task_id', '') message = request.get('message', 'unknown') bot_management.bot_event(event_type='task_error', bot_id=bot_id, external_ip=self.request.remote_addr, dimensions=None, state=None, version=None, quarantined=None, task_id=task_id, task_name=None, message=message) line = ('Bot: https://%s/restricted/bot/%s\n' 'Task failed: https://%s/user/task/%s\n' '%s') % (app_identity.get_default_version_hostname(), bot_id, app_identity.get_default_version_hostname(), task_id, message) ereporter2.log_request(self.request, source='bot', message=line) msg = log_unexpected_keys(self.EXPECTED_KEYS, request, self.request, 'bot', 'keys') if msg: self.abort_with_error(400, error=msg) msg = task_scheduler.bot_kill_task( task_pack.unpack_run_result_key(task_id), bot_id) if msg: logging.error(msg) self.abort_with_error(400, error=msg) self.send_response({})
def post(self): (request, bot_id, version, state, dimensions, quarantined_msg) = self._process() event = request.get("event") if event not in ("bot_error", "bot_rebooting", "bot_shutdown"): self.abort_with_error(400, error="Unsupported event type") message = request.get("message") bot_management.bot_event( event_type=event, bot_id=bot_id, external_ip=self.request.remote_addr, dimensions=dimensions, state=state, version=version, quarantined=bool(quarantined_msg), task_id=None, task_name=None, message=message, ) if event == "bot_error": line = ("Bot: https://%s/restricted/bot/%s\n" "Bot error:\n" "%s") % ( app_identity.get_default_version_hostname(), bot_id, message, ) ereporter2.log_request(self.request, source="bot", message=line) self.send_response({})
def log_unexpected_subset_keys(expected_keys, minimum_keys, actual_keys, request, source, name): """Logs an error if unexpected keys are present or expected keys are missing. Accepts optional keys. This is important to catch typos. """ message = has_unexpected_subset_keys(expected_keys, minimum_keys, actual_keys, name) if message: ereporter2.log_request(request, source=source, message=message) return message
def post(self, task_id=None): request = self.parse_body() bot_id = request.get('id') task_id = request.get('task_id', '') message = request.get('message', 'unknown') machine_type = None bot_info = bot_management.get_info_key(bot_id).get() if bot_info: machine_type = bot_info.machine_type # Make sure bot self-reported ID matches the authentication token. Raises # auth.AuthorizationError if not. bot_auth.validate_bot_id_and_fetch_config(bot_id, machine_type) bot_management.bot_event( event_type='task_error', bot_id=bot_id, external_ip=self.request.remote_addr, authenticated_as=auth.get_peer_identity().to_bytes(), dimensions=None, state=None, version=None, quarantined=None, maintenance_msg=None, task_id=task_id, task_name=None, message=message) line = ('Bot: https://%s/restricted/bot/%s\n' 'Task failed: https://%s/user/task/%s\n' '%s') % (app_identity.get_default_version_hostname(), bot_id, app_identity.get_default_version_hostname(), task_id, message) ereporter2.log_request(self.request, source='bot', message=line) msg = log_unexpected_keys(self.EXPECTED_KEYS, request, self.request, 'bot', 'keys') if msg: self.abort_with_error(400, error=msg) msg = task_scheduler.bot_kill_task( task_pack.unpack_run_result_key(task_id), bot_id) if msg: logging.error(msg) self.abort_with_error(400, error=msg) self.send_response({})
def post(self): request = self.parse_body() log_unexpected_keys( self.EXPECTED_KEYS, request, self.request, 'bot', 'keys') message = request.get('message', 'unknown') bot_id = request.get('id') if bot_id: bot_management.bot_event( event_type='bot_error', bot_id=bot_id, external_ip=self.request.remote_addr, dimensions=None, state=None, version=None, quarantined=None, task_id=None, task_name=None, message=message) # Also log inconditionally an ereporter2 event. line = ( 'Bot: https://%s/restricted/bot/%s\n' 'Old API error:\n' '%s') % ( app_identity.get_default_version_hostname(), bot_id, message) ereporter2.log_request(self.request, source='bot', message=line) self.send_response({})
def post(self): (request, bot_id, version, state, dimensions, quarantined_msg) = self._process() event = request.get('event') if event not in ('bot_error', 'bot_rebooting', 'bot_shutdown'): self.abort_with_error(400, error='Unsupported event type') message = request.get('message') bot_management.bot_event( event_type=event, bot_id=bot_id, external_ip=self.request.remote_addr, dimensions=dimensions, state=state, version=version, quarantined=bool(quarantined_msg), task_id=None, task_name=None, message=message) if event == 'bot_error': line = ( 'Bot: https://%s/restricted/bot/%s\n' 'Bot error:\n' '%s') % ( app_identity.get_default_version_hostname(), bot_id, message) ereporter2.log_request(self.request, source='bot', message=line) self.send_response({})
def _process(self): """Fetches bot info and settings, does authorization and quarantine checks. Returns: _ProcessResult instance, see its fields for more info. Raises: auth.AuthorizationError if bot's credentials are invalid. """ request = self.parse_body() version = request.get('version', None) dimensions = request.get('dimensions') or {} state = request.get('state') or {} bot_id = None if dimensions.get('id'): dimension_id = dimensions['id'] if (isinstance(dimension_id, list) and len(dimension_id) == 1 and isinstance(dimension_id[0], unicode)): bot_id = dimensions['id'][0] lease_expiration_ts = None machine_type = None if bot_id: logging.debug('Fetching bot info and settings') bot_info, bot_settings = ndb.get_multi([ bot_management.get_info_key(bot_id), bot_management.get_settings_key(bot_id) ]) if bot_info: lease_expiration_ts = bot_info.lease_expiration_ts machine_type = bot_info.machine_type # Make sure bot self-reported ID matches the authentication token. Raises # auth.AuthorizationError if not. logging.debug('Fetching bot group config') bot_group_cfg = bot_auth.validate_bot_id_and_fetch_config( bot_id, machine_type) # The server side dimensions from bot_group_cfg override bot-provided ones. # If both server side config and bot report some dimension, server side # config wins. We still emit an warning if bot tries to supply the dimension # and it disagrees with the server defined one. Note that this may happen # on a first poll after server side config for a bot has changed. The bot # doesn't know about new server-assigned dimensions yet in this case. Also # don't report ['default'], bot sends it in the handshake before it knows # anything at all. for dim_key, from_cfg in bot_group_cfg.dimensions.iteritems(): from_bot = sorted(dimensions.get(dim_key) or []) from_cfg = sorted(from_cfg) if from_bot and from_bot != ['default'] and from_bot != from_cfg: logging.warning( 'Dimensions in bots.cfg don\'t match ones provided by the bot\n' 'bot_id: "%s", key: "%s", from_bot: %s, from_cfg: %s', bot_id, dim_key, from_bot, from_cfg) dimensions[dim_key] = from_cfg # Fill in all result fields except 'quarantined_msg'. result = _ProcessResult(request=request, bot_id=bot_id, version=version, state=state, dimensions=dimensions, bot_group_cfg=bot_group_cfg, lease_expiration_ts=lease_expiration_ts, maintenance_msg=state.get('maintenance')) # The bot may decide to "self-quarantine" itself. Accept both via # dimensions or via state. See bot_management._BotCommon.quarantined for # more details. if (bool(dimensions.get('quarantined')) or bool(state.get('quarantined'))): result.quarantined_msg = 'Bot self-quarantined' return result quarantined_msg = None # Use a dummy 'for' to be able to break early from the block. for _ in [0]: quarantined_msg = has_unexpected_keys(self.EXPECTED_KEYS, request, 'keys') if quarantined_msg: break quarantined_msg = has_missing_keys(self.REQUIRED_STATE_KEYS, state, 'state') if quarantined_msg: break if not bot_id: quarantined_msg = 'Missing bot id' break if not dimensions.get('pool'): quarantined_msg = 'Missing \'pool\' dimension' break if not all( config.validate_dimension_key(key) and isinstance(values, list) and all( config.validate_dimension_value(value) for value in values) for key, values in dimensions.iteritems()): quarantined_msg = ('Invalid dimensions type:\n%s' % json.dumps(dimensions, sort_keys=True, indent=2, separators=(',', ': '))) break if quarantined_msg: line = 'Quarantined Bot\nhttps://%s/restricted/bot/%s\n%s' % ( app_identity.get_default_version_hostname(), bot_id, quarantined_msg) ereporter2.log_request(self.request, source='bot', message=line) result.quarantined_msg = quarantined_msg return result # Look for admin enforced quarantine. if bool(bot_settings and bot_settings.quarantined): result.quarantined_msg = 'Quarantined by admin' return result # TODO(maruel): Parallelise. task_queues.assert_bot_async(dimensions).get_result() return result
def post(self, task_id=None): # Unlike handshake and poll, we do not accept invalid keys here. This code # path is much more strict. request = self.parse_body() msg = log_unexpected_subset_keys( self.ACCEPTED_KEYS, self.REQUIRED_KEYS, request, self.request, 'bot', 'keys') if msg: self.abort_with_error(400, error=msg) bot_id = request['id'] cost_usd = request['cost_usd'] task_id = request['task_id'] duration = request.get('duration') exit_code = request.get('exit_code') hard_timeout = request.get('hard_timeout') io_timeout = request.get('io_timeout') output = request.get('output') output_chunk_start = request.get('output_chunk_start') outputs_ref = request.get('outputs_ref') run_result_key = task_pack.unpack_run_result_key(task_id) if output is not None: try: output = base64.b64decode(output) except UnicodeEncodeError as e: logging.error('Failed to decode output\n%s\n%r', e, output) output = output.encode('ascii', 'replace') except TypeError as e: # Save the output as-is instead. The error will be logged in ereporter2 # and returning a HTTP 500 would only force the bot to stay in a retry # loop. logging.error('Failed to decode output\n%s\n%r', e, output) try: success, completed = task_scheduler.bot_update_task( run_result_key, bot_id, output, output_chunk_start, exit_code, duration, hard_timeout, io_timeout, cost_usd, outputs_ref) if not success: logging.info('Failed to update, please retry') self.abort_with_error(500, error='Failed to update, please retry') action = 'task_completed' if completed else 'task_update' bot_management.bot_event( event_type=action, bot_id=bot_id, external_ip=self.request.remote_addr, dimensions=None, state=None, version=None, quarantined=None, task_id=task_id, task_name=None) except ValueError as e: ereporter2.log_request( request=self.request, source='server', category='task_failure', message='Failed to update task: %s' % e) self.abort_with_error(400, error=str(e)) except webob.exc.HTTPException: raise except Exception as e: logging.exception('Internal error: %s', e) self.abort_with_error(500, error=str(e)) # TODO(maruel): When a task is canceled, reply with 'DIE' so that the bot # reboots itself to abort the task abruptly. It is useful when a task hangs # and the timeout was set too long or the task was superseded by a newer # task with more recent executable (e.g. a new Try Server job on a newer # patchset on Rietveld). self.send_response({'ok': True})
def post(self, task_id=None): # Unlike handshake and poll, we do not accept invalid keys here. This code # path is much more strict. request = self.parse_body() msg = log_unexpected_subset_keys(self.ACCEPTED_KEYS, self.REQUIRED_KEYS, request, self.request, 'bot', 'keys') if msg: self.abort_with_error(400, error=msg) bot_id = request['id'] cost_usd = request['cost_usd'] task_id = request['task_id'] duration = request.get('duration') exit_code = request.get('exit_code') hard_timeout = request.get('hard_timeout') io_timeout = request.get('io_timeout') output = request.get('output') output_chunk_start = request.get('output_chunk_start') outputs_ref = request.get('outputs_ref') run_result_key = task_pack.unpack_run_result_key(task_id) if output is not None: try: output = base64.b64decode(output) except UnicodeEncodeError as e: logging.error('Failed to decode output\n%s\n%r', e, output) output = output.encode('ascii', 'replace') except TypeError as e: # Save the output as-is instead. The error will be logged in ereporter2 # and returning a HTTP 500 would only force the bot to stay in a retry # loop. logging.error('Failed to decode output\n%s\n%r', e, output) try: success, completed = task_scheduler.bot_update_task( run_result_key, bot_id, output, output_chunk_start, exit_code, duration, hard_timeout, io_timeout, cost_usd, outputs_ref) if not success: logging.info('Failed to update, please retry') self.abort_with_error(500, error='Failed to update, please retry') action = 'task_completed' if completed else 'task_update' bot_management.bot_event(event_type=action, bot_id=bot_id, external_ip=self.request.remote_addr, dimensions=None, state=None, version=None, quarantined=None, task_id=task_id, task_name=None) except ValueError as e: ereporter2.log_request(request=self.request, source='server', category='task_failure', message='Failed to update task: %s' % e) self.abort_with_error(400, error=str(e)) except webob.exc.HTTPException: raise except Exception as e: logging.exception('Internal error: %s', e) self.abort_with_error(500, error=str(e)) # TODO(maruel): When a task is canceled, reply with 'DIE' so that the bot # reboots itself to abort the task abruptly. It is useful when a task hangs # and the timeout was set too long or the task was superseded by a newer # task with more recent executable (e.g. a new Try Server job on a newer # patchset on Rietveld). self.send_response({'ok': True})
def _process(self): """Returns True if the bot has invalid parameter and should be automatically quarantined. Does one DB synchronous GET. Returns: _ProcessResult instance, see its fields for more info. Raises: auth.AuthorizationError if bot's credentials are invalid. """ request = self.parse_body() version = request.get('version', None) dimensions = request.get('dimensions') or {} state = request.get('state') or {} bot_id = None if dimensions.get('id'): dimension_id = dimensions['id'] if (isinstance(dimension_id, list) and len(dimension_id) == 1 and isinstance(dimension_id[0], unicode)): bot_id = dimensions['id'][0] # Make sure bot self-reported ID matches the authentication token. Raises # auth.AuthorizationError if not. bot_group_cfg = bot_auth.validate_bot_id_and_fetch_config(bot_id) # The server side dimensions from bot_group_cfg override bot-provided ones. # If both server side config and bot report some dimension, server side # config wins. We still emit an error if bot tries to supply the dimension # and it disagrees with the server defined one. Don't report ['default'] as # an error, bot sends it in the handshake before it knows anything at all. for dim_key, from_cfg in bot_group_cfg.dimensions.iteritems(): from_bot = sorted(dimensions.get(dim_key) or []) from_cfg = sorted(from_cfg) if from_bot and from_bot != ['default'] and from_bot != from_cfg: logging.error( 'Dimensions in bots.cfg doesn\'t match ones provided by the bot\n' 'bot_id: "%s", key: "%s", from_bot: %s, from_cfg: %s', bot_id, dim_key, from_bot, from_cfg) dimensions[dim_key] = from_cfg # Fill in all result fields except 'quarantined_msg'. result = _ProcessResult(request=request, bot_id=bot_id, version=version, state=state, dimensions=dimensions, bot_group_cfg=bot_group_cfg) # The bot may decide to "self-quarantine" itself. Accept both via # dimensions or via state. See bot_management._BotCommon.quarantined for # more details. if (bool(dimensions.get('quarantined')) or bool(state.get('quarantined'))): result.quarantined_msg = 'Bot self-quarantined' return result quarantined_msg = None # Use a dummy 'for' to be able to break early from the block. for _ in [0]: quarantined_msg = has_unexpected_keys(self.EXPECTED_KEYS, request, 'keys') if quarantined_msg: break quarantined_msg = has_missing_keys(self.REQUIRED_STATE_KEYS, state, 'state') if quarantined_msg: break if not bot_id: quarantined_msg = 'Missing bot id' break if not dimensions.get('pool'): quarantined_msg = 'Missing \'pool\' dimension' break if not all( isinstance(key, unicode) and re.match(task_request.DIMENSION_KEY_RE, key) and isinstance(values, list) and all( isinstance(value, unicode) for value in values) for key, values in dimensions.iteritems()): quarantined_msg = ('Invalid dimensions type:\n%s' % json.dumps(dimensions, sort_keys=True, indent=2, separators=(',', ': '))) break dimensions_count = task_to_run.dimensions_powerset_count( dimensions) if dimensions_count > task_to_run.MAX_DIMENSIONS: quarantined_msg = 'Dimensions product %d is too high' % dimensions_count break if not isinstance(state.get('lease_expiration_ts'), (None.__class__, int)): quarantined_msg = ( 'lease_expiration_ts (%r) must be int or None' % (state['lease_expiration_ts'])) break if quarantined_msg: line = 'Quarantined Bot\nhttps://%s/restricted/bot/%s\n%s' % ( app_identity.get_default_version_hostname(), bot_id, quarantined_msg) ereporter2.log_request(self.request, source='bot', message=line) result.quarantined_msg = quarantined_msg return result # Look for admin enforced quarantine. bot_settings = bot_management.get_settings_key(bot_id).get() if bool(bot_settings and bot_settings.quarantined): result.quarantined_msg = 'Quarantined by admin' return result return result
def _process(self): """Returns True if the bot has invalid parameter and should be automatically quarantined. Does one DB synchronous GET. Returns: tuple(request, bot_id, version, state, dimensions, quarantined_msg) """ request = self.parse_body() version = request.get('version', None) dimensions = request.get('dimensions', {}) state = request.get('state', {}) bot_id = None if dimensions.get('id'): dimension_id = dimensions['id'] if (isinstance(dimension_id, list) and len(dimension_id) == 1 and isinstance(dimension_id[0], unicode)): bot_id = dimensions['id'][0] # The bot may decide to "self-quarantine" itself. Accept both via # dimensions or via state. See bot_management._BotCommon.quarantined for # more details. if (bool(dimensions.get('quarantined')) or bool(state.get('quarantined'))): return request, bot_id, version, state, dimensions, 'Bot self-quarantined' quarantined_msg = None # Use a dummy 'for' to be able to break early from the block. for _ in [0]: quarantined_msg = has_unexpected_keys(self.EXPECTED_KEYS, request, 'keys') if quarantined_msg: break quarantined_msg = has_missing_keys(self.REQUIRED_STATE_KEYS, state, 'state') if quarantined_msg: break if not bot_id: quarantined_msg = 'Missing bot id' break if not all( isinstance(key, unicode) and isinstance(values, list) and all(isinstance(value, unicode) for value in values) for key, values in dimensions.iteritems()): quarantined_msg = ('Invalid dimensions type:\n%s' % json.dumps(dimensions, sort_keys=True, indent=2, separators=(',', ': '))) break dimensions_count = task_to_run.dimensions_powerset_count( dimensions) if dimensions_count > task_to_run.MAX_DIMENSIONS: quarantined_msg = 'Dimensions product %d is too high' % dimensions_count break if quarantined_msg: line = 'Quarantined Bot\nhttps://%s/restricted/bot/%s\n%s' % ( app_identity.get_default_version_hostname(), bot_id, quarantined_msg) ereporter2.log_request(self.request, source='bot', message=line) return request, bot_id, version, state, dimensions, quarantined_msg # Look for admin enforced quarantine. bot_settings = bot_management.get_settings_key(bot_id).get() if bool(bot_settings and bot_settings.quarantined): return request, bot_id, version, state, dimensions, 'Quarantined by admin' return request, bot_id, version, state, dimensions, None
def _process(self): """Returns True if the bot has invalid parameter and should be automatically quarantined. Does one DB synchronous GET. Returns: tuple(request, bot_id, version, state, dimensions, quarantined_msg) """ request = self.parse_body() version = request.get("version", None) dimensions = request.get("dimensions", {}) state = request.get("state", {}) bot_id = None if dimensions.get("id"): dimension_id = dimensions["id"] if isinstance(dimension_id, list) and len(dimension_id) == 1 and isinstance(dimension_id[0], unicode): bot_id = dimensions["id"][0] # The bot may decide to "self-quarantine" itself. Accept both via # dimensions or via state. See bot_management._BotCommon.quarantined for # more details. if bool(dimensions.get("quarantined")) or bool(state.get("quarantined")): return request, bot_id, version, state, dimensions, "Bot self-quarantined" quarantined_msg = None # Use a dummy 'for' to be able to break early from the block. for _ in [0]: quarantined_msg = has_unexpected_keys(self.EXPECTED_KEYS, request, "keys") if quarantined_msg: break quarantined_msg = has_missing_keys(self.REQUIRED_STATE_KEYS, state, "state") if quarantined_msg: break if not bot_id: quarantined_msg = "Missing bot id" break if not all( isinstance(key, unicode) and isinstance(values, list) and all(isinstance(value, unicode) for value in values) for key, values in dimensions.iteritems() ): quarantined_msg = "Invalid dimensions type:\n%s" % json.dumps( dimensions, sort_keys=True, indent=2, separators=(",", ": ") ) break dimensions_count = task_to_run.dimensions_powerset_count(dimensions) if dimensions_count > task_to_run.MAX_DIMENSIONS: quarantined_msg = "Dimensions product %d is too high" % dimensions_count break if quarantined_msg: line = "Quarantined Bot\nhttps://%s/restricted/bot/%s\n%s" % ( app_identity.get_default_version_hostname(), bot_id, quarantined_msg, ) ereporter2.log_request(self.request, source="bot", message=line) return request, bot_id, version, state, dimensions, quarantined_msg # Look for admin enforced quarantine. bot_settings = bot_management.get_settings_key(bot_id).get() if bool(bot_settings and bot_settings.quarantined): return request, bot_id, version, state, dimensions, "Quarantined by admin" return request, bot_id, version, state, dimensions, None
def post(self, task_id=None): # Unlike handshake and poll, we do not accept invalid keys here. This code # path is much more strict. request = self.parse_body() msg = log_unexpected_subset_keys(self.ACCEPTED_KEYS, self.REQUIRED_KEYS, request, self.request, "bot", "keys") if msg: self.abort_with_error(400, error=msg) bot_id = request["id"] cost_usd = request["cost_usd"] task_id = request["task_id"] duration = request.get("duration") exit_code = request.get("exit_code") hard_timeout = request.get("hard_timeout") io_timeout = request.get("io_timeout") output = request.get("output") output_chunk_start = request.get("output_chunk_start") outputs_ref = request.get("outputs_ref") run_result_key = task_pack.unpack_run_result_key(task_id) if output is not None: try: output = base64.b64decode(output) except UnicodeEncodeError as e: logging.error("Failed to decode output\n%s\n%r", e, output) output = output.encode("ascii", "replace") except TypeError as e: # Save the output as-is instead. The error will be logged in ereporter2 # and returning a HTTP 500 would only force the bot to stay in a retry # loop. logging.error("Failed to decode output\n%s\n%r", e, output) try: success, completed = task_scheduler.bot_update_task( run_result_key, bot_id, output, output_chunk_start, exit_code, duration, hard_timeout, io_timeout, cost_usd, outputs_ref, ) if not success: self.abort_with_error(500, error="Failed to update, please retry") action = "task_completed" if completed else "task_update" bot_management.bot_event( event_type=action, bot_id=bot_id, external_ip=self.request.remote_addr, dimensions=None, state=None, version=None, quarantined=None, task_id=task_id, task_name=None, ) except ValueError as e: ereporter2.log_request( request=self.request, source="server", category="task_failure", message="Failed to update task: %s" % e ) self.abort_with_error(400, error=str(e)) except Exception as e: self.abort_with_error(500, error=str(e)) # TODO(maruel): When a task is canceled, reply with 'DIE' so that the bot # reboots itself to abort the task abruptly. It is useful when a task hangs # and the timeout was set too long or the task was superseded by a newer # task with more recent executable (e.g. a new Try Server job on a newer # patchset on Rietveld). self.send_response({"ok": True})
def post(self, task_id=None): # Unlike handshake and poll, we do not accept invalid keys here. This code # path is much more strict. request = self.parse_body() msg = log_unexpected_subset_keys(self.ACCEPTED_KEYS, self.REQUIRED_KEYS, request, self.request, 'bot', 'keys') if msg: self.abort_with_error(400, error=msg) bot_id = request['id'] task_id = request['task_id'] machine_type = None bot_info = bot_management.get_info_key(bot_id).get() if bot_info: machine_type = bot_info.machine_type # Make sure bot self-reported ID matches the authentication token. Raises # auth.AuthorizationError if not. bot_auth.validate_bot_id_and_fetch_config(bot_id, machine_type) bot_overhead = request.get('bot_overhead') cipd_pins = request.get('cipd_pins') cipd_stats = request.get('cipd_stats') cost_usd = request.get('cost_usd', 0) duration = request.get('duration') exit_code = request.get('exit_code') hard_timeout = request.get('hard_timeout') io_timeout = request.get('io_timeout') isolated_stats = request.get('isolated_stats') output = request.get('output') output_chunk_start = request.get('output_chunk_start') outputs_ref = request.get('outputs_ref') if (isolated_stats or cipd_stats) and bot_overhead is None: ereporter2.log_request(request=self.request, source='server', category='task_failure', message='Failed to update task: %s' % task_id) self.abort_with_error( 400, error= 'isolated_stats and cipd_stats require bot_overhead to be set' '\nbot_overhead: %s\nisolate_stats: %s' % (bot_overhead, isolated_stats)) run_result_key = task_pack.unpack_run_result_key(task_id) performance_stats = None if bot_overhead is not None: performance_stats = task_result.PerformanceStats( bot_overhead=bot_overhead) if isolated_stats: download = isolated_stats.get('download') or {} upload = isolated_stats.get('upload') or {} def unpack_base64(d, k): x = d.get(k) if x: return base64.b64decode(x) performance_stats.isolated_download = task_result.OperationStats( duration=download.get('duration'), initial_number_items=download.get('initial_number_items'), initial_size=download.get('initial_size'), items_cold=unpack_base64(download, 'items_cold'), items_hot=unpack_base64(download, 'items_hot')) performance_stats.isolated_upload = task_result.OperationStats( duration=upload.get('duration'), items_cold=unpack_base64(upload, 'items_cold'), items_hot=unpack_base64(upload, 'items_hot')) if cipd_stats: performance_stats.package_installation = task_result.OperationStats( duration=cipd_stats.get('duration')) if output is not None: try: output = base64.b64decode(output) except UnicodeEncodeError as e: logging.error('Failed to decode output\n%s\n%r', e, output) output = output.encode('ascii', 'replace') except TypeError as e: # Save the output as-is instead. The error will be logged in ereporter2 # and returning a HTTP 500 would only force the bot to stay in a retry # loop. logging.error('Failed to decode output\n%s\n%r', e, output) if outputs_ref: outputs_ref = task_request.FilesRef(**outputs_ref) if cipd_pins: cipd_pins = task_result.CipdPins( client_package=task_request.CipdPackage( **cipd_pins['client_package']), packages=[ task_request.CipdPackage(**args) for args in cipd_pins['packages'] ]) try: state = task_scheduler.bot_update_task( run_result_key=run_result_key, bot_id=bot_id, output=output, output_chunk_start=output_chunk_start, exit_code=exit_code, duration=duration, hard_timeout=hard_timeout, io_timeout=io_timeout, cost_usd=cost_usd, outputs_ref=outputs_ref, cipd_pins=cipd_pins, performance_stats=performance_stats) if not state: logging.info('Failed to update, please retry') self.abort_with_error(500, error='Failed to update, please retry') if state in (task_result.State.COMPLETED, task_result.State.TIMED_OUT): action = 'task_completed' elif state == task_result.State.KILLED: action = 'task_killed' else: assert state in (task_result.State.BOT_DIED, task_result.State.RUNNING), state action = 'task_update' bot_management.bot_event( event_type=action, bot_id=bot_id, external_ip=self.request.remote_addr, authenticated_as=auth.get_peer_identity().to_bytes(), dimensions=None, state=None, version=None, quarantined=None, maintenance_msg=None, task_id=task_id, task_name=None) except ValueError as e: ereporter2.log_request(request=self.request, source='server', category='task_failure', message='Failed to update task: %s' % e) self.abort_with_error(400, error=str(e)) except webob.exc.HTTPException: raise except Exception as e: logging.exception('Internal error: %s', e) self.abort_with_error(500, error=str(e)) self.send_response({ 'must_stop': state == task_result.State.KILLED, 'ok': True })
def post(self, task_id=None): # Unlike handshake and poll, we do not accept invalid keys here. This code # path is much more strict. request = self.parse_body() msg = log_unexpected_subset_keys(self.ACCEPTED_KEYS, self.REQUIRED_KEYS, request, self.request, 'bot', 'keys') if msg: self.abort_with_error(400, error=msg) bot_id = request['id'] cost_usd = request['cost_usd'] task_id = request['task_id'] bot_overhead = request.get('bot_overhead') duration = request.get('duration') exit_code = request.get('exit_code') hard_timeout = request.get('hard_timeout') io_timeout = request.get('io_timeout') isolated_stats = request.get('isolated_stats') output = request.get('output') output_chunk_start = request.get('output_chunk_start') outputs_ref = request.get('outputs_ref') if bool(isolated_stats) != (bot_overhead is not None): ereporter2.log_request(request=self.request, source='server', category='task_failure', message='Failed to update task: %s' % task_id) self.abort_with_error( 400, error='Both bot_overhead and isolated_stats must be set ' 'simultaneously\nbot_overhead: %s\nisolated_stats: %s' % (bot_overhead, isolated_stats)) run_result_key = task_pack.unpack_run_result_key(task_id) performance_stats = None if isolated_stats: download = isolated_stats['download'] upload = isolated_stats['upload'] performance_stats = task_result.PerformanceStats( bot_overhead=bot_overhead, isolated_download=task_result.IsolatedOperation( duration=download['duration'], initial_number_items=download['initial_number_items'], initial_size=download['initial_size'], items_cold=base64.b64decode(download['items_cold']), items_hot=base64.b64decode(download['items_hot'])), isolated_upload=task_result.IsolatedOperation( duration=upload['duration'], items_cold=base64.b64decode(upload['items_cold']), items_hot=base64.b64decode(upload['items_hot']))) if output is not None: try: output = base64.b64decode(output) except UnicodeEncodeError as e: logging.error('Failed to decode output\n%s\n%r', e, output) output = output.encode('ascii', 'replace') except TypeError as e: # Save the output as-is instead. The error will be logged in ereporter2 # and returning a HTTP 500 would only force the bot to stay in a retry # loop. logging.error('Failed to decode output\n%s\n%r', e, output) if outputs_ref: outputs_ref = task_request.FilesRef(**outputs_ref) try: state = task_scheduler.bot_update_task( run_result_key=run_result_key, bot_id=bot_id, output=output, output_chunk_start=output_chunk_start, exit_code=exit_code, duration=duration, hard_timeout=hard_timeout, io_timeout=io_timeout, cost_usd=cost_usd, outputs_ref=outputs_ref, performance_stats=performance_stats) if not state: logging.info('Failed to update, please retry') self.abort_with_error(500, error='Failed to update, please retry') if state in (task_result.State.COMPLETED, task_result.State.TIMED_OUT): action = 'task_completed' else: assert state in (task_result.State.BOT_DIED, task_result.State.RUNNING), state action = 'task_update' bot_management.bot_event(event_type=action, bot_id=bot_id, external_ip=self.request.remote_addr, dimensions=None, state=None, version=None, quarantined=None, task_id=task_id, task_name=None) except ValueError as e: ereporter2.log_request(request=self.request, source='server', category='task_failure', message='Failed to update task: %s' % e) self.abort_with_error(400, error=str(e)) except webob.exc.HTTPException: raise except Exception as e: logging.exception('Internal error: %s', e) self.abort_with_error(500, error=str(e)) # TODO(maruel): When a task is canceled, reply with 'DIE' so that the bot # reboots itself to abort the task abruptly. It is useful when a task hangs # and the timeout was set too long or the task was superseded by a newer # task with more recent executable (e.g. a new Try Server job on a newer # patchset on Rietveld). self.send_response({'ok': True})