def test_task_idempotent_variable(self): # Test the edge case where GlobalConfig.reusable_task_age_secs is being # modified. This ensure TaskResultSummary.order(TRS.key) works. self.mock(random, 'getrandbits', lambda _: 0x88) cfg = config.settings() cfg.reusable_task_age_secs = 10 cfg.store() # First task is idempotent. self._task_ran_successfully() # Second task is scheduled, first task is too old to be reused. second_ts = self.mock_now(self.now, 10) task_id = self._task_ran_successfully() # Now any of the 2 tasks could be reused. Assert the right one (the most # recent) is reused. cfg = config.settings() cfg.reusable_task_age_secs = 100 cfg.store() # Third task is deduped against second task. That ensures ordering works # correctly. third_ts = self.mock_now(self.now, 20) self._task_deduped(third_ts, task_id, '1d69ba3ea8008810', second_ts)
def test_has_capacity_BotEvent(self): # Disable the memcache code path to confirm the DB based behavior. self.mock(task_queues, 'probably_has_capacity', lambda *_: None) d = {u'pool': [u'default'], u'os': [u'Ubuntu-16.04']} botid = 'id1' _bot_event(event_type='request_sleep', dimensions={ 'id': [botid], 'pool': ['default'], 'os': ['Ubuntu', 'Ubuntu-16.04'] }) self.assertEqual(True, bot_management.has_capacity(d)) or_dimensions = { u'pool': [u'default'], u'os': [u'Ubuntu-14.04|Ubuntu-16.04'], } # Delete the BotInfo, so the bot will disappear. bot_management.get_info_key(botid).delete() # The capacity is still found due to a recent BotEvent with this dimension. self.assertEqual(True, bot_management.has_capacity(d)) self.assertEqual(True, bot_management.has_capacity(or_dimensions)) self.mock_now(self.now, config.settings().bot_death_timeout_secs - 1) self.assertEqual(True, bot_management.has_capacity(d)) self.assertEqual(True, bot_management.has_capacity(or_dimensions)) self.mock_now(self.now, config.settings().bot_death_timeout_secs) self.assertEqual(False, bot_management.has_capacity(d)) self.assertEqual(False, bot_management.has_capacity(or_dimensions))
def apply_server_property_defaults(properties): """Fills ndb task properties with default values read from server settings.""" cfg = config.settings() if not cfg: return cfg = config.settings() if cfg.isolate.default_server and cfg.isolate.default_namespace: properties.inputs_ref = properties.inputs_ref or task_request.FilesRef( ) properties.inputs_ref.isolatedserver = ( properties.inputs_ref.isolatedserver or cfg.isolate.default_server) properties.inputs_ref.namespace = (properties.inputs_ref.namespace or cfg.isolate.default_namespace) if cfg.HasField('cipd') and properties.cipd_input: properties.cipd_input.server = (properties.cipd_input.server or cfg.cipd.default_server) properties.cipd_input.client_package = ( properties.cipd_input.client_package or task_request.CipdPackage()) properties.cipd_input.client_package.package_name = ( properties.cipd_input.client_package.package_name or cfg.cipd.default_client_package.package_name) properties.cipd_input.client_package.version = ( properties.cipd_input.client_package.version or cfg.cipd.default_client_package.version)
def get(self): ndb.get_context().set_cache_policy(lambda _: False) if not config.settings().mp.enabled: logging.info('MP support is disabled') return if config.settings().mp.server: new_server = config.settings().mp.server current_config = machine_provider.MachineProviderConfiguration().cached() if new_server != current_config.instance_url: logging.info('Updating Machine Provider server to %s', new_server) current_config.modify(updated_by='', instance_url=new_server) lease_management.ensure_entities_exist() lease_management.drain_excess()
def test_config_conflict(self): self.set_as_admin() resp = self.app.get('/restricted/config') # TODO(maruel): Use beautifulsoup? params = { 'bot_death_timeout_secs': 10 * 60, 'google_analytics': 'foobar', 'keyid': str(config.settings().key.integer_id() - 1), 'reusable_task_age_secs': 30, 'xsrf_token': self.get_xsrf_token(), } self.assertEqual('', config.settings().google_analytics) resp = self.app.post('/restricted/config', params) self.assertIn('Update conflict', resp) self.assertEqual('', config.settings().google_analytics)
def test_config_conflict(self): self.set_as_admin() resp = self.app.get('/restricted/config') # TODO(maruel): Use beautifulsoup? params = { 'bot_death_timeout_secs': 10*60, 'google_analytics': 'foobar', 'keyid': str(config.settings().key.integer_id() - 1), 'reusable_task_age_secs': 30, 'xsrf_token': self.get_xsrf_token(), } self.assertEqual('', config.settings().google_analytics) resp = self.app.post('/restricted/config', params) self.assertIn('Update conflict', resp) self.assertEqual('', config.settings().google_analytics)
def get_bot_version(host): """Retrieves the current bot version (SHA256) loaded on this server. The memcache is first checked for the version, otherwise the value is generated and then stored in the memcache. Returns: tuple(hash of the current bot version, dict of additional files). """ signature = _get_signature(host) version = memcache.get('version-' + signature, namespace='bot_code') if version: return version, None # Need to calculate it. additionals = {'config/bot_config.py': get_bot_config().content} bot_dir = os.path.join(ROOT_DIR, 'swarming_bot') version = bot_archive.get_swarming_bot_version(bot_dir, host, utils.get_app_version(), additionals, local_config.settings()) memcache.set('version-' + signature, version, namespace='bot_code', time=60) return version, additionals
def render(name, params=None): """Shorthand to render a template.""" out = { 'google_analytics': config.settings().google_analytics, } out.update(params or {}) return template.render(name, out)
def test_list_ok(self): """Asserts that BotInfo is returned for the appropriate set of bots.""" self.set_as_privileged_user() now = datetime.datetime(2010, 1, 2, 3, 4, 5, 6) now_str = unicode(now.strftime(self.DATETIME_FORMAT)) self.mock_now(now) bot_management.bot_event( event_type='bot_connected', bot_id='id1', external_ip='8.8.4.4', dimensions={'foo': ['bar'], 'id': ['id1']}, state={'ram': 65}, version='123456789', quarantined=False, task_id=None, task_name=None) expected = { u'items': [ { u'bot_id': u'id1', u'dimensions': [ {u'key': u'foo', u'value': [u'bar']}, {u'key': u'id', u'value': [u'id1']}, ], u'external_ip': u'8.8.4.4', u'first_seen_ts': now_str, u'is_dead': False, u'last_seen_ts': now_str, u'quarantined': False, u'version': u'123456789', }, ], u'death_timeout': unicode(config.settings().bot_death_timeout_secs), u'now': unicode(now.strftime(self.DATETIME_FORMAT)), } request = swarming_rpcs.BotsRequest() response = self.call_api('list', body=message_to_dict(request)) self.assertEqual(expected, response.json)
def get_content_security_policy(self): # We use iframes to display pages at display_server_url_template. Need to # allow it in CSP. csp = super(UIHandler, self).get_content_security_policy() tmpl = config.settings().display_server_url_template if tmpl: if tmpl.startswith('/'): csp['child-src'].append("'self'") else: # We assume the template specifies '%s' in its last path component. # We strip it to get a "parent" path that we can put into CSP. Note that # whitelisting an entire display server domain is unnecessary wide. csp['child-src'].append(tmpl[:tmpl.rfind('/') + 1]) extra = config.settings().extra_child_src_csp_url csp['child-src'].extend(extra) return csp
def _find_dupe_task(now, h): """Finds a previously run task that is also idempotent and completed. Fetch items that can be used to dedupe the task. See the comment for this property for more details. Do not use "task_result.TaskResultSummary.created_ts > oldest" here because this would require a composite index. It's unnecessary because TaskRequest.key is equivalent to decreasing TaskRequest.created_ts, ordering by key works as well and doesn't require a composite index. """ # TODO(maruel): Make a reverse map on successful task completion so this # becomes a simple ndb.get(). cls = task_result.TaskResultSummary q = cls.query(cls.properties_hash == h).order(cls.key) for i, dupe_summary in enumerate(q.iter(batch_size=1)): # It is possible for the query to return stale items. if (dupe_summary.state != task_result.State.COMPLETED or dupe_summary.failure): if i == 2: # Indexes are very inconsistent, give up. return None continue # Refuse tasks older than X days. This is due to the isolate server # dropping files. # TODO(maruel): The value should be calculated from the isolate server # setting and be unbounded when no isolated input was used. oldest = now - datetime.timedelta( seconds=config.settings().reusable_task_age_secs) if dupe_summary.created_ts <= oldest: return None return dupe_summary return None
def get_swarming_bot_zip(host): """Returns a zipped file of all the files a bot needs to run. Returns: A string representing the zipped file's contents. """ version, additionals, bot_config_rev = get_bot_version(host) cached_content, cached_bot_config_rev = get_cached_swarming_bot_zip( version) # TODO(crbug.com/1087981): Compare the bot config revisions. # Separate deployment to be safe. if cached_content and cached_bot_config_rev: logging.debug( 'memcached bot code %s; %d bytes with bot_config.py rev: %s', version, len(cached_content), cached_bot_config_rev) return cached_content # Get the start bot script from the database, if present. Pass an empty # file if the files isn't present. bot_config, bot_config_rev = get_bot_config() additionals = additionals or { 'config/bot_config.py': bot_config.content, } bot_dir = os.path.join(ROOT_DIR, 'swarming_bot') content, version = bot_archive.get_swarming_bot_zip( bot_dir, host, utils.get_app_version(), additionals, local_config.settings()) logging.info('generated bot code %s; %d bytes with bot_config.py rev: %s', version, len(content), bot_config_rev) cache_swarming_bot_zip(version, content, bot_config_rev) return content
def list(self, request): """Provides list of known bots. Deleted bots will not be listed. """ logging.debug('%s', request) now = utils.utcnow() q = bot_management.BotInfo.query() try: q = bot_management.filter_dimensions(q, request.dimensions) q = bot_management.filter_availability( q, swarming_rpcs.to_bool(request.quarantined), swarming_rpcs.to_bool(request.in_maintenance), swarming_rpcs.to_bool(request.is_dead), swarming_rpcs.to_bool(request.is_busy), swarming_rpcs.to_bool(request.is_mp)) except ValueError as e: raise endpoints.BadRequestException(str(e)) bots, cursor = datastore_utils.fetch_page(q, request.limit, request.cursor) return swarming_rpcs.BotList( cursor=cursor, death_timeout=config.settings().bot_death_timeout_secs, items=[message_conversion.bot_info_to_rpc(bot) for bot in bots], now=now)
def bootstrap_dev_server_acls(): """Adds localhost to IP whitelist and Swarming groups.""" assert utils.is_local_dev_server() if auth.is_replica(): return bots = auth.bootstrap_loopback_ips() auth_settings = config.settings().auth admins_group = auth_settings.admins_group users_group = auth_settings.users_group bot_bootstrap_group = auth_settings.bot_bootstrap_group auth.bootstrap_group(users_group, bots, 'Swarming users') auth.bootstrap_group(bot_bootstrap_group, bots, 'Bot bootstrap') # Add a swarming admin. [email protected] is used in # server_smoke_test.py admin = auth.Identity(auth.IDENTITY_USER, '*****@*****.**') auth.bootstrap_group(admins_group, [admin], 'Swarming administrators') # Add an instance admin (for easier manual testing when running dev server). auth.bootstrap_group( auth.ADMIN_GROUP, [auth.Identity(auth.IDENTITY_USER, '*****@*****.**')], 'Users that can manage groups')
def setUp(self): super(ExternalSchedulerApiTestBatchMode, self).setUp() base = { 'address': u'http://localhost:1', 'id': u'foo', 'dimensions': ['key1:value1', 'key2:value2'], 'all_dimensions': None, 'any_dimensions': None, 'enabled': True, 'allow_es_fallback': True, } self.cfg_foo = pools_config.ExternalSchedulerConfig(**base) base['id'] = u'hoe' self.cfg_hoe = pools_config.ExternalSchedulerConfig(**base) self.mock(external_scheduler, '_get_client', self._get_client) self._enqueue_orig = self.mock(utils, 'enqueue_task', self._enqueue) self._client = None # Setup the backend to handle task queues. self.app = webtest.TestApp(handlers_backend.create_application(True), extra_environ={ 'REMOTE_ADDR': self.source_ip, 'SERVER_SOFTWARE': os.environ['SERVER_SOFTWARE'], }) self.cfg = config.settings() self.cfg.enable_batch_es_notifications = True self.mock(config, 'settings', lambda: self.cfg)
def setUp(self): super(ExternalSchedulerApiTest, self).setUp() self.es_cfg = pools_config.ExternalSchedulerConfig( address=u'http://localhost:1', id=u'foo', dimensions=['key1:value1', 'key2:value2'], all_dimensions=None, any_dimensions=None, enabled=True, allow_es_fallback=True) # Make the values deterministic. self.mock_now(datetime.datetime(2014, 1, 2, 3, 4, 5, 6)) self.mock(random, 'getrandbits', lambda _: 0x88) # Use the local fake client to external scheduler.. self.mock(external_scheduler, '_get_client', self._get_client) self._client = None # Setup the backend to handle task queues. self.app = webtest.TestApp(handlers_backend.create_application(True), extra_environ={ 'REMOTE_ADDR': self.source_ip, 'SERVER_SOFTWARE': os.environ['SERVER_SOFTWARE'], }) self._enqueue_orig = self.mock(utils, 'enqueue_task', self._enqueue) cfg = config.settings() cfg.enable_batch_es_notifications = False self.mock(config, 'settings', lambda: cfg)
def _check_dimension_acls(request): """Raises AuthorizationError if some requested dimensions are forbidden. Uses 'dimension_acls' field from the settings. See proto/config.proto. """ dim_acls = config.settings().dimension_acls if not dim_acls or not dim_acls.entry: return # not configured, this is fine ident = request.authenticated dims = request.properties.dimensions assert 'id' in dims or 'pool' in dims, dims # see _validate_dimensions assert ident is not None # see task_request.init_new_request # Forbid targeting individual bots for non-admins, but allow using 'id' if # 'pool' is used as well (so whoever can posts tasks to 'pool', can target an # individual bot in that pool). if 'id' in dims and 'pool' not in dims: if not acl.is_admin(): raise auth.AuthorizationError( 'Only Swarming administrators can post tasks with "id" dimension ' 'without specifying a "pool" dimension.') for k, v in sorted(dims.iteritems()): if not _can_use_dimension(dim_acls, ident, k, v): raise auth.AuthorizationError( 'User %s is not allowed to schedule tasks with dimension "%s:%s"' % (ident.to_bytes(), k, v))
def get(self): ndb.get_context().set_cache_policy(lambda _: False) if not config.settings().mp.enabled: logging.info('MP support is disabled') return lease_management.compute_utilization()
def list(self, request): """Provides list of known bots. Deleted bots will not be listed. """ logging.debug('%s', request) now = utils.utcnow() # Disable the in-process local cache. This is important, as there can be up # to a thousand entities loaded in memory, and this is a pure memory leak, # as there's no chance this specific instance will need these again, # therefore this leads to 'Exceeded soft memory limit' AppEngine errors. q = bot_management.BotInfo.query(default_options=ndb.QueryOptions( use_cache=False)) try: q = bot_management.filter_dimensions(q, request.dimensions) q = bot_management.filter_availability( q, swarming_rpcs.to_bool(request.quarantined), swarming_rpcs.to_bool(request.in_maintenance), swarming_rpcs.to_bool(request.is_dead), swarming_rpcs.to_bool(request.is_busy)) except ValueError as e: raise endpoints.BadRequestException(str(e)) bots, cursor = datastore_utils.fetch_page(q, request.limit, request.cursor) return swarming_rpcs.BotList( cursor=cursor, death_timeout=config.settings().bot_death_timeout_secs, items=[message_conversion.bot_info_to_rpc(bot) for bot in bots], now=now)
def test_api_bots(self): self.set_as_privileged_user() now = datetime.datetime(2010, 1, 2, 3, 4, 5, 6) now_str = unicode(now.strftime(utils.DATETIME_FORMAT)) self.mock_now(now) bot_management.bot_event( event_type='bot_connected', bot_id='id1', external_ip='8.8.4.4', dimensions={'foo': ['bar'], 'id': ['id1']}, state={'ram': 65}, version='123456789', quarantined=False, task_id=None, task_name=None) actual = self.app.get('/swarming/api/v1/client/bots', status=200).json expected = { u'items': [ { u'dimensions': {u'foo': [u'bar'], u'id': [u'id1']}, u'external_ip': u'8.8.4.4', u'first_seen_ts': now_str, u'id': u'id1', u'is_dead': False, u'last_seen_ts': now_str, u'quarantined': False, u'state': {u'ram': 65}, u'task_id': None, u'task_name': None, u'version': u'123456789', }, ], u'cursor': None, u'death_timeout': config.settings().bot_death_timeout_secs, u'limit': 1000, u'now': unicode(now.strftime(utils.DATETIME_FORMAT)), } self.assertEqual(expected, actual) # Test with limit. actual = self.app.get( '/swarming/api/v1/client/bots?limit=1', status=200).json expected['limit'] = 1 self.assertEqual(expected, actual) bot_management.bot_event( event_type='bot_connected', bot_id='id2', external_ip='8.8.4.4', dimensions={'foo': ['bar'], 'id': ['id2']}, state={'ram': 65}, version='123456789', quarantined=False, task_id=None, task_name=None) actual = self.app.get( '/swarming/api/v1/client/bots?limit=1', status=200).json expected['cursor'] = actual['cursor'] self.assertTrue(actual['cursor']) self.assertEqual(expected, actual) # Test with cursor. actual = self.app.get( '/swarming/api/v1/client/bots?limit=1&cursor=%s' % actual['cursor'], status=200).json expected['cursor'] = None expected['items'][0]['dimensions']['id'] = [u'id2'] expected['items'][0]['id'] = u'id2' self.assertEqual(expected, actual)
def get(self): limit = int(self.request.get('limit', 100)) cursor = datastore_query.Cursor(urlsafe=self.request.get('cursor')) sort_by = self.request.get('sort_by', '__key__') if sort_by not in self.ACCEPTABLE_BOTS_SORTS: self.abort(400, 'Invalid sort_by query parameter') if sort_by[0] == '-': order = datastore_query.PropertyOrder( sort_by[1:], datastore_query.PropertyOrder.DESCENDING) else: order = datastore_query.PropertyOrder( sort_by, datastore_query.PropertyOrder.ASCENDING) now = utils.utcnow() cutoff = now - datetime.timedelta( seconds=config.settings().bot_death_timeout_secs) num_bots_busy_future = bot_management.BotInfo.query( bot_management.BotInfo.is_busy == True).count_async() num_bots_dead_future = bot_management.BotInfo.query( bot_management.BotInfo.last_seen_ts < cutoff).count_async() num_bots_quarantined_future = bot_management.BotInfo.query( bot_management.BotInfo.quarantined == True).count_async() num_bots_total_future = bot_management.BotInfo.query().count_async() fetch_future = bot_management.BotInfo.query().order(order).fetch_page_async( limit, start_cursor=cursor) # TODO(maruel): self.request.host_url should be the default AppEngine url # version and not the current one. It is only an issue when # version-dot-appid.appspot.com urls are used to access this page. version = bot_code.get_bot_version(self.request.host_url) bots, cursor, more = fetch_future.get_result() # Prefetch the tasks. We don't actually use the value here, it'll be # implicitly used by ndb local's cache when refetched by the html template. tasks = filter(None, (b.task for b in bots)) ndb.get_multi(tasks) num_bots_busy = num_bots_busy_future.get_result() num_bots_dead = num_bots_dead_future.get_result() num_bots_quarantined = num_bots_quarantined_future.get_result() num_bots_total = num_bots_total_future.get_result() params = { 'bots': bots, 'current_version': version, 'cursor': cursor.urlsafe() if cursor and more else '', 'is_admin': acl.is_admin(), 'is_privileged_user': acl.is_privileged_user(), 'limit': limit, 'now': now, 'num_bots_alive': num_bots_total - num_bots_dead, 'num_bots_busy': num_bots_busy, 'num_bots_dead': num_bots_dead, 'num_bots_quarantined': num_bots_quarantined, 'sort_by': sort_by, 'sort_options': self.SORT_OPTIONS, 'xsrf_token': self.generate_xsrf_token(), } self.response.write( template.render('swarming/restricted_botslist.html', params))
def get(self): limit = int(self.request.get('limit', 100)) cursor = datastore_query.Cursor(urlsafe=self.request.get('cursor')) sort_by = self.request.get('sort_by', '__key__') if sort_by not in self.ACCEPTABLE_BOTS_SORTS: self.abort(400, 'Invalid sort_by query parameter') if sort_by[0] == '-': order = datastore_query.PropertyOrder( sort_by[1:], datastore_query.PropertyOrder.DESCENDING) else: order = datastore_query.PropertyOrder( sort_by, datastore_query.PropertyOrder.ASCENDING) now = utils.utcnow() cutoff = now - datetime.timedelta( seconds=config.settings().bot_death_timeout_secs) num_bots_busy_future = bot_management.BotInfo.query( bot_management.BotInfo.is_busy == True).count_async() num_bots_dead_future = bot_management.BotInfo.query( bot_management.BotInfo.last_seen_ts < cutoff).count_async() num_bots_quarantined_future = bot_management.BotInfo.query( bot_management.BotInfo.quarantined == True).count_async() num_bots_total_future = bot_management.BotInfo.query().count_async() fetch_future = bot_management.BotInfo.query().order( order).fetch_page_async(limit, start_cursor=cursor) # TODO(maruel): self.request.host_url should be the default AppEngine url # version and not the current one. It is only an issue when # version-dot-appid.appspot.com urls are used to access this page. version = bot_code.get_bot_version(self.request.host_url) bots, cursor, more = fetch_future.get_result() # Prefetch the tasks. We don't actually use the value here, it'll be # implicitly used by ndb local's cache when refetched by the html template. tasks = filter(None, (b.task for b in bots)) ndb.get_multi(tasks) num_bots_busy = num_bots_busy_future.get_result() num_bots_dead = num_bots_dead_future.get_result() num_bots_quarantined = num_bots_quarantined_future.get_result() num_bots_total = num_bots_total_future.get_result() params = { 'bots': bots, 'current_version': version, 'cursor': cursor.urlsafe() if cursor and more else '', 'is_admin': acl.is_admin(), 'is_privileged_user': acl.is_privileged_user(), 'limit': limit, 'now': now, 'num_bots_alive': num_bots_total - num_bots_dead, 'num_bots_busy': num_bots_busy, 'num_bots_dead': num_bots_dead, 'num_bots_quarantined': num_bots_quarantined, 'sort_by': sort_by, 'sort_options': self.SORT_OPTIONS, 'xsrf_token': self.generate_xsrf_token(), } self.response.write( template.render('swarming/restricted_botslist.html', params))
def common(self, note): params = { "cfg": config.settings(fresh=True), "note": note, "path": self.request.path, "xsrf_token": self.generate_xsrf_token(), } self.response.write(template.render("swarming/restricted_config.html", params))
def test_task_idempotent(self): self.mock(random, 'getrandbits', lambda _: 0x88) # First task is idempotent. task_id = self._task_ran_successfully() # Second task is deduped against first task. new_ts = self.mock_now(self.now, config.settings().reusable_task_age_secs-1) self._task_deduped(new_ts, task_id)
def common(self, note): params = { 'cfg': config.settings(fresh=True), 'note': note, 'path': self.request.path, 'xsrf_token': self.generate_xsrf_token(), } self.response.write( template.render('swarming/restricted_config.html', params))
def get_content_security_policy(self): # We use iframes to display pages at display_server_url_template. Need to # allow it in CSP. csp = super(UIHandler, self).get_content_security_policy() csp['frame-src'].append("'self'") tmpl = config.settings().display_server_url_template if tmpl: if not tmpl.startswith('/'): # We assume the template specifies '%s' in its last path component. # We strip it to get a "parent" path that we can put into CSP. Note that # whitelisting an entire display server domain is unnecessary wide. csp['frame-src'].append(tmpl[:tmpl.rfind('/') + 1]) extra = config.settings().extra_child_src_csp_url # Note that frame-src was once child-src, which was deprecated and support # was dropped by some browsers. See # https://bugs.chromium.org/p/chromium/issues/detail?id=839909 csp['frame-src'].extend(extra) return csp
def cron_update_bot_info(): """Refreshes BotInfo.composite for dead bots.""" dt = datetime.timedelta(seconds=config.settings().bot_death_timeout_secs) cutoff = utils.utcnow() - dt @ndb.tasklet def run(bot_key): bot = yield bot_key.get_async() if (bot and bot.last_seen_ts <= cutoff and (BotInfo.ALIVE in bot.composite or BotInfo.DEAD not in bot.composite)): # Updating it recomputes composite. # TODO(maruel): BotEvent. yield bot.put_async() logging.info('DEAD: %s', bot.id) raise ndb.Return(1) raise ndb.Return(0) # The assumption here is that a cron job can churn through all the entities # fast enough. The number of dead bot is expected to be <10k. In practice the # average runtime is around 8 seconds. dead = 0 seen = 0 failed = 0 try: futures = [] for b in BotInfo.query(BotInfo.last_seen_ts <= cutoff): seen += 1 if BotInfo.ALIVE in b.composite or BotInfo.DEAD not in b.composite: # Make sure the variable is not aliased. k = b.key # Unregister the bot from task queues since it can't reap anything. task_queues.cleanup_after_bot(k.parent()) # Retry more often than the default 1. We do not want to throw too much # in the logs and there should be plenty of time to do the retries. f = datastore_utils.transaction_async(lambda: run(k), retries=5) futures.append(f) if len(futures) >= 5: ndb.Future.wait_any(futures) for i in xrange(len(futures) - 1, -1, -1): if futures[i].done(): try: dead += futures.pop(i).get_result() except datastore_utils.CommitError: logging.warning('Failed to commit a Tx') failed += 1 for f in futures: try: dead += f.get_result() except datastore_utils.CommitError: logging.warning('Failed to commit a Tx') failed += 1 finally: logging.debug('Seen %d bots, updated %d bots, failed %d tx', seen, dead, failed) return dead
def has_capacity(dimensions): """Returns True if there's a reasonable chance for this task request dimensions set to be serviced by a bot alive. First look at the task queues, then look into the datastore to figure this out. """ assert not ndb.in_transaction() # Look at the fast path. cap = task_queues.probably_has_capacity(dimensions) if cap is not None: return cap # Add it to the 'quick cache' to improve performance. This cache is kept for # the same duration as how long bots are considered still alive without a # ping. Useful if there's a single bot in the fleet for these dimensions and # it takes a long time to reboot. This is the case with Android with slow # initialization and some baremetal bots (thanks SCSI firmware!). seconds = config.settings().bot_death_timeout_secs @ndb.tasklet def run_query(flat): # Do a query. That's slower and it's eventually consistent. q = BotInfo.query() for f in flat: q = q.filter(BotInfo.dimensions_flat == f) num = yield q.count_async(limit=1) if num: logging.info('Found capacity via BotInfo: %s', flat) raise ndb.Return(True) # Search a bit harder. In this case, we're looking for BotEvent which would # be a bot that used to exist recently. cutoff = utils.utcnow() - datetime.timedelta(seconds=seconds) q = BotEvent.query(BotEvent.ts > cutoff) for f in flat: q = q.filter(BotEvent.dimensions_flat == f) num = yield q.count_async(limit=1) if num: logging.info('Found capacity via BotEvent: %s', flat) raise ndb.Return(True) raise ndb.Return(False) futures = [ run_query(f) for f in task_queues.expand_dimensions_to_flats(dimensions) ] ndb.tasklets.Future.wait_all(futures) if any(f.get_result() for f in futures): task_queues.set_has_capacity(dimensions, seconds) return True logging.warning('HAS NO CAPACITY: %s', dimensions) return False
def exponential_backoff(attempt_num): """Returns an exponential backoff value in seconds.""" assert attempt_num >= 0 if random.random() < _PROBABILITY_OF_QUICK_COMEBACK: # Randomly ask the bot to return quickly. return 1.0 # If the user provided a max then use it, otherwise use default 60s. max_wait = config.settings().max_bot_sleep_time or 60. return min(max_wait, math.pow(1.5, min(attempt_num, 10) + 1))
def test_list_ok(self): """Asserts that BotInfo is returned for the appropriate set of bots.""" self.set_as_privileged_user() now = datetime.datetime(2010, 1, 2, 3, 4, 5, 6) now_str = unicode(now.strftime(self.DATETIME_FORMAT)) self.mock_now(now) bot_management.bot_event(event_type='bot_connected', bot_id='id1', external_ip='8.8.4.4', dimensions={ 'foo': ['bar'], 'id': ['id1'] }, state={'ram': 65}, version='123456789', quarantined=False, task_id=None, task_name=None) expected = { u'items': [ { u'bot_id': u'id1', u'dimensions': [ { u'key': u'foo', u'value': [u'bar'] }, { u'key': u'id', u'value': [u'id1'] }, ], u'external_ip': u'8.8.4.4', u'first_seen_ts': now_str, u'is_dead': False, u'last_seen_ts': now_str, u'quarantined': False, u'version': u'123456789', }, ], u'death_timeout': unicode(config.settings().bot_death_timeout_secs), u'now': unicode(now.strftime(self.DATETIME_FORMAT)), } request = swarming_rpcs.BotsRequest() response = self.call_api('list', body=message_to_dict(request)) self.assertEqual(expected, response.json)
def test_task_idempotent_three(self): self.mock(random, 'getrandbits', lambda _: 0x88) # First task is idempotent. task_id = self._task_ran_successfully() # Second task is deduped against first task. new_ts = self.mock_now(self.now, config.settings().reusable_task_age_secs-1) self._task_deduped(new_ts, task_id) # Third task is scheduled, second task is not dedupable, first task is too # old. new_ts = self.mock_now(self.now, config.settings().reusable_task_age_secs) data = _gen_request( name='yay', user='******', properties=dict(dimensions={u'OS': u'Windows-3.1.1'}, idempotent=True)) request = task_request.make_request(data, True) _result_summary = task_scheduler.schedule_request(request) # The task was enqueued for execution. self.assertNotEqual(None, task_to_run.TaskToRun.query().get().queue_number)
def _calc_composite(self): """Returns the value for BotInfo.composite, which permits quick searches.""" timeout = config.settings().bot_death_timeout_secs is_dead = (utils.utcnow() - self.last_seen_ts).total_seconds() >= timeout return [ self.IN_MAINTENANCE if self.maintenance_msg else self.NOT_IN_MAINTENANCE, self.DEAD if is_dead else self.ALIVE, self.QUARANTINED if self.quarantined else self.HEALTHY, self.BUSY if self.task_id else self.IDLE ]
def list(self, request): """Provides list of bots.""" now = utils.utcnow() cursor = datastore_query.Cursor(urlsafe=request.cursor) q = bot_management.BotInfo.query().order(bot_management.BotInfo.key) bots, cursor, more = q.fetch_page(request.batch_size, start_cursor=cursor) return swarming_rpcs.BotList( cursor=cursor.urlsafe() if cursor and more else None, death_timeout=config.settings().bot_death_timeout_secs, items=[message_conversion.bot_info_from_dict(bot.to_dict_with_now( now)) for bot in bots], now=now)
def filter_availability(q, quarantined, is_dead, now): """Filters a ndb.Query for BotInfo based on quarantined/is_dead.""" if quarantined is not None: q = q.filter(BotInfo.quarantined == quarantined) dt = datetime.timedelta(seconds=config.settings().bot_death_timeout_secs) timeout = now - dt if is_dead: q = q.filter(BotInfo.last_seen_ts < timeout) elif is_dead is not None: q = q.filter(BotInfo.last_seen_ts > timeout) return q
def get(self, page): if not page: page = 'swarming' params = { 'client_id': config.settings().ui_client_id, } try: self.response.write(template.render( 'swarming/public_%s_index.html' % page, params)) except template.TemplateNotFound: self.abort(404, 'Page not found.')
def post(self): # Convert MultiDict into a dict. params = {k: self.request.params.getone(k) for k in self.request.params if k not in ("keyid", "xsrf_token")} params["bot_death_timeout_secs"] = int(params["bot_death_timeout_secs"]) params["reusable_task_age_secs"] = int(params["reusable_task_age_secs"]) cfg = config.settings(fresh=True) keyid = int(self.request.get("keyid", "0")) if cfg.key.integer_id() != keyid: self.common("Update conflict %s != %s" % (cfg.key.integer_id(), keyid)) return cfg.populate(**params) cfg.store() self.common("Settings updated")
def get(self): logging.error('Unexpected old client') now = utils.utcnow() limit = int(self.request.get('limit', 1000)) filter_by = self.request.get('filter') if filter_by and filter_by not in self.ACCEPTABLE_FILTERS: self.abort_with_error(400, error='Invalid filter query parameter') q = bot_management.BotInfo.query() if not filter_by: q = q.order(bot_management.BotInfo.key) recheck = lambda _: True elif filter_by == 'quarantined': q = q.order(bot_management.BotInfo.key) q = q.filter(bot_management.BotInfo.quarantined == True) recheck = lambda b: b.quarantined elif filter_by == 'is_dead': # The first sort key must be the same as used in the filter, otherwise # datastore raises BadRequestError. deadline = now - datetime.timedelta( seconds=config.settings().bot_death_timeout_secs) q = q.order(bot_management.BotInfo.last_seen_ts) q = q.filter(bot_management.BotInfo.last_seen_ts < deadline) recheck = lambda b: b.last_seen_ts < deadline else: raise AssertionError('Impossible') cursor = datastore_query.Cursor(urlsafe=self.request.get('cursor')) bots, cursor, more = q.fetch_page(limit, start_cursor=cursor) data = { 'cursor': cursor.urlsafe() if cursor and more else None, 'death_timeout': config.settings().bot_death_timeout_secs, 'items': [b.to_dict_with_now(now) for b in bots if recheck(b)], 'limit': limit, 'now': now, } self.send_response(utils.to_json_encodable(data))
def list(self, request): """Provides list of known bots. Deleted bots will not be listed. """ logging.info('%s', request) now = utils.utcnow() q = bot_management.BotInfo.query().order(bot_management.BotInfo.key) bots, cursor = datastore_utils.fetch_page(q, request.limit, request.cursor) return swarming_rpcs.BotList( cursor=cursor, death_timeout=config.settings().bot_death_timeout_secs, items=[message_conversion.bot_info_to_rpc(bot, now) for bot in bots], now=now)
def test_task_idempotent_old(self): self.mock(random, 'getrandbits', lambda _: 0x88) # First task is idempotent. self._task_ran_successfully() # Second task is scheduled, first task is too old to be reused. new_ts = self.mock_now(self.now, config.settings().reusable_task_age_secs) data = _gen_request_data( name='yay', user='******', properties=dict(dimensions={u'OS': u'Windows-3.1.1'}, idempotent=True)) request = task_request.make_request(data) _result_summary = task_scheduler.schedule_request(request) # The task was enqueued for execution. self.assertNotEqual(None, task_to_run.TaskToRun.query().get().queue_number)
def post(self): # Convert MultiDict into a dict. params = { k: self.request.params.getone(k) for k in self.request.params if k not in ('keyid', 'xsrf_token') } params['bot_death_timeout_secs'] = int(params['bot_death_timeout_secs']) params['reusable_task_age_secs'] = int(params['reusable_task_age_secs']) cfg = config.settings(fresh=True) keyid = int(self.request.get('keyid', '0')) if cfg.key.integer_id() != keyid: self.common('Update conflict %s != %s' % (cfg.key.integer_id(), keyid)) return cfg.populate(**params) cfg.store() self.common('Settings updated')
def list(self, request): """Provides list of known bots. Deleted bots will not be listed. """ logging.info('%s', request) now = utils.utcnow() q = bot_management.BotInfo.query().order(bot_management.BotInfo.key) for d in request.dimensions: if not ':' in d: raise endpoints.BadRequestException('Invalid dimensions') parts = d.split(':', 1) if len(parts) != 2 or any(i.strip() != i or not i for i in parts): raise endpoints.BadRequestException('Invalid dimensions') q = q.filter(bot_management.BotInfo.dimensions_flat == d) bots, cursor = datastore_utils.fetch_page(q, request.limit, request.cursor) return swarming_rpcs.BotList( cursor=cursor, death_timeout=config.settings().bot_death_timeout_secs, items=[message_conversion.bot_info_to_rpc(bot, now) for bot in bots], now=now)
def test_api_bots(self): self.set_as_privileged_user() self.mock_now(datetime.datetime(2010, 1, 2, 3, 4, 5, 6)) now_str = lambda: unicode(utils.utcnow().strftime(utils.DATETIME_FORMAT)) bot_management.bot_event( event_type="bot_connected", bot_id="id1", external_ip="8.8.4.4", dimensions={"foo": ["bar"], "id": ["id1"]}, state={"ram": 65}, version="123456789", quarantined=False, task_id=None, task_name=None, ) bot1_dict = { u"dimensions": {u"foo": [u"bar"], u"id": [u"id1"]}, u"external_ip": u"8.8.4.4", u"first_seen_ts": now_str(), u"id": u"id1", u"is_dead": False, u"last_seen_ts": now_str(), u"quarantined": False, u"state": {u"ram": 65}, u"task_id": None, u"task_name": None, u"version": u"123456789", } actual = self.app.get("/swarming/api/v1/client/bots", status=200).json expected = { u"items": [bot1_dict], u"cursor": None, u"death_timeout": config.settings().bot_death_timeout_secs, u"limit": 1000, u"now": now_str(), } self.assertEqual(expected, actual) # Test with limit. actual = self.app.get("/swarming/api/v1/client/bots?limit=1", status=200).json expected["limit"] = 1 self.assertEqual(expected, actual) # Advance time to make bot1 dead to test filtering for dead bots. self.mock_now(datetime.datetime(2011, 1, 2, 3, 4, 5, 6)) bot1_dict["is_dead"] = True expected["now"] = now_str() # Use quarantined bot to check filtering by 'quarantined' flag. bot_management.bot_event( event_type="bot_connected", bot_id="id2", external_ip="8.8.4.4", dimensions={"foo": ["bar"], "id": ["id2"]}, state={"ram": 65}, version="123456789", quarantined=True, task_id=None, task_name=None, ) bot2_dict = { u"dimensions": {u"foo": [u"bar"], u"id": [u"id2"]}, u"external_ip": u"8.8.4.4", u"first_seen_ts": now_str(), u"id": u"id2", u"is_dead": False, u"last_seen_ts": now_str(), u"quarantined": True, u"state": {u"ram": 65}, u"task_id": None, u"task_name": None, u"version": u"123456789", } # Test limit + cursor: start the query. actual = self.app.get("/swarming/api/v1/client/bots?limit=1", status=200).json expected["cursor"] = actual["cursor"] expected["items"] = [bot1_dict] self.assertTrue(actual["cursor"]) self.assertEqual(expected, actual) # Test limit + cursor: continue the query. actual = self.app.get("/swarming/api/v1/client/bots?limit=1&cursor=%s" % actual["cursor"], status=200).json expected["cursor"] = None expected["items"] = [bot2_dict] self.assertEqual(expected, actual) # Filtering by 'quarantined'. actual = self.app.get("/swarming/api/v1/client/bots?filter=quarantined", status=200).json expected["limit"] = 1000 expected["cursor"] = None expected["items"] = [bot2_dict] self.assertEqual(expected, actual) # Filtering by 'is_dead'. actual = self.app.get("/swarming/api/v1/client/bots?filter=is_dead", status=200).json expected["limit"] = 1000 expected["cursor"] = None expected["items"] = [bot1_dict] self.assertEqual(expected, actual)
def schedule_request(request): """Creates and stores all the entities to schedule a new task request. The number of entities created is 3: TaskRequest, TaskResultSummary and TaskToRun. The TaskRequest is saved first as a DB transaction, then TaskResultSummary and TaskToRun are saved as a single DB RPC. The Search index is also updated in-between. Arguments: - request: is in the TaskRequest entity saved in the DB. Returns: TaskResultSummary. TaskToRun is not returned. """ dupe_future = None if request.properties.idempotent: # Find a previously run task that is also idempotent and completed. Start a # query to fetch items that can be used to dedupe the task. See the comment # for this property for more details. # # Do not use "cls.created_ts > oldest" here because this would require a # composite index. It's unnecessary because TaskRequest.key is mostly # equivalent to decreasing TaskRequest.created_ts, ordering by key works as # well and doesn't require a composite index. cls = task_result.TaskResultSummary h = request.properties.properties_hash dupe_future = cls.query(cls.properties_hash == h).order(cls.key).get_async() # At this point, the request is now in the DB but not yet in a mode where it # can be triggered or visible. Index it right away so it is searchable. If any # of remaining calls in this function fail, the TaskRequest and Search # Document will simply point to an incomplete task, which will be ignored. # # Creates the entities TaskToRun and TaskResultSummary but do not save them # yet. TaskRunResult will be created once a bot starts it. task = task_to_run.new_task_to_run(request) result_summary = task_result.new_result_summary(request) # Do not specify a doc_id, as they are guaranteed to be monotonically # increasing and searches are done in reverse order, which fits exactly the # created_ts ordering. This is useful because DateField is precise to the date # (!) and NumberField is signed 32 bits so the best it could do with EPOCH is # second resolution up to year 2038. index = search.Index(name="requests") packed = task_pack.pack_result_summary_key(result_summary.key) doc = search.Document( fields=[search.TextField(name="name", value=request.name), search.AtomField(name="id", value=packed)] ) # Even if it fails here, we're still fine, as the task is not "alive" yet. search_future = index.put_async([doc]) now = utils.utcnow() if dupe_future: # Reuse the results! dupe_summary = dupe_future.get_result() # Refuse tasks older than X days. This is due to the isolate server dropping # files. https://code.google.com/p/swarming/issues/detail?id=197 oldest = now - datetime.timedelta(seconds=config.settings().reusable_task_age_secs) if dupe_summary and dupe_summary.created_ts > oldest: # If there's a bug, commenting out this block is sufficient to disable the # functionality. # Setting task.queue_number to None removes it from the scheduling. task.queue_number = None _copy_entity(dupe_summary, result_summary, ("created_ts", "name", "user", "tags")) result_summary.properties_hash = None result_summary.try_number = 0 result_summary.cost_saved_usd = result_summary.cost_usd # Only zap after. result_summary.costs_usd = [] result_summary.deduped_from = task_pack.pack_run_result_key(dupe_summary.run_result_key) # Get parent task details if applicable. parent_task_keys = None if request.parent_task_id: parent_run_key = task_pack.unpack_run_result_key(request.parent_task_id) parent_task_keys = [parent_run_key, task_pack.run_result_key_to_result_summary_key(parent_run_key)] result_summary.modified_ts = now # Storing these entities makes this task live. It is important at this point # that the HTTP handler returns as fast as possible, otherwise the task will # be run but the client will not know about it. def run(): ndb.put_multi([result_summary, task]) def run_parent(): # This one is slower. items = ndb.get_multi(parent_task_keys) k = result_summary.task_id for item in items: item.children_task_ids.append(k) item.modified_ts = now ndb.put_multi(items) # Raising will abort to the caller. futures = [datastore_utils.transaction_async(run)] if parent_task_keys: futures.append(datastore_utils.transaction_async(run_parent)) try: search_future.get_result() except search.Error: # Do not abort the task, for now search is best effort. logging.exception("Put failed") for future in futures: # Check for failures, it would raise in this case, aborting the call. future.get_result() stats.add_task_entry( "task_enqueued", result_summary.key, dimensions=request.properties.dimensions, user=request.user ) return result_summary
def is_enabled_callback(): return config.settings().enable_ts_monitoring
def test_api_bots(self): self.set_as_privileged_user() self.mock_now(datetime.datetime(2010, 1, 2, 3, 4, 5, 6)) now_str = lambda: unicode(utils.utcnow().strftime(utils.DATETIME_FORMAT)) bot_management.bot_event( event_type='bot_connected', bot_id='id1', external_ip='8.8.4.4', dimensions={'foo': ['bar'], 'id': ['id1']}, state={'ram': 65}, version='123456789', quarantined=False, task_id=None, task_name=None) bot1_dict = { u'dimensions': {u'foo': [u'bar'], u'id': [u'id1']}, u'external_ip': u'8.8.4.4', u'first_seen_ts': now_str(), u'id': u'id1', u'is_dead': False, u'last_seen_ts': now_str(), u'quarantined': False, u'state': {u'ram': 65}, u'task_id': None, u'task_name': None, u'version': u'123456789', } actual = self.app.get('/swarming/api/v1/client/bots', status=200).json expected = { u'items': [bot1_dict], u'cursor': None, u'death_timeout': config.settings().bot_death_timeout_secs, u'limit': 1000, u'now': now_str(), } self.assertEqual(expected, actual) # Test with limit. actual = self.app.get( '/swarming/api/v1/client/bots?limit=1', status=200).json expected['limit'] = 1 self.assertEqual(expected, actual) # Advance time to make bot1 dead to test filtering for dead bots. self.mock_now(datetime.datetime(2011, 1, 2, 3, 4, 5, 6)) bot1_dict['is_dead'] = True expected['now'] = now_str() # Use quarantined bot to check filtering by 'quarantined' flag. bot_management.bot_event( event_type='bot_connected', bot_id='id2', external_ip='8.8.4.4', dimensions={'foo': ['bar'], 'id': ['id2']}, state={'ram': 65}, version='123456789', quarantined=True, task_id=None, task_name=None) bot2_dict = { u'dimensions': {u'foo': [u'bar'], u'id': [u'id2']}, u'external_ip': u'8.8.4.4', u'first_seen_ts': now_str(), u'id': u'id2', u'is_dead': False, u'last_seen_ts': now_str(), u'quarantined': True, u'state': {u'ram': 65}, u'task_id': None, u'task_name': None, u'version': u'123456789', } # Test limit + cursor: start the query. actual = self.app.get( '/swarming/api/v1/client/bots?limit=1', status=200).json expected['cursor'] = actual['cursor'] expected['items'] = [bot1_dict] self.assertTrue(actual['cursor']) self.assertEqual(expected, actual) # Test limit + cursor: continue the query. actual = self.app.get( '/swarming/api/v1/client/bots?limit=1&cursor=%s' % actual['cursor'], status=200).json expected['cursor'] = None expected['items'] = [bot2_dict] self.assertEqual(expected, actual) # Filtering by 'quarantined'. actual = self.app.get( '/swarming/api/v1/client/bots?filter=quarantined', status=200).json expected['limit'] = 1000 expected['cursor'] = None expected['items'] = [bot2_dict] self.assertEqual(expected, actual) # Filtering by 'is_dead'. actual = self.app.get( '/swarming/api/v1/client/bots?filter=is_dead', status=200).json expected['limit'] = 1000 expected['cursor'] = None expected['items'] = [bot1_dict] self.assertEqual(expected, actual)
def is_dead(self, now): """Returns True if the bot is dead based on timestamp now.""" timeout = config.settings().bot_death_timeout_secs return (now - self.last_seen_ts).total_seconds() >= timeout