def test_cron_tidy_stale(self): now = datetime.datetime(2010, 1, 2, 3, 4, 5) self.mock_now(now) self.assertEqual(0, _assert_bot()) request = self._assert_task() exp = (request.expiration_ts - request.created_ts) + task_queues._ADVANCE self.assert_count(1, task_queues.BotTaskDimensions) self.assert_count(1, task_queues.TaskDimensions) self.assertEqual([2980491642], task_queues.get_queues(u'bot1')) # No-op. task_queues.cron_tidy_stale() self.assert_count(1, task_queues.BotTaskDimensions) self.assert_count(1, task_queues.TaskDimensions) self.assertEqual([2980491642], task_queues.get_queues(u'bot1')) # One second before expiration. self.mock_now(now, exp.total_seconds()) task_queues.cron_tidy_stale() self.assert_count(1, task_queues.BotTaskDimensions) self.assert_count(1, task_queues.TaskDimensions) self.assertEqual([2980491642], task_queues.get_queues(u'bot1')) # TaskDimension expired. self.mock_now(now, exp.total_seconds() + 1) task_queues.cron_tidy_stale() self.assert_count(0, task_queues.BotTaskDimensions) self.assert_count(0, task_queues.TaskDimensions) self.assertEqual([], task_queues.get_queues(u'bot1'))
def test_assert_task_async_call_rebuld_task_cache_async(self): self.assertEqual(0, _assert_bot()) dimensions = { u'id': [u'bot1'], } self.mock_now(datetime.datetime(2020, 1, 2, 3, 4, 5)) request1 = _gen_request(properties=_gen_properties( dimensions=dimensions)) task_queues.assert_task_async(request1).get_result() self.assert_count(1, task_queues.BotDimensions) self.assert_count(1, task_queues.BotTaskDimensions) self.assert_count(1, task_queues.TaskDimensions) bot_root_key = bot_management.get_root_key('bot1') self.assertEqual(1, len(task_queues.get_queues(bot_root_key))) # expire BotTaskDimensions by changing time. memcache.flush_all() bot_task_dimensions = task_queues.BotTaskDimensions.query( ancestor=bot_root_key).fetch()[0] self.mock_now(bot_task_dimensions.valid_until_ts + datetime.timedelta(seconds=1)) self.assertEqual(0, len(task_queues.get_queues(bot_root_key))) # request a task with the same dimensions. memcache.flush_all() request2 = _gen_request(properties=_gen_properties( dimensions=dimensions)) task_queues.assert_task_async(request2).get_result() self.assert_count(1, task_queues.BotDimensions) self.assert_count(1, task_queues.BotTaskDimensions) self.assert_count(1, task_queues.TaskDimensions) self.assertEqual(1, len(task_queues.get_queues(bot_root_key)))
def test_assert_bot_dimensions_changed(self): # Ensure that stale BotTaskDimensions are deleted when the bot dimensions # changes. now = datetime.datetime(2010, 1, 2, 3, 4, 5) self.mock_now(now) request = self._assert_task() exp = (request.expiration_ts - request.created_ts) + task_queues._ADVANCE self.assertEqual(1, _assert_bot()) # One hour later, the bot changes dimensions. self.mock_now(now, task_queues._ADVANCE.total_seconds()) self.assertEqual(1, _assert_bot({u'gpu': u'Matrox'})) self.assert_count(1, task_queues.BotTaskDimensions) self.assert_count(1, task_queues.TaskDimensions) self.assertEqual([2980491642], task_queues.get_queues(u'bot1')) # One second before expiration. self.mock_now(now, exp.total_seconds()) self.assertEqual(None, _assert_bot({u'gpu': u'Matrox'})) self.assert_count(1, task_queues.BotTaskDimensions) self.assert_count(1, task_queues.TaskDimensions) self.assertEqual([2980491642], task_queues.get_queues(u'bot1')) # TaskDimension expired. The fact that the bot changed dimensions after an # hour didn't impact BotTaskDimensions expiration. self.mock_now(now, exp.total_seconds() + 1) self.assertEqual(0, _assert_bot()) self.assert_count(1, task_queues.BotTaskDimensions) self.assert_count(1, task_queues.TaskDimensions) self.assertEqual([], task_queues.get_queues(u'bot1'))
def test_or_dimensions_new_tasks(self): # Bots are already registered, then new tasks show up self.mock_now(datetime.datetime(2020, 1, 2, 3, 4, 5)) self.assertEqual( 0, _assert_bot(bot_id=u'bot1', dimensions={ u'os': [u'v1', u'v2'], u'gpu': [u'nv'], })) self.assertEqual( 0, _assert_bot(bot_id=u'bot2', dimensions={ u'os': [u'v2'], u'gpu': [u'amd'], })) payloads = self._mock_enqueue_task_async_for_rebuild_task_cache() request1 = _gen_request(properties=_gen_properties( dimensions={ u'pool': [u'default'], u'os': [u'v1|v2'], u'gpu': [u'nv|amd'], })) task_queues.assert_task_async(request1).get_result() self.assertEqual(1, len(payloads)) f = task_queues.rebuild_task_cache_async(payloads[-1]) self.assertEqual(True, f.get_result()) payloads.pop() # Both bots should be able to handle |request1| self.assert_count(2, task_queues.BotDimensions) self.assert_count(2, task_queues.BotTaskDimensions) self.assert_count(1, task_queues.TaskDimensions) self.assertEqual(4, len(task_queues.TaskDimensions.query().get().sets)) bot1_root_key = bot_management.get_root_key(u'bot1') bot2_root_key = bot_management.get_root_key(u'bot2') self.assertEqual(1, len(task_queues.get_queues(bot1_root_key))) self.assertEqual(1, len(task_queues.get_queues(bot2_root_key))) request2 = _gen_request(properties=_gen_properties( dimensions={ u'pool': [u'default'], u'os': [u'v1'], u'gpu': [u'nv|amd'], })) task_queues.assert_task_async(request2).get_result() self.assertEqual(1, len(payloads)) f = task_queues.rebuild_task_cache_async(payloads[-1]) self.assertEqual(True, f.get_result()) payloads.pop() # Only bot1 can handle |request2| self.assert_count(3, task_queues.BotTaskDimensions) self.assert_count(2, task_queues.TaskDimensions) self.assertEqual(2, len(task_queues.get_queues(bot1_root_key))) self.assertEqual(1, len(task_queues.get_queues(bot2_root_key)))
def test_assert_task_then_bot(self): self._assert_task() self.assertEqual(1, _assert_bot()) self.assert_count(1, task_queues.BotDimensions) self.assert_count(1, task_queues.BotTaskDimensions) self.assert_count(1, task_queues.TaskDimensions) self.assertEqual([2980491642], task_queues.get_queues(u'bot1'))
def test_probably_has_capacity_get_queues(self): d = {u'pool': [u'default'], u'os': [u'Ubuntu-16.04']} # Capacity registers there only once there's a request enqueued and # get_queues() is called. _assert_bot() request = _gen_request(properties=_gen_properties(dimensions=d)) task_queues.assert_task_async(request).get_result() self.assertEqual(1, self.execute_tasks()) self.assertEqual(None, task_queues.probably_has_capacity(d)) # It get sets only once get_queues() is called. bot_root_key = bot_management.get_root_key(u'bot1') task_queues.get_queues(bot_root_key) self.assertEqual(True, task_queues.probably_has_capacity(d)) self.assertEqual([1843498234], memcache.get('bot1', namespace='task_queues'))
def test_assert_task_async_then_bot(self): self._assert_task() self.assertEqual(1, _assert_bot()) self.assert_count(1, task_queues.BotDimensions) self.assert_count(1, task_queues.BotTaskDimensions) self.assert_count(1, task_queues.TaskDimensions) bot_root_key = bot_management.get_root_key(u'bot1') self.assertEqual([2980491642], task_queues.get_queues(bot_root_key))
def test_cron_tidy_stale(self): now = datetime.datetime(2010, 1, 2, 3, 4, 5) self.mock_now(now) self.assertEqual(0, _assert_bot()) request = self._assert_task() exp = (request.expiration_ts - request.created_ts + task_queues._EXTEND_VALIDITY) self.assert_count(1, task_queues.BotTaskDimensions) self.assert_count(1, task_queues.TaskDimensions) bot_root_key = bot_management.get_root_key(u'bot1') self.assertEqual([2980491642], task_queues.get_queues(bot_root_key)) # No-op. task_queues.cron_tidy_stale() self.assert_count(1, task_queues.BotTaskDimensions) self.assert_count(1, task_queues.TaskDimensions) self.assertEqual([2980491642], task_queues.get_queues(bot_root_key)) # TaskDimension expired but is still kept; get_queues() doesn't return it # anymore even if still in the DB. BotTaskDimensions was evicted. self.mock_now(now, exp.total_seconds() + 1) task_queues.cron_tidy_stale() self.assert_count(0, task_queues.BotTaskDimensions) self.assert_count(1, task_queues.TaskDimensions) self.assertEqual([], task_queues.get_queues(bot_root_key)) # Just before _KEEP_DEAD. self.mock_now(now, (exp + task_queues._KEEP_DEAD).total_seconds()) task_queues.cron_tidy_stale() self.assert_count(0, task_queues.BotTaskDimensions) self.assert_count(1, task_queues.TaskDimensions) self.assertEqual([], task_queues.get_queues(bot_root_key)) # TaskDimension expired, after KEEP_DEAD. self.mock_now(now, (exp + task_queues._KEEP_DEAD).total_seconds() + 1) task_queues.cron_tidy_stale() self.assert_count(0, task_queues.BotTaskDimensions) self.assert_count(0, task_queues.TaskDimensions) self.assertEqual([], task_queues.get_queues(bot_root_key))
def test_or_dimensions_same_hash(self): self.mock_now(datetime.datetime(2020, 1, 2, 3, 4, 5)) self.assertEqual( 0, _assert_bot(bot_id=u'bot1', dimensions={u'os': [u'v1']})) self.assertEqual( 0, _assert_bot(bot_id=u'bot2', dimensions={u'os': [u'v2']})) self.assertEqual( 0, _assert_bot(bot_id=u'bot3', dimensions={u'os': [u'v3']})) payloads = self._mock_enqueue_task_async_for_rebuild_task_cache() # Both requests should have the same dimension_hash request1 = _gen_request(properties=_gen_properties(dimensions={ u'pool': [u'default'], u'os': [u'v1|v2|v3'], })) request2 = _gen_request(properties=_gen_properties(dimensions={ u'pool': [u'default'], u'os': [u'v3|v2|v1'], })) task_queues.assert_task_async(request1).get_result() task_queues.assert_task_async(request2).get_result() self.assertEqual(2, len(payloads)) while payloads: f = task_queues.rebuild_task_cache_async(payloads[-1]) self.assertEqual(True, f.get_result()) payloads.pop() self.assert_count(3, task_queues.BotDimensions) self.assert_count(3, task_queues.BotTaskDimensions) self.assert_count(1, task_queues.TaskDimensions) self.assertEqual(3, len(task_queues.TaskDimensions.query().get().sets)) bot1_root_key = bot_management.get_root_key(u'bot1') bot2_root_key = bot_management.get_root_key(u'bot2') bot3_root_key = bot_management.get_root_key(u'bot3') self.assertEqual(1, len(task_queues.get_queues(bot1_root_key))) self.assertEqual(1, len(task_queues.get_queues(bot2_root_key))) self.assertEqual(1, len(task_queues.get_queues(bot3_root_key)))
def post(self, task_id=None): # Unlike handshake and poll, we do not accept invalid keys here. This code # path is much more strict. request = self.parse_body() msg = log_unexpected_subset_keys(self.ACCEPTED_KEYS, self.REQUIRED_KEYS, request, self.request, 'bot', 'keys') if msg: self.abort_with_error(400, error=msg) bot_id = request['id'] task_id = request['task_id'] # Make sure bot self-reported ID matches the authentication token. Raises # auth.AuthorizationError if not. bot_auth.validate_bot_id_and_fetch_config(bot_id) bot_overhead = request.get('bot_overhead') cipd_pins = request.get('cipd_pins') cipd_stats = request.get('cipd_stats') cost_usd = request.get('cost_usd', 0) duration = request.get('duration') exit_code = request.get('exit_code') hard_timeout = request.get('hard_timeout') io_timeout = request.get('io_timeout') isolated_stats = request.get('isolated_stats') output = request.get('output') output_chunk_start = request.get('output_chunk_start') outputs_ref = request.get('outputs_ref') if (isolated_stats or cipd_stats) and bot_overhead is None: ereporter2.log_request(request=self.request, source='server', category='task_failure', message='Failed to update task: %s' % task_id) self.abort_with_error( 400, error= 'isolated_stats and cipd_stats require bot_overhead to be set' '\nbot_overhead: %s\nisolate_stats: %s' % (bot_overhead, isolated_stats)) run_result_key = task_pack.unpack_run_result_key(task_id) performance_stats = None if bot_overhead is not None: performance_stats = task_result.PerformanceStats( bot_overhead=bot_overhead) if isolated_stats: download = isolated_stats.get('download') or {} upload = isolated_stats.get('upload') or {} def unpack_base64(d, k): x = d.get(k) if x: return base64.b64decode(x) performance_stats.isolated_download = task_result.OperationStats( duration=download.get('duration'), initial_number_items=download.get('initial_number_items'), initial_size=download.get('initial_size'), items_cold=unpack_base64(download, 'items_cold'), items_hot=unpack_base64(download, 'items_hot')) performance_stats.isolated_upload = task_result.OperationStats( duration=upload.get('duration'), items_cold=unpack_base64(upload, 'items_cold'), items_hot=unpack_base64(upload, 'items_hot')) if cipd_stats: performance_stats.package_installation = task_result.OperationStats( duration=cipd_stats.get('duration')) if output is not None: try: output = base64.b64decode(output) except UnicodeEncodeError as e: logging.error('Failed to decode output\n%s\n%r', e, output) output = output.encode('ascii', 'replace') except TypeError as e: # Save the output as-is instead. The error will be logged in ereporter2 # and returning a HTTP 500 would only force the bot to stay in a retry # loop. logging.error('Failed to decode output\n%s\n%r', e, output) if outputs_ref: outputs_ref = task_request.FilesRef(**outputs_ref) if cipd_pins: cipd_pins = task_result.CipdPins( client_package=task_request.CipdPackage( **cipd_pins['client_package']), packages=[ task_request.CipdPackage(**args) for args in cipd_pins['packages'] ]) # Tell the task queues management engine that the bot is still alive, and # it shall refresh the task queues. bot_root_key = bot_management.get_root_key(bot_id) task_queues.get_queues(bot_root_key) try: state = task_scheduler.bot_update_task( run_result_key=run_result_key, bot_id=bot_id, output=output, output_chunk_start=output_chunk_start, exit_code=exit_code, duration=duration, hard_timeout=hard_timeout, io_timeout=io_timeout, cost_usd=cost_usd, outputs_ref=outputs_ref, cipd_pins=cipd_pins, performance_stats=performance_stats) if not state: logging.info('Failed to update, please retry') self.abort_with_error(500, error='Failed to update, please retry') if state in (task_result.State.COMPLETED, task_result.State.TIMED_OUT): action = 'task_completed' elif state == task_result.State.KILLED: action = 'task_killed' else: assert state in (task_result.State.BOT_DIED, task_result.State.RUNNING), state action = 'task_update' bot_management.bot_event( event_type=action, bot_id=bot_id, external_ip=self.request.remote_addr, authenticated_as=auth.get_peer_identity().to_bytes(), dimensions=None, state=None, version=None, quarantined=None, maintenance_msg=None, task_id=task_id, task_name=None) except ValueError as e: ereporter2.log_request(request=self.request, source='server', category='task_failure', message='Failed to update task: %s' % e) self.abort_with_error(400, error=str(e)) except webob.exc.HTTPException: raise except Exception as e: logging.exception('Internal error: %s', e) self.abort_with_error(500, error=str(e)) # - BOT_DIED will occur when the following conditions are true: # - The bot polled correctly, but then stopped updating for at least # task_result.BOT_PING_TOLERANCE. (It can occur if the host went to # sleep, or the OS was overwhelmed). # - /internal/cron/abort_bot_died runs, detects the bot is MIA, kills the # task. # - Bot wakes up, starts sending updates again. # - KILLED is when the client uses the kill API to forcibly stop a running # task. must_stop = state in (task_result.State.BOT_DIED, task_result.State.KILLED) if must_stop: logging.info('asking bot to kill the task') self.send_response({'must_stop': must_stop, 'ok': True})
def test_rebuild_task_cache_async(self): # Assert that expiration works. now = datetime.datetime(2010, 1, 2, 3, 4, 5) self.mock_now(now) # We want _yield_BotTaskDimensions_keys() to return multiple # BotTaskDimensions ndb.Key to confirm that the inner loops work. This # requires a few bots. _assert_bot(bot_id=u'bot1') _assert_bot(bot_id=u'bot2') _assert_bot(bot_id=u'bot3') bot_root_key = bot_management.get_root_key(u'bot1') self.assertEqual(0, task_queues.BotTaskDimensions.query().count()) self.assertEqual(0, task_queues.TaskDimensions.query().count()) # Intentionally force the code to throttle the number of concurrent RPCs, # otherwise the inner loops wouldn't be reached with less than 50 bots, and # testing with 50 bots, would make the unit test slow. self.mock(task_queues, '_CAP_FUTURES_LIMIT', 1) payloads = self._mock_enqueue_task_async_for_rebuild_task_cache() # The equivalent of self._assert_task(tasks=1) except that we snapshot the # payload. # Trigger multiple task queues to go deeper in the code. request_1 = _gen_request(properties=_gen_properties(dimensions={ u'cpu': [u'x86-64'], u'pool': [u'default'], })) task_queues.assert_task_async(request_1).get_result() self.assertEqual(1, len(payloads)) f = task_queues.rebuild_task_cache_async(payloads[-1]) self.assertEqual(True, f.get_result()) self.assertEqual(3, task_queues.BotTaskDimensions.query().count()) self.assertEqual(1, task_queues.TaskDimensions.query().count()) self.assertEqual(60, request_1.expiration_secs) expected = now + task_queues._EXTEND_VALIDITY + datetime.timedelta( seconds=request_1.expiration_secs) self.assertEqual( expected, task_queues.TaskDimensions.query().get().valid_until_ts) request_2 = _gen_request(properties=_gen_properties( dimensions={ u'os': [u'Ubuntu-16.04'], u'pool': [u'default'], })) task_queues.assert_task_async(request_2).get_result() self.assertEqual(2, len(payloads)) f = task_queues.rebuild_task_cache_async(payloads[-1]) self.assertEqual(True, f.get_result()) self.assertEqual(6, task_queues.BotTaskDimensions.query().count()) self.assertEqual(2, task_queues.TaskDimensions.query().count()) self.assertEqual([227177418, 1843498234], task_queues.get_queues(bot_root_key)) memcache.flush_all() self.assertEqual([227177418, 1843498234], task_queues.get_queues(bot_root_key)) # Now expire the two TaskDimensions, one at a time, and rebuild the task # queue. offset = (task_queues._EXTEND_VALIDITY + datetime.timedelta( seconds=request_1.expiration_secs)).total_seconds() + 1 self.mock_now(now, offset) f = task_queues.rebuild_task_cache_async(payloads[0]) self.assertEqual(True, f.get_result()) self.assertEqual(6, task_queues.BotTaskDimensions.query().count()) self.assertEqual(2, task_queues.TaskDimensions.query().count()) self.assertEqual([227177418, 1843498234], task_queues.get_queues(bot_root_key)) # Observe the effect of memcache. See comment in get_queues(). memcache.flush_all() self.assertEqual([], task_queues.get_queues(bot_root_key)) # Re-running still do not delete TaskDimensions because they are kept until # _KEEP_DEAD. f = task_queues.rebuild_task_cache_async(payloads[1]) self.assertEqual(True, f.get_result()) self.assertEqual(6, task_queues.BotTaskDimensions.query().count()) self.assertEqual(2, task_queues.TaskDimensions.query().count()) self.assertEqual([], task_queues.get_queues(bot_root_key)) # Get past _KEEP_DEAD. offset = (task_queues._EXTEND_VALIDITY + task_queues._KEEP_DEAD + datetime.timedelta(seconds=request_1.expiration_secs) ).total_seconds() + 1 self.mock_now(now, offset) self.assertEqual([], task_queues.get_queues(bot_root_key)) f = task_queues.rebuild_task_cache_async(payloads[0]) self.assertEqual(True, f.get_result()) self.assertEqual(6, task_queues.BotTaskDimensions.query().count()) self.assertEqual(1, task_queues.TaskDimensions.query().count()) self.assertEqual([], task_queues.get_queues(bot_root_key))
def _yield_potential_tasks(bot_id): """Queries all the known task queues in parallel and yields the task in order of priority. The ordering is opportunistic, not strict. There's a risk of not returning exactly in the priority order depending on index staleness and query execution latency. The number of queries is unbounded. Yields: TaskToRun entities, trying to yield the highest priority one first. To have finite execution time, starts yielding results once one of these conditions are met: - 1 second elapsed; in this case, continue iterating in the background - First page of every query returned - All queries exhausted """ potential_dimensions_hashes = task_queues.get_queues(bot_id) # Note that the default ndb.EVENTUAL_CONSISTENCY is used so stale items may be # returned. It's handled specifically by consumers of this function. start = time.time() queries = [_get_task_to_run_query(d) for d in potential_dimensions_hashes] yielders = [_yield_pages_async(q, 10) for q in queries] # We do care about the first page of each query so we cannot merge all the # results of every query insensibly. futures = [] try: for y in yielders: futures.append(next(y, None)) while (time.time() - start) < 1 and not all(f.done() for f in futures if f): r = ndb.eventloop.run0() if r is None: break time.sleep(r) logging.debug( '_yield_potential_tasks(%s): waited %.3fs for %d items from %d Futures', bot_id, time.time() - start, sum(len(f.get_result()) for f in futures if f.done()), len(futures)) # items is a list of TaskToRun. The entities are needed because property # queue_number is used to sort according to each task's priority. items = [] for i, f in enumerate(futures): if f and f.done(): # The ndb.Future returns a list of up to 10 TaskToRun entities. r = f.get_result() if r: # The ndb.Query ask for a valid queue_number but under load, it # happens the value is not valid anymore. items.extend(i for i in r if i.queue_number) # Prime the next page, in case. futures[i] = next(yielders[i], None) # That's going to be our search space for now. items.sort(key=_queue_number_fifo_priority) # It is possible that there is no items yet, in case all futures are taking # more than 1 second. # It is possible that all futures are done if every queue has less than 10 # task pending. while any(futures) or items: if items: yield items[0] items = items[1:] else: # Let activity happen. ndb.eventloop.run1() changed = False for i, f in enumerate(futures): if f and f.done(): # See loop above for explanation. items.extend(i for i in f.get_result() if i.queue_number) futures[i] = next(yielders[i], None) changed = True if changed: items.sort(key=_queue_number_fifo_priority) except apiproxy_errors.DeadlineExceededError as e: # This is normally due to: "The API call datastore_v3.RunQuery() took too # long to respond and was cancelled." # At that point, the Cloud DB index is not able to sustain the load. So the # best thing to do is to back off a bit and not return any task to the bot # for this poll. logging.error( 'Failed to yield a task due to an RPC timeout. Returning no ' 'task to the bot: %s', e)