def terminate(self, request): """Asks a bot to terminate itself gracefully. The bot will stay in the DB, use 'delete' to remove it from the DB afterward. This request returns a pseudo-taskid that can be waited for to wait for the bot to turn down. This command is particularly useful when a privileged user needs to safely debug a machine specific issue. The user can trigger a terminate for one of the bot exhibiting the issue, wait for the pseudo-task to run then access the machine with the guarantee that the bot is not running anymore. """ # TODO(maruel): Disallow a terminate task when there's one currently # pending or if the bot is considered 'dead', e.g. no contact since 10 # minutes. logging.debug('%s', request) bot_id = unicode(request.bot_id) bot_key = bot_management.get_info_key(bot_id) get_or_raise(bot_key) # raises 404 if there is no such bot try: # Craft a special priority 0 task to tell the bot to shutdown. request = task_request.create_termination_task( bot_id, wait_for_capacity=True) except (datastore_errors.BadValueError, TypeError, ValueError) as e: raise endpoints.BadRequestException(e.message) result_summary = task_scheduler.schedule_request(request, secret_bytes=None) return swarming_rpcs.TerminateResponse( task_id=task_pack.pack_result_summary_key(result_summary.key))
def handle_early_release(machine_lease): """Handles the early release of a leased machine. Args: machine_lease: MachineLease instance. """ if machine_lease.lease_expiration_ts <= utils.utcnow( ) + datetime.timedelta(seconds=machine_lease.early_release_secs): logging.info('MachineLease ready to be released: %s', machine_lease.key) task_result_summary = task_scheduler.schedule_request( task_request.create_termination_task(machine_lease.hostname, True), check_acls=False, ) associate_termination_task(machine_lease.key, machine_lease.hostname, task_result_summary.task_id)
def handle_early_release(machine_lease): """Handles the early release of a leased machine. Args: machine_lease: MachineLease instance. """ assert not machine_lease.termination_task, machine_lease.termination_task early_expiration_ts = machine_lease.lease_expiration_ts - datetime.timedelta( seconds=machine_lease.early_release_secs) if machine_lease.drained or early_expiration_ts <= utils.utcnow(): logging.info( 'MachineLease ready to be released:\nKey: %s\nHostname: %s', machine_lease.key, machine_lease.hostname, ) task_result_summary = task_scheduler.schedule_request( task_request.create_termination_task(machine_lease.hostname), None, ) associate_termination_task(machine_lease.key, machine_lease.hostname, task_result_summary.task_id)
def test_create_termination_task(self): request = task_request.create_termination_task(u'some-bot', wait_for_capacity=True) self.assertTrue(request.task_slice(0).properties.is_terminate)
def check_for_connection(machine_lease): """Checks for a bot_connected event. Args: machine_lease: MachineLease instance. """ assert machine_lease.instruction_ts # Technically this query is wrong because it looks at events in reverse # chronological order. The connection time we find here is actually the # most recent connection when we want the earliest. However, this function # is only called for new bots and stops being called once the connection # time is recorded, so the connection time we record should end up being the # first connection anyways. Iterating in the correct order would require # building a new, large index. for event in bot_management.get_events_query(machine_lease.bot_id, True): # We don't want to find a bot_connected event from before we sent the # connection instruction (e.g. in the event of hostname reuse), so do not # look at events from before the connection instruction was sent. if event.ts < machine_lease.instruction_ts: break if event.event_type == 'bot_connected': logging.info( 'Bot connected:\nKey: %s\nHostname: %s\nTime: %s', machine_lease.key, machine_lease.hostname, event.ts, ) associate_connection_ts(machine_lease.key, event.ts) ts_mon_metrics.on_machine_connected_time( (event.ts - machine_lease.instruction_ts).total_seconds(), fields={ 'machine_type': machine_lease.machine_type.id(), }, ) return # The bot hasn't connected yet. If it's dead or missing, release the lease. # At this point we have sent the connection instruction so the bot could still # connect after we release the lease but before Machine Provider actually # deletes the bot. Therefore we also schedule a termination task if releasing # the bot. That way, if the bot connects, it will just shut itself down. bot_info = bot_management.get_info_key(machine_lease.hostname).get() if not bot_info: logging.error( 'BotInfo missing:\nKey: %s\nHostname: %s', machine_lease.key, machine_lease.hostname, ) task_scheduler.schedule_request( task_request.create_termination_task(machine_lease.hostname), None, ) if release(machine_lease): clear_lease_request(machine_lease.key, machine_lease.client_request_id) return if bot_info.is_dead: logging.warning( 'Bot failed to connect in time:\nKey: %s\nHostname: %s', machine_lease.key, machine_lease.hostname, ) task_scheduler.schedule_request( task_request.create_termination_task(machine_lease.hostname), None, ) if release(machine_lease): cleanup_bot(machine_lease)
def test_create_termination_task(self): request = task_request.create_termination_task(u'some-bot', True) self.assertTrue(request.properties.is_terminate)