def testAddRoleToPrimarySuccess(self): """Test manager can add a role to a primary server successfully. Confirm that actions needs to be taken, e.g., restart scheduler for new drone to be added. """ server_models.validate(role=server_models.ServerRole.ROLE.DRONE) server_manager_utils.check_server(mox.IgnoreArg(), mox.IgnoreArg()).AndReturn(True) server_manager_utils.use_server_db().MultipleTimes().AndReturn(True) self.mox.StubOutWithMock(self.PRIMARY_SCHEDULER, 'get_role_names') self.PRIMARY_SCHEDULER.get_role_names().AndReturn( [server_models.ServerRole.ROLE.SCHEDULER]) server_models.ServerRole.objects.create( server=self.PRIMARY_SCHEDULER, role=server_models.ServerRole.ROLE.DRONE).AndReturn( self.DRONE_ROLE) server_models.Server.objects.filter( roles__role=server_models.ServerRole.ROLE.SCHEDULER, status=server_models.Server.STATUS.PRIMARY).AndReturn( [self.PRIMARY_SCHEDULER]) infra.execute_command(mox.IgnoreArg(), mox.IgnoreArg()) self.mox.ReplayAll() server_manager._add_role(self.PRIMARY_SCHEDULER, server_models.ServerRole.ROLE.DRONE, action=True)
def testDeleteRoleFromPrimarySuccess(self): """Test manager can delete a role from a primary server successfully. Confirm that database call is made, and actions are taken, e.g., restart scheduler to delete an existing drone. """ server_manager_utils.use_server_db().MultipleTimes().AndReturn(True) server_models.validate(role=server_models.ServerRole.ROLE.DRONE) self.mox.StubOutWithMock(self.PRIMARY_DRONE, 'get_role_names') self.PRIMARY_DRONE.get_role_names().MultipleTimes().AndReturn( [server_models.ServerRole.ROLE.DRONE]) self.mox.StubOutWithMock(self.PRIMARY_DRONE.roles, 'get') self.PRIMARY_DRONE.roles.get( role=server_models.ServerRole.ROLE.DRONE).AndReturn( self.DRONE_ROLE) server_models.Server.objects.filter( roles__role=server_models.ServerRole.ROLE.SCHEDULER, status=server_models.Server.STATUS.PRIMARY).AndReturn( [self.PRIMARY_SCHEDULER]) server_manager.server_manager_utils.warn_missing_role( server_models.ServerRole.ROLE.DRONE, self.PRIMARY_DRONE) infra.execute_command(mox.IgnoreArg(), mox.IgnoreArg()) self.mox.ReplayAll() server_manager._delete_role(self.PRIMARY_DRONE, server_models.ServerRole.ROLE.DRONE, action=True)
def testChangeStatusSuccess_PrimaryToRepairFailed(self): """Test manager can change the status of a primary server to repair_required. """ server_models.validate( status=server_models.Server.STATUS.REPAIR_REQUIRED) self.mox.StubOutWithMock(self.PRIMARY_DRONE.roles, 'filter') self.mox.StubOutWithMock(self.PRIMARY_DRONE, 'get_role_names') self.PRIMARY_DRONE.get_role_names().MultipleTimes().AndReturn( [server_models.ServerRole.ROLE.DRONE]) self.PRIMARY_DRONE.roles.filter( role__in=server_models.ServerRole.ROLES_REQUIRE_UNIQUE_INSTANCE ).AndReturn(None) server_manager_utils.use_server_db().MultipleTimes().AndReturn(True) server_manager_utils.warn_missing_role( server_models.ServerRole.ROLE.DRONE, self.PRIMARY_DRONE) server_models.Server.objects.filter( roles__role=server_models.ServerRole.ROLE.SCHEDULER, status=server_models.Server.STATUS.PRIMARY).AndReturn( [self.PRIMARY_SCHEDULER]) infra.execute_command(mox.IgnoreArg(), mox.IgnoreArg()) self.mox.ReplayAll() server_manager._change_status( server=self.PRIMARY_DRONE, status=server_models.Server.STATUS.REPAIR_REQUIRED, action=True)
def get_drones(): """Get a list of drones from server database or global config. """ if server_manager_utils.use_server_db(): return server_manager_utils.get_drones() else: drones = global_config.global_config.get_config_value( scheduler_config.CONFIG_SECTION, 'drones', default='localhost') return [hostname.strip() for hostname in drones.split(',')]
def get_shards(): """Get a list of shards from server database or global config. """ if server_manager_utils.use_server_db(): return server_manager_utils.get_shards() else: config = global_config.global_config shards = config.get_config_value('SERVER', 'shards', default='') return [hostname.strip() for hostname in shards.split(',')]
def refresh_drone_configs(self): """ Reread global config options for all drones. """ # Import server_manager_utils is delayed rather than at the beginning of # this module. The reason is that test_that imports drone_manager when # importing autoserv_utils. The import is done before test_that setup # django (test_that only setup django in setup_local_afe, since it's # not needed when test_that runs the test in a lab duts through :lab: # option. Therefore, if server_manager_utils is imported at the # beginning of this module, test_that will fail since django is not # setup yet. from autotest_lib.site_utils import server_manager_utils config = global_config.global_config section = scheduler_config.CONFIG_SECTION config.parse_config_file() for hostname, drone in self._drones.iteritems(): if server_manager_utils.use_server_db(): server = server_manager_utils.get_servers(hostname=hostname)[0] attributes = dict([(a.attribute, a.value) for a in server.attributes.all()]) drone.enabled = (int(attributes.get('disabled', 0)) == 0) drone.max_processes = int( attributes.get( 'max_processes', scheduler_config.config.max_processes_per_drone)) allowed_users = attributes.get('users', None) else: disabled = config.get_config_value(section, '%s_disabled' % hostname, default='') drone.enabled = not bool(disabled) drone.max_processes = config.get_config_value( section, '%s_max_processes' % hostname, type=int, default=scheduler_config.config.max_processes_per_drone) allowed_users = config.get_config_value(section, '%s_users' % hostname, default=None) if allowed_users: drone.allowed_users = set(allowed_users.split()) else: drone.allowed_users = None logging.info('Drone %s.max_processes: %s', hostname, drone.max_processes) logging.info('Drone %s.enabled: %s', hostname, drone.enabled) logging.info('Drone %s.allowed_users: %s', hostname, drone.allowed_users) logging.info('Drone %s.support_ssp: %s', hostname, drone.support_ssp) self._reorder_drone_queue() # max_processes may have changed # Clear notification record about reaching max_processes limit. self._notify_record = {}
def testDeleteRoleFromBackupSuccess(self): """Test manager can delete a role from a backup server successfully. Confirm that database call is made, and no action is taken, e.g., restart scheduler to delete an existing devserver. """ server_models.validate(role=server_models.ServerRole.ROLE.DRONE) server_manager_utils.use_server_db().MultipleTimes().AndReturn(True) self.mox.StubOutWithMock(self.BACKUP_DRONE, 'get_role_names') self.BACKUP_DRONE.get_role_names().MultipleTimes().AndReturn( [server_models.ServerRole.ROLE.DRONE]) self.mox.StubOutWithMock(self.BACKUP_DRONE.roles, 'get') self.BACKUP_DRONE.roles.get( role=server_models.ServerRole.ROLE.DRONE).AndReturn( self.DRONE_ROLE) self.mox.ReplayAll() server_manager._delete_role(server=self.BACKUP_DRONE, role=server_models.ServerRole.ROLE.DRONE, action=True)
def testAddRoleToBackupSuccess(self): """Test manager can add a role to a backup server successfully. Confirm that database call is made, and no action is taken, e.g., restart scheduler to activate a new devserver. """ server_models.validate(role=server_models.ServerRole.ROLE.DEVSERVER) server_manager_utils.check_server(mox.IgnoreArg(), mox.IgnoreArg()).AndReturn(True) server_manager_utils.use_server_db().MultipleTimes().AndReturn(True) self.mox.StubOutWithMock(self.BACKUP_DRONE, 'get_role_names') self.BACKUP_DRONE.get_role_names().AndReturn( [server_models.ServerRole.ROLE.DRONE]) server_models.ServerRole.objects.create( server=mox.IgnoreArg(), role=server_models.ServerRole.ROLE.DEVSERVER).AndReturn( self.DRONE_ROLE) self.mox.ReplayAll() server_manager._add_role(server=self.BACKUP_DRONE, role=server_models.ServerRole.ROLE.DEVSERVER, action=True)
def try_execute(server, roles, enable, post_change, prev_status=server_models.Server.STATUS.REPAIR_REQUIRED, do_action=False): """Try to execute actions for given role changes of the server. @param server: Server that has the role changes. @param roles: A list of roles changed. @param enable: Set to True if the roles are enabled, i.e., added to server. If it's False, the roles are removed from the server. @param post_change: Set to True if to apply actions should be applied after the role changes, otherwise, set to False. @param prev_status: The previous status after the status change if any. This is to help to decide if actions should be executed, since actions should be applied if the server's status is changed from primary to other status. Default to repair_required. @param do_action: Set to True to execute actions, otherwise, post a warning. """ if not server_manager_utils.use_server_db(): return # This check is to prevent actions to be applied to server not in primary # role or server database is not enabled. Note that no action is needed # before a server is changed to primary status. If that assumption is # no longer valid, this method needs to be updated accordingly. if (server.status != server_models.Server.STATUS.PRIMARY and prev_status != server_models.Server.STATUS.PRIMARY): return possible_actions = {} if enable: if post_change: possible_actions = ACTIONS_AFTER_ROLE_APPLIED else: if post_change: possible_actions = ACTIONS_AFTER_ROLE_REMOVED else: possible_actions = ACTIONS_BEFORE_ROLE_REMOVED all_actions = [] for role in roles: all_actions.extend(possible_actions.get(role, [])) for action in set(all_actions): if do_action: apply(action) else: message = ('WARNING! Action %s is skipped. Please manually ' 'execute the action to make your change effective.' % str(action)) print >> sys.stderr, message
def testChangeStatusSuccess_BackupToPrimary(self): """Test manager can change the status of a backup server to primary. """ server_models.validate(status=server_models.Server.STATUS.PRIMARY) server_manager_utils.use_server_db().MultipleTimes().AndReturn(True) self.mox.StubOutWithMock(self.BACKUP_DRONE, 'get_role_names') self.BACKUP_DRONE.get_role_names().MultipleTimes().AndReturn( [server_models.ServerRole.ROLE.DRONE]) self.mox.StubOutWithMock(self.BACKUP_DRONE.roles, 'filter') self.BACKUP_DRONE.roles.filter( role__in=server_models.ServerRole.ROLES_REQUIRE_UNIQUE_INSTANCE ).AndReturn(None) server_models.Server.objects.filter( roles__role=server_models.ServerRole.ROLE.SCHEDULER, status=server_models.Server.STATUS.PRIMARY).AndReturn( [self.PRIMARY_SCHEDULER]) infra.execute_command(mox.IgnoreArg(), mox.IgnoreArg()) self.mox.ReplayAll() server_manager._change_status( server=self.BACKUP_DRONE, status=server_models.Server.STATUS.PRIMARY, action=True)
def main(): if _monitor_db_host_acquisition: logging.info('Please set inline_host_acquisition=False in the shadow ' 'config before starting the host scheduler.') sys.exit(0) try: options = parse_arguments(sys.argv[1:]) scheduler_lib.check_production_settings(options) # If server database is enabled, check if the server has role # `host_scheduler`. If the server does not have host_scheduler role, # exception will be raised and host scheduler will not continue to run. if server_manager_utils.use_server_db(): server_manager_utils.confirm_server_has_role(hostname='localhost', role='host_scheduler') initialize(options.testing) with ts_mon_config.SetupTsMonGlobalState( 'autotest_host_scheduler', indirect=True, debug_file=options.metrics_file, ): metrics.Counter('%s/start' % _METRICS_PREFIX).increment() process_start_time = time.time() host_scheduler = HostScheduler() minimum_tick_sec = global_config.global_config.get_config_value( 'SCHEDULER', 'host_scheduler_minimum_tick_sec', type=float) while not _shutdown: if _lifetime_expired(options.lifetime_hours, process_start_time): break start = time.time() host_scheduler.tick() curr_tick_sec = time.time() - start if (minimum_tick_sec > curr_tick_sec): time.sleep(minimum_tick_sec - curr_tick_sec) else: time.sleep(0.0001) logging.info('Shutdown request recieved. Bye! Bye!') except server_manager_utils.ServerActionError: # This error is expected when the server is not in primary status # for host-scheduler role. Thus do not send email for it. raise except Exception: metrics.Counter('%s/uncaught_exception' % _METRICS_PREFIX).increment() raise finally: email_manager.manager.send_queued_emails() if _db_manager: _db_manager.disconnect()
def testDeleteRoleFromPrimarySuccess_NoAction(self): """Test manager can delete a role from a primary server successfully. Confirm that database call is made, and no action is taken as action is set to False. """ server_manager_utils.use_server_db().MultipleTimes().AndReturn(True) server_models.validate(role=server_models.ServerRole.ROLE.DRONE) self.mox.StubOutWithMock(self.PRIMARY_DRONE, 'get_role_names') self.PRIMARY_DRONE.get_role_names().MultipleTimes().AndReturn( [server_models.ServerRole.ROLE.DRONE]) self.mox.StubOutWithMock(self.PRIMARY_DRONE.roles, 'get') self.PRIMARY_DRONE.roles.get( role=server_models.ServerRole.ROLE.DRONE).AndReturn( self.DRONE_ROLE) server_manager.server_manager_utils.warn_missing_role( server_models.ServerRole.ROLE.DRONE, self.PRIMARY_DRONE) self.mox.ReplayAll() server_manager._delete_role(self.PRIMARY_DRONE, server_models.ServerRole.ROLE.DRONE, action=False)
def main(): if _monitor_db_host_acquisition: logging.info('Please set inline_host_acquisition=False in the shadow ' 'config before starting the host scheduler.') # The upstart job for the host scheduler understands exit(0) to mean # 'don't respawn'. This is desirable when the job scheduler is acquiring # hosts inline. sys.exit(0) try: options = parse_arguments(sys.argv[1:]) scheduler_lib.check_production_settings(options) # If server database is enabled, check if the server has role # `host_scheduler`. If the server does not have host_scheduler role, # exception will be raised and host scheduler will not continue to run. if server_manager_utils.use_server_db(): server_manager_utils.confirm_server_has_role(hostname='localhost', role='host_scheduler') initialize(options.testing) # Start the thread to report metadata. metadata_reporter.start() ts_mon_config.SetupTsMonGlobalState('autotest_host_scheduler') host_scheduler = HostScheduler() minimum_tick_sec = global_config.global_config.get_config_value( 'SCHEDULER', 'minimum_tick_sec', type=float) while not _shutdown: start = time.time() host_scheduler.tick() curr_tick_sec = time.time() - start if (minimum_tick_sec > curr_tick_sec): time.sleep(minimum_tick_sec - curr_tick_sec) else: time.sleep(0.0001) except server_manager_utils.ServerActionError as e: # This error is expected when the server is not in primary status # for host-scheduler role. Thus do not send email for it. raise except Exception: email_manager.manager.log_stacktrace( 'Uncaught exception; terminating host_scheduler.') raise finally: email_manager.manager.send_queued_emails() if _db_manager: _db_manager.disconnect() metadata_reporter.abort()
def _email_alert(): """ """ if not server_manager_utils.use_server_db(): logging.debug('Server database not emailed, email alert is skipped.') return try: server_manager_utils.confirm_server_has_role(hostname='localhost', role='scheduler') except server_manager_utils.ServerActionError: # Only email alert if the server is a scheduler, not shard. return subject = ('Metadata upload has been failing for %d seconds' % _MAX_UPLOAD_FAIL_DURATION) email_manager.manager.enqueue_notify_email(subject, '') email_manager.manager.send_queued_emails()
def get_servers(hostname=None, role=None, status=None): """Get a list of servers with matching role and status. @param hostname: FQDN of the server. @param role: Name of the server role, e.g., drone, scheduler. Default to None to match any role. @param status: Status of the server, e.g., primary, backup, repair_required. Default to None to match any server status. @raises error.RPCException: If server database is not used. @return: A list of server names for servers with matching role and status. """ if not server_manager_utils.use_server_db(): raise error.RPCException('Server database is not enabled. Please try ' 'retrieve servers from global config.') servers = server_manager_utils.get_servers(hostname=hostname, role=role, status=status) return [s.get_details() for s in servers]
def delete(hostname, server=None): """Delete given server from server database. @param hostname: hostname of the server to be deleted. @param server: Server object from database query, this argument should be injected by the verify_server_exists decorator. @raise ServerActionError: If delete server action failed, e.g., server is not found in database. """ print 'Deleting server %s from server database.' % hostname if (server_manager_utils.use_server_db() and server.status == server_models.Server.STATUS.PRIMARY): print ('Server %s is in status primary, need to disable its ' 'current roles first.' % hostname) for role in server.roles.all(): _delete_role(server, role.role) server.delete() print 'Server %s is deleted from server database.' % hostname
def main(): """Entry point for suite_scheduler.py""" signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGHUP, signal_handler) signal.signal(signal.SIGTERM, signal_handler) parser, options, args = parse_options() if args or options.events and not options.build: parser.print_help() return 1 if options.config_file and not os.path.exists(options.config_file): logging.error('Specified config file %s does not exist.', options.config_file) return 1 config = forgiving_config_parser.ForgivingConfigParser() config.read(options.config_file) if options.list: print 'Supported events:' for event_class in driver.Driver.EVENT_CLASSES: print ' ', event_class.KEYWORD return 0 # If we're just sanity checking, we can stop after we've parsed the # config file. if options.sanity: # config_file_getter generates a high amount of noise at DEBUG level logging.getLogger().setLevel(logging.WARNING) d = driver.Driver(None, None, True) d.SetUpEventsAndTasks(config, None) tasks_per_event = d.TasksFromConfig(config) # flatten [[a]] -> [a] tasks = [x for y in tasks_per_event.values() for x in y] control_files_exist = sanity.CheckControlFileExistence(tasks) return control_files_exist logging_manager.configure_logging(SchedulerLoggingConfig(), log_dir=options.log_dir) if not options.log_dir: logging.info('Not logging to a file, as --log_dir was not passed.') # If server database is enabled, check if the server has role # `suite_scheduler`. If the server does not have suite_scheduler role, # exception will be raised and suite scheduler will not continue to run. if not server_manager_utils: raise ImportError( 'Could not import autotest_lib.site_utils.server_manager_utils') if server_manager_utils.use_server_db(): server_manager_utils.confirm_server_has_role(hostname='localhost', role='suite_scheduler') afe_server = global_config.global_config.get_config_value( CONFIG_SECTION_SERVER, "suite_scheduler_afe", default=None) afe = frontend_wrappers.RetryingAFE(server=afe_server, timeout_min=10, delay_sec=5, debug=False) logging.info('Connecting to: %s', afe.server) enumerator = board_enumerator.BoardEnumerator(afe) scheduler = deduping_scheduler.DedupingScheduler(afe, options.file_bug) mv = manifest_versions.ManifestVersions(options.tmp_repo_dir) d = driver.Driver(scheduler, enumerator) d.SetUpEventsAndTasks(config, mv) # Set up metrics upload for Monarch. ts_mon_config.SetupTsMonGlobalState('autotest_suite_scheduler') try: if options.events: # Act as though listed events have just happened. keywords = re.split('\s*,\s*', options.events) if not options.tmp_repo_dir: logging.warn('To run a list of events, you may need to use ' '--repo_dir to specify a folder that already has ' 'manifest repo set up. This is needed for suites ' 'requiring firmware update.') logging.info('Forcing events: %r', keywords) d.ForceEventsOnceForBuild(keywords, options.build, options.os_type) else: if not options.tmp_repo_dir: mv.Initialize() d.RunForever(config, mv) except Exception as e: logging.error('Fatal exception in suite_scheduler: %r\n%s', e, traceback.format_exc()) return 1