def __init__(self): self._logger = get_logger(__name__) self._master_results_path = Configuration['results_directory'] self._slave_registry = SlaveRegistry.singleton() self._scheduler_pool = BuildSchedulerPool() self._build_request_handler = BuildRequestHandler(self._scheduler_pool) self._build_request_handler.start() self._slave_allocator = SlaveAllocator(self._scheduler_pool) self._slave_allocator.start() # The best practice for determining the number of threads to use is # the number of threads per core multiplied by the number of physical # cores. So for example, with 10 cores, 2 sockets and 2 per core, the # max would be 40. # # Currently we use threads for incrementing/decrementing slave executor # counts (lock acquisition) and tearing down the slave (network IO). 32 threads should be # plenty for these tasks. In the case of heavy load, the bottle neck will be the number # of executors, not the time it takes to lock/unlock the executor counts or the number of # teardown requests. Tweak the number to find the sweet spot if you feel this is the case. self._thread_pool_executor = ThreadPoolExecutor(max_workers=32) # Asynchronously delete (but immediately rename) all old builds when master starts. # Remove this if/when build numbers are unique across master starts/stops if os.path.exists(self._master_results_path): fs.async_delete(self._master_results_path) fs.create_dir(self._master_results_path) # Configure heartbeat tracking self._unresponsive_slaves_cleanup_interval = Configuration[ 'unresponsive_slaves_cleanup_interval'] self._hb_scheduler = sched.scheduler() SlavesCollector.register_slaves_metrics_collector( lambda: self._slave_registry.get_all_slaves_by_id().values())
def setUp(self): super().setUp() Configuration['results_directory'] = abspath(join('some', 'temp', 'directory')) self.patch('app.master.build.BuildArtifact.__new__') # patch __new__ to mock instances but keep static methods self.mock_util = self.patch('app.master.build.app.util') # stub out util - it often interacts with the fs self.mock_open = self.patch('app.master.build.open', autospec=False, create=True) self.mock_listdir = self.patch('os.listdir') self.scheduler_pool = BuildSchedulerPool()
def __init__(self): self._logger = get_logger(__name__) self._master_results_path = Configuration['results_directory'] self._all_slaves_by_url = {} self._all_builds_by_id = OrderedDict() self._scheduler_pool = BuildSchedulerPool() self._build_request_handler = BuildRequestHandler(self._scheduler_pool) self._build_request_handler.start() self._slave_allocator = SlaveAllocator(self._build_request_handler) self._slave_allocator.start() # Asynchronously delete (but immediately rename) all old builds when master starts. # Remove this if/when build numbers are unique across master starts/stops if os.path.exists(self._master_results_path): fs.async_delete(self._master_results_path) fs.create_dir(self._master_results_path)
def setUp(self): super().setUp() Configuration['results_directory'] = abspath(join('some', 'temp', 'directory')) self.patch('app.master.build.BuildArtifact.__new__') # patch __new__ to mock instances but keep static methods self.mock_util = self.patch('app.master.build.app.util') # stub out util - it often interacts with the fs self.mock_open = self.patch('app.master.build.open', autospec=False, create=True) self.scheduler_pool = BuildSchedulerPool()
def __init__(self): self._logger = get_logger(__name__) self._master_results_path = Configuration['results_directory'] self._all_slaves_by_url = {} self._scheduler_pool = BuildSchedulerPool() self._build_request_handler = BuildRequestHandler(self._scheduler_pool) self._build_request_handler.start() self._slave_allocator = SlaveAllocator(self._scheduler_pool) self._slave_allocator.start() # Initialize the database connection before we initialize a BuildStore Connection.create(Configuration['database_url']) UnhandledExceptionHandler.singleton().add_teardown_callback( BuildStore.clean_up) # The best practice for determining the number of threads to use is # the number of threads per core multiplied by the number of physical # cores. So for example, with 10 cores, 2 sockets and 2 per core, the # max would be 40. # # Currently we use threads for incrementing/decrementing slave executor # counts (lock acquisition) and tearing down the slave (network IO). 32 threads should be # plenty for these tasks. In the case of heavy load, the bottle neck will be the number # of executors, not the time it takes to lock/unlock the executor counts or the number of # teardown requests. Tweak the number to find the sweet spot if you feel this is the case. self._thread_pool_executor = ThreadPoolExecutor(max_workers=32) # Asynchronously delete (but immediately rename) all old builds when master starts. # Remove this if/when build numbers are unique across master starts/stops # TODO: We can remove this code since we persist builds across master restarts # if os.path.exists(self._master_results_path): # fs.async_delete(self._master_results_path) # fs.create_dir(self._master_results_path) SlavesCollector.register_slaves_metrics_collector( lambda: self.all_slaves_by_id().values())
class ClusterMaster(ClusterService): """ The ClusterRunner Master service: This is the main application class that the web framework/REST API sits on top of. """ API_VERSION = 'v1' def __init__(self): self._logger = get_logger(__name__) self._master_results_path = Configuration['results_directory'] self._slave_registry = SlaveRegistry.singleton() self._scheduler_pool = BuildSchedulerPool() self._build_request_handler = BuildRequestHandler(self._scheduler_pool) self._build_request_handler.start() self._slave_allocator = SlaveAllocator(self._scheduler_pool) self._slave_allocator.start() # The best practice for determining the number of threads to use is # the number of threads per core multiplied by the number of physical # cores. So for example, with 10 cores, 2 sockets and 2 per core, the # max would be 40. # # Currently we use threads for incrementing/decrementing slave executor # counts (lock acquisition) and tearing down the slave (network IO). 32 threads should be # plenty for these tasks. In the case of heavy load, the bottle neck will be the number # of executors, not the time it takes to lock/unlock the executor counts or the number of # teardown requests. Tweak the number to find the sweet spot if you feel this is the case. self._thread_pool_executor = ThreadPoolExecutor(max_workers=32) # Asynchronously delete (but immediately rename) all old builds when master starts. # Remove this if/when build numbers are unique across master starts/stops if os.path.exists(self._master_results_path): fs.async_delete(self._master_results_path) fs.create_dir(self._master_results_path) # Configure heartbeat tracking self._unresponsive_slaves_cleanup_interval = Configuration[ 'unresponsive_slaves_cleanup_interval'] self._hb_scheduler = sched.scheduler() SlavesCollector.register_slaves_metrics_collector( lambda: self._slave_registry.get_all_slaves_by_id().values()) def start_heartbeat_tracker_thread(self): self._logger.info('Heartbeat tracker will run every {} seconds'.format( self._unresponsive_slaves_cleanup_interval)) Thread(target=self._start_heartbeat_tracker, name='HeartbeatTrackerThread', daemon=True).start() def _start_heartbeat_tracker(self): self._hb_scheduler.enter(0, 0, self._disconnect_non_heartbeating_slaves) self._hb_scheduler.run() def _disconnect_non_heartbeating_slaves(self): slaves_to_disconnect = [ slave for slave in self._slave_registry.get_all_slaves_by_url().values() if slave.is_alive() and not self._is_slave_responsive(slave) ] for slave in slaves_to_disconnect: self._disconnect_slave(slave) self._logger.error( 'Slave {} marked offline as it is not sending heartbeats.'. format(slave.id)) self._hb_scheduler.enter(self._unresponsive_slaves_cleanup_interval, 0, self._disconnect_non_heartbeating_slaves) def _is_slave_responsive(self, slave: ClusterSlave) -> bool: time_since_last_heartbeat = (datetime.now() - slave.get_last_heartbeat_time()).seconds return time_since_last_heartbeat < self._unresponsive_slaves_cleanup_interval def _get_status(self): """ Just returns a dumb message and prints it to the console. :rtype: str """ return 'Master service is up.' def api_representation(self): """ Gets a dict representing this resource which can be returned in an API response. :rtype: dict [str, mixed] """ slaves_representation = [ slave.api_representation() for slave in self._slave_registry.get_all_slaves_by_id().values() ] return { 'status': self._get_status(), 'slaves': slaves_representation, } def get_builds(self, offset: int = None, limit: int = None) -> List['Build']: """ Returns a list of all builds. :param offset: The starting index of the requested build :param limit: The number of builds requested """ num_builds = BuildStore.size() start, end = get_paginated_indices(offset, limit, num_builds) return BuildStore.get_range(start, end) def active_builds(self): """ Returns a list of incomplete builds :rtype: list[Build] """ return [build for build in self.get_builds() if not build.is_finished] def connect_slave(self, slave_url, num_executors, slave_session_id=None): """ Connect a slave to this master. :type slave_url: str :type num_executors: int :type slave_session_id: str | None :return: The response with the slave id of the slave. :rtype: dict[str, str] """ # todo: Validate arg types for this and other methods called via API. # If a slave had previously been connected, and is now being reconnected, the cleanest way to resolve this # bookkeeping is for the master to forget about the previous slave instance and start with a fresh instance. try: old_slave = self._slave_registry.get_slave(slave_url=slave_url) except ItemNotFoundError: pass else: self._logger.warning( 'Slave requested to connect to master, even though previously connected as {}. ' + 'Removing existing slave instance from the master\'s bookkeeping.', old_slave) # If a slave has requested to reconnect, we have to assume that whatever build the dead slave was # working on no longer has valid results. if old_slave.current_build_id is not None: self._logger.info( '{} has build [{}] running on it. Attempting to cancel build.', old_slave, old_slave.current_build_id) try: build = self.get_build(old_slave.current_build_id) build.cancel() self._logger.info( 'Cancelled build {} due to dead slave {}', old_slave.current_build_id, old_slave) except ItemNotFoundError: self._logger.info( 'Failed to find build {} that was running on {}', old_slave.current_build_id, old_slave) slave = Slave(slave_url, num_executors, slave_session_id) self._slave_registry.add_slave(slave) self._slave_allocator.add_idle_slave(slave) self._logger.info( 'Slave on {} connected to master with {} executors. (id: {})', slave_url, num_executors, slave.id) return {'slave_id': str(slave.id)} def handle_slave_state_update(self, slave, new_slave_state): """ Execute logic to transition the specified slave to the given state. :type slave: Slave :type new_slave_state: SlaveState """ slave_transition_functions = { SlaveState.DISCONNECTED: self._disconnect_slave, SlaveState.SHUTDOWN: self._graceful_shutdown_slave, SlaveState.IDLE: self._slave_allocator.add_idle_slave, SlaveState.SETUP_COMPLETED: self._handle_setup_success_on_slave, SlaveState.SETUP_FAILED: self._handle_setup_failure_on_slave, } if new_slave_state not in slave_transition_functions: raise BadRequestError( 'Invalid slave state "{}". Valid states are: {}.'.format( new_slave_state, ', '.join(slave_transition_functions.keys()))) do_transition = slave_transition_functions.get(new_slave_state) do_transition(slave) def update_slave_last_heartbeat_time(self, slave): slave.update_last_heartbeat_time() def set_shutdown_mode_on_slaves(self, slave_ids): """ :type slave_ids: list[int] """ # Find all the slaves first so if an invalid slave_id is specified, we 404 before shutting any of them down. slaves = [ self._slave_registry.get_slave(slave_id=slave_id) for slave_id in slave_ids ] for slave in slaves: self.handle_slave_state_update(slave, SlaveState.SHUTDOWN) def _graceful_shutdown_slave(self, slave): """ Puts slave in shutdown mode so it cannot receive new builds. The slave will be killed when finished with any running builds. :type slave: Slave """ slave.set_shutdown_mode() self._logger.info('Slave on {} was put in shutdown mode. (id: {})', slave.url, slave.id) def _disconnect_slave(self, slave): """ Mark a slave dead. :type slave: Slave """ # Mark slave dead. We do not remove it from the list of all slaves. We also do not remove it from idle_slaves; # that will happen during slave allocation. slave.mark_dead() # todo: Fail/resend any currently executing subjobs still executing on this slave. self._logger.info('Slave on {} was disconnected. (id: {})', slave.url, slave.id) def _handle_setup_success_on_slave(self, slave: Slave): """ Respond to successful build setup on a slave. This starts subjob executions on the slave. This should be called once after the specified slave has already run build_setup commands for the specified build. """ build = self.get_build(slave.current_build_id) scheduler = self._scheduler_pool.get(build) self._thread_pool_executor.submit( scheduler.begin_subjob_executions_on_slave, slave=slave) def _handle_setup_failure_on_slave(self, slave): """ Respond to failed build setup on a slave. This should put the slave back into a usable state. :type slave: Slave """ build = self.get_build(slave.current_build_id) build.setup_failures += 1 if build.setup_failures >= MAX_SETUP_FAILURES: build.cancel() build.mark_failed( 'Setup failed on this build more than {} times. Failing the build.' .format(MAX_SETUP_FAILURES)) slave.teardown() def handle_request_for_new_build(self, build_params): """ Creates a new Build object and adds it to the request queue to be processed. :param build_params: :type build_params: dict[str, str] :rtype tuple [bool, dict [str, str]] """ build_request = BuildRequest(build_params) success = False if build_request.is_valid(): build = Build(build_request) BuildStore.add(build) build.generate_project_type( ) # WIP(joey): This should be internal to the Build object. self._build_request_handler.handle_build_request(build) response = {'build_id': build.build_id()} success = True elif not build_request.is_valid_type(): response = {'error': 'Invalid build request type.'} else: required_params = build_request.required_parameters() response = { 'error': 'Missing required parameter. Required parameters: {}'.format( required_params) } return success, response # todo: refactor to use exception instead of boolean def handle_request_to_update_build(self, build_id, update_params): """ Updates the state of a build with the values passed in. Used for cancelling running builds. :type build_id: int :param update_params: The fields that should be updated and their new values :type update_params: dict [str, str] :return: The success/failure and the response we want to send to the requestor :rtype: tuple [bool, dict [str, str]] """ build = BuildStore.get(int(build_id)) if build is None: raise ItemNotFoundError('Invalid build id.') success, response = build.validate_update_params(update_params) if not success: return success, response return build.update_state(update_params), {} def handle_result_reported_from_slave(self, slave_url, build_id, subjob_id, payload=None): """ Process the result and dispatch the next subjob :type slave_url: str :type build_id: int :type subjob_id: int :type payload: dict :rtype: str """ self._logger.info( 'Results received from {} for subjob. (Build {}, Subjob {})', slave_url, build_id, subjob_id) build = BuildStore.get(int(build_id)) slave = self._slave_registry.get_slave(slave_url=slave_url) try: build.complete_subjob(subjob_id, payload) finally: scheduler = self._scheduler_pool.get(build) self._thread_pool_executor.submit( scheduler.execute_next_subjob_or_free_executor, slave=slave) def get_build(self, build_id): """ Returns a build by id :param build_id: The id for the build whose status we are getting :type build_id: int :rtype: Build """ build = BuildStore.get(build_id) if build is None: raise ItemNotFoundError('Invalid build id: {}.'.format(build_id)) return build def get_path_for_build_results_archive(self, build_id: int, is_tar_request: bool = False ) -> str: """ Given a build id, get the absolute file path for the archive file containing the build results. :param build_id: The build id for which to retrieve the artifacts archive file :param is_tar_request: If true, download the tar.gz archive instead of a zip. :return: The path to the archived results file """ build = BuildStore.get(build_id) if build is None: raise ItemNotFoundError('Invalid build id.') archive_file = build.artifacts_tar_file if is_tar_request else build.artifacts_zip_file if archive_file is None: raise ItemNotReadyError( 'Build artifact file is not yet ready. Try again later.') return archive_file
class ClusterMaster(ClusterService): """ The ClusterRunner Master service: This is the main application class that the web framework/REST API sits on top of. """ API_VERSION = 'v1' def __init__(self): self._logger = get_logger(__name__) self._master_results_path = Configuration['results_directory'] self._all_slaves_by_url = {} self._all_builds_by_id = OrderedDict() self._scheduler_pool = BuildSchedulerPool() self._build_request_handler = BuildRequestHandler(self._scheduler_pool) self._build_request_handler.start() self._slave_allocator = SlaveAllocator(self._build_request_handler) self._slave_allocator.start() # Asynchronously delete (but immediately rename) all old builds when master starts. # Remove this if/when build numbers are unique across master starts/stops if os.path.exists(self._master_results_path): fs.async_delete(self._master_results_path) fs.create_dir(self._master_results_path) def _get_status(self): """ Just returns a dumb message and prints it to the console. :rtype: str """ return 'Master service is up.' def api_representation(self): """ Gets a dict representing this resource which can be returned in an API response. :rtype: dict [str, mixed] """ slaves_representation = [slave.api_representation() for slave in self.all_slaves_by_id().values()] return { 'status': self._get_status(), 'slaves': slaves_representation, } def builds(self): """ Returns a list of all builds :rtype: list[Build] """ return self._all_builds_by_id.values() def active_builds(self): """ Returns a list of incomplete builds :rtype: list[Build] """ return [build for build in self.builds() if not build.is_finished] def all_slaves_by_id(self): """ Retrieve all connected slaves :rtype: dict [int, Slave] """ slaves_by_slave_id = {} for slave in self._all_slaves_by_url.values(): slaves_by_slave_id[slave.id] = slave return slaves_by_slave_id def get_slave(self, slave_id=None, slave_url=None): """ Get the instance of given slave by either the slave's id or url. Only one of slave_id or slave_url should be specified. :param slave_id: The id of the slave to return :type slave_id: int :param slave_url: The url of the slave to return :type slave_url: str :return: The instance of the slave :rtype: Slave """ if (slave_id is None) == (slave_url is None): raise ValueError('Only one of slave_id or slave_url should be specified to get_slave().') if slave_id is not None: for slave in self._all_slaves_by_url.values(): if slave.id == slave_id: return slave else: if slave_url in self._all_slaves_by_url: return self._all_slaves_by_url[slave_url] raise ItemNotFoundError('Requested slave ({}) does not exist.'.format(slave_id)) def connect_slave(self, slave_url, num_executors): """ Connect a slave to this master. :type slave_url: str :type num_executors: int :return: The response with the slave id of the slave. :rtype: dict[str, str] """ # If a slave had previously been connected, and is now being reconnected, the cleanest way to resolve this # bookkeeping is for the master to forget about the previous slave instance and start with a fresh instance. if slave_url in self._all_slaves_by_url: self._logger.warning('Slave on {} requested to connect to master, even though previously connected. ' + 'Removing existing slave instance from the master\'s bookkeeping.', slave_url) old_slave = self._all_slaves_by_url.get(slave_url) # If a slave has requested to reconnect, we have to assume that whatever build the dead slave was # working on no longer has valid results. if old_slave.current_build_id is not None: self._logger.info('{} has build [{}] running on it. Attempting to cancel build.', slave_url, old_slave.current_build_id) try: build = self.get_build(old_slave.current_build_id) build.cancel() self._logger.info('Cancelled build {} due to dead slave {}', old_slave.current_build_id, slave_url) except ItemNotFoundError: self._logger.info('Failed to find build {} that was running on {}', old_slave.current_build_id, slave_url) slave = Slave(slave_url, num_executors) self._all_slaves_by_url[slave_url] = slave self._slave_allocator.add_idle_slave(slave) self._logger.info('Slave on {} connected to master with {} executors. (id: {})', slave_url, num_executors, slave.id) return {'slave_id': str(slave.id)} def handle_slave_state_update(self, slave, new_slave_state): """ Execute logic to transition the specified slave to the given state. :type slave: Slave :type new_slave_state: SlaveState """ slave_transition_functions = { SlaveState.DISCONNECTED: self._disconnect_slave, SlaveState.SHUTDOWN: self._graceful_shutdown_slave, SlaveState.IDLE: self._slave_allocator.add_idle_slave, SlaveState.SETUP_COMPLETED: self._handle_setup_success_on_slave, SlaveState.SETUP_FAILED: self._handle_setup_failure_on_slave, } if new_slave_state not in slave_transition_functions: raise BadRequestError('Invalid slave state "{}". Valid states are: {}.' .format(new_slave_state, ', '.join(slave_transition_functions.keys()))) do_transition = slave_transition_functions.get(new_slave_state) do_transition(slave) def set_shutdown_mode_on_slaves(self, slave_ids): """ :type slave_ids: list[int] """ # Find all the slaves first so if an invalid slave_id is specified, we 404 before shutting any of them down. slaves = [self.get_slave(slave_id) for slave_id in slave_ids] for slave in slaves: self.handle_slave_state_update(slave, SlaveState.SHUTDOWN) def _graceful_shutdown_slave(self, slave): """ Puts slave in shutdown mode so it cannot receive new builds. The slave will be killed when finished with any running builds. :type slave: Slave """ slave.set_shutdown_mode() self._logger.info('Slave on {} was put in shutdown mode. (id: {})', slave.url, slave.id) def _disconnect_slave(self, slave): """ Mark a slave dead. :type slave: Slave """ # Mark slave dead. We do not remove it from the list of all slaves. We also do not remove it from idle_slaves; # that will happen during slave allocation. slave.mark_dead() # todo: Fail/resend any currently executing subjobs still executing on this slave. self._logger.info('Slave on {} was disconnected. (id: {})', slave.url, slave.id) def _handle_setup_success_on_slave(self, slave): """ Respond to successful build setup on a slave. This starts subjob executions on the slave. This should be called once after the specified slave has already run build_setup commands for the specified build. :type slave: Slave """ build = self.get_build(slave.current_build_id) scheduler = self._scheduler_pool.get(build) scheduler.begin_subjob_executions_on_slave(slave) def _handle_setup_failure_on_slave(self, slave): """ Respond to failed build setup on a slave. This should put the slave back into a usable state. :type slave: Slave """ raise BadRequestError('Setup failure handling on the master is not yet implemented.') def handle_request_for_new_build(self, build_params): """ Creates a new Build object and adds it to the request queue to be processed. :param build_params: :type build_params: dict[str, str] :rtype tuple [bool, dict [str, str]] """ build_request = BuildRequest(build_params) success = False if build_request.is_valid(): build = Build(build_request) self._all_builds_by_id[build.build_id()] = build build.generate_project_type() # WIP(joey): This should be internal to the Build object. self._build_request_handler.handle_build_request(build) response = {'build_id': build.build_id()} success = True elif not build_request.is_valid_type(): response = {'error': 'Invalid build request type.'} else: required_params = build_request.required_parameters() response = {'error': 'Missing required parameter. Required parameters: {}'.format(required_params)} return success, response # todo: refactor to use exception instead of boolean def handle_request_to_update_build(self, build_id, update_params): """ Updates the state of a build with the values passed in. Used for cancelling running builds. :type build_id: int :param update_params: The fields that should be updated and their new values :type update_params: dict [str, str] :return: The success/failure and the response we want to send to the requestor :rtype: tuple [bool, dict [str, str]] """ build = self._all_builds_by_id.get(int(build_id)) if build is None: raise ItemNotFoundError('Invalid build id.') success, response = build.validate_update_params(update_params) if not success: return success, response return build.update_state(update_params), {} def handle_result_reported_from_slave(self, slave_url, build_id, subjob_id, payload=None): """ Process the result and dispatch the next subjob :type slave_url: str :type build_id: int :type subjob_id: int :type payload: dict :rtype: str """ self._logger.info('Results received from {} for subjob. (Build {}, Subjob {})', slave_url, build_id, subjob_id) build = self._all_builds_by_id[int(build_id)] slave = self._all_slaves_by_url[slave_url] # If the build has been canceled, don't work on the next subjob. if not build.is_finished: # WIP(joey): This check should be internal to the Build object. try: build.complete_subjob(subjob_id, payload) finally: scheduler = self._scheduler_pool.get(build) scheduler.execute_next_subjob_or_free_executor(slave) def get_build(self, build_id): """ Returns a build by id :param build_id: The id for the build whose status we are getting :type build_id: int :rtype: Build """ build = self._all_builds_by_id.get(build_id) if build is None: raise ItemNotFoundError('Invalid build id: {}.'.format(build_id)) return build def get_path_for_build_results_archive(self, build_id): """ Given a build id, get the absolute file path for the archive file containing the build results. :param build_id: The build id for which to retrieve the artifacts archive file :type build_id: int :return: The path to the archived results file :rtype: str """ build = self._all_builds_by_id.get(build_id) if build is None: raise ItemNotFoundError('Invalid build id.') archive_file = build.artifacts_archive_file if archive_file is None: raise ItemNotReadyError('Build artifact file is not yet ready. Try again later.') return archive_file
class TestBuild(BaseUnitTestCase): _FAKE_SLAVE_URL = 'my.favorite.slave.com:40001' _FAKE_MAX_EXECUTORS = sys.maxsize _FAKE_MAX_EXECUTORS_PER_SLAVE = sys.maxsize _FAKE_PAYLOAD = { 'filename': 'pizza_order.txt', 'body': 'Four large pepperoni, one small cheese.' } def setUp(self): super().setUp() Configuration['results_directory'] = abspath( join('some', 'temp', 'directory')) self.patch('app.master.build.BuildArtifact.__new__' ) # patch __new__ to mock instances but keep static methods self.mock_util = self.patch( 'app.master.build.app.util' ) # stub out util - it often interacts with the fs self.mock_open = self.patch('app.master.build.open', autospec=False, create=True) self.scheduler_pool = BuildSchedulerPool() def test_allocate_slave_calls_slave_setup(self): mock_slave = self._create_mock_slave() build = self._create_test_build(BuildStatus.PREPARED) scheduler = self.scheduler_pool.get(build) scheduler.allocate_slave(mock_slave) mock_slave.setup.assert_called_once_with(build, executor_start_index=0) def test_build_doesnt_use_more_than_max_executors(self): mock_slaves = [ self._create_mock_slave(num_executors=5) for _ in range(3) ] # 15 total available executors expected_num_executors_used = 12 # We expect the build to use 12 out of 15 available executors. job_config = self._create_job_config( max_executors=expected_num_executors_used) build = self._create_test_build(BuildStatus.PREPARED, job_config=job_config) scheduler = self.scheduler_pool.get(build) scheduler.execute_next_subjob_or_free_executor = Mock() for mock_slave in mock_slaves: scheduler.allocate_slave(mock_slave) scheduler.begin_subjob_executions_on_slave(mock_slave) self.assertEqual( scheduler.execute_next_subjob_or_free_executor.call_count, expected_num_executors_used, 'Build should start executing as many subjobs as its max_executors setting.' ) def test_build_doesnt_use_more_than_max_executors_per_slave(self): mock_slaves = [ self._create_mock_slave(num_executors=5) for _ in range(3) ] max_executors_per_slave = 2 job_config = self._create_job_config( max_executors_per_slave=max_executors_per_slave) build = self._create_test_build(build_status=BuildStatus.PREPARED, job_config=job_config) scheduler = self.scheduler_pool.get(build) scheduler.execute_next_subjob_or_free_executor = Mock() for mock_slave in mock_slaves: scheduler.allocate_slave(mock_slave) scheduler.begin_subjob_executions_on_slave(mock_slave) # Even though each slave has 5 executors, we should only start subjobs on 2 of those executors per slave. expected_subjob_execution_calls = [ call(mock_slaves[0]), call(mock_slaves[0]), call(mock_slaves[1]), call(mock_slaves[1]), call(mock_slaves[2]), call(mock_slaves[2]), ] self.assertEqual( scheduler.execute_next_subjob_or_free_executor.mock_calls, expected_subjob_execution_calls, 'Build should start executing as many subjobs per slave as its max_executors_per_slave setting.' ) def test_build_status_returns_queued_after_build_creation(self): build = self._create_test_build() self.assertEqual( build._status(), BuildStatus.QUEUED, 'Build status should be QUEUED immediately after build has been created.' ) def test_build_status_returns_queued_after_build_preparation(self): build = self._create_test_build(BuildStatus.PREPARED) self.assertEqual( build._status(), BuildStatus.QUEUED, 'Build status should be QUEUED after build has been prepared.') def test_build_status_returns_building_after_setup_has_started(self): mock_slave = self._create_mock_slave() build = self._create_test_build(BuildStatus.PREPARED) scheduler = self.scheduler_pool.get(build) scheduler.allocate_slave(mock_slave) self.assertEqual( build._status(), BuildStatus.BUILDING, 'Build status should be BUILDING after setup has started on slaves.' ) def test_build_status_returns_building_after_setup_is_complete_and_subjobs_are_executing( self): build = self._create_test_build(BuildStatus.BUILDING) self.assertEqual( build._status(), BuildStatus.BUILDING, 'Build status should be BUILDING after subjobs have started executing on slaves.' ) def test_build_status_returns_finished_after_all_subjobs_complete_and_slaves_finished( self): build = self._create_test_build(BuildStatus.BUILDING) build._create_build_artifact = MagicMock() self._finish_test_build(build) # Verify build artifacts was called after subjobs completed build._create_build_artifact.assert_called_once_with() self.assertTrue(build._subjobs_are_finished) self.assertEqual(build._status(), BuildStatus.FINISHED) def test_complete_subjob_parses_payload_and_stores_value_in_atom_objects( self): fake_atom_exit_code = 777 mock_open(mock=self.mock_open, read_data=str(fake_atom_exit_code)) build = self._create_test_build(BuildStatus.BUILDING, num_subjobs=1, num_atoms_per_subjob=1) subjob = build.all_subjobs()[0] build.complete_subjob(subjob.subjob_id(), payload=self._FAKE_PAYLOAD) expected_payload_sys_path = join(Configuration['results_directory'], '1', 'artifact_0_0') self.mock_open.assert_called_once_with( join(expected_payload_sys_path, BuildArtifact.EXIT_CODE_FILE), 'r', ) self.assertEqual(subjob.atoms[0].exit_code, fake_atom_exit_code) def test_complete_subjob_marks_atoms_of_subjob_as_completed(self): build = self._create_test_build(BuildStatus.BUILDING) subjob = build.all_subjobs()[0] build.complete_subjob(subjob.subjob_id(), payload=self._FAKE_PAYLOAD) for atom in subjob.atoms: self.assertEqual(AtomState.COMPLETED, atom.state) def test_complete_subjob_writes_and_extracts_payload_to_correct_directory( self): build = self._create_test_build(BuildStatus.BUILDING) subjob = build.all_subjobs()[0] payload = { 'filename': 'turtles.txt', 'body': 'Heroes in a half shell.' } build.complete_subjob(subjob.subjob_id(), payload=payload) expected_payload_sys_path = join(Configuration['results_directory'], '1', 'turtles.txt') self.mock_util.fs.write_file.assert_called_once_with( 'Heroes in a half shell.', expected_payload_sys_path) self.mock_util.fs.extract_tar.assert_called_once_with( expected_payload_sys_path, delete=True) def test_exception_is_raised_if_problem_occurs_writing_subjob(self): build = self._create_test_build(BuildStatus.BUILDING) subjob = build.all_subjobs()[0] self.mock_util.fs.write_file.side_effect = FileExistsError with self.assertRaises(Exception): build.complete_subjob(subjob.subjob_id(), payload=self._FAKE_PAYLOAD) @genty_dataset( max_executors_reached=(1, False), max_executors_not_reached=(30, True), ) def test_need_more_slaves_returns_false_if_and_only_if_max_executors_is_reached( self, max_executors_for_build, build_should_need_more_slaves): job_config = self._create_job_config( max_executors=max_executors_for_build) build = self._create_test_build(BuildStatus.PREPARED, num_subjobs=100, job_config=job_config) scheduler = self.scheduler_pool.get(build) mock_slave = self._create_mock_slave(num_executors=5) scheduler.allocate_slave(slave=mock_slave) self.assertEqual( scheduler.needs_more_slaves(), build_should_need_more_slaves, 'If and only if the maximum number of executors is allocated we should not need more slaves.' ) def test_build_cannot_be_prepared_more_than_once(self): build = self._create_test_build(BuildStatus.QUEUED) job_config = self._create_job_config() subjobs = self._create_subjobs(count=3, job_config=job_config) subjob_calculator = self._create_mock_subjob_calc(subjobs) build.prepare(subjob_calculator) with self.assertRaisesRegex(RuntimeError, r'prepare\(\) was called more than once'): build.prepare(subjob_calculator) def test_teardown_called_on_slave_when_no_subjobs_remain(self): mock_slave = self._create_mock_slave(num_executors=1) self._create_test_build(BuildStatus.FINISHED, num_subjobs=1, slaves=[mock_slave]) mock_slave.teardown.assert_called_with() def test_teardown_called_on_all_slaves_when_no_subjobs_remain(self): mock_slaves = [ self._create_mock_slave(num_executors=5), self._create_mock_slave(num_executors=4), self._create_mock_slave(num_executors=3), ] self._create_test_build(BuildStatus.FINISHED, num_subjobs=20, slaves=mock_slaves) for mock_slave in mock_slaves: mock_slave.teardown.assert_called_with() def test_teardown_called_on_slave_when_slave_in_shutdown_mode(self): mock_slave = self._create_mock_slave(num_executors=5) mock_slave.start_subjob.side_effect = SlaveMarkedForShutdownError self._create_test_build(BuildStatus.BUILDING, num_subjobs=30, slaves=[mock_slave]) mock_slave.teardown.assert_called_with() def test_cancel_prevents_further_subjob_starts_and_sets_canceled(self): mock_slave = self._create_mock_slave(num_executors=5) build = self._create_test_build(BuildStatus.BUILDING, num_subjobs=30, slaves=[mock_slave]) self.assertEqual( mock_slave.start_subjob.call_count, 5, 'Slave should only have had as many subjobs started ' 'as its num_executors.') build.cancel() self._finish_test_build(build, assert_postbuild_tasks_complete=False) self.assertEqual(build._status(), BuildStatus.CANCELED, 'Canceled build should have canceled state.') self.assertEqual( mock_slave.start_subjob.call_count, 5, 'A canceled build should not have any more subjobs ' 'started after it has been canceled.') def test_cancel_is_a_noop_if_build_is_already_finished(self): mock_slave = self._create_mock_slave() build = self._create_test_build(BuildStatus.FINISHED, slaves=[mock_slave]) num_slave_calls_before_cancel = len(mock_slave.method_calls) build.cancel() self.assertEqual( build._status(), BuildStatus.FINISHED, 'Canceling a finished build should not change its state.') self.assertEqual( len(mock_slave.method_calls), num_slave_calls_before_cancel, 'Canceling a finished build should not cause any further calls to slave.' ) def test_validate_update_params_for_cancelling_build(self): build = self._create_test_build() success, response = build.validate_update_params( {'status': 'canceled'}) self.assertTrue(success, "Correct status update should report success") self.assertEqual({}, response, "Error response should be empty") def test_validate_update_params_rejects_bad_params(self): build = self._create_test_build() success, response = build.validate_update_params({'status': 'foo'}) self.assertFalse(success, "Bad status update reported success") self.assertEqual( { 'error': "Value (foo) is not in list of allowed values (['canceled']) for status" }, response, "Error response not expected") def test_validate_update_params_rejects_bad_keys(self): build = self._create_test_build() success, response = build.validate_update_params( {'badkey': 'canceled'}) self.assertFalse(success, "Bad status update reported success") self.assertEqual( {'error': "Key (badkey) is not in list of allowed keys (status)"}, response, "Error response not expected") def test_update_state_to_canceled_will_cancel_build(self): build = self._create_test_build(BuildStatus.BUILDING) build.cancel = Mock() success = build.update_state({'status': 'canceled'}) build.cancel.assert_called_once_with() self.assertTrue(success, "Update did not report success") def test_execute_next_subjob_with_no_more_subjobs_should_not_teardown_same_slave_twice( self): mock_slave = self._create_mock_slave() build = self._create_test_build(BuildStatus.BUILDING, slaves=[mock_slave]) scheduler = self.scheduler_pool.get(build) self._finish_test_build(build, assert_postbuild_tasks_complete=False) scheduler.execute_next_subjob_or_free_executor(mock_slave) scheduler.execute_next_subjob_or_free_executor(mock_slave) self.assertEqual(mock_slave.teardown.call_count, 1, "Teardown should only be called once") def test_slave_is_fully_allocated_when_max_executors_per_slave_is_not_set( self): mock_slave = self._create_mock_slave(num_executors=10) job_config = self._create_job_config( max_executors_per_slave=float('inf')) self._create_test_build(BuildStatus.BUILDING, job_config=job_config, slaves=[mock_slave]) self.assertEqual( mock_slave.claim_executor.call_count, 10, 'Claim executor should be called once for each ' 'of the slave executors.') def test_slave_is_only_allocated_up_to_max_executors_per_slave_setting( self): mock_slave = self._create_mock_slave(num_executors=10) job_config = self._create_job_config(max_executors_per_slave=5) self._create_test_build(BuildStatus.BUILDING, job_config=job_config, slaves=[mock_slave]) self.assertEqual( mock_slave.claim_executor.call_count, 5, 'Claim executor should be called ' 'max_executors_per_slave times.') def test_generate_project_type_raises_error_if_failed_to_generate_project( self): build = self._create_test_build() self.patch( 'app.master.build.util.create_project_type').return_value = None with self.assertRaises(BuildProjectError): build.generate_project_type() def test_creating_build_sets_queued_timestamp(self): build = self._create_test_build() self.assertIsNotNone( build.get_state_timestamp(BuildStatus.QUEUED), '"queued" timestamp should be set immediately after build creation.' ) def test_preparing_build_sets_prepared_timestamps(self): job_config = self._create_job_config() subjobs = self._create_subjobs(job_config=job_config) subjob_calculator = self._create_mock_subjob_calc(subjobs) build = self._create_test_build(BuildStatus.QUEUED) self.assertIsNone( build.get_state_timestamp(BuildStatus.PREPARED), '"prepared" timestamp should not be set before build preparation.') build.prepare(subjob_calculator) self.assertIsNotNone( build.get_state_timestamp(BuildStatus.PREPARED), '"prepared" timestamp should not be set before build preparation.') def test_allocating_slave_to_build_sets_building_timestamp_only_on_first_slave_allocation( self): mock_slave1 = self._create_mock_slave() mock_slave2 = self._create_mock_slave() build = self._create_test_build(BuildStatus.PREPARED) scheduler = self.scheduler_pool.get(build) self.assertIsNone( build.get_state_timestamp(BuildStatus.BUILDING), '"building" timestamp should not be set until slave allocated.') scheduler.allocate_slave(slave=mock_slave1) building_timestamp1 = build.get_state_timestamp(BuildStatus.BUILDING) scheduler.allocate_slave(slave=mock_slave2) building_timestamp2 = build.get_state_timestamp(BuildStatus.BUILDING) self.assertIsNotNone( building_timestamp1, '"building" timestamp should be set after first slave allocated.') self.assertEqual( building_timestamp1, building_timestamp2, '"building" timestamp should not change upon further slave allocation.' ) def test_finishing_build_sets_finished_timestamp(self): build = self._create_test_build(BuildStatus.BUILDING) self.assertIsNone( build.get_state_timestamp(BuildStatus.FINISHED), '"finished" timestamp should not be set until build finishes.') self._finish_test_build(build) self.assertIsNotNone( build.get_state_timestamp(BuildStatus.FINISHED), '"finished" timestamp should be set when build finishes.') def test_marking_build_failed_sets_error_timestamp(self): build = self._create_test_build(BuildStatus.BUILDING) self.assertIsNone( build.get_state_timestamp(BuildStatus.ERROR), '"error" timestamp should not be set unless build fails.') build.mark_failed('Test build was intentionally marked failed.') self.assertIsNotNone( build.get_state_timestamp(BuildStatus.ERROR), '"error" timestamp should be set when build fails.') def test_canceling_build_sets_canceled_timestamp(self): build = self._create_test_build(BuildStatus.BUILDING) self.assertIsNone( build.get_state_timestamp(BuildStatus.CANCELED), '"canceled" timestamp should not be set unless build is canceled.') build.cancel() self.assertIsNotNone( build.get_state_timestamp(BuildStatus.CANCELED), '"canceled" timestamp should be set when build is canceled.') def test_get_failed_atoms_returns_none_if_not_finished(self): build = self._create_test_build(BuildStatus.BUILDING) self.assertIsNone(build._get_failed_atoms()) def test_get_failed_atoms_returns_empty_list_if_finished_and_all_passed( self): build = self._create_test_build(BuildStatus.FINISHED) build._build_artifact = MagicMock(spec_set=BuildArtifact) build._build_artifact.get_failed_subjob_and_atom_ids.return_value = [] self.assertEquals([], build._get_failed_atoms()) def test_get_failed_atoms_returns_failed_atoms_only(self): build = self._create_test_build(BuildStatus.FINISHED, num_subjobs=5, num_atoms_per_subjob=10) build._build_artifact = MagicMock(spec_set=BuildArtifact) # Failed items: (SubjobId: 1, AtomId: 1) and (SubjobId: 3, AtomId: 3) build._build_artifact.get_failed_subjob_and_atom_ids.return_value = [ (1, 1), (3, 3) ] failed_atoms = build._get_failed_atoms() self.assertEquals(failed_atoms, [ build._all_subjobs_by_id[1]._atoms[1], build._all_subjobs_by_id[3]._atoms[3], ]) def _create_test_build( self, build_status=None, job_config=None, num_subjobs=3, num_atoms_per_subjob=3, slaves=None, ): """ Create a Build instance for testing purposes. The instance will be created and brought to the specified state similarly to how it would reach that state in actual app execution. Build instances have a huge amount of internal state with complicated interactions, so this helper method helps us write tests that are much more consistent and closer to reality. It also helps us avoid modifying a build's private members directly. :type build_status: BuildStatus :rtype: Build """ build = Build(BuildRequest(build_parameters={})) if build_status is None: return build # QUEUED: Instantiate a mock project_type instance for the build. mock_project_type = self._create_mock_project_type() self.patch('app.master.build.util.create_project_type' ).return_value = mock_project_type build.generate_project_type() if build_status is BuildStatus.QUEUED: return build # PREPARED: Create a fake job config and subjobs and hand them off to the build. job_config = job_config or self._create_job_config() mock_project_type.job_config.return_value = job_config subjobs = self._create_subjobs(count=num_subjobs, num_atoms_each=num_atoms_per_subjob, job_config=job_config) subjob_calculator = self._create_mock_subjob_calc(subjobs) build.prepare(subjob_calculator) if build_status is BuildStatus.PREPARED: return build # BUILDING: Allocate a slave and begin subjob executions on that slave. slaves = slaves or [self._create_mock_slave()] scheduler = self.scheduler_pool.get(build) for slave in slaves: scheduler.allocate_slave(slave=slave) scheduler.begin_subjob_executions_on_slave(slave=slave) if build_status is BuildStatus.BUILDING: return build # ERROR: Mark the in-progress build as failed. if build_status is BuildStatus.ERROR: build.mark_failed( failure_reason='Test build was intentionally marked failed.') return build # CANCELED: Cancel the in-progress build. if build_status is BuildStatus.CANCELED: build.cancel() return build # FINISHED: Complete all subjobs and allow all postbuild tasks to execute. self._finish_test_build(build) if build_status is BuildStatus.FINISHED: return build raise ValueError( 'Unsupported value for build_status: "{}".'.format(build_status)) def _create_subjobs(self, count=3, num_atoms_each=1, build_id=0, job_config=None): return [ Subjob( build_id=build_id, subjob_id=i, project_type=None, job_config=job_config, atoms=[Atom('NAME=Leonardo') for _ in range(num_atoms_each)], ) for i in range(count) ] def _create_job_config( self, max_executors=_FAKE_MAX_EXECUTORS, max_executors_per_slave=_FAKE_MAX_EXECUTORS_PER_SLAVE, ): atomizer = Atomizer([{'FAKE': 'fake atomizer command'}]) return JobConfig('', '', '', '', atomizer, max_executors, max_executors_per_slave) def _create_mock_project_type(self): return MagicMock(spec_set=ProjectType()) def _create_mock_slave(self, num_executors=5): """ :type num_executors: int :rtype: Slave | MagicMock """ slave_spec = Slave( '', 0 ) # constructor values don't matter since this is just a spec object mock_slave = MagicMock(spec_set=slave_spec, url=self._FAKE_SLAVE_URL, num_executors=num_executors) counter = Counter() mock_slave.claim_executor.side_effect = counter.increment mock_slave.free_executor.side_effect = counter.decrement return mock_slave def _create_mock_subjob_calc(self, subjobs): """ :type subjobs: list[Subjob] :rtype: SubjobCalculator """ mock_subjob_calculator = MagicMock(spec_set=SubjobCalculator) mock_subjob_calculator.compute_subjobs_for_build.return_value = subjobs return mock_subjob_calculator def _finish_test_build(self, build, assert_postbuild_tasks_complete=True): """ Complete all the subjobs for a build, triggering the build's postbuild tasks and transitioning it to the "finished" state. Since postbuild tasks are asynchronous, this injects an event so we can detect when the asynchronous method is finished. :type build: Build :type assert_postbuild_tasks_complete: bool """ build_scheduler = self.scheduler_pool.get(build) # Inject an event into the build's postbuild task so that we can detect when it completes. postbuild_tasks_complete_event = Event() self._on_async_postbuild_tasks_completed( build, postbuild_tasks_complete_event.set) # Complete all subjobs for this build. build_has_running_subjobs = True while build_has_running_subjobs: build_has_running_subjobs = False # copy allocated_slaves list since slaves may get deallocated during loop slaves_allocated = build_scheduler._slaves_allocated.copy() for mock_slave in slaves_allocated: self.assertIsInstance( mock_slave, Mock, '_finish_test_build() can only be used on builds with mock slaves.' ) for subjob in self._get_in_progress_subjobs_for_mock_slave( mock_slave): build_has_running_subjobs = True build.complete_subjob(subjob.subjob_id()) build_scheduler.execute_next_subjob_or_free_executor( mock_slave) # Wait for the async postbuild thread to complete executing postbuild tasks. if assert_postbuild_tasks_complete: self.assertTrue( postbuild_tasks_complete_event.wait(timeout=5), 'Postbuild tasks should be run and complete quickly when build finishes.' ) def _get_in_progress_subjobs_for_mock_slave(self, mock_slave): return [ start_subjob_args[0] for start_subjob_args, _ in mock_slave.start_subjob.call_args_list if start_subjob_args[0].atoms[0].state is AtomState.IN_PROGRESS ] def _on_async_postbuild_tasks_completed(self, build, callback): # Patch a build so it executes the specified callback after its PostBuild thread finishes. original_async_postbuild_method = build._perform_async_postbuild_tasks def async_postbuild_tasks_with_callback(): original_async_postbuild_method() callback() build._perform_async_postbuild_tasks = async_postbuild_tasks_with_callback
class TestBuild(BaseUnitTestCase): _FAKE_SLAVE_URL = 'my.favorite.slave.com:40001' _FAKE_MAX_EXECUTORS = sys.maxsize _FAKE_MAX_EXECUTORS_PER_SLAVE = sys.maxsize _FAKE_PAYLOAD = {'filename': 'pizza_order.txt', 'body': 'Four large pepperoni, one small cheese.'} def setUp(self): super().setUp() Configuration['results_directory'] = abspath(join('some', 'temp', 'directory')) self.patch('app.master.build.BuildArtifact.__new__') # patch __new__ to mock instances but keep static methods self.mock_util = self.patch('app.master.build.app.util') # stub out util - it often interacts with the fs self.mock_open = self.patch('app.master.build.open', autospec=False, create=True) self.scheduler_pool = BuildSchedulerPool() def test_allocate_slave_calls_slave_setup(self): mock_slave = self._create_mock_slave() build = self._create_test_build(BuildStatus.PREPARED) scheduler = self.scheduler_pool.get(build) scheduler.allocate_slave(mock_slave) mock_slave.setup.assert_called_once_with(build, executor_start_index=0) def test_build_doesnt_use_more_than_max_executors(self): mock_slaves = [self._create_mock_slave(num_executors=5) for _ in range(3)] # 15 total available executors expected_num_executors_used = 12 # We expect the build to use 12 out of 15 available executors. job_config = self._create_job_config(max_executors=expected_num_executors_used) build = self._create_test_build(BuildStatus.PREPARED, job_config=job_config) scheduler = self.scheduler_pool.get(build) scheduler.execute_next_subjob_or_free_executor = Mock() for mock_slave in mock_slaves: scheduler.allocate_slave(mock_slave) scheduler.begin_subjob_executions_on_slave(mock_slave) self.assertEqual(scheduler.execute_next_subjob_or_free_executor.call_count, expected_num_executors_used, 'Build should start executing as many subjobs as its max_executors setting.') def test_build_doesnt_use_more_than_max_executors_per_slave(self): mock_slaves = [self._create_mock_slave(num_executors=5) for _ in range(3)] max_executors_per_slave = 2 job_config = self._create_job_config(max_executors_per_slave=max_executors_per_slave) build = self._create_test_build(build_status=BuildStatus.PREPARED, job_config=job_config) scheduler = self.scheduler_pool.get(build) scheduler.execute_next_subjob_or_free_executor = Mock() for mock_slave in mock_slaves: scheduler.allocate_slave(mock_slave) scheduler.begin_subjob_executions_on_slave(mock_slave) # Even though each slave has 5 executors, we should only start subjobs on 2 of those executors per slave. expected_subjob_execution_calls = [ call(mock_slaves[0]), call(mock_slaves[0]), call(mock_slaves[1]), call(mock_slaves[1]), call(mock_slaves[2]), call(mock_slaves[2]), ] self.assertEqual( scheduler.execute_next_subjob_or_free_executor.mock_calls, expected_subjob_execution_calls, 'Build should start executing as many subjobs per slave as its max_executors_per_slave setting.') def test_build_status_returns_queued_after_build_creation(self): build = self._create_test_build() self.assertEqual(build._status(), BuildStatus.QUEUED, 'Build status should be QUEUED immediately after build has been created.') @skip('PREPARING not yet supported in _create_test_build()') # WIP(joey): Support PREPARING state. def test_build_status_returns_preparing_after_build_begins_prep(self): build = self._create_test_build(BuildState.PREPARING) self.assertEqual(build._status(), BuildState.PREPARING, 'Build status should be PREPARING after build has begun preparation.') def test_build_status_returns_prepared_after_build_preparation(self): build = self._create_test_build(BuildStatus.PREPARED) self.assertEqual(build._status(), BuildStatus.PREPARED, 'Build status should be PREPARED after build has been prepared.') def test_build_status_returns_building_after_setup_has_started(self): mock_slave = self._create_mock_slave() build = self._create_test_build(BuildStatus.PREPARED) scheduler = self.scheduler_pool.get(build) scheduler.allocate_slave(mock_slave) self.assertEqual(build._status(), BuildStatus.BUILDING, 'Build status should be BUILDING after setup has started on slaves.') def test_build_status_returns_building_after_setup_is_complete_and_subjobs_are_executing(self): build = self._create_test_build(BuildStatus.BUILDING) self.assertEqual(build._status(), BuildStatus.BUILDING, 'Build status should be BUILDING after subjobs have started executing on slaves.') def test_build_status_returns_finished_after_all_subjobs_complete_and_slaves_finished(self): build = self._create_test_build(BuildStatus.BUILDING) build._create_build_artifact = MagicMock() self._finish_test_build(build) # Verify build artifacts was called after subjobs completed build._create_build_artifact.assert_called_once_with() self.assertTrue(build._all_subjobs_are_finished()) self.assertEqual(build._status(), BuildStatus.FINISHED) def test_complete_subjob_parses_payload_and_stores_value_in_atom_objects(self): fake_atom_exit_code = 777 mock_open(mock=self.mock_open, read_data=str(fake_atom_exit_code)) build = self._create_test_build(BuildStatus.BUILDING, num_subjobs=1, num_atoms_per_subjob=1) subjob = build.all_subjobs()[0] build.complete_subjob(subjob.subjob_id(), payload=self._FAKE_PAYLOAD) expected_payload_sys_path = join(Configuration['results_directory'], '1', 'artifact_0_0') self.mock_open.assert_called_once_with( join(expected_payload_sys_path, BuildArtifact.EXIT_CODE_FILE), 'r', ) self.assertEqual(subjob.atoms[0].exit_code, fake_atom_exit_code) def test_complete_subjob_marks_atoms_of_subjob_as_completed(self): build = self._create_test_build(BuildStatus.BUILDING) subjob = build.all_subjobs()[0] build.complete_subjob(subjob.subjob_id(), payload=self._FAKE_PAYLOAD) for atom in subjob.atoms: self.assertEqual(AtomState.COMPLETED, atom.state) def test_complete_subjob_writes_and_extracts_payload_to_correct_directory(self): build = self._create_test_build(BuildStatus.BUILDING) subjob = build.all_subjobs()[0] payload = {'filename': 'turtles.txt', 'body': 'Heroes in a half shell.'} build.complete_subjob(subjob.subjob_id(), payload=payload) expected_payload_sys_path = join(Configuration['results_directory'], '1', 'turtles.txt') self.mock_util.fs.write_file.assert_called_once_with('Heroes in a half shell.', expected_payload_sys_path) self.mock_util.fs.extract_tar.assert_called_once_with(expected_payload_sys_path, delete=True) def test_exception_is_raised_if_problem_occurs_writing_subjob(self): build = self._create_test_build(BuildStatus.BUILDING) subjob = build.all_subjobs()[0] self.mock_util.fs.write_file.side_effect = FileExistsError with self.assertRaises(Exception): build.complete_subjob(subjob.subjob_id(), payload=self._FAKE_PAYLOAD) @genty_dataset( max_executors_reached=(1, False, 100), max_executors_not_reached=(30, True, 100), fewer_subjobs_than_max_executors=(30, False, 1), ) def test_need_more_slaves( self, max_executors_for_build, build_should_need_more_slaves, num_subjobs ): job_config = self._create_job_config(max_executors=max_executors_for_build) build = self._create_test_build(BuildStatus.PREPARED, num_subjobs=num_subjobs, job_config=job_config) scheduler = self.scheduler_pool.get(build) mock_slave = self._create_mock_slave(num_executors=5) scheduler.allocate_slave(slave=mock_slave) self.assertEqual(scheduler.needs_more_slaves(), build_should_need_more_slaves, 'If and only if the maximum number of executors is allocated we should not need more slaves.') def test_build_cannot_be_prepared_more_than_once(self): build = self._create_test_build(BuildStatus.QUEUED) job_config = self._create_job_config() subjobs = self._create_subjobs(count=3, job_config=job_config) subjob_calculator = self._create_mock_subjob_calc(subjobs) build.prepare(subjob_calculator) with self.assertRaisesRegex(RuntimeError, r'prepare\(\) was called more than once'): build.prepare(subjob_calculator) def test_teardown_called_on_slave_when_no_subjobs_remain(self): mock_slave = self._create_mock_slave(num_executors=1) self._create_test_build(BuildStatus.FINISHED, num_subjobs=1, slaves=[mock_slave]) mock_slave.teardown.assert_called_with() def test_teardown_called_on_all_slaves_when_no_subjobs_remain(self): mock_slaves = [ self._create_mock_slave(num_executors=5), self._create_mock_slave(num_executors=4), self._create_mock_slave(num_executors=3), ] self._create_test_build(BuildStatus.FINISHED, num_subjobs=20, slaves=mock_slaves) for mock_slave in mock_slaves: mock_slave.teardown.assert_called_with() def test_teardown_called_on_slave_when_slave_in_shutdown_mode(self): mock_slave = self._create_mock_slave(num_executors=5) mock_slave.start_subjob.side_effect = SlaveMarkedForShutdownError self._create_test_build(BuildStatus.BUILDING, num_subjobs=30, slaves=[mock_slave]) mock_slave.teardown.assert_called_with() def test_cancel_prevents_further_subjob_starts_and_sets_canceled(self): # dev: this is flaky now mock_slave = self._create_mock_slave(num_executors=5) build = self._create_test_build(BuildStatus.BUILDING, num_subjobs=30, slaves=[mock_slave]) self.assertEqual(mock_slave.start_subjob.call_count, 5, 'Slave should only have had as many subjobs started ' 'as its num_executors.') build.cancel() self._finish_test_build(build, assert_postbuild_tasks_complete=False) self.assertEqual(build._status(), BuildStatus.CANCELED, 'Canceled build should have canceled state.') self.assertEqual(mock_slave.start_subjob.call_count, 5, 'A canceled build should not have any more subjobs ' 'started after it has been canceled.') def test_cancel_is_a_noop_if_build_is_already_finished(self): mock_slave = self._create_mock_slave() build = self._create_test_build(BuildStatus.FINISHED, slaves=[mock_slave]) num_slave_calls_before_cancel = len(mock_slave.method_calls) build.cancel() self.assertEqual(build._status(), BuildStatus.FINISHED, 'Canceling a finished build should not change its state.') self.assertEqual(len(mock_slave.method_calls), num_slave_calls_before_cancel, 'Canceling a finished build should not cause any further calls to slave.') def test_validate_update_params_for_cancelling_build(self): build = self._create_test_build() success, response = build.validate_update_params({'status': 'canceled'}) self.assertTrue(success, "Correct status update should report success") self.assertEqual({}, response, "Error response should be empty") def test_validate_update_params_rejects_bad_params(self): build = self._create_test_build() success, response = build.validate_update_params({'status': 'foo'}) self.assertFalse(success, "Bad status update reported success") self.assertEqual({'error': "Value (foo) is not in list of allowed values (['canceled']) for status"}, response, "Error response not expected") def test_validate_update_params_rejects_bad_keys(self): build = self._create_test_build() success, response = build.validate_update_params({'badkey': 'canceled'}) self.assertFalse(success, "Bad status update reported success") self.assertEqual({'error': "Key (badkey) is not in list of allowed keys (status)"}, response, "Error response not expected") def test_update_state_to_canceled_will_cancel_build(self): build = self._create_test_build(BuildStatus.BUILDING) build.cancel = Mock() success = build.update_state({'status': 'canceled'}) build.cancel.assert_called_once_with() self.assertTrue(success, "Update did not report success") def test_execute_next_subjob_with_no_more_subjobs_should_not_teardown_same_slave_twice(self): mock_slave = self._create_mock_slave() build = self._create_test_build(BuildStatus.BUILDING, slaves=[mock_slave]) scheduler = self.scheduler_pool.get(build) self._finish_test_build(build, assert_postbuild_tasks_complete=False) scheduler.execute_next_subjob_or_free_executor(mock_slave) scheduler.execute_next_subjob_or_free_executor(mock_slave) self.assertEqual(mock_slave.teardown.call_count, 1, "Teardown should only be called once") def test_slave_is_fully_allocated_when_max_executors_per_slave_is_not_set(self): mock_slave = self._create_mock_slave(num_executors=10) job_config = self._create_job_config(max_executors_per_slave=float('inf')) self._create_test_build(BuildStatus.BUILDING, job_config=job_config, slaves=[mock_slave]) self.assertEqual(mock_slave.claim_executor.call_count, 10, 'Claim executor should be called once for each ' 'of the slave executors.') def test_slave_is_only_allocated_up_to_max_executors_per_slave_setting(self): mock_slave = self._create_mock_slave(num_executors=10) job_config = self._create_job_config(max_executors_per_slave=5) self._create_test_build(BuildStatus.BUILDING, job_config=job_config, slaves=[mock_slave]) self.assertEqual(mock_slave.claim_executor.call_count, 5, 'Claim executor should be called ' 'max_executors_per_slave times.') def test_generate_project_type_raises_error_if_failed_to_generate_project(self): build = self._create_test_build() self.patch('app.master.build.util.create_project_type').return_value = None with self.assertRaises(BuildProjectError): build.generate_project_type() def test_creating_build_sets_queued_timestamp(self): build = self._create_test_build() self.assertIsNotNone(self._get_build_state_timestamp(build, BuildState.QUEUED), '"queued" timestamp should be set immediately after build creation.') def test_preparing_build_sets_prepared_timestamps(self): job_config = self._create_job_config() subjobs = self._create_subjobs(job_config=job_config) subjob_calculator = self._create_mock_subjob_calc(subjobs) build = self._create_test_build(BuildStatus.QUEUED) self.assertIsNone(self._get_build_state_timestamp(build, BuildState.PREPARED), '"prepared" timestamp should not be set before build preparation.') build.prepare(subjob_calculator) self.assertIsNotNone(self._get_build_state_timestamp(build, BuildState.PREPARED), '"prepared" timestamp should not be set before build preparation.') def test_preparing_build_creates_empty_results_directory(self): subjob_calculator = self._create_mock_subjob_calc([]) build = self._create_test_build(BuildStatus.QUEUED) build.prepare(subjob_calculator) self.mock_util.fs.create_dir.assert_called_once_with(build._build_results_dir()) def test_allocating_slave_to_build_sets_building_timestamp_only_on_first_slave_allocation(self): mock_slave1 = self._create_mock_slave() mock_slave2 = self._create_mock_slave() build = self._create_test_build(BuildStatus.PREPARED) scheduler = self.scheduler_pool.get(build) self.assertIsNone(self._get_build_state_timestamp(build, BuildState.BUILDING), '"building" timestamp should not be set until slave allocated.') scheduler.allocate_slave(slave=mock_slave1) building_timestamp1 = self._get_build_state_timestamp(build, BuildState.BUILDING) scheduler.allocate_slave(slave=mock_slave2) building_timestamp2 = self._get_build_state_timestamp(build, BuildState.BUILDING) self.assertIsNotNone(building_timestamp1, '"building" timestamp should be set after first slave allocated.') self.assertEqual(building_timestamp1, building_timestamp2, '"building" timestamp should not change upon further slave allocation.') def test_finishing_build_sets_finished_timestamp(self): build = self._create_test_build(BuildStatus.BUILDING) self.assertIsNone(self._get_build_state_timestamp(build, BuildState.FINISHED), '"finished" timestamp should not be set until build finishes.') self._finish_test_build(build) self.assertIsNotNone(self._get_build_state_timestamp(build, BuildState.FINISHED), '"finished" timestamp should be set when build finishes.') def test_marking_build_failed_sets_error_timestamp(self): build = self._create_test_build(BuildStatus.BUILDING) self.assertIsNone(self._get_build_state_timestamp(build, BuildState.ERROR), '"error" timestamp should not be set unless build fails.') build.mark_failed('Test build was intentionally marked failed.') self.assertIsNotNone(self._get_build_state_timestamp(build, BuildState.ERROR), '"error" timestamp should be set when build fails.') def test_canceling_build_sets_canceled_timestamp(self): build = self._create_test_build(BuildStatus.BUILDING) self.assertIsNone(self._get_build_state_timestamp(build, BuildState.CANCELED), '"canceled" timestamp should not be set unless build is canceled.') build.cancel() self.assertIsNotNone(self._get_build_state_timestamp(build, BuildState.CANCELED), '"canceled" timestamp should be set when build is canceled.') def test_get_failed_atoms_returns_none_if_not_finished(self): build = self._create_test_build(BuildStatus.BUILDING) self.assertIsNone(build._get_failed_atoms()) def test_get_failed_atoms_returns_empty_list_if_finished_and_all_passed(self): build = self._create_test_build(BuildStatus.FINISHED) build._build_artifact = MagicMock(spec_set=BuildArtifact) build._build_artifact.get_failed_subjob_and_atom_ids.return_value = [] self.assertEquals([], build._get_failed_atoms()) def test_get_failed_atoms_returns_failed_atoms_only(self): build = self._create_test_build(BuildStatus.FINISHED, num_subjobs=5, num_atoms_per_subjob=10) build._build_artifact = MagicMock(spec_set=BuildArtifact) # Failed items: (SubjobId: 1, AtomId: 1) and (SubjobId: 3, AtomId: 3) build._build_artifact.get_failed_subjob_and_atom_ids.return_value = [(1, 1), (3, 3)] failed_atoms = build._get_failed_atoms() self.assertEquals(failed_atoms, [ build._all_subjobs_by_id[1]._atoms[1], build._all_subjobs_by_id[3]._atoms[3], ]) def _create_test_build( self, build_status=None, job_config=None, num_subjobs=3, num_atoms_per_subjob=3, slaves=None, ): """ Create a Build instance for testing purposes. The instance will be created and brought to the specified state similarly to how it would reach that state in actual app execution. Build instances have a huge amount of internal state with complicated interactions, so this helper method helps us write tests that are much more consistent and closer to reality. It also helps us avoid modifying a build's private members directly. :type build_status: BuildStatus :rtype: Build """ build = Build(BuildRequest(build_parameters={})) if build_status is None: return build # QUEUED: Instantiate a mock project_type instance for the build. mock_project_type = self._create_mock_project_type() self.patch('app.master.build.util.create_project_type').return_value = mock_project_type build.generate_project_type() if build_status is BuildStatus.QUEUED: return build # PREPARED: Create a fake job config and subjobs and hand them off to the build. job_config = job_config or self._create_job_config() mock_project_type.job_config.return_value = job_config subjobs = self._create_subjobs(count=num_subjobs, num_atoms_each=num_atoms_per_subjob, job_config=job_config) subjob_calculator = self._create_mock_subjob_calc(subjobs) build.prepare(subjob_calculator) if build_status is BuildStatus.PREPARED: return build # BUILDING: Allocate a slave and begin subjob executions on that slave. slaves = slaves or [self._create_mock_slave()] scheduler = self.scheduler_pool.get(build) for slave in slaves: scheduler.allocate_slave(slave=slave) scheduler.begin_subjob_executions_on_slave(slave=slave) if build_status is BuildStatus.BUILDING: return build # ERROR: Mark the in-progress build as failed. if build_status is BuildStatus.ERROR: build.mark_failed(failure_reason='Test build was intentionally marked failed.') return build # CANCELED: Cancel the in-progress build. if build_status is BuildStatus.CANCELED: build.cancel() return build # FINISHED: Complete all subjobs and allow all postbuild tasks to execute. self._finish_test_build(build) if build_status is BuildStatus.FINISHED: return build raise ValueError('Unsupported value for build_status: "{}".'.format(build_status)) def _create_subjobs(self, count=3, num_atoms_each=1, build_id=0, job_config=None): return [ Subjob( build_id=build_id, subjob_id=i, project_type=None, job_config=job_config, atoms=[Atom('NAME=Leonardo') for _ in range(num_atoms_each)], ) for i in range(count) ] def _create_job_config( self, max_executors=_FAKE_MAX_EXECUTORS, max_executors_per_slave=_FAKE_MAX_EXECUTORS_PER_SLAVE, ): atomizer = Atomizer([{'FAKE': 'fake atomizer command'}]) return JobConfig('', '', '', '', atomizer, max_executors, max_executors_per_slave) def _create_mock_project_type(self): return MagicMock(spec_set=ProjectType()) def _create_mock_slave(self, num_executors=5): """ :type num_executors: int :rtype: Slave | MagicMock """ slave_spec = Slave('', 0) # constructor values don't matter since this is just a spec object mock_slave = MagicMock(spec_set=slave_spec, url=self._FAKE_SLAVE_URL, num_executors=num_executors) counter = Counter() mock_slave.claim_executor.side_effect = counter.increment mock_slave.free_executor.side_effect = counter.decrement return mock_slave def _create_mock_subjob_calc(self, subjobs): """ :type subjobs: list[Subjob] :rtype: SubjobCalculator """ mock_subjob_calculator = MagicMock(spec_set=SubjobCalculator) mock_subjob_calculator.compute_subjobs_for_build.return_value = subjobs return mock_subjob_calculator def _finish_test_build(self, build, assert_postbuild_tasks_complete=True): """ Complete all the subjobs for a build, triggering the build's postbuild tasks and transitioning it to the "finished" state. Since postbuild tasks are asynchronous, this injects an event so we can detect when the asynchronous method is finished. :type build: Build :type assert_postbuild_tasks_complete: bool """ build_scheduler = self.scheduler_pool.get(build) # Inject an event into the build's postbuild task so that we can detect when it completes. postbuild_tasks_complete_event = Event() self._on_async_postbuild_tasks_completed(build, postbuild_tasks_complete_event.set) # Complete all subjobs for this build. build_has_running_subjobs = True while build_has_running_subjobs: build_has_running_subjobs = False # copy allocated_slaves list since slaves may get deallocated during loop slaves_allocated = build_scheduler._slaves_allocated.copy() for mock_slave in slaves_allocated: self.assertIsInstance(mock_slave, Mock, '_finish_test_build() can only be used on builds with mock slaves.') for subjob in self._get_in_progress_subjobs_for_mock_slave(mock_slave): build_has_running_subjobs = True build.complete_subjob(subjob.subjob_id()) build_scheduler.execute_next_subjob_or_free_executor(mock_slave) # Wait for the async postbuild thread to complete executing postbuild tasks. if assert_postbuild_tasks_complete: self.assertTrue(postbuild_tasks_complete_event.wait(timeout=5), 'Postbuild tasks should be run and complete quickly when build finishes.') def _get_in_progress_subjobs_for_mock_slave(self, mock_slave): return [ start_subjob_args[0] for start_subjob_args, _ in mock_slave.start_subjob.call_args_list if start_subjob_args[0].atoms[0].state is AtomState.IN_PROGRESS ] def _on_async_postbuild_tasks_completed(self, build, callback): # Patch a build so it executes the specified callback after its PostBuild thread finishes. original_async_postbuild_method = build._perform_async_postbuild_tasks def async_postbuild_tasks_with_callback(): original_async_postbuild_method() callback() build._perform_async_postbuild_tasks = async_postbuild_tasks_with_callback def _get_build_state_timestamp(self, build, build_state): """ Get the recorded timestamp for a given build status. This may be None if the build has not yet reached the specified state. :type build: Build :type build_state: BuildState :rtype: float | None """ return build.api_representation()['state_timestamps'].get(build_state.lower())