def test_executing_build_teardown_multiple_times_will_raise_exception(self): self.mock_network.post().status_code = http.client.OK slave = self._create_cluster_slave() project_type_mock = self.patch('app.slave.cluster_slave.util.create_project_type').return_value # This test uses setup_complete_event to detect when the async fetch_project() has executed. setup_complete_event = Event() project_type_mock.fetch_project.side_effect = self.no_args_side_effect(setup_complete_event.set) # This test uses teardown_event to cause a thread to block on the teardown_build() call. teardown_event = Event() project_type_mock.teardown_build.side_effect = self.no_args_side_effect(teardown_event.wait) slave.connect_to_master(self._FAKE_MASTER_URL) slave.setup_build(build_id=123, project_type_params={'type': 'Fake'}, build_executor_start_index=0) self.assertTrue(setup_complete_event.wait(timeout=5), 'Build setup should complete very quickly.') # Start the first thread that does build teardown. This thread will block on teardown_build(). first_thread = SafeThread(target=slave._do_build_teardown_and_reset) first_thread.start() # Call build teardown() again and it should raise an exception. with self.assertRaises(BuildTeardownError): slave._do_build_teardown_and_reset() # Cleanup: Unblock the first thread and let it finish. We use the unhandled exception handler just in case any # exceptions occurred on the thread (so that they'd be passed back to the main thread and fail the test). teardown_event.set() with UnhandledExceptionHandler.singleton(): first_thread.join()
def __init__(self): self._logger = get_logger(__name__) self._all_slaves_by_url = {} self._all_builds_by_id = OrderedDict() # This is an OrderedDict so we can more easily implement get_queue() self._builds_waiting_for_slaves = Queue() self._request_queue = Queue() self._request_handler = SerialRequestHandler() self._request_queue_worker_thread = SafeThread( target=self._build_preparation_loop, name='RequestHandlerLoop', daemon=True) self._request_queue_worker_thread.start() self._slave_allocation_worker_thread = SafeThread( target=self._slave_allocation_loop, name='SlaveAllocationLoop', daemon=True) self._slave_allocation_worker_thread.start() self._master_results_path = Configuration['results_directory'] # It's important that idle slaves are only in the queue once so we use OrderedSet self._idle_slaves = OrderedSetQueue() # Delete all old builds when master starts. Remove this if/when build numbers are unique across master # starts/stops if os.path.exists(self._master_results_path): shutil.rmtree(self._master_results_path) fs.create_dir(self._master_results_path)
def __init__(self): self._logger = get_logger(__name__) self._all_slaves_by_url = {} self._all_builds_by_id = OrderedDict() # This is an OrderedDict so we can more easily implement get_queue() self._builds_waiting_for_slaves = Queue() self._request_queue = Queue() self._request_handler = SerialRequestHandler() self._request_queue_worker_thread = SafeThread( target=self._build_preparation_loop, name='RequestHandlerLoop', daemon=True) self._request_queue_worker_thread.start() self._slave_allocation_worker_thread = SafeThread( target=self._slave_allocation_loop, name='SlaveAllocationLoop', daemon=True) self._slave_allocation_worker_thread.start() self._master_results_path = Configuration['results_directory'] # It's important that idle slaves are only in the queue once so we use OrderedSet self._idle_slaves = OrderedSetQueue() # Asynchronously delete (but immediately rename) all old builds when master starts. # Remove this if/when build numbers are unique across master starts/stops if os.path.exists(self._master_results_path): fs.async_delete(self._master_results_path) fs.create_dir(self._master_results_path)
class SlaveAllocator(object): """ The SlaveAllocator class is responsible for allocating slaves to prepared builds. """ def __init__(self, build_request_handler): """ :type build_request_handler: BuildRequestHandler """ self._logger = get_logger(__name__) self._build_request_handler = build_request_handler self._idle_slaves = OrderedSetQueue() self._allocation_thread = SafeThread( target=self._slave_allocation_loop, name='SlaveAllocationLoop', daemon=True) def start(self): """ Start the infinite loop that will pull off prepared builds from a synchronized queue and allocate them slaves. """ if self._allocation_thread.is_alive(): raise RuntimeError('Error: slave allocation loop was asked to start when its already running.') self._allocation_thread.start() def _slave_allocation_loop(self): """ Builds wait in line for more slaves. This method executes in the background on another thread and watches for idle slaves, then gives them out to the waiting builds. """ while True: # This is a blocking call that will block until there is a prepared build. build_waiting_for_slave = self._build_request_handler.next_prepared_build() while build_waiting_for_slave.needs_more_slaves(): claimed_slave = self._idle_slaves.get() # Remove dead slaves from the idle queue if not claimed_slave.is_alive(use_cached=False): continue # The build may have completed while we were waiting for an idle slave, so check one more time. if build_waiting_for_slave.needs_more_slaves(): # Potential race condition here! If the build completes after the if statement is checked, # a slave will be allocated needlessly (and run slave.setup(), which can be significant work). self._logger.info('Allocating slave {} to build {}.', claimed_slave.url, build_waiting_for_slave.build_id()) build_waiting_for_slave.allocate_slave(claimed_slave) else: self.add_idle_slave(claimed_slave) self._logger.info('Done allocating slaves for build {}.', build_waiting_for_slave.build_id()) def add_idle_slave(self, slave): """ Add a slave to the idle queue. :type slave: Slave """ slave.mark_as_idle() self._idle_slaves.put(slave)
def post(self): self._write_status() kill_thread = SafeThread( name='kill-thread', target=self._cluster_slave.kill, ) kill_thread.start()
def __init__(self): self._logger = get_logger(__name__) self._builds_waiting_for_slaves = Queue() self._request_queue = Queue() self._request_queue_worker_thread = SafeThread( target=self._build_preparation_loop, name='RequestHandlerLoop', daemon=True) self._project_preparation_locks = {}
def __init__(self, scheduler_pool): """ :type scheduler_pool: app.master.build_scheduler_pool.BuildSchedulerPool """ self._logger = get_logger(__name__) self._scheduler_pool = scheduler_pool self._idle_slaves = OrderedSetQueue() self._allocation_thread = SafeThread( target=self._slave_allocation_loop, name='SlaveAllocationLoop', daemon=True)
def __init__(self, build_request_handler): """ :type build_request_handler: BuildRequestHandler """ self._logger = get_logger(__name__) self._build_request_handler = build_request_handler self._idle_slaves = OrderedSetQueue() self._allocation_thread = SafeThread( target=self._slave_allocation_loop, name='SlaveAllocationLoop', daemon=True)
def __init__(self, scheduler_pool): """ :type scheduler_pool: app.master.build_scheduler_pool.BuildSchedulerPool """ self._logger = get_logger(__name__) self._scheduler_pool = scheduler_pool self._request_queue = Queue() self._request_queue_worker_thread = SafeThread( target=self._build_preparation_loop, name='RequestHandlerLoop', daemon=True) self._project_preparation_locks = {}
def test_exception_on_safe_thread_calls_teardown_callbacks(self): my_awesome_teardown_callback = MagicMock() unhandled_exception_handler = UnhandledExceptionHandler.singleton() unhandled_exception_handler.add_teardown_callback(my_awesome_teardown_callback, 'fake arg', fake_kwarg='boop') def my_terrible_method(): raise Exception('Sic semper tyrannis!') thread = SafeThread(target=my_terrible_method) thread.start() thread.join() my_awesome_teardown_callback.assert_called_once_with('fake arg', fake_kwarg='boop')
def __init__(self, scheduler_pool): """ :type scheduler_pool: BuildSchedulerPool """ self._logger = get_logger(__name__) self._scheduler_pool = scheduler_pool self._builds_waiting_for_slaves = Queue() self._request_queue = Queue() self._request_queue_worker_thread = SafeThread( target=self._build_preparation_loop, name='RequestHandlerLoop', daemon=True) self._project_preparation_locks = {} self._subjob_calculator = SubjobCalculator()
def test_normal_execution_on_safe_thread_does_not_call_teardown_callbacks(self): my_lonely_teardown_callback = MagicMock() unhandled_exception_handler = UnhandledExceptionHandler.singleton() unhandled_exception_handler.add_teardown_callback(my_lonely_teardown_callback) def my_fantastic_method(): print('Veritas vos liberabit!') thread = SafeThread(target=my_fantastic_method) thread.start() thread.join() self.assertFalse(my_lonely_teardown_callback.called, 'The teardown callback should not be called unless an exception is raised.')
def start_working_on_subjob(self, build_id, subjob_id, subjob_artifact_dir, atomic_commands): """ Begin working on a subjob with the given build id and subjob id. This just starts the subjob execution asynchronously on a separate thread. :type build_id: int :type subjob_id: int :type subjob_artifact_dir: str :type atomic_commands: list[str] :return: The text to return in the API response. :rtype: dict[str, int] """ if build_id != self._current_build_id: raise BadRequestError('Attempted to start subjob {} for build {}, ' 'but current build id is {}.'.format(subjob_id, build_id, self._current_build_id)) # get idle executor from queue to claim it as in-use (or block until one is available) executor = self._idle_executors.get() # Start a thread to execute the job (after waiting for setup to complete) SafeThread( target=self._execute_subjob, args=(build_id, subjob_id, executor, subjob_artifact_dir, atomic_commands), name='Bld{}-Sub{}'.format(build_id, subjob_id), ).start() self._logger.info('Slave ({}:{}) has received subjob. (Build {}, Subjob {})', self.host, self.port, build_id, subjob_id) return {'executor_id': executor.id}
def setup_build(self, build_id, project_type_params, build_executor_start_index): """ Usually called once per build to do build-specific setup. Will block any subjobs from executing until setup completes. The actual setup is performed on another thread and will unblock subjobs (via an Event) once it finishes. :param build_id: The id of the build to run setup on :type build_id: int :param project_type_params: The parameters that define the project_type this build will execute in :type project_type_params: dict :param build_executor_start_index: How many executors have alreayd been allocated on other slaves for this build :type build_executor_start_index: int """ self._logger.info('Executing setup for build {} (type: {}).', build_id, project_type_params.get('type')) self._current_build_id = build_id self._build_teardown_coin = SingleUseCoin() # protects against build_teardown being executed multiple times # create an project_type instance for build-level operations self._project_type = util.create_project_type(project_type_params) # verify all executors are idle if not self._idle_executors.full(): raise RuntimeError('Slave tried to setup build but not all executors are idle. ({}/{} executors idle.)' .format(self._idle_executors.qsize(), self._num_executors)) # Collect all the executors to pass to project_type.fetch_project(). This will create a new project_type for # each executor (for subjob-level operations). executors = list(self._idle_executors.queue) SafeThread( target=self._async_setup_build, name='Bld{}-Setup'.format(build_id), args=(executors, project_type_params, build_executor_start_index) ).start()
def start_subjob(self, subjob): """ :type subjob: Subjob """ if not self.is_alive(): raise RuntimeError('Tried to start a subjob on a dead slave! ({}, id: {})'.format(self.url, self.id)) SafeThread(target=self._async_start_subjob, args=(subjob,)).start()
def test_calling_kill_subprocesses_will_break_out_of_command_execution_wait_loop( self): self._mock_stdout_and_stderr(b'fake_output', b'fake_error') self.mock_popen.pid = 55555 self._simulate_hanging_popen_process() project_type = ProjectType() command_thread = SafeThread( target=project_type.execute_command_in_project, args=('echo The power is yours!', )) # This calls execute_command_in_project() on one thread, and calls kill_subprocesses() on another. The # kill_subprocesses() call should cause the first thread to exit. command_thread.start() project_type.kill_subprocesses() # This *should* join immediately, but we specify a timeout just in case something goes wrong so that the test # doesn't hang. A successful join implies success. We also use the UnhandledExceptionHandler so that exceptions # propagate from the child thread to the test thread and fail the test. with UnhandledExceptionHandler.singleton(): command_thread.join(timeout=10) if command_thread.is_alive(): self.mock_killpg( ) # Calling killpg() causes the command thread to end. self.fail( 'project_type.kill_subprocesses should cause the command execution wait loop to exit.' ) self.mock_killpg.assert_called_once_with( 55555, ANY) # Note: os.killpg does not accept keyword args.
def start_subjob(self, subjob): """ :type subjob: Subjob """ if not self.is_alive(): raise DeadSlaveError('Tried to start a subjob on a dead slave! ({}, id: {})'.format(self.url, self.id)) if self._is_in_shutdown_mode: raise SlaveMarkedForShutdownError('Tried to start a subjob on a slave in shutdown mode. ({}, id: {})' .format(self.url, self.id)) SafeThread(target=self._async_start_subjob, args=(subjob,)).start()
def test_calling_kill_subprocesses_will_break_out_of_command_execution_wait_loop(self): def fake_communicate(timeout=None): # The fake implementation is that communicate() times out forever until os.killpg is called. if mock_killpg.call_count == 0 and timeout is not None: raise TimeoutExpired(None, timeout) elif mock_killpg.call_count > 0: return b'fake output', b'fake error' self.fail('Popen.communicate() should not be called without a timeout before os.killpg has been called.') mock_killpg = self.patch('os.killpg') self.mock_popen.communicate.side_effect = fake_communicate self.mock_popen.returncode = 1 self.mock_popen.pid = 55555 project_type = ProjectType() command_thread = SafeThread(target=project_type.execute_command_in_project, args=('echo The power is yours!',)) # This calls execute_command_in_project() on one thread, and calls kill_subprocesses() on another. The # kill_subprocesses() call should cause the first thread to exit. command_thread.start() project_type.kill_subprocesses() # This *should* join immediately, but we specify a timeout just in case something goes wrong so that the test # doesn't hang. A successful join implies success. We also use the UnhandledExceptionHandler so that exceptions # propagate from the child thread to the test thread and fail the test. with UnhandledExceptionHandler.singleton(): command_thread.join(timeout=10) if command_thread.is_alive(): mock_killpg() # Calling killpg() causes the command thread to end. self.fail('project_type.kill_subprocesses should cause the command execution wait loop to exit.') mock_killpg.assert_called_once_with(pgid=55555, sig=ANY)
def test_executing_build_teardown_multiple_times_will_raise_exception( self): self.mock_network.post().status_code = http.client.OK slave = self._create_cluster_slave() project_type_mock = self.patch( 'app.slave.cluster_slave.util.create_project_type').return_value # This test uses setup_complete_event to detect when the async fetch_project() has executed. setup_complete_event = Event() project_type_mock.fetch_project.side_effect = self.no_args_side_effect( setup_complete_event.set) # This test uses teardown_event to cause a thread to block on the teardown_build() call. teardown_event = Event() project_type_mock.teardown_build.side_effect = self.no_args_side_effect( teardown_event.wait) slave.connect_to_master(self._FAKE_MASTER_URL) slave.setup_build(build_id=123, project_type_params={'type': 'Fake'}) self.assertTrue(setup_complete_event.wait(timeout=5), 'Build setup should complete very quickly.') # Start the first thread that does build teardown. This thread will block on teardown_build(). first_thread = SafeThread(target=slave._do_build_teardown_and_reset) first_thread.start() # Call build teardown() again and it should raise an exception. with self.assertRaises(BuildTeardownError): slave._do_build_teardown_and_reset() # Cleanup: Unblock the first thread and let it finish. We use the unhandled exception handler just in case any # exceptions occurred on the thread (so that they'd be passed back to the main thread and fail the test). teardown_event.set() with UnhandledExceptionHandler.singleton(): first_thread.join()
def run(self, *args, **kwargs): app_thread = SafeThread( name=self._THREAD_NAME, target=self.async_run, args=args, kwargs=kwargs, ) app_thread.start() app_thread.join()
def mark_subjob_complete(self, subjob_id): """ :type subjob_id: int """ subjob = self._all_subjobs_by_id[int(subjob_id)] with self._build_completion_lock: self._finished_subjobs.put(subjob, block=False) subjobs_are_finished = self._subjobs_are_finished # We use a local variable here which was set inside the _build_completion_lock to prevent a race condition if subjobs_are_finished: self._logger.info("All results received for build {}!", self._build_id) SafeThread(target=self._perform_async_postbuild_tasks, name='PostBuild{}'.format(self._build_id)).start()
def start_subjob(self, subjob): """ :type subjob: Subjob """ if not self.is_alive(): raise DeadSlaveError( 'Tried to start a subjob on a dead slave! ({}, id: {})'.format( self.url, self.id)) if self._is_in_shutdown_mode: raise SlaveMarkedForShutdownError( 'Tried to start a subjob on a slave in shutdown mode. ({}, id: {})' .format(self.url, self.id)) # todo: This should not be a SafeThread. https://github.com/box/ClusterRunner/issues/337 SafeThread(target=self._async_start_subjob, args=(subjob, )).start()
def teardown_build(self, build_id=None): """ Called at the end of each build on each slave before it reports back to the master that it is idle again. :param build_id: The build id to teardown -- this parameter is used solely for correctness checking of the master, to make sure that the master is not erroneously sending teardown commands for other builds. :type build_id: int | None """ if self._current_build_id is None: raise BadRequestError('Tried to teardown a build but no build is active on this slave.') if build_id is not None and build_id != self._current_build_id: raise BadRequestError('Tried to teardown build {}, ' 'but slave is running build {}!'.format(build_id, self._current_build_id)) SafeThread( target=self._async_teardown_build, name='Bld{}-Teardwn'.format(build_id) ).start()
def _build_preparation_loop(self): """ Grabs a build off the request_queue (populated by self.handle_build_request()), prepares it, and puts that build onto the self.builds_waiting_for_slaves queue. """ while True: build = self._request_queue.get() project_id = build.project_type.project_id() if project_id not in self._project_preparation_locks: self._logger.info('Creating project lock [{}] for build {}', project_id, str(build.build_id())) self._project_preparation_locks[project_id] = Lock() project_lock = self._project_preparation_locks[project_id] SafeThread(target=self._prepare_build_async, name='Bld{}-PreparationThread'.format(build.build_id()), args=(build, project_lock)).start()
def test_exception_on_safe_thread_calls_teardown_callbacks(self): my_awesome_teardown_callback = MagicMock() unhandled_exception_handler = UnhandledExceptionHandler.singleton() unhandled_exception_handler.add_teardown_callback( my_awesome_teardown_callback, 'fake arg', fake_kwarg='boop') def my_terrible_method(): raise Exception('Sic semper tyrannis!') thread = SafeThread(target=my_terrible_method) thread.start() thread.join() my_awesome_teardown_callback.assert_called_once_with('fake arg', fake_kwarg='boop')
def test_normal_execution_on_safe_thread_does_not_call_teardown_callbacks( self): my_lonely_teardown_callback = MagicMock() unhandled_exception_handler = UnhandledExceptionHandler.singleton() unhandled_exception_handler.add_teardown_callback( my_lonely_teardown_callback) def my_fantastic_method(): print('Veritas vos liberabit!') thread = SafeThread(target=my_fantastic_method) thread.start() thread.join() self.assertFalse( my_lonely_teardown_callback.called, 'The teardown callback should not be called unless an exception is raised.' )
def test_calling_kill_subprocesses_will_break_out_of_command_execution_wait_loop(self): self._mock_out_popen_communicate() project_type = ProjectType() command_thread = SafeThread(target=project_type.execute_command_in_project, args=('echo The power is yours!',)) # This calls execute_command_in_project() on one thread, and calls kill_subprocesses() on another. The # kill_subprocesses() call should cause the first thread to exit. command_thread.start() project_type.kill_subprocesses() # This *should* join immediately, but we specify a timeout just in case something goes wrong so that the test # doesn't hang. A successful join implies success. We also use the UnhandledExceptionHandler so that exceptions # propagate from the child thread to the test thread and fail the test. with UnhandledExceptionHandler.singleton(): command_thread.join(timeout=10) if command_thread.is_alive(): self.mock_killpg() # Calling killpg() causes the command thread to end. self.fail('project_type.kill_subprocesses should cause the command execution wait loop to exit.') self.mock_killpg.assert_called_once_with(55555, ANY) # Note: os.killpg does not accept keyword args.
class BuildRequestHandler(object): """ The BuildRequestHandler class is responsible for preparing a non-prepared build. Implementation notes: This class manages two critical Queues in ClusterRunner: request_queue and builds_waiting_for_slaves. The request_queue is the queue of non-prepared Build instances that the BuildRequestHandler has yet to prepare. This queue is populated by the ClusterMaster instance. The builds_waiting_for_slaves queue is the queue of prepared Build instances that the BuildRequestHandler has completed build preparation for, and is waiting for the SlaveAllocator (a separate entity) to pull Builds from. All of the input of builds come through self.handle_build_request() calls, and all of the output of builds go through self.next_prepared_build_scheduler() calls. """ def __init__(self, scheduler_pool): """ :type scheduler_pool: BuildSchedulerPool """ self._logger = get_logger(__name__) self._scheduler_pool = scheduler_pool self._builds_waiting_for_slaves = Queue() self._request_queue = Queue() self._request_queue_worker_thread = SafeThread( target=self._build_preparation_loop, name='RequestHandlerLoop', daemon=True) self._project_preparation_locks = {} self._subjob_calculator = SubjobCalculator() def start(self): """ Start the infinite loop that will accept unprepared builds and put them through build preparation. """ if self._request_queue_worker_thread.is_alive(): raise RuntimeError('Error: build request handler loop was asked to start when its already running.') self._request_queue_worker_thread.start() def handle_build_request(self, build): """ :param build: the requested build :type build: Build """ self._request_queue.put(build) analytics.record_event(analytics.BUILD_REQUEST_QUEUED, build_id=build.build_id(), log_msg='Queued request for build {build_id}.') def next_prepared_build_scheduler(self): """ Get the scheduler for the next build that has successfully completed build preparation. This is a blocking call--if there are no more builds that have completed build preparation and this method gets invoked, the execution will hang until the next build has completed build preparation. :rtype: BuildScheduler """ build = self._builds_waiting_for_slaves.get() build_scheduler = self._scheduler_pool.get(build) return build_scheduler def _build_preparation_loop(self): """ Grabs a build off the request_queue (populated by self.handle_build_request()), prepares it, and puts that build onto the self.builds_waiting_for_slaves queue. """ while True: build = self._request_queue.get() project_id = build.project_type.project_id() if project_id not in self._project_preparation_locks: self._logger.info('Creating project lock [{}] for build {}', project_id, str(build.build_id())) self._project_preparation_locks[project_id] = Lock() project_lock = self._project_preparation_locks[project_id] SafeThread( target=self._prepare_build_async, name='Bld{}-PreparationThread'.format(build.build_id()), args=(build, project_lock) ).start() def _prepare_build_async(self, build, project_lock): """ :type build: Build :type project_lock: Lock """ self._logger.info('Build {} is waiting for the project lock', build.build_id()) with project_lock: self._logger.info('Build {} has acquired project lock', build.build_id()) analytics.record_event(analytics.BUILD_PREPARE_START, build_id=build.build_id(), log_msg='Build preparation loop is handling request for build {build_id}.') try: build.prepare(self._subjob_calculator) if not build.has_error: analytics.record_event(analytics.BUILD_PREPARE_FINISH, build_id=build.build_id(), is_success=True, log_msg='Build {build_id} successfully prepared.') # If the atomizer found no work to do, perform build cleanup and skip the slave allocation. if len(build.all_subjobs()) == 0: self._logger.info('Build {} has no work to perform and is exiting.', build.build_id()) build.finish() # If there is work to be done, this build must queue to be allocated slaves. else: self._logger.info('Build {} is waiting for slaves.', build.build_id()) self._builds_waiting_for_slaves.put(build) except Exception as ex: # pylint: disable=broad-except build.mark_failed(str(ex)) # WIP(joey): Build should do this internally. self._logger.exception('Could not handle build request for build {}.'.format(build.build_id())) analytics.record_event(analytics.BUILD_PREPARE_FINISH, build_id=build.build_id(), is_success=False)
class BuildRequestHandler(object): """ The BuildRequestHandler class is responsible for preparing a non-prepared build. Implementation notes: This class manages two critical Queue's in ClusterRunner: request_queue and builds_waiting_for_slaves. The request_queue is the queue of non-prepared Build instances that the BuildRequestHandler has yet to prepare. This queue is populated by the ClusterMaster instance. The builds_waiting_for_slaves queue is the queue of prepared Build instances that the BuildRequestHandler has completed build preparation for, and is waiting for the SlaveAllocator (a separate entity) to pull Builds from. All of the input of builds come through self.handle_build_request() calls, and all of the output of builds go through self.next_prepared_build() calls. """ def __init__(self): self._logger = get_logger(__name__) self._builds_waiting_for_slaves = Queue() self._request_queue = Queue() self._request_queue_worker_thread = SafeThread( target=self._build_preparation_loop, name='RequestHandlerLoop', daemon=True) self._project_preparation_locks = {} def start(self): """ Start the infinite loop that will accept unprepared builds and put them through build preparation. """ if self._request_queue_worker_thread.is_alive(): raise RuntimeError( 'Error: build request handler loop was asked to start when its already running.' ) self._request_queue_worker_thread.start() def handle_build_request(self, build): """ :param build: the requested build :type build: Build """ self._request_queue.put(build) analytics.record_event(analytics.BUILD_REQUEST_QUEUED, build_id=build.build_id(), log_msg='Queued request for build {build_id}.') def next_prepared_build(self): """ Get the next build that has successfully completed build preparation. This is a blocking call--if there are no more builds that have completed build preparation and this method gets invoked, the execution will hang until the next build has completed build preparation. :rtype: Build """ return self._builds_waiting_for_slaves.get() def _build_preparation_loop(self): """ Grabs a build off the request_queue (populated by self.handle_build_request()), prepares it, and puts that build onto the self.builds_waiting_for_slaves queue. """ while True: build = self._request_queue.get() project_id = build.project_type.project_id() if project_id not in self._project_preparation_locks: self._logger.info('Creating project lock [{}] for build {}', project_id, str(build.build_id())) self._project_preparation_locks[project_id] = Lock() project_lock = self._project_preparation_locks[project_id] SafeThread(target=self._prepare_build_async, name='Bld{}-PreparationThread'.format(build.build_id()), args=(build, project_lock)).start() def _prepare_build_async(self, build, project_lock): """ :type build: Build :type project_lock: Lock """ self._logger.info('Build {} is waiting for the project lock', build.build_id()) with project_lock: self._logger.info('Build {} has acquired project lock', build.build_id()) analytics.record_event( analytics.BUILD_PREPARE_START, build_id=build.build_id(), log_msg= 'Build preparation loop is handling request for build {build_id}.' ) try: self._prepare_build(build) if not build.has_error: analytics.record_event( analytics.BUILD_PREPARE_FINISH, build_id=build.build_id(), log_msg= 'Build {build_id} successfully prepared and waiting for slaves.' ) self._builds_waiting_for_slaves.put(build) except Exception as ex: # pylint: disable=broad-except build.mark_failed(str(ex)) self._logger.exception( 'Could not handle build request for build {}.'.format( build.build_id())) def _prepare_build(self, build): """ Prepare a Build to be distributed across slaves. :param build: the Build instance to be prepared to be distributed across slaves :type build: Build """ build_id = build.build_id() build_request = build.build_request if not isinstance(build_request, BuildRequest): raise RuntimeError( 'Build {} has no associated request object.'.format(build_id)) project_type = build.project_type if not isinstance(project_type, ProjectType): raise RuntimeError('Build {} has no project set.'.format(build_id)) self._logger.info('Fetching project for build {}.', build_id) project_type.fetch_project() self._logger.info('Successfully fetched project for build {}.', build_id) job_config = project_type.job_config() if job_config is None: build.mark_failed( 'Build failed while trying to parse cluster_runner.yaml.') return subjobs = self._compute_subjobs_for_build(build_id, job_config, project_type) build.prepare(subjobs, job_config) def _compute_subjobs_for_build(self, build_id, job_config, project_type): """ :type build_id: int :type job_config: JobConfig :param project_type: the docker, directory, or git repo project_type that this build is running in :type project_type: project_type.project_type.ProjectType :rtype: list[Subjob] """ atoms_list = job_config.atomizer.atomize_in_project(project_type) # Group the atoms together using some grouping strategy timing_file_path = project_type.timing_file_path(job_config.name) grouped_atoms = self._grouped_atoms(atoms_list, job_config.max_executors, timing_file_path, project_type.project_directory) # Generate subjobs for each group of atoms subjobs = [] for subjob_id in range(len(grouped_atoms)): atoms = grouped_atoms[subjob_id] subjobs.append( Subjob(build_id, subjob_id, project_type, job_config, atoms)) return subjobs def _grouped_atoms(self, atoms, max_executors, timing_file_path, project_directory): """ Return atoms that are grouped for optimal CI performance. If a timing file exists, then use the TimeBasedAtomGrouper. If not, use the default AtomGrouper (groups each atom into its own subjob). :param atoms: all of the atoms to be run this time :type atoms: list[app.master.atom.Atom] :param max_executors: the maximum number of executors for this build :type max_executors: int :param timing_file_path: path to where the timing data file would be stored (if it exists) for this job :type timing_file_path: str :type project_directory: str :return: the grouped atoms (in the form of list of lists of strings) :rtype: list[list[app.master.atom.Atom]] """ atom_time_map = None if os.path.isfile(timing_file_path): with open(timing_file_path, 'r') as json_file: try: atom_time_map = json.load(json_file) except ValueError: self._logger.warning( 'Failed to load timing data from file that exists {}', timing_file_path) if atom_time_map is not None and len(atom_time_map) > 0: atom_grouper = TimeBasedAtomGrouper(atoms, max_executors, atom_time_map, project_directory) else: atom_grouper = AtomGrouper(atoms, max_executors) return atom_grouper.groupings()
def start_heartbeat_thread(self): self._logger.info('Heartbeat will run every {} seconds'.format( self._heartbeat_interval)) SafeThread(target=self._start_heartbeat, name='HeartbeatThread', daemon=True).start()
class SlaveAllocator(object): """ The SlaveAllocator class is responsible for allocating slaves to prepared builds. """ def __init__(self, build_request_handler): """ :type build_request_handler: BuildRequestHandler """ self._logger = get_logger(__name__) self._build_request_handler = build_request_handler self._idle_slaves = OrderedSetQueue() self._allocation_thread = SafeThread( target=self._slave_allocation_loop, name='SlaveAllocationLoop', daemon=True) def start(self): """ Start the infinite loop that will pull off prepared builds from a synchronized queue and allocate them slaves. """ if self._allocation_thread.is_alive(): raise RuntimeError( 'Error: slave allocation loop was asked to start when its already running.' ) self._allocation_thread.start() def _slave_allocation_loop(self): """ Builds wait in line for more slaves. This method executes in the background on another thread and watches for idle slaves, then gives them out to the waiting builds. """ while True: # This is a blocking call that will block until there is a prepared build. build_waiting_for_slave = self._build_request_handler.next_prepared_build( ) while build_waiting_for_slave.needs_more_slaves(): claimed_slave = self._idle_slaves.get() # Remove dead and shutdown slaves from the idle queue if claimed_slave.is_shutdown() or not claimed_slave.is_alive( use_cached=False): continue # The build may have completed while we were waiting for an idle slave, so check one more time. if build_waiting_for_slave.needs_more_slaves(): # Potential race condition here! If the build completes after the if statement is checked, # a slave will be allocated needlessly (and run slave.setup(), which can be significant work). self._logger.info('Allocating slave {} to build {}.', claimed_slave.url, build_waiting_for_slave.build_id()) build_waiting_for_slave.allocate_slave(claimed_slave) else: self.add_idle_slave(claimed_slave) self._logger.info('Done allocating slaves for build {}.', build_waiting_for_slave.build_id()) def add_idle_slave(self, slave): """ Add a slave to the idle queue. :type slave: Slave """ try: slave.mark_as_idle() self._idle_slaves.put(slave) except SlaveMarkedForShutdownError: pass
class BuildRequestHandler(object): """ The BuildRequestHandler class is responsible for preparing a non-prepared build. Implementation notes: This class manages two critical Queue's in ClusterRunner: request_queue and builds_waiting_for_slaves. The request_queue is the queue of non-prepared Build instances that the BuildRequestHandler has yet to prepare. This queue is populated by the ClusterMaster instance. The builds_waiting_for_slaves queue is the queue of prepared Build instances that the BuildRequestHandler has completed build preparation for, and is waiting for the SlaveAllocator (a separate entity) to pull Builds from. All of the input of builds come through self.handle_build_request() calls, and all of the output of builds go through self.next_prepared_build() calls. """ def __init__(self): self._logger = get_logger(__name__) self._builds_waiting_for_slaves = Queue() self._request_queue = Queue() self._request_queue_worker_thread = SafeThread( target=self._build_preparation_loop, name='RequestHandlerLoop', daemon=True) self._project_preparation_locks = {} def start(self): """ Start the infinite loop that will accept unprepared builds and put them through build preparation. """ if self._request_queue_worker_thread.is_alive(): raise RuntimeError('Error: build request handler loop was asked to start when its already running.') self._request_queue_worker_thread.start() def handle_build_request(self, build): """ :param build: the requested build :type build: Build """ self._request_queue.put(build) analytics.record_event(analytics.BUILD_REQUEST_QUEUED, build_id=build.build_id(), log_msg='Queued request for build {build_id}.') def next_prepared_build(self): """ Get the next build that has successfully completed build preparation. This is a blocking call--if there are no more builds that have completed build preparation and this method gets invoked, the execution will hang until the next build has completed build preparation. :rtype: Build """ return self._builds_waiting_for_slaves.get() def _build_preparation_loop(self): """ Grabs a build off the request_queue (populated by self.handle_build_request()), prepares it, and puts that build onto the self.builds_waiting_for_slaves queue. """ while True: build = self._request_queue.get() project_id = build.project_type.project_id() if project_id not in self._project_preparation_locks: self._logger.info('Creating project lock [{}] for build {}', project_id, str(build.build_id())) self._project_preparation_locks[project_id] = Lock() project_lock = self._project_preparation_locks[project_id] SafeThread( target=self._prepare_build_async, name='Bld{}-PreparationThread'.format(build.build_id()), args=(build, project_lock) ).start() def _prepare_build_async(self, build, project_lock): """ :type build: Build :type project_lock: Lock """ self._logger.info('Build {} is waiting for the project lock', build.build_id()) with project_lock: self._logger.info('Build {} has acquired project lock', build.build_id()) analytics.record_event(analytics.BUILD_PREPARE_START, build_id=build.build_id(), log_msg='Build preparation loop is handling request for build {build_id}.') try: self._prepare_build(build) if not build.has_error: analytics.record_event(analytics.BUILD_PREPARE_FINISH, build_id=build.build_id(), log_msg='Build {build_id} successfully prepared and waiting for slaves.') self._builds_waiting_for_slaves.put(build) except Exception as ex: # pylint: disable=broad-except build.mark_failed(str(ex)) self._logger.exception('Could not handle build request for build {}.'.format(build.build_id())) def _prepare_build(self, build): """ Prepare a Build to be distributed across slaves. :param build: the Build instance to be prepared to be distributed across slaves :type build: Build """ build_id = build.build_id() build_request = build.build_request if not isinstance(build_request, BuildRequest): raise RuntimeError('Build {} has no associated request object.'.format(build_id)) project_type = build.project_type if not isinstance(project_type, ProjectType): raise RuntimeError('Build {} has no project set.'.format(build_id)) self._logger.info('Fetching project for build {}.', build_id) project_type.fetch_project() self._logger.info('Successfully fetched project for build {}.', build_id) job_config = project_type.job_config() if job_config is None: build.mark_failed('Build failed while trying to parse cluster_runner.yaml.') return subjobs = self._compute_subjobs_for_build(build_id, job_config, project_type) build.prepare(subjobs, job_config) def _compute_subjobs_for_build(self, build_id, job_config, project_type): """ :type build_id: int :type job_config: JobConfig :param project_type: the directory, or git repo project_type that this build is running in :type project_type: project_type.project_type.ProjectType :rtype: list[Subjob] """ atoms_list = job_config.atomizer.atomize_in_project(project_type) # Group the atoms together using some grouping strategy timing_file_path = project_type.timing_file_path(job_config.name) grouped_atoms = self._grouped_atoms( atoms_list, job_config.max_executors, timing_file_path, project_type.project_directory ) # Generate subjobs for each group of atoms subjobs = [] for subjob_id in range(len(grouped_atoms)): atoms = grouped_atoms[subjob_id] subjobs.append(Subjob(build_id, subjob_id, project_type, job_config, atoms)) return subjobs def _grouped_atoms(self, atoms, max_executors, timing_file_path, project_directory): """ Return atoms that are grouped for optimal CI performance. If a timing file exists, then use the TimeBasedAtomGrouper. If not, use the default AtomGrouper (groups each atom into its own subjob). :param atoms: all of the atoms to be run this time :type atoms: list[app.master.atom.Atom] :param max_executors: the maximum number of executors for this build :type max_executors: int :param timing_file_path: path to where the timing data file would be stored (if it exists) for this job :type timing_file_path: str :type project_directory: str :return: the grouped atoms (in the form of list of lists of strings) :rtype: list[list[app.master.atom.Atom]] """ atom_time_map = None if os.path.isfile(timing_file_path): with open(timing_file_path, 'r') as json_file: try: atom_time_map = json.load(json_file) except ValueError: self._logger.warning('Failed to load timing data from file that exists {}', timing_file_path) if atom_time_map is not None and len(atom_time_map) > 0: atom_grouper = TimeBasedAtomGrouper(atoms, max_executors, atom_time_map, project_directory) else: atom_grouper = AtomGrouper(atoms, max_executors) return atom_grouper.groupings()
class BuildRequestHandler(object): """ The BuildRequestHandler class is responsible for preparing a non-prepared build. Implementation notes: This class manages two critical Queues in ClusterRunner: request_queue and builds_waiting_for_slaves. The request_queue is the queue of non-prepared Build instances that the BuildRequestHandler has yet to prepare. This queue is populated by the ClusterMaster instance. The builds_waiting_for_slaves queue is the queue of prepared Build instances that the BuildRequestHandler has completed build preparation for, and is waiting for the SlaveAllocator (a separate entity) to pull Builds from. All of the input of builds come through self.handle_build_request() calls, and all of the output of builds go through self._scheduler_pool.next_prepared_build_scheduler() calls. """ def __init__(self, scheduler_pool): """ :type scheduler_pool: app.master.build_scheduler_pool.BuildSchedulerPool """ self._logger = get_logger(__name__) self._scheduler_pool = scheduler_pool self._request_queue = Queue() self._request_queue_worker_thread = SafeThread( target=self._build_preparation_loop, name='RequestHandlerLoop', daemon=True) self._project_preparation_locks = {} def start(self): """ Start the infinite loop that will accept unprepared builds and put them through build preparation. """ if self._request_queue_worker_thread.is_alive(): raise RuntimeError( 'Error: build request handler loop was asked to start when its already running.' ) self._request_queue_worker_thread.start() def handle_build_request(self, build): """ :param build: the requested build :type build: Build """ self._request_queue.put(build) analytics.record_event(analytics.BUILD_REQUEST_QUEUED, build_id=build.build_id(), log_msg='Queued request for build {build_id}.') def _build_preparation_loop(self): """ Grabs a build off the request_queue (populated by self.handle_build_request()), prepares it, and puts that build onto the self.builds_waiting_for_slaves queue. """ while True: build = self._request_queue.get() project_id = build.project_type.project_id() if project_id not in self._project_preparation_locks: self._logger.info('Creating project lock [{}] for build {}', project_id, str(build.build_id())) self._project_preparation_locks[project_id] = Lock() project_lock = self._project_preparation_locks[project_id] SafeThread(target=self._prepare_build_async, name='Bld{}-PreparationThread'.format(build.build_id()), args=(build, project_lock)).start() def _prepare_build_async(self, build, project_lock): """ :type build: app.master.build.Build :type project_lock: Lock """ self._logger.info('Build {} is waiting for the project lock', build.build_id()) with project_lock: self._logger.info('Build {} has acquired project lock', build.build_id()) analytics.record_event( analytics.BUILD_PREPARE_START, build_id=build.build_id(), log_msg= 'Build preparation loop is handling request for build {build_id}.' ) try: build.prepare() if not build.is_stopped: analytics.record_event( analytics.BUILD_PREPARE_FINISH, build_id=build.build_id(), is_success=True, log_msg='Build {build_id} successfully prepared.') # If the atomizer found no work to do, perform build cleanup and skip the slave allocation. if len(build.get_subjobs()) == 0: self._logger.info( 'Build {} has no work to perform and is exiting.', build.build_id()) build.finish() # If there is work to be done, this build must queue to be allocated slaves. else: self._logger.info('Build {} is waiting for slaves.', build.build_id()) self._scheduler_pool.add_build_waiting_for_slaves( build) except Exception as ex: # pylint: disable=broad-except if not build.is_canceled: build.mark_failed( str(ex)) # WIP(joey): Build should do this internally. self._logger.exception( 'Could not handle build request for build {}.'.format( build.build_id())) analytics.record_event(analytics.BUILD_PREPARE_FINISH, build_id=build.build_id(), is_success=False)
class ClusterMaster(object): """ The ClusterRunner Master service: This is the main application class that the web framework/REST API sits on top of. """ API_VERSION = 'v1' def __init__(self): self._logger = get_logger(__name__) self._all_slaves_by_url = {} self._all_builds_by_id = OrderedDict() # This is an OrderedDict so we can more easily implement get_queue() self._builds_waiting_for_slaves = Queue() self._request_queue = Queue() self._request_handler = SerialRequestHandler() self._request_queue_worker_thread = SafeThread( target=self._build_preparation_loop, name='RequestHandlerLoop', daemon=True) self._request_queue_worker_thread.start() self._slave_allocation_worker_thread = SafeThread( target=self._slave_allocation_loop, name='SlaveAllocationLoop', daemon=True) self._slave_allocation_worker_thread.start() self._master_results_path = Configuration['results_directory'] # It's important that idle slaves are only in the queue once so we use OrderedSet self._idle_slaves = OrderedSetQueue() # Asynchronously delete (but immediately rename) all old builds when master starts. # Remove this if/when build numbers are unique across master starts/stops if os.path.exists(self._master_results_path): fs.async_delete(self._master_results_path) fs.create_dir(self._master_results_path) def _get_status(self): """ Just returns a dumb message and prints it to the console. :rtype: str """ return 'Master service is up.' def api_representation(self): """ Gets a dict representing this resource which can be returned in an API response. :rtype: dict [str, mixed] """ slaves_representation = [slave.api_representation() for slave in self.all_slaves_by_id().values()] return { 'status': self._get_status(), 'slaves': slaves_representation, } def builds(self): """ Returns a list of all builds :rtype: list[Build] """ return self._all_builds_by_id.values() def active_builds(self): """ Returns a list of incomplete builds :rtype: list[Build] """ return [build for build in self.builds() if not build.is_finished] def _mark_build_finished_if_slaves_are_done(self, build_id): """ Run when a slave is finished with a build. If this is the last slave for that build, mark the build finished. Even when the subjobs are complete, a slave is not finished with the build until teardown is complete. :type build_id: int """ for slave in self._all_slaves_by_url.values(): if slave.current_build_id == build_id: return self.get_build(build_id).finish() def all_slaves_by_id(self): """ Retrieve all connected slaves :rtype: dict [int, Slave] """ slaves_by_slave_id = {} for slave in self._all_slaves_by_url.values(): slaves_by_slave_id[slave.id] = slave return slaves_by_slave_id def get_slave(self, slave_id=None, slave_url=None): """ Get the instance of given slave by either the slave's id or url. Only one of slave_id or slave_url should be specified. :param slave_id: The id of the slave to return :type slave_id: int :param slave_url: The url of the slave to return :type slave_url: str :return: The instance of the slave :rtype: Slave """ if (slave_id is None) == (slave_url is None): raise ValueError('Only one of slave_id or slave_url should be specified to get_slave().') if slave_id is not None: for slave in self._all_slaves_by_url.values(): if slave.id == slave_id: return slave else: if slave_url in self._all_slaves_by_url: return self._all_slaves_by_url[slave_url] raise ItemNotFoundError('Requested slave ({}) does not exist.'.format(slave_id)) def connect_new_slave(self, slave_url, num_executors): """ Add a new slave to this master. :type slave_url: str :type num_executors: int :return: The slave id of the new slave :rtype: int """ slave = Slave(slave_url, num_executors) self._all_slaves_by_url[slave_url] = slave self._add_idle_slave(slave) self._logger.info('Slave on {} connected to master with {} executors. (id: {})', slave_url, num_executors, slave.id) return {'slave_id': str(slave.id)} def handle_slave_state_update(self, slave, new_slave_state): """ Execute logic to transition the specified slave to the given state. :type slave: Slave :type new_slave_state: SlaveState """ slave_transition_functions = { SlaveState.DISCONNECTED: self._disconnect_slave, SlaveState.IDLE: self._add_idle_slave, SlaveState.SETUP_COMPLETED: self._handle_setup_success_on_slave, SlaveState.SETUP_FAILED: self._handle_setup_failure_on_slave, } if new_slave_state not in slave_transition_functions: raise BadRequestError('Invalid slave state "{}". Valid states are: {}.' .format(new_slave_state, ', '.join(slave_transition_functions.keys()))) do_transition = slave_transition_functions.get(new_slave_state) do_transition(slave) def _disconnect_slave(self, slave): """ Mark a slave dead. :type slave: Slave """ # Mark slave dead. We do not remove it from the list of all slaves. We also do not remove it from idle_slaves; # that will happen during slave allocation. slave.set_is_alive(False) # todo: Fail any currently executing subjobs still executing on this slave. self._logger.info('Slave on {} was disconnected. (id: {})', slave.url, slave.id) def _add_idle_slave(self, slave): """ Add a slave to the idle quexue :type slave: Slave """ build_id = slave.current_build_id slave.mark_as_idle() if build_id is not None: self._mark_build_finished_if_slaves_are_done(build_id) self._idle_slaves.put(slave) def _handle_setup_success_on_slave(self, slave): """ Respond to successful build setup on a slave. This starts subjob executions on the slave. This should be called once after the specified slave has already run build_setup commands for the specified build. :type slave: Slave """ build = self.get_build(slave.current_build_id) build.begin_subjob_executions_on_slave(slave) def _handle_setup_failure_on_slave(self, slave): """ Respond to failed build setup on a slave. This should put the slave back into a usable state. :type slave: Slave """ raise BadRequestError('Setup failure handling on the master is not yet implemented.') def handle_request_for_new_build(self, build_params): """ Creates a new Build object and adds it to the request queue to be processed. :param build_params: :type build_params: dict[str, str] :rtype tuple [bool, dict [str, str]] """ build_request = BuildRequest(build_params) success = False if build_request.is_valid(): build = Build(build_request) self._all_builds_by_id[build.build_id()] = build self._request_queue.put(build) analytics.record_event(analytics.BUILD_REQUEST_QUEUED, build_id=build.build_id()) response = {'build_id': build.build_id()} success = True elif not build_request.is_valid_type(): response = {'error': 'Invalid build request type.'} else: required_params = build_request.required_parameters() response = {'error': 'Missing required parameter. Required parameters: {}'.format(required_params)} return success, response def handle_request_to_update_build(self, build_id, update_params): """ Updates the state of a build with the values passed in. Used for cancelling running builds. :type build_id: int :param update_params: The fields that should be updated and their new values :type update_params: dict [str, str] :return: The success/failure and the response we want to send to the requestor :rtype: tuple [bool, dict [str, str]] """ build = self._all_builds_by_id.get(int(build_id)) if build is None: raise ItemNotFoundError('Invalid build id.') success, response = build.validate_update_params(update_params) if not success: return success, response return build.update_state(update_params), {} def handle_result_reported_from_slave(self, slave_url, build_id, subjob_id, payload=None): """ Process the result and dispatch the next subjob :type slave_url: str :type build_id: int :type subjob_id: int :type payload: dict :rtype: str """ self._logger.info('Results received from {} for subjob. (Build {}, Subjob {})', slave_url, build_id, subjob_id) build = self._all_builds_by_id[int(build_id)] slave = self._all_slaves_by_url[slave_url] # If the build has been canceled, don't work on the next subjob. if not build.is_finished: build.handle_subjob_payload(subjob_id, payload) build.mark_subjob_complete(subjob_id) build.execute_next_subjob_on_slave(slave) def get_build(self, build_id): """ Returns a build by id :param build_id: The id for the build whose status we are getting :type build_id: int :rtype: Build """ build = self._all_builds_by_id.get(build_id) if build is None: raise ItemNotFoundError('Invalid build id.') return build def get_path_for_build_results_archive(self, build_id): """ Given a build id, get the absolute file path for the archive file containing the build results. :param build_id: The build id for which to retrieve the artifacts archive file :type build_id: int :return: The path to the archived results file :rtype: str """ build = self._all_builds_by_id.get(build_id) if build is None: raise ItemNotFoundError('Invalid build id.') archive_file = build.artifacts_archive_file if archive_file is None: raise ItemNotReadyError('Build artifact file is not yet ready. Try again later.') return archive_file def _build_preparation_loop(self): """ Grabs a build off the request_queue, prepares it, and puts that build onto the builds_waiting_for_slaves queue. """ while True: build = self._request_queue.get() try: self._request_handler.handle_request(build) if not build.has_error: self._logger.info('Build {} was successfully prepared and is now waiting for slaves.', build.build_id()) self._builds_waiting_for_slaves.put(build) except Exception as ex: # pylint: disable=broad-except build.mark_failed(str(ex)) self._logger.exception('Could not handle build request for build {}'.format(build.build_id())) def _slave_allocation_loop(self): """ Builds wait in line for more slaves. This method executes in the background on another thread and watches for idle slaves, then gives them out to the waiting builds. """ while True: build_waiting_for_slave = self._builds_waiting_for_slaves.get() while build_waiting_for_slave.needs_more_slaves(): claimed_slave = self._idle_slaves.get() # Remove dead slaves from the idle queue if not claimed_slave.is_alive(use_cached=False): continue # The build may have completed while we were waiting for an idle slave, so check one more time. if build_waiting_for_slave.needs_more_slaves(): # Potential race condition here! If the build completes after the if statement is checked, # a slave will be allocated needlessly (and run slave.setup(), which can be significant work). self._logger.info('Allocating slave {} to build {}.', claimed_slave.url, build_waiting_for_slave.build_id()) build_waiting_for_slave.allocate_slave(claimed_slave) else: self._add_idle_slave(claimed_slave) self._logger.info('Done allocating slaves for build {}.', build_waiting_for_slave.build_id())
class ClusterMaster(object): """ The ClusterRunner Master service: This is the main application class that the web framework/REST API sits on top of. """ API_VERSION = 'v1' def __init__(self): self._logger = get_logger(__name__) self._all_slaves_by_url = {} self._all_builds_by_id = OrderedDict() # This is an OrderedDict so we can more easily implement get_queue() self._builds_waiting_for_slaves = Queue() self._request_queue = Queue() self._request_handler = SerialRequestHandler() self._request_queue_worker_thread = SafeThread( target=self._build_preparation_loop, name='RequestHandlerLoop', daemon=True) self._request_queue_worker_thread.start() self._slave_allocation_worker_thread = SafeThread( target=self._slave_allocation_loop, name='SlaveAllocationLoop', daemon=True) self._slave_allocation_worker_thread.start() self._master_results_path = Configuration['results_directory'] # It's important that idle slaves are only in the queue once so we use OrderedSet self._idle_slaves = OrderedSetQueue() # Delete all old builds when master starts. Remove this if/when build numbers are unique across master # starts/stops if os.path.exists(self._master_results_path): shutil.rmtree(self._master_results_path) fs.create_dir(self._master_results_path) def _get_status(self): """ Just returns a dumb message and prints it to the console. :rtype: str """ return 'Master service is up.' def api_representation(self): """ Gets a dict representing this resource which can be returned in an API response. :rtype: dict [str, mixed] """ slaves_representation = [slave.api_representation() for slave in self.all_slaves_by_id().values()] return { 'status': self._get_status(), 'slaves': slaves_representation, } def builds(self): """ Returns a list of all builds :rtype: list[Build] """ return self._all_builds_by_id.values() def active_builds(self): """ Returns a list of incomplete builds :rtype: list[Build] """ return [build for build in self.builds() if not build.is_finished] def add_idle_slave(self, slave): """ Add a slave to the idle queue :type slave: Slave """ build_id = slave.current_build_id slave.mark_as_idle() if build_id is not None: self._mark_build_finished_if_slaves_are_done(build_id) self._idle_slaves.put(slave) def _mark_build_finished_if_slaves_are_done(self, build_id): """ Run when a slave is finished with a build. If this is the last slave for that build, mark the build finished. Even when the subjobs are complete, a slave is not finished with the build until teardown is complete. :type build_id: int """ for slave in self._all_slaves_by_url.values(): if slave.current_build_id == build_id: return self.get_build(build_id).finish() def all_slaves_by_id(self): """ Retrieve all connected slaves :rtype: dict [int, Slave] """ slaves_by_slave_id = {} for slave in self._all_slaves_by_url.values(): slaves_by_slave_id[slave.id] = slave return slaves_by_slave_id def slave(self, slave_id): """ A connected slave. :rtype slave_id: int :rtype: Slave """ for slave in self._all_slaves_by_url.values(): if slave.id == slave_id: return slave raise ItemNotFoundError('Requested slave ({}) does not exist.'.format(slave_id)) def connect_new_slave(self, slave_url, num_executors): """ Add a new slave to this master :type slave_url: str :type num_executors: int :return: The slave id of the new slave :rtype: int """ slave = Slave(slave_url, num_executors) self._all_slaves_by_url[slave_url] = slave self.add_idle_slave(slave) self._logger.info('Slave on {} connected to master with {} executors. (id: {})', slave_url, num_executors, slave.id) return {'slave_id': str(slave.id)} def disconnect_slave(self, slave_id): """ Mark a slave dead :type slave_id: int """ # Mark slave dead. We do not remove it from the list of all slaves. We also do not remove it from idle_slaves; # that will happen during slave allocation. slave = self.slave(slave_id) slave.is_alive = False self._logger.info('Slave on {} was disconnected. (id: {})', slave.url, slave.id) def handle_request_for_new_build(self, build_params): """ Creates a new Build object and adds it to the request queue to be processed. :param build_params: :type build_params: dict[str, str] :rtype tuple [bool, dict [str, str]] """ build_request = BuildRequest(build_params) success = False if build_request.is_valid(): build = Build(build_request) self._all_builds_by_id[build.build_id()] = build self._request_queue.put(build) analytics.record_event(analytics.BUILD_REQUEST_QUEUED, build_id=build.build_id()) response = {'build_id': build.build_id()} success = True elif not build_request.is_valid_type(): response = {'error': 'Invalid build request type.'} else: required_params = build_request.required_parameters() response = {'error': 'Missing required parameter. Required parameters: {}'.format(required_params)} return success, response def handle_result_reported_from_slave(self, slave_url, build_id, subjob_id, payload=None): """ Process the result and dispatch the next subjob :type slave_url: str :type build_id: int :type subjob_id: int :type payload: dict :rtype: str """ self._logger.info('Results received from {} for subjob. (Build {}, Subjob {})', slave_url, build_id, subjob_id) build = self._all_builds_by_id[int(build_id)] slave = self._all_slaves_by_url[slave_url] build.handle_subjob_payload(subjob_id, payload) build.mark_subjob_complete(subjob_id) build.execute_next_subjob_on_slave(slave) def get_build(self, build_id): """ Returns a build by id :param build_id: The id for the build whose status we are getting :type build_id: int :rtype: Build """ build = self._all_builds_by_id.get(build_id) if build is None: raise ItemNotFoundError('Invalid build id.') return build def get_path_for_build_results_archive(self, build_id): """ Given a build id, get the absolute file path for the archive file containing the build results. :param build_id: The build id for which to retrieve the artifacts archive file :type build_id: int :return: The path to the archived results file :rtype: str """ build = self._all_builds_by_id.get(build_id) if build is None: raise ItemNotFoundError('Invalid build id.') archive_file = build.artifacts_archive_file if archive_file is None: raise ItemNotReadyError('Build artifact file is not yet ready. Try again later.') return archive_file def _build_preparation_loop(self): """ Grabs a build off the request_queue, prepares it, and puts that build onto the builds_waiting_for_slaves queue. """ while True: build = self._request_queue.get() try: self._request_handler.handle_request(build) if not build.has_error: self._logger.info('Build {} was successfully prepared and is now waiting for slaves.', build.build_id()) self._builds_waiting_for_slaves.put(build) except Exception as ex: # pylint: disable=broad-except build.mark_failed(str(ex)) self._logger.exception('Could not handle build request for build {}'.format(build.build_id())) def _slave_allocation_loop(self): """ Builds wait in line for more slaves. This method executes in the background on another thread and watches for idle slaves, then gives them out to the waiting builds. """ while True: build_waiting_for_slave = self._builds_waiting_for_slaves.get() while build_waiting_for_slave.needs_more_slaves(): claimed_slave = self._idle_slaves.get() # Remove dead slaves from the idle queue if not claimed_slave.is_alive: continue # The build may have completed while we were waiting for an idle slave, so check one more time. if build_waiting_for_slave.needs_more_slaves(): # Potential race condition here! If the build completes after the if statement is checked, # a slave will be allocated needlessly (and run slave.setup(), which can be significant work). self._logger.info('Allocating slave {} to build {}.', claimed_slave.url, build_waiting_for_slave.build_id()) build_waiting_for_slave.allocate_slave(claimed_slave) else: self.add_idle_slave(claimed_slave) self._logger.info('Done allocating slaves for build {}.', build_waiting_for_slave.build_id())
class BuildRequestHandler(object): """ The BuildRequestHandler class is responsible for preparing a non-prepared build. Implementation notes: This class manages two critical Queue's in ClusterRunner: request_queue and builds_waiting_for_slaves. The request_queue is the queue of non-prepared Build instances that the BuildRequestHandler has yet to prepare. This queue is populated by the ClusterMaster instance. The builds_waiting_for_slaves queue is the queue of prepared Build instances that the BuildRequestHandler has completed build preparation for, and is waiting for the SlaveAllocator (a separate entity) to pull Builds from. All of the input of builds come through self.handle_build_request() calls, and all of the output of builds go through self.next_prepared_build_scheduler() calls. """ def __init__(self, scheduler_pool): """ :type scheduler_pool: BuildSchedulerPool """ self._logger = get_logger(__name__) self._scheduler_pool = scheduler_pool self._builds_waiting_for_slaves = Queue() self._request_queue = Queue() self._request_queue_worker_thread = SafeThread( target=self._build_preparation_loop, name='RequestHandlerLoop', daemon=True) self._project_preparation_locks = {} self._subjob_calculator = SubjobCalculator() def start(self): """ Start the infinite loop that will accept unprepared builds and put them through build preparation. """ if self._request_queue_worker_thread.is_alive(): raise RuntimeError( 'Error: build request handler loop was asked to start when its already running.' ) self._request_queue_worker_thread.start() def handle_build_request(self, build): """ :param build: the requested build :type build: Build """ self._request_queue.put(build) analytics.record_event(analytics.BUILD_REQUEST_QUEUED, build_id=build.build_id(), log_msg='Queued request for build {build_id}.') def next_prepared_build_scheduler(self): """ Get the scheduler for the next build that has successfully completed build preparation. This is a blocking call--if there are no more builds that have completed build preparation and this method gets invoked, the execution will hang until the next build has completed build preparation. :rtype: BuildScheduler """ build = self._builds_waiting_for_slaves.get() build_scheduler = self._scheduler_pool.get(build) return build_scheduler def _build_preparation_loop(self): """ Grabs a build off the request_queue (populated by self.handle_build_request()), prepares it, and puts that build onto the self.builds_waiting_for_slaves queue. """ while True: build = self._request_queue.get() project_id = build.project_type.project_id() if project_id not in self._project_preparation_locks: self._logger.info('Creating project lock [{}] for build {}', project_id, str(build.build_id())) self._project_preparation_locks[project_id] = Lock() project_lock = self._project_preparation_locks[project_id] SafeThread(target=self._prepare_build_async, name='Bld{}-PreparationThread'.format(build.build_id()), args=(build, project_lock)).start() def _prepare_build_async(self, build, project_lock): """ :type build: Build :type project_lock: Lock """ self._logger.info('Build {} is waiting for the project lock', build.build_id()) with project_lock: self._logger.info('Build {} has acquired project lock', build.build_id()) analytics.record_event( analytics.BUILD_PREPARE_START, build_id=build.build_id(), log_msg= 'Build preparation loop is handling request for build {build_id}.' ) try: build.prepare(self._subjob_calculator) if not build.has_error: analytics.record_event( analytics.BUILD_PREPARE_FINISH, build_id=build.build_id(), is_success=True, log_msg= 'Build {build_id} successfully prepared and waiting for slaves.' ) self._builds_waiting_for_slaves.put(build) except Exception as ex: # pylint: disable=broad-except build.mark_failed(str(ex)) self._logger.exception( 'Could not handle build request for build {}.'.format( build.build_id())) analytics.record_event(analytics.BUILD_PREPARE_FINISH, build_id=build.build_id(), is_success=False)