def __init__(self, build_request): """ :type build_request: BuildRequest """ self._logger = get_logger(__name__) self._build_id = self._build_id_counter.increment() self._build_request = build_request self._artifacts_archive_file = None self._build_artifact = None self._error_message = None self._preparation_coin = SingleUseCoin( ) # protects against separate threads calling prepare() more than once self._project_type = None self._build_completion_lock = Lock( ) # protects against more than one thread detecting the build's finish self._all_subjobs_by_id = {} self._unstarted_subjobs = None # WIP(joey): Move subjob queues to BuildScheduler class. self._finished_subjobs = None self._failed_atoms = None self._postbuild_tasks_are_finished = False # WIP(joey): Remove and use build state. self._timing_file_path = None self._state_machine = BuildFsm(build_id=self._build_id, enter_state_callbacks={ BuildState.ERROR: self._on_enter_error_state, BuildState.CANCELED: self._on_enter_canceled_state, })
def __init__(self, build_request): """ :type build_request: BuildRequest """ self._logger = get_logger(__name__) self._build_id = self._build_id_counter.increment() self._build_request = build_request self._artifacts_archive_file = None self._build_artifact = None self._error_message = None self._preparation_coin = SingleUseCoin() # protects against separate threads calling prepare() more than once self._project_type = None self._build_completion_lock = Lock() # protects against more than one thread detecting the build's finish self._all_subjobs_by_id = {} self._unstarted_subjobs = None # WIP(joey): Move subjob queues to BuildScheduler class. self._finished_subjobs = None self._failed_atoms = None self._postbuild_tasks_are_finished = False # WIP(joey): Remove and use build state. self._timing_file_path = None self._state_machine = BuildFsm( build_id=self._build_id, enter_state_callbacks={ BuildState.ERROR: self._on_enter_error_state, BuildState.CANCELED: self._on_enter_canceled_state, } )
def __init__(self, build_request): """ :type build_request: BuildRequest """ self._logger = get_logger(__name__) self._build_id = self._build_id_counter.increment() self._build_request = build_request self._artifacts_tar_file = None # DEPRECATED - Use zip file instead self._artifacts_zip_file = None self._build_artifact = None self._error_message = None self._preparation_coin = SingleUseCoin( ) # protects against separate threads calling prepare() more than once self._project_type = None self._build_completion_lock = Lock( ) # protects against more than one thread detecting the build's finish self._all_subjobs_by_id = OrderedDict() self._unstarted_subjobs = None # WIP(joey): Move subjob queues to BuildScheduler class. self._finished_subjobs = None self._failed_atoms = None self._postbuild_tasks_are_finished = False # WIP(joey): Remove and use build state. self._timing_file_path = None leave_state_callbacks = { build_state: self._on_leave_state for build_state in BuildState } self._state_machine = BuildFsm( build_id=self._build_id, enter_state_callbacks={ BuildState.ERROR: self._on_enter_error_state, BuildState.CANCELED: self._on_enter_canceled_state, BuildState.PREPARING: self._on_enter_preparing_state, }, leave_state_callbacks=leave_state_callbacks) # Number of times build_setup has failed on this build. If # setup_failures increases beyond MAX_SETUP_FAILURES, the build is # cancelled self.setup_failures = 0
class Build(object): """ A build is a single execution of any configured job. This class: - exposes the overall status of the build - keeps track of the build's subjobs and their completion state - manages slaves that have been assigned to accept this build's subjobs :type _build_id: int :type _build_request: BuildRequest :type _build_artifact: None | BuildArtifact :type _error_message: None | str :type _project_type: None | ProjectType :type _timing_file_path: None | str """ _build_id_counter = Counter() # class-level counter for assigning build ids def __init__(self, build_request): """ :type build_request: BuildRequest """ self._logger = get_logger(__name__) self._build_id = self._build_id_counter.increment() self._build_request = build_request self._artifacts_archive_file = None self._build_artifact = None self._error_message = None self._preparation_coin = SingleUseCoin() # protects against separate threads calling prepare() more than once self._project_type = None self._build_completion_lock = Lock() # protects against more than one thread detecting the build's finish self._all_subjobs_by_id = {} self._unstarted_subjobs = None # WIP(joey): Move subjob queues to BuildScheduler class. self._finished_subjobs = None self._failed_atoms = None self._postbuild_tasks_are_finished = False # WIP(joey): Remove and use build state. self._timing_file_path = None self._state_machine = BuildFsm( build_id=self._build_id, enter_state_callbacks={ BuildState.ERROR: self._on_enter_error_state, BuildState.CANCELED: self._on_enter_canceled_state, } ) def api_representation(self): failed_atoms_api_representation = None if self._get_failed_atoms() is not None: failed_atoms_api_representation = [failed_atom.api_representation() for failed_atom in self._get_failed_atoms()] build_state = self._status() # todo: PREPARING/PREPARED are new states -- make sure clients can handle them before exposing. if build_state in (BuildState.PREPARING, BuildState.PREPARED): build_state = BuildState.QUEUED return { 'id': self._build_id, 'status': build_state, 'artifacts': self._artifacts_archive_file, # todo: this should probably be a url, not a file path 'details': self._detail_message, 'error_message': self._error_message, 'num_atoms': self._num_atoms, 'num_subjobs': len(self._all_subjobs_by_id), 'failed_atoms': failed_atoms_api_representation, 'result': self._result(), 'request_params': self.build_request.build_parameters(), # Convert self._state_timestamps to OrderedDict to make raw API response more readable. Sort the entries # by numerically increasing dict value, with None values sorting highest. 'state_timestamps': OrderedDict(sorted( [(state.lower(), timestamp) for state, timestamp in self._state_machine.transition_timestamps.items()], key=lambda item: item[1] or float('inf'))), } def generate_project_type(self): """ Instantiate the project type for this build, populating the self._project_type instance variable. As a side effect, this method also updates the build request's build_parameters dictionary with the unique workspace directory path for this build. :raises BuildProjectError when failed to instantiate project type """ # Generate a unique project build directory name that will be symlinked to the actual project directory # later on when the project gets fetched. build_specific_project_directory = self._generate_unique_symlink_path_for_build_repo() # Because build_specific_project_directory is entirely internal and generated by ClusterRunner (it is a # build-unique generated symlink), we must manually add it to the project_type_params project_type_params = self.build_request.build_parameters() project_type_params.update({'build_project_directory': build_specific_project_directory}) self._project_type = util.create_project_type(project_type_params) if self._project_type is None: raise BuildProjectError('Build failed due to an invalid project type.') def prepare(self, subjob_calculator): """ :param subjob_calculator: Used after project fetch to atomize and group subjobs for this build :type subjob_calculator: SubjobCalculator """ if not isinstance(self.build_request, BuildRequest): raise RuntimeError('Build {} has no associated request object.'.format(self._build_id)) if not isinstance(self.project_type, ProjectType): raise RuntimeError('Build {} has no project set.'.format(self._build_id)) if not self._preparation_coin.spend(): raise RuntimeError('prepare() was called more than once on build {}.'.format(self._build_id)) self._state_machine.trigger(BuildEvent.START_PREPARE) # WIP(joey): Move the following code into a PREPARING state callback # (so that it won't execute if the build has already been canceled.) self._logger.info('Fetching project for build {}.', self._build_id) self.project_type.fetch_project() self._logger.info('Successfully fetched project for build {}.', self._build_id) job_config = self.project_type.job_config() if job_config is None: raise RuntimeError('Build failed while trying to parse clusterrunner.yaml.') subjobs = subjob_calculator.compute_subjobs_for_build(self._build_id, job_config, self.project_type) self._unstarted_subjobs = Queue(maxsize=len(subjobs)) # WIP(joey): Move this into BuildScheduler? self._finished_subjobs = Queue(maxsize=len(subjobs)) # WIP(joey): Remove this and just record finished count. for subjob in subjobs: self._all_subjobs_by_id[subjob.subjob_id()] = subjob self._unstarted_subjobs.put(subjob) self._timing_file_path = self._project_type.timing_file_path(job_config.name) app.util.fs.create_dir(self._build_results_dir()) self._state_machine.trigger(BuildEvent.FINISH_PREPARE) def build_id(self): """ :rtype: int """ return self._build_id @property def build_request(self): """ :rtype: BuildRequest """ return self._build_request def all_subjobs(self): """ Returns a list of subjobs for this build :rtype: list[Subjob] """ return [subjob for subjob in self._all_subjobs_by_id.values()] def subjob(self, subjob_id): """ Returns a single subjob :type subjob_id: int :rtype: Subjob """ subjob = self._all_subjobs_by_id.get(subjob_id) if subjob is None: raise ItemNotFoundError('Invalid subjob id.') return subjob def complete_subjob(self, subjob_id, payload=None): """ Handle the subjob payload and mark the given subjob id for this build as complete. :type subjob_id: int :type payload: dict """ try: self._handle_subjob_payload(subjob_id, payload) self._mark_subjob_complete(subjob_id) except Exception: self._logger.exception('Error while completing subjob; marking build as failed.') self.mark_failed('Error occurred while completing subjob {}.'.format(subjob_id)) raise def _parse_payload_for_atom_exit_code(self, subjob_id): subjob = self.subjob(subjob_id) for atom_id in range(len(subjob.atoms)): artifact_dir = BuildArtifact.atom_artifact_directory( self.build_id(), subjob.subjob_id(), atom_id, result_root=Configuration['results_directory'] ) atom_exit_code_file_sys_path = os.path.join(artifact_dir, BuildArtifact.EXIT_CODE_FILE) with open(atom_exit_code_file_sys_path, 'r') as atom_exit_code_file: subjob.atoms[atom_id].exit_code = int(atom_exit_code_file.read()) def _handle_subjob_payload(self, subjob_id, payload): if not payload: self._logger.warning('No payload for subjob {} of build {}.', subjob_id, self._build_id) return # Assertion: all payloads received from subjobs are uniquely named. result_file_path = os.path.join(self._build_results_dir(), payload['filename']) try: app.util.fs.write_file(payload['body'], result_file_path) app.util.fs.extract_tar(result_file_path, delete=True) self._parse_payload_for_atom_exit_code(subjob_id) except: self._logger.warning('Writing payload for subjob {} of build {} FAILED.', subjob_id, self._build_id) raise def _read_subjob_timings_from_results(self): """ Collect timing data from all subjobs :rtype: dict [str, float] """ timings = {} for _, subjob in self._all_subjobs_by_id.items(): timings.update(subjob.read_timings()) return timings def _mark_subjob_complete(self, subjob_id): """ :type subjob_id: int """ subjob = self.subjob(subjob_id) subjob.mark_completed() with self._build_completion_lock: self._finished_subjobs.put(subjob, block=False) should_trigger_postbuild_tasks = self._all_subjobs_are_finished() and not self._is_stopped() # We use a local variable here which was set inside the _build_completion_lock to prevent a race condition if should_trigger_postbuild_tasks: self._logger.info("All results received for build {}!", self._build_id) SafeThread(target=self._perform_async_postbuild_tasks, name='PostBuild{}'.format(self._build_id)).start() def mark_started(self): """ Mark the build as started. """ self._state_machine.trigger(BuildEvent.START_BUILDING) def finish(self): """ Perform postbuild task and mark this build as finished. """ # This method also transitions the FSM to finished after the postbuild tasks are complete. self._perform_async_postbuild_tasks() def mark_failed(self, failure_reason): """ Mark a build as failed and set a failure reason. The failure reason should be something we can present to the end user of ClusterRunner, so try not to include detailed references to internal implementation. :type failure_reason: str """ self._state_machine.trigger(BuildEvent.FAIL, error_msg=failure_reason) def mark_setup_failed(self, failure_reason): """ Mark a build as failed and set a failure reason. Because setup failures don't have any logs, we put the build_id in the setup_failed file for easier querying of worker logs. :type failure_reason: str """ self._state_machine.trigger(BuildEvent.FAIL, error_msg='{} Build Id: {}.'.format(failure_reason, self._build_id)) setup_failure_file = os.path.join(self._build_results_dir(), BuildArtifact.SETUP_FAILED_FILE) app.util.fs.write_file(str(self._build_id), setup_failure_file) self._create_build_artifact() def _on_enter_error_state(self, event): """ Store an error message for the build and log the failure. This method is triggered by a state machine transition to the ERROR state. :param event: The Fysom event object """ # WIP(joey): Should this be a reenter_state callback also? Should it check for previous error message? default_error_msg = 'An unspecified error occurred.' self._error_message = getattr(event, 'error_msg', default_error_msg) self._logger.warning('Build {} failed: {}', self.build_id(), self._error_message) def cancel(self): """ Cancel a running build. """ self._logger.notice('Request received to cancel build {}.', self._build_id) self._state_machine.trigger(BuildEvent.CANCEL) def _on_enter_canceled_state(self, event): # Deplete the unstarted subjob queue. # WIP(joey): Just remove this completely and adjust behavior of other methods based on self._is_canceled(). # TODO: Handle situation where cancel() is called while subjobs are being added to _unstarted_subjobs while self._unstarted_subjobs is not None and not self._unstarted_subjobs.empty(): try: # A subjob may be asynchronously pulled from this queue, so we need to avoid blocking when empty. self._unstarted_subjobs.get(block=False) except Empty: break def validate_update_params(self, update_params): """ Determine if a dict of update params are valid, and generate an error if not :param update_params: Params passed into a PUT for this build :type update_params: dict [str, str] :return: Whether the params are valid and a response containing an error message if not :rtype: tuple [bool, dict [str, str]] """ keys_and_values_allowed = {'status': ['canceled']} message = None for key, value in update_params.items(): if key not in keys_and_values_allowed.keys(): message = 'Key ({}) is not in list of allowed keys ({})'.\ format(key, ",".join(keys_and_values_allowed.keys())) elif value not in keys_and_values_allowed[key]: message = 'Value ({}) is not in list of allowed values ({}) for {}'.\ format(value, keys_and_values_allowed[key], key) if message is not None: return False, {'error': message} return True, {} def update_state(self, update_params): """ Make updates to the state of this build given a set of update params :param update_params: The keys and values to update on this build :type update_params: dict [str, str] """ success = False for key, value in update_params.items(): if key == 'status': if value == 'canceled': self.cancel() success = True return success @property def project_type(self): """ :rtype: ProjectType """ return self._project_type @property def artifacts_archive_file(self): return self._artifacts_archive_file # WIP(joey): Change some of these private @properties to methods. @property def _num_subjobs_total(self): return len(self._all_subjobs_by_id) @property def _num_subjobs_finished(self): return 0 if not self._finished_subjobs else self._finished_subjobs.qsize() @property def _num_atoms(self): # todo: blacklist states instead of whitelist, or just check _all_subjobs_by_id directly if self._status() not in [BuildState.BUILDING, BuildState.FINISHED]: return None return sum([len(subjob.atomic_commands()) for subjob in self._all_subjobs_by_id.values()]) def _all_subjobs_are_finished(self): return self._finished_subjobs and self._finished_subjobs.full() @property def is_finished(self): # WIP(joey): Calling logic should check _is_canceled if it needs to instead of including the check here. return self._is_canceled() or self._postbuild_tasks_are_finished @property def _detail_message(self): if self._num_subjobs_total > 0: return '{} of {} subjobs are complete ({:.1f}%).'.format( self._num_subjobs_finished, self._num_subjobs_total, 100 * self._num_subjobs_finished / self._num_subjobs_total ) return None def _status(self): # WIP(joey): Rename to _state. """ :rtype: BuildState """ return self._state_machine.state @property def has_error(self): return self._status() is BuildState.ERROR def _is_canceled(self): return self._status() is BuildState.CANCELED def _is_stopped(self): return self._status() in (BuildState.ERROR, BuildState.CANCELED) def _get_failed_atoms(self): """ The atoms that failed. Returns None if the build hasn't completed yet. Returns empty set if build has completed and no atoms have failed. :rtype: list[Atom] | None """ if self._failed_atoms is None and self.is_finished: if self._is_canceled(): return [] self._failed_atoms = [] for subjob_id, atom_id in self._build_artifact.get_failed_subjob_and_atom_ids(): subjob = self.subjob(subjob_id) atom = subjob.atoms[atom_id] self._failed_atoms.append(atom) return self._failed_atoms def _result(self): """ Can return three states: None: FAILURE: NO_FAILURES: :rtype: BuildResult | None """ if self._is_canceled(): return BuildResult.FAILURE if self.is_finished: if len(self._build_artifact.get_failed_subjob_and_atom_ids()) == 0: return BuildResult.NO_FAILURES return BuildResult.FAILURE return None def _perform_async_postbuild_tasks(self): """ Once a build is complete, certain tasks can be performed asynchronously. """ self._create_build_artifact() self._delete_temporary_build_artifact_files() self._postbuild_tasks_are_finished = True self._state_machine.trigger(BuildEvent.POSTBUILD_TASKS_COMPLETE) def _create_build_artifact(self): self._build_artifact = BuildArtifact(self._build_results_dir()) self._build_artifact.generate_failures_file() self._build_artifact.write_timing_data(self._timing_file_path, self._read_subjob_timings_from_results()) self._artifacts_archive_file = app.util.fs.compress_directory(self._build_results_dir(), BuildArtifact.ARTIFACT_FILE_NAME) def _delete_temporary_build_artifact_files(self): """ Delete the temporary build result files that are no longer needed, due to the creation of the build artifact tarball. ONLY call this method after _create_build_artifact() has completed. Otherwise we have lost the build results. """ build_result_dir = self._build_results_dir() start_time = time.time() for path in os.listdir(build_result_dir): # The build result tar-ball is also stored in this same directory, so we must not delete it. if path == BuildArtifact.ARTIFACT_FILE_NAME: continue full_path = os.path.join(build_result_dir, path) # Do NOT use app.util.fs.async_delete() here. That call will generate a temp directory for every # atom, which can be in the thousands per build, and can lead to running up against the ulimit -Hn. if os.path.isdir: shutil.rmtree(full_path, ignore_errors=True) else: os.remove(full_path) end_time = time.time() - start_time self._logger.info('Completed deleting artifact files for {}, took {:.1f} seconds.', self._build_id, end_time) def _build_results_dir(self): return BuildArtifact.build_artifact_directory(self.build_id(), result_root=Configuration['results_directory']) def _generate_unique_symlink_path_for_build_repo(self): """ Generate a unique symlink path for a build-specific repo. This method does NOT generate the symlink itself. :rtype: str """ return os.path.join(Configuration['build_symlink_directory'], str(uuid.uuid4()))
class Build(object): """ A build is a single execution of any configured job. This class: - exposes the overall status of the build - keeps track of the build's subjobs and their completion state - manages slaves that have been assigned to accept this build's subjobs :type _build_id: int :type _build_request: BuildRequest :type _build_artifact: None | BuildArtifact :type _error_message: None | str :type _project_type: None | ProjectType :type _timing_file_path: None | str """ _build_id_counter = Counter() # class-level counter for assigning build ids def __init__(self, build_request): """ :type build_request: BuildRequest """ self._logger = get_logger(__name__) self._build_id = self._build_id_counter.increment() self._build_request = build_request self._artifacts_tar_file = None # DEPRECATED - Use zip file instead self._artifacts_zip_file = None self._build_artifact = None self._error_message = None self._preparation_coin = SingleUseCoin() # protects against separate threads calling prepare() more than once self._project_type = None self._build_completion_lock = Lock() # protects against more than one thread detecting the build's finish self._all_subjobs_by_id = OrderedDict() self._unstarted_subjobs = None # WIP(joey): Move subjob queues to BuildScheduler class. self._finished_subjobs = None self._failed_atoms = None self._postbuild_tasks_are_finished = False # WIP(joey): Remove and use build state. self._timing_file_path = None leave_state_callbacks = {build_state: self._on_leave_state for build_state in BuildState} self._state_machine = BuildFsm( build_id=self._build_id, enter_state_callbacks={ BuildState.ERROR: self._on_enter_error_state, BuildState.CANCELED: self._on_enter_canceled_state, BuildState.PREPARING: self._on_enter_preparing_state, }, leave_state_callbacks=leave_state_callbacks ) # Number of times build_setup has failed on this build. If # setup_failures increases beyond MAX_SETUP_FAILURES, the build is # cancelled self.setup_failures = 0 def api_representation(self): failed_atoms_api_representation = None if self._get_failed_atoms() is not None: failed_atoms_api_representation = [failed_atom.api_representation() for failed_atom in self._get_failed_atoms()] build_state = self._status() # todo: PREPARING/PREPARED are new states -- make sure clients can handle them before exposing. if build_state in (BuildState.PREPARING, BuildState.PREPARED): build_state = BuildState.QUEUED return { 'id': self._build_id, 'status': build_state, 'details': self._detail_message, 'error_message': self._error_message, 'num_atoms': self._num_atoms, 'num_subjobs': len(self._all_subjobs_by_id), 'failed_atoms': failed_atoms_api_representation, 'result': self._result(), 'request_params': self.build_request.build_parameters(), # Convert self._state_timestamps to OrderedDict to make raw API response more readable. Sort the entries # by numerically increasing dict value, with None values sorting highest. 'state_timestamps': OrderedDict(sorted( [(state.lower(), timestamp) for state, timestamp in self._state_machine.transition_timestamps.items()], key=lambda item: item[1] or float('inf'))), } def generate_project_type(self): """ Instantiate the project type for this build, populating the self._project_type instance variable. As a side effect, this method also updates the build request's build_parameters dictionary with the unique workspace directory path for this build. :raises BuildProjectError when failed to instantiate project type """ # Generate a unique project build directory name that will be symlinked to the actual project directory # later on when the project gets fetched. build_specific_project_directory = self._generate_unique_symlink_path_for_build_repo() # Because build_specific_project_directory is entirely internal and generated by ClusterRunner (it is a # build-unique generated symlink), we must manually add it to the project_type_params project_type_params = self.build_request.build_parameters() project_type_params.update({'build_project_directory': build_specific_project_directory}) self._project_type = util.create_project_type(project_type_params) if self._project_type is None: raise BuildProjectError('Build failed due to an invalid project type.') def prepare(self): if not isinstance(self.build_request, BuildRequest): raise RuntimeError('Build {} has no associated request object.'.format(self._build_id)) if not isinstance(self.project_type, ProjectType): raise RuntimeError('Build {} has no project set.'.format(self._build_id)) if not self._preparation_coin.spend(): raise RuntimeError('prepare() was called more than once on build {}.'.format(self._build_id)) self._state_machine.trigger(BuildEvent.START_PREPARE) def build_id(self): """ :rtype: int """ return self._build_id @property def build_request(self): """ :rtype: BuildRequest """ return self._build_request def get_subjobs(self, offset: int=None, limit: int=None) -> List['Subjob']: """ Returns a list of subjobs for this build :param offset: The starting index of the requested build :param limit: The number of builds requested """ num_subjobs = len(self._all_subjobs_by_id) start, end = get_paginated_indices(offset, limit, num_subjobs) requested_subjobs = islice(self._all_subjobs_by_id, start, end) return [self._all_subjobs_by_id[key] for key in requested_subjobs] def subjob(self, subjob_id: int) -> Subjob: """Return the subjob for this build with the specified id.""" subjob = self._all_subjobs_by_id.get(subjob_id) if subjob is None: raise ItemNotFoundError('Invalid subjob id.') return subjob def complete_subjob(self, subjob_id, payload=None): """ Handle the subjob payload and mark the given subjob id for this build as complete. :type subjob_id: int :type payload: dict """ try: self._handle_subjob_payload(subjob_id, payload) self._mark_subjob_complete(subjob_id) except Exception: self._logger.exception('Error while completing subjob; marking build as failed.') self.mark_failed('Error occurred while completing subjob {}.'.format(subjob_id)) raise def _parse_payload_for_atom_exit_code(self, subjob_id): subjob = self.subjob(subjob_id) for atom_id in range(len(subjob.atoms)): artifact_dir = BuildArtifact.atom_artifact_directory( self.build_id(), subjob.subjob_id(), atom_id, result_root=Configuration['results_directory'] ) atom_exit_code_file_sys_path = os.path.join(artifact_dir, BuildArtifact.EXIT_CODE_FILE) with open(atom_exit_code_file_sys_path, 'r') as atom_exit_code_file: subjob.atoms[atom_id].exit_code = int(atom_exit_code_file.read()) def _handle_subjob_payload(self, subjob_id, payload): if not payload: self._logger.warning('No payload for subjob {} of build {}.', subjob_id, self._build_id) return # Assertion: all payloads received from subjobs are uniquely named. result_file_path = os.path.join(self._build_results_dir(), payload['filename']) try: app.util.fs.write_file(payload['body'], result_file_path) app.util.fs.extract_tar(result_file_path, delete=True) self._parse_payload_for_atom_exit_code(subjob_id) except: internal_errors.labels(ErrorType.SubjobWriteFailure).inc() # pylint: disable=no-member self._logger.warning('Writing payload for subjob {} of build {} FAILED.', subjob_id, self._build_id) raise def _read_subjob_timings_from_results(self): """ Collect timing data from all subjobs :rtype: dict [str, float] """ timings = {} for _, subjob in self._all_subjobs_by_id.items(): timings.update(subjob.read_timings()) return timings def _mark_subjob_complete(self, subjob_id): """ :type subjob_id: int """ subjob = self.subjob(subjob_id) subjob.mark_completed() with self._build_completion_lock: self._finished_subjobs.put(subjob, block=False) should_trigger_postbuild_tasks = self._all_subjobs_are_finished() and not self.is_stopped # We use a local variable here which was set inside the _build_completion_lock to prevent a race condition if should_trigger_postbuild_tasks: self._logger.info("All results received for build {}!", self._build_id) self.finish() def mark_started(self): """ Mark the build as started. """ self._state_machine.trigger(BuildEvent.START_BUILDING) def finish(self): """ Perform postbuild task and mark this build as finished. """ Thread( target=self._perform_async_postbuild_tasks, name='PostBuild{}'.format(self._build_id), ).start() def mark_failed(self, failure_reason): """ Mark a build as failed and set a failure reason. The failure reason should be something we can present to the end user of ClusterRunner, so try not to include detailed references to internal implementation. :type failure_reason: str """ self._state_machine.trigger(BuildEvent.FAIL, error_msg=failure_reason) def _on_enter_error_state(self, event): """ Store an error message for the build and log the failure. This method is triggered by a state machine transition to the ERROR state. :param event: The Fysom event object """ # WIP(joey): Should this be a reenter_state callback also? Should it check for previous error message? default_error_msg = 'An unspecified error occurred.' self._error_message = getattr(event, 'error_msg', default_error_msg) self._logger.warning('Build {} failed: {}', self.build_id(), self._error_message) def _on_enter_preparing_state(self, event): """ Prepare the build by atomization and subjobs creation. This method is triggered by a state machine transition to the PREPARING state. :param event: The Fysom event object :type event: BuildEvent """ self._logger.info('Fetching project for build {}.', self._build_id) self.project_type.fetch_project() self._logger.info('Successfully fetched project for build {}.', self._build_id) job_config = self.project_type.job_config() if job_config is None: raise RuntimeError('Build failed while trying to parse clusterrunner.yaml.') subjobs = compute_subjobs_for_build(self._build_id, job_config, self.project_type) self._unstarted_subjobs = Queue(maxsize=len(subjobs)) # WIP(joey): Move this into BuildScheduler? self._finished_subjobs = Queue(maxsize=len(subjobs)) # WIP(joey): Remove this and just record finished count. for subjob in subjobs: self._all_subjobs_by_id[subjob.subjob_id()] = subjob self._unstarted_subjobs.put(subjob) self._timing_file_path = self._project_type.timing_file_path(job_config.name) app.util.fs.create_dir(self._build_results_dir()) self._state_machine.trigger(BuildEvent.FINISH_PREPARE) def _on_leave_state(self, event): start_time = self._state_machine.transition_timestamps.get(event.src) if start_time is not None: elapsed = time.time() - start_time build_state_duration_seconds.labels(event.src.value).observe(elapsed) # pylint: disable=no-member else: self._logger.warn('Build {} transitioned from state {} to state {} but never marked started timestamp for {}', self._build_id, event.src, event.dst, event.src) def cancel(self): """ Cancel a running build. """ self._state_machine.trigger(BuildEvent.CANCEL) def _on_enter_canceled_state(self, event): """ :param event: The Fysom event object :type event: BuildEvent """ self._logger.notice('Canceling build {}.', self._build_id) # Set the kill_event to kill the subprocesses for the build self.project_type.kill_subprocesses() # Deplete the unstarted subjob queue. # WIP(joey): Just remove this completely and adjust behavior of other methods based on self._is_canceled(). # TODO: Handle situation where cancel() is called while subjobs are being added to _unstarted_subjobs while self._unstarted_subjobs is not None and not self._unstarted_subjobs.empty(): try: # A subjob may be asynchronously pulled from this queue, so we need to avoid blocking when empty. self._unstarted_subjobs.get(block=False) except Empty: break def validate_update_params(self, update_params): """ Determine if a dict of update params are valid, and generate an error if not :param update_params: Params passed into a PUT for this build :type update_params: dict [str, str] :return: Whether the params are valid and a response containing an error message if not :rtype: tuple [bool, dict [str, str]] """ keys_and_values_allowed = {'status': ['canceled']} message = None for key, value in update_params.items(): if key not in keys_and_values_allowed.keys(): message = 'Key ({}) is not in list of allowed keys ({})'.\ format(key, ",".join(keys_and_values_allowed.keys())) elif value not in keys_and_values_allowed[key]: message = 'Value ({}) is not in list of allowed values ({}) for {}'.\ format(value, keys_and_values_allowed[key], key) if message is not None: return False, {'error': message} return True, {} def update_state(self, update_params): """ Make updates to the state of this build given a set of update params :param update_params: The keys and values to update on this build :type update_params: dict [str, str] """ success = False for key, value in update_params.items(): if key == 'status': if value == 'canceled': self.cancel() success = True return success @property def project_type(self): """ :rtype: ProjectType """ return self._project_type @property def artifacts_zip_file(self): """Return the local path to the artifacts zip archive.""" return self._artifacts_zip_file @property def artifacts_tar_file(self): """ DEPRECATED: We are transitioning to zip files from tar.gz files for artifacts. Return the local path to the artifacts tar.gz archive. """ self._logger.warning('The tar format for build artifact files is deprecated. File: {}', self._artifacts_tar_file) return self._artifacts_tar_file # WIP(joey): Change some of these private @properties to methods. @property def _num_subjobs_total(self): return len(self._all_subjobs_by_id) @property def _num_subjobs_finished(self): return 0 if not self._finished_subjobs else self._finished_subjobs.qsize() @property def _num_atoms(self): # todo: blacklist states instead of whitelist, or just check _all_subjobs_by_id directly if self._status() not in [BuildState.BUILDING, BuildState.FINISHED]: return None return sum([len(subjob.atomic_commands()) for subjob in self._all_subjobs_by_id.values()]) def _all_subjobs_are_finished(self): return self._finished_subjobs and self._finished_subjobs.full() @property def is_finished(self): # WIP(joey): Calling logic should check _is_canceled if it needs to instead of including the check here. return self.is_canceled or self._postbuild_tasks_are_finished @property def _detail_message(self): if self._num_subjobs_total > 0: return '{} of {} subjobs are complete ({:.1f}%).'.format( self._num_subjobs_finished, self._num_subjobs_total, 100 * self._num_subjobs_finished / self._num_subjobs_total ) return None def _status(self): # WIP(joey): Rename to _state. """ :rtype: BuildState """ return self._state_machine.state @property def has_error(self): return self._status() is BuildState.ERROR @property def is_canceled(self): return self._status() is BuildState.CANCELED @property def is_stopped(self): return self._status() in (BuildState.ERROR, BuildState.CANCELED) def _get_failed_atoms(self): """ The atoms that failed. Returns None if the build hasn't completed yet. Returns empty set if build has completed and no atoms have failed. :rtype: list[Atom] | None """ if self._failed_atoms is None and self.is_finished: if self.is_canceled: return [] self._failed_atoms = [] for subjob_id, atom_id in self._build_artifact.get_failed_subjob_and_atom_ids(): subjob = self.subjob(subjob_id) atom = subjob.atoms[atom_id] self._failed_atoms.append(atom) return self._failed_atoms def _result(self): """ Can return three states: None: FAILURE: NO_FAILURES: :rtype: BuildResult | None """ if self.is_canceled: return BuildResult.FAILURE if self.is_finished: if len(self._build_artifact.get_failed_subjob_and_atom_ids()) == 0: return BuildResult.NO_FAILURES return BuildResult.FAILURE return None def _perform_async_postbuild_tasks(self): """ Once a build is complete, execute certain tasks like archiving the artifacts and writing timing data. This method also transitions the FSM to finished after the postbuild tasks are complete. """ try: timing_data = self._read_subjob_timings_from_results() self._create_build_artifact(timing_data) serialized_build_time_seconds.observe(sum(timing_data.values())) self._delete_temporary_build_artifact_files() self._postbuild_tasks_are_finished = True self._state_machine.trigger(BuildEvent.POSTBUILD_TASKS_COMPLETE) self._logger.notice('Completed build (id: {}), saving to database.'.format(self._build_id)) self.save() except Exception as ex: # pylint: disable=broad-except internal_errors.labels(ErrorType.PostBuildFailure).inc() # pylint: disable=no-member self._logger.exception('Postbuild tasks failed for build {}.'.format(self._build_id)) self.mark_failed('Postbuild tasks failed due to an internal error: "{}"'.format(ex)) def _create_build_artifact(self, timing_data: Dict[str, float]): # pylint: disable=unsubscriptable-object self._build_artifact = BuildArtifact(self._build_results_dir()) self._build_artifact.generate_failures_file() self._build_artifact.write_timing_data(self._timing_file_path, timing_data) self._artifacts_tar_file = app.util.fs.tar_directory(self._build_results_dir(), BuildArtifact.ARTIFACT_TARFILE_NAME) temp_tar_path = None try: # Temporarily move aside tar file so we can create a zip file, then move it back. # This juggling can be removed once we're no longer creating tar artifacts. temp_tar_path = shutil.move(self._artifacts_tar_file, tempfile.mktemp()) self._artifacts_zip_file = app.util.fs.zip_directory(self._build_results_dir(), BuildArtifact.ARTIFACT_ZIPFILE_NAME) except Exception: # pylint: disable=broad-except internal_errors.labels(ErrorType.ZipFileCreationFailure).inc() # pylint: disable=no-member # Due to issue #339 we are ignoring exceptions in the zip file creation for now. self._logger.exception('Zipping of artifacts failed. This error will be ignored.') finally: if temp_tar_path: shutil.move(temp_tar_path, self._artifacts_tar_file) def _delete_temporary_build_artifact_files(self): """ Delete the temporary build result files that are no longer needed, due to the creation of the build artifact tarball. ONLY call this method after _create_build_artifact() has completed. Otherwise we have lost the build results. """ build_result_dir = self._build_results_dir() start_time = time.time() for path in os.listdir(build_result_dir): # The build result archive is also stored in this same directory, so we must not delete it. if path in (BuildArtifact.ARTIFACT_TARFILE_NAME, BuildArtifact.ARTIFACT_ZIPFILE_NAME): continue full_path = os.path.join(build_result_dir, path) # Do NOT use app.util.fs.async_delete() here. That call will generate a temp directory for every # atom, which can be in the thousands per build, and can lead to running up against the ulimit -Hn. if os.path.isdir: shutil.rmtree(full_path, ignore_errors=True) else: os.remove(full_path) end_time = time.time() - start_time self._logger.info('Completed deleting artifact files for {}, took {:.1f} seconds.', self._build_id, end_time) def _build_results_dir(self): return BuildArtifact.build_artifact_directory(self.build_id(), result_root=Configuration['results_directory']) def _generate_unique_symlink_path_for_build_repo(self): """ Generate a unique symlink path for a build-specific repo. This method does NOT generate the symlink itself. :rtype: str """ return os.path.join(Configuration['build_symlink_directory'], str(uuid.uuid4())) # pylint: disable=protected-access def save(self): """Serialize the Build object and update all of the parts to the database.""" with Connection.get() as session: build_schema = session.query(BuildSchema).filter(BuildSchema.build_id == self._build_id).first() failed_artifact_directories_schema = session.query(FailedArtifactDirectoriesSchema) \ .filter(FailedArtifactDirectoriesSchema.build_id == self._build_id) \ .all() failed_subjob_atom_pairs_schema = session.query(FailedSubjobAtomPairsSchema) \ .filter(FailedSubjobAtomPairsSchema.build_id == self._build_id) \ .all() atoms_schema = session.query(AtomsSchema).filter(AtomsSchema.build_id == self._build_id).all() subjobs_schema = session.query(SubjobsSchema).filter(SubjobsSchema.build_id == self._build_id).all() # If this wasn't found, it's safe to assume that the build doesn't exist within the database if build_schema is None: raise ItemNotFoundError('Unable to find build (id: {}) in database.'.format(self._build_id)) build_schema.artifacts_tar_file = self._artifacts_tar_file build_schema.artifacts_zip_file = self._artifacts_zip_file build_schema.error_message = self._error_message build_schema.postbuild_tasks_are_finished = self._postbuild_tasks_are_finished build_schema.setup_failures = self.setup_failures build_schema.timing_file_path = self._timing_file_path build_artifact_dir = None if self._build_artifact is not None: build_artifact_dir = self._build_artifact.build_artifact_dir build_schema.build_artifact_dir = build_artifact_dir if self._build_artifact is not None: # Clear all old directories session.query(FailedArtifactDirectoriesSchema) \ .filter(FailedArtifactDirectoriesSchema.build_id == self._build_id) \ .delete() # Commit changes so we don't delete the newly added rows later session.commit() # Add all the updated versions of the directories for directory in self._build_artifact._get_failed_artifact_directories(): failed_artifact_directory = FailedArtifactDirectoriesSchema( build_id=self._build_id, failed_artifact_directory=directory ) session.add(failed_artifact_directory) if self._build_artifact is not None: # Clear all old directories session.query(FailedSubjobAtomPairsSchema) \ .filter(FailedSubjobAtomPairsSchema.build_id == self._build_id) \ .delete() # Commit changes so we don't delete the newly added rows later session.commit() # Add all the updated versions of the data for subjob_id, atom_id in self._build_artifact.get_failed_subjob_and_atom_ids(): failed_subjob_and_atom_ids = FailedSubjobAtomPairsSchema( build_id=self._build_id, subjob_id=subjob_id, atom_id=atom_id ) session.add(failed_subjob_and_atom_ids) build_schema.build_parameters = json.dumps(self._build_request.build_parameters()) fsm_timestamps = {state.lower(): timestamp for state, timestamp in self._state_machine.transition_timestamps.items()} build_schema.state = self._status() build_schema.queued_ts = fsm_timestamps['queued'] build_schema.finished_ts = fsm_timestamps['finished'] build_schema.prepared_ts = fsm_timestamps['prepared'] build_schema.preparing_ts = fsm_timestamps['preparing'] build_schema.error_ts = fsm_timestamps['error'] build_schema.canceled_ts = fsm_timestamps['canceled'] build_schema.building_ts = fsm_timestamps['building'] # Subjobs # Clear all old Subjobs and Atoms session.query(SubjobsSchema) \ .filter(SubjobsSchema.build_id == self._build_id) \ .delete() session.query(AtomsSchema) \ .filter(AtomsSchema.build_id == self._build_id) \ .delete() # Commit changes so we don't delete the newly added rows later session.commit() # Add all the updated versions of Subjobs and Atoms subjobs = self._all_subjobs_by_id for subjob_id in subjobs: subjob = self._all_subjobs_by_id[subjob_id] subjob_schema = SubjobsSchema( subjob_id=subjob_id, build_id=self._build_id, completed=subjob.completed ) session.add(subjob_schema) # Atoms for atom in subjob._atoms: atom_schema = AtomsSchema( atom_id=atom.id, build_id=self._build_id, subjob_id=subjob_id, command_string=atom.command_string, expected_time=atom.expected_time, actual_time=atom.actual_time, exit_code=atom.exit_code, state=atom.state ) session.add(atom_schema) @classmethod def load_from_db(cls, build_id): """ Given a build_id, fetch all the stored information from the database to reconstruct a Build object to represent that build. :param build_id: The id of the build to recreate. """ with Connection.get() as session: build_schema = session.query(BuildSchema).filter(BuildSchema.build_id == build_id).first() failed_artifact_directories_schema = session.query(FailedArtifactDirectoriesSchema) \ .filter(FailedArtifactDirectoriesSchema.build_id == build_id) \ .all() failed_subjob_atom_pairs_schema = session.query(FailedSubjobAtomPairsSchema) \ .filter(FailedSubjobAtomPairsSchema.build_id == build_id) \ .all() atoms_schema = session.query(AtomsSchema).filter(AtomsSchema.build_id == build_id).all() subjobs_schema = session.query(SubjobsSchema).filter(SubjobsSchema.build_id == build_id).all() # If a query returns None, then we know the build wasn't found in the database if not build_schema: return None build_parameters = json.loads(build_schema.build_parameters) # Genereate a BuildRequest object with our query response build_request = BuildRequest(build_parameters) # Create initial Build object, we will be altering the state of this as we get more data build = Build(build_request) build._build_id = build_id # Manually generate ProjectType object for build and create a `job_config` since this is usually done in `prepare()` build.generate_project_type() job_config = build.project_type.job_config() # Manually update build data build._artifacts_tar_file = build_schema.artifacts_tar_file build._artifacts_zip_file = build_schema.artifacts_zip_file build._error_message = build_schema.error_message build._postbuild_tasks_are_finished = bool(int(build_schema.postbuild_tasks_are_finished)) build.setup_failures = build_schema.setup_failures build._timing_file_path = build_schema.timing_file_path # Manually set the state machine timestamps build._state_machine._transition_timestamps = { BuildState.QUEUED: build_schema.queued_ts, BuildState.FINISHED: build_schema.finished_ts, BuildState.PREPARED: build_schema.prepared_ts, BuildState.PREPARING: build_schema.preparing_ts, BuildState.ERROR: build_schema.error_ts, BuildState.CANCELED: build_schema.canceled_ts, BuildState.BUILDING: build_schema.building_ts } build._state_machine._fsm.current = BuildState[build_schema.state] build_artifact = BuildArtifact(build_schema.build_artifact_dir) directories = [] for directory in failed_artifact_directories_schema: directories.append(directory.failed_artifact_directory) build_artifact._failed_artifact_directories = directories pairs = [] for pair in failed_subjob_atom_pairs_schema: pairs.append((pair.subjob_id, pair.atom_id)) build_artifact._q_failed_subjob_atom_pairs = pairs build._build_artifact = build_artifact atoms_by_subjob_id = {} for atom in atoms_schema: atoms_by_subjob_id.setdefault(atom.subjob_id, []) atoms_by_subjob_id[atom.subjob_id].append(Atom( atom.command_string, atom.expected_time, atom.actual_time, atom.exit_code, atom.state, atom.atom_id, atom.subjob_id )) subjobs = OrderedDict() for subjob in subjobs_schema: atoms = atoms_by_subjob_id[subjob.subjob_id] # Add atoms after subjob is created so we don't alter their state on initialization subjob_to_add = Subjob(build_id, subjob.subjob_id, build.project_type, job_config, []) subjob_to_add._atoms = atoms subjob_to_add.completed = subjob.completed subjobs[subjob.subjob_id] = subjob_to_add build._all_subjobs_by_id = subjobs # Place subjobs into correct queues within the build build._unstarted_subjobs = Queue(maxsize=len(subjobs)) build._finished_subjobs = Queue(maxsize=len(subjobs)) for _, subjob in subjobs.items(): if subjob.completed: build._finished_subjobs.put(subjob) else: build._unstarted_subjobs.put(subjob) return build
class Build(object): """ A build is a single execution of any configured job. This class: - exposes the overall status of the build - keeps track of the build's subjobs and their completion state - manages slaves that have been assigned to accept this build's subjobs :type _build_id: int :type _build_request: BuildRequest :type _build_artifact: None | BuildArtifact :type _error_message: None | str :type _project_type: None | ProjectType :type _timing_file_path: None | str """ _build_id_counter = Counter( ) # class-level counter for assigning build ids def __init__(self, build_request): """ :type build_request: BuildRequest """ self._logger = get_logger(__name__) self._build_id = self._build_id_counter.increment() self._build_request = build_request self._artifacts_archive_file = None self._build_artifact = None self._error_message = None self._preparation_coin = SingleUseCoin( ) # protects against separate threads calling prepare() more than once self._project_type = None self._build_completion_lock = Lock( ) # protects against more than one thread detecting the build's finish self._all_subjobs_by_id = {} self._unstarted_subjobs = None # WIP(joey): Move subjob queues to BuildScheduler class. self._finished_subjobs = None self._failed_atoms = None self._postbuild_tasks_are_finished = False # WIP(joey): Remove and use build state. self._timing_file_path = None self._state_machine = BuildFsm(build_id=self._build_id, enter_state_callbacks={ BuildState.ERROR: self._on_enter_error_state, BuildState.CANCELED: self._on_enter_canceled_state, }) def api_representation(self): failed_atoms_api_representation = None if self._get_failed_atoms() is not None: failed_atoms_api_representation = [ failed_atom.api_representation() for failed_atom in self._get_failed_atoms() ] build_state = self._status() # todo: PREPARING/PREPARED are new states -- make sure clients can handle them before exposing. if build_state in (BuildState.PREPARING, BuildState.PREPARED): build_state = BuildState.QUEUED return { 'id': self._build_id, 'status': build_state, 'artifacts': self. _artifacts_archive_file, # todo: this should probably be a url, not a file path 'details': self._detail_message, 'error_message': self._error_message, 'num_atoms': self._num_atoms, 'num_subjobs': len(self._all_subjobs_by_id), 'failed_atoms': failed_atoms_api_representation, 'result': self._result(), 'request_params': self.build_request.build_parameters(), # Convert self._state_timestamps to OrderedDict to make raw API response more readable. Sort the entries # by numerically increasing dict value, with None values sorting highest. 'state_timestamps': OrderedDict( sorted([(state.lower(), timestamp) for state, timestamp in self._state_machine.transition_timestamps.items()], key=lambda item: item[1] or float('inf'))), } def generate_project_type(self): """ Instantiate the project type for this build, populating the self._project_type instance variable. As a side effect, this method also updates the build request's build_parameters dictionary with the unique workspace directory path for this build. :raises BuildProjectError when failed to instantiate project type """ # Generate a unique project build directory name that will be symlinked to the actual project directory # later on when the project gets fetched. build_specific_project_directory = self._generate_unique_symlink_path_for_build_repo( ) # Because build_specific_project_directory is entirely internal and generated by ClusterRunner (it is a # build-unique generated symlink), we must manually add it to the project_type_params project_type_params = self.build_request.build_parameters() project_type_params.update( {'build_project_directory': build_specific_project_directory}) self._project_type = util.create_project_type(project_type_params) if self._project_type is None: raise BuildProjectError( 'Build failed due to an invalid project type.') def prepare(self, subjob_calculator): """ :param subjob_calculator: Used after project fetch to atomize and group subjobs for this build :type subjob_calculator: SubjobCalculator """ if not isinstance(self.build_request, BuildRequest): raise RuntimeError( 'Build {} has no associated request object.'.format( self._build_id)) if not isinstance(self.project_type, ProjectType): raise RuntimeError('Build {} has no project set.'.format( self._build_id)) if not self._preparation_coin.spend(): raise RuntimeError( 'prepare() was called more than once on build {}.'.format( self._build_id)) self._state_machine.trigger(BuildEvent.START_PREPARE) # WIP(joey): Move the following code into a PREPARING state callback # (so that it won't execute if the build has already been canceled.) self._logger.info('Fetching project for build {}.', self._build_id) self.project_type.fetch_project() self._logger.info('Successfully fetched project for build {}.', self._build_id) job_config = self.project_type.job_config() if job_config is None: raise RuntimeError( 'Build failed while trying to parse clusterrunner.yaml.') subjobs = subjob_calculator.compute_subjobs_for_build( self._build_id, job_config, self.project_type) self._unstarted_subjobs = Queue( maxsize=len(subjobs)) # WIP(joey): Move this into BuildScheduler? self._finished_subjobs = Queue(maxsize=len( subjobs)) # WIP(joey): Remove this and just record finished count. for subjob in subjobs: self._all_subjobs_by_id[subjob.subjob_id()] = subjob self._unstarted_subjobs.put(subjob) self._timing_file_path = self._project_type.timing_file_path( job_config.name) app.util.fs.create_dir(self._build_results_dir()) self._state_machine.trigger(BuildEvent.FINISH_PREPARE) def build_id(self): """ :rtype: int """ return self._build_id @property def build_request(self): """ :rtype: BuildRequest """ return self._build_request def all_subjobs(self): """ Returns a list of subjobs for this build :rtype: list[Subjob] """ return [subjob for subjob in self._all_subjobs_by_id.values()] def subjob(self, subjob_id): """ Returns a single subjob :type subjob_id: int :rtype: Subjob """ subjob = self._all_subjobs_by_id.get(subjob_id) if subjob is None: raise ItemNotFoundError('Invalid subjob id.') return subjob def complete_subjob(self, subjob_id, payload=None): """ Handle the subjob payload and mark the given subjob id for this build as complete. :type subjob_id: int :type payload: dict """ try: self._handle_subjob_payload(subjob_id, payload) self._mark_subjob_complete(subjob_id) except Exception: self._logger.exception( 'Error while completing subjob; marking build as failed.') self.mark_failed( 'Error occurred while completing subjob {}.'.format(subjob_id)) raise def _parse_payload_for_atom_exit_code(self, subjob_id): subjob = self.subjob(subjob_id) for atom_id in range(len(subjob.atoms)): artifact_dir = BuildArtifact.atom_artifact_directory( self.build_id(), subjob.subjob_id(), atom_id, result_root=Configuration['results_directory']) atom_exit_code_file_sys_path = os.path.join( artifact_dir, BuildArtifact.EXIT_CODE_FILE) with open(atom_exit_code_file_sys_path, 'r') as atom_exit_code_file: subjob.atoms[atom_id].exit_code = int( atom_exit_code_file.read()) def _handle_subjob_payload(self, subjob_id, payload): if not payload: self._logger.warning('No payload for subjob {} of build {}.', subjob_id, self._build_id) return # Assertion: all payloads received from subjobs are uniquely named. result_file_path = os.path.join(self._build_results_dir(), payload['filename']) try: app.util.fs.write_file(payload['body'], result_file_path) app.util.fs.extract_tar(result_file_path, delete=True) self._parse_payload_for_atom_exit_code(subjob_id) except: self._logger.warning( 'Writing payload for subjob {} of build {} FAILED.', subjob_id, self._build_id) raise def _read_subjob_timings_from_results(self): """ Collect timing data from all subjobs :rtype: dict [str, float] """ timings = {} for _, subjob in self._all_subjobs_by_id.items(): timings.update(subjob.read_timings()) return timings def _mark_subjob_complete(self, subjob_id): """ :type subjob_id: int """ subjob = self.subjob(subjob_id) subjob.mark_completed() with self._build_completion_lock: self._finished_subjobs.put(subjob, block=False) should_trigger_postbuild_tasks = self._all_subjobs_are_finished( ) and not self._is_stopped() # We use a local variable here which was set inside the _build_completion_lock to prevent a race condition if should_trigger_postbuild_tasks: self._logger.info("All results received for build {}!", self._build_id) SafeThread(target=self._perform_async_postbuild_tasks, name='PostBuild{}'.format(self._build_id)).start() def mark_started(self): """ Mark the build as started. """ self._state_machine.trigger(BuildEvent.START_BUILDING) def finish(self): """ Perform postbuild task and mark this build as finished. """ # This method also transitions the FSM to finished after the postbuild tasks are complete. self._perform_async_postbuild_tasks() def mark_failed(self, failure_reason): """ Mark a build as failed and set a failure reason. The failure reason should be something we can present to the end user of ClusterRunner, so try not to include detailed references to internal implementation. :type failure_reason: str """ self._state_machine.trigger(BuildEvent.FAIL, error_msg=failure_reason) def _on_enter_error_state(self, event): """ Store an error message for the build and log the failure. This method is triggered by a state machine transition to the ERROR state. :param event: The Fysom event object """ # WIP(joey): Should this be a reenter_state callback also? Should it check for previous error message? default_error_msg = 'An unspecified error occurred.' self._error_message = getattr(event, 'error_msg', default_error_msg) self._logger.warning('Build {} failed: {}', self.build_id(), self._error_message) def cancel(self): """ Cancel a running build. """ self._logger.notice('Request received to cancel build {}.', self._build_id) self._state_machine.trigger(BuildEvent.CANCEL) def _on_enter_canceled_state(self, event): # Deplete the unstarted subjob queue. # WIP(joey): Just remove this completely and adjust behavior of other methods based on self._is_canceled(). # TODO: Handle situation where cancel() is called while subjobs are being added to _unstarted_subjobs while self._unstarted_subjobs is not None and not self._unstarted_subjobs.empty( ): try: # A subjob may be asynchronously pulled from this queue, so we need to avoid blocking when empty. self._unstarted_subjobs.get(block=False) except Empty: break def validate_update_params(self, update_params): """ Determine if a dict of update params are valid, and generate an error if not :param update_params: Params passed into a PUT for this build :type update_params: dict [str, str] :return: Whether the params are valid and a response containing an error message if not :rtype: tuple [bool, dict [str, str]] """ keys_and_values_allowed = {'status': ['canceled']} message = None for key, value in update_params.items(): if key not in keys_and_values_allowed.keys(): message = 'Key ({}) is not in list of allowed keys ({})'.\ format(key, ",".join(keys_and_values_allowed.keys())) elif value not in keys_and_values_allowed[key]: message = 'Value ({}) is not in list of allowed values ({}) for {}'.\ format(value, keys_and_values_allowed[key], key) if message is not None: return False, {'error': message} return True, {} def update_state(self, update_params): """ Make updates to the state of this build given a set of update params :param update_params: The keys and values to update on this build :type update_params: dict [str, str] """ success = False for key, value in update_params.items(): if key == 'status': if value == 'canceled': self.cancel() success = True return success @property def project_type(self): """ :rtype: ProjectType """ return self._project_type @property def artifacts_archive_file(self): return self._artifacts_archive_file # WIP(joey): Change some of these private @properties to methods. @property def _num_subjobs_total(self): return len(self._all_subjobs_by_id) @property def _num_subjobs_finished(self): return 0 if not self._finished_subjobs else self._finished_subjobs.qsize( ) @property def _num_atoms(self): # todo: blacklist states instead of whitelist, or just check _all_subjobs_by_id directly if self._status() not in [BuildState.BUILDING, BuildState.FINISHED]: return None return sum([ len(subjob.atomic_commands()) for subjob in self._all_subjobs_by_id.values() ]) def _all_subjobs_are_finished(self): return self._finished_subjobs and self._finished_subjobs.full() @property def is_finished(self): # WIP(joey): Calling logic should check _is_canceled if it needs to instead of including the check here. return self._is_canceled() or self._postbuild_tasks_are_finished @property def _detail_message(self): if self._num_subjobs_total > 0: return '{} of {} subjobs are complete ({:.1f}%).'.format( self._num_subjobs_finished, self._num_subjobs_total, 100 * self._num_subjobs_finished / self._num_subjobs_total) return None def _status(self): # WIP(joey): Rename to _state. """ :rtype: BuildState """ return self._state_machine.state @property def has_error(self): return self._status() is BuildState.ERROR def _is_canceled(self): return self._status() is BuildState.CANCELED def _is_stopped(self): return self._status() in (BuildState.ERROR, BuildState.CANCELED) def _get_failed_atoms(self): """ The atoms that failed. Returns None if the build hasn't completed yet. Returns empty set if build has completed and no atoms have failed. :rtype: list[Atom] | None """ if self._failed_atoms is None and self.is_finished: if self._is_canceled(): return [] self._failed_atoms = [] for subjob_id, atom_id in self._build_artifact.get_failed_subjob_and_atom_ids( ): subjob = self.subjob(subjob_id) atom = subjob.atoms[atom_id] self._failed_atoms.append(atom) return self._failed_atoms def _result(self): """ Can return three states: None: FAILURE: NO_FAILURES: :rtype: BuildResult | None """ if self._is_canceled(): return BuildResult.FAILURE if self.is_finished: if len(self._build_artifact.get_failed_subjob_and_atom_ids()) == 0: return BuildResult.NO_FAILURES return BuildResult.FAILURE return None def _perform_async_postbuild_tasks(self): """ Once a build is complete, certain tasks can be performed asynchronously. """ self._create_build_artifact() self._delete_temporary_build_artifact_files() self._postbuild_tasks_are_finished = True self._state_machine.trigger(BuildEvent.POSTBUILD_TASKS_COMPLETE) def _create_build_artifact(self): self._build_artifact = BuildArtifact(self._build_results_dir()) self._build_artifact.generate_failures_file() self._build_artifact.write_timing_data( self._timing_file_path, self._read_subjob_timings_from_results()) self._artifacts_archive_file = app.util.fs.compress_directory( self._build_results_dir(), BuildArtifact.ARTIFACT_FILE_NAME) def _delete_temporary_build_artifact_files(self): """ Delete the temporary build result files that are no longer needed, due to the creation of the build artifact tarball. ONLY call this method after _create_build_artifact() has completed. Otherwise we have lost the build results. """ build_result_dir = self._build_results_dir() start_time = time.time() for path in os.listdir(build_result_dir): # The build result tar-ball is also stored in this same directory, so we must not delete it. if path == BuildArtifact.ARTIFACT_FILE_NAME: continue full_path = os.path.join(build_result_dir, path) # Do NOT use app.util.fs.async_delete() here. That call will generate a temp directory for every # atom, which can be in the thousands per build, and can lead to running up against the ulimit -Hn. if os.path.isdir: shutil.rmtree(full_path, ignore_errors=True) else: os.remove(full_path) end_time = time.time() - start_time self._logger.info( 'Completed deleting artifact files for {}, took {:.1f} seconds.', self._build_id, end_time) def _build_results_dir(self): return BuildArtifact.build_artifact_directory( self.build_id(), result_root=Configuration['results_directory']) def _generate_unique_symlink_path_for_build_repo(self): """ Generate a unique symlink path for a build-specific repo. This method does NOT generate the symlink itself. :rtype: str """ return os.path.join(Configuration['build_symlink_directory'], str(uuid.uuid4()))