def function_with_retries(*args, **kwargs): for i in range(num_attempts): try: return_value = function(*args, **kwargs) except exceptions as ex: if i == num_attempts - 1: raise # final attempt failed log.get_logger(__name__).warning('Call to {} raised {}("{}"). Retrying in {} seconds.', function.__qualname__, type(ex).__name__, ex, retry_delay) time.sleep(retry_delay) else: return return_value
def __init__(self, build_request): """ :type build_request: BuildRequest """ self._logger = get_logger(__name__) self._build_id = self._build_id_counter.increment() self.build_request = build_request self._artifacts_archive_file = None self._build_artifact = None """ :type : BuildArtifact""" self._error_message = None self.is_prepared = False self._setup_is_started = False self._preparation_coin = SingleUseCoin() # protects against separate threads calling prepare() more than once self._is_canceled = False self._project_type = None self._build_completion_lock = Lock() # protects against more than one thread detecting the build's finish self._all_subjobs_by_id = {} self._unstarted_subjobs = None # WIP: Move subjob queues to BuildScheduler class. self._finished_subjobs = None self._failed_atoms = None self._postbuild_tasks_are_finished = False self._timing_file_path = None self._state_timestamps = {status: None for status in BuildStatus} # initialize all timestamps to None self._record_state_timestamp(BuildStatus.QUEUED)
def __init__(self, build_request): """ :type build_request: BuildRequest """ self._logger = get_logger(__name__) self._build_id = self._build_id_counter.increment() self.build_request = build_request self._artifacts_archive_file = None self._build_artifact = None """ :type : BuildArtifact""" self._error_message = None self.is_prepared = False self._setup_is_started = False self._preparation_coin = SingleUseCoin( ) # protects against separate threads calling prepare() more than once self._is_canceled = False self._project_type = None self._build_completion_lock = Lock( ) # protects against more than one thread detecting the build's finish self._all_subjobs_by_id = {} self._unstarted_subjobs = None # WIP: Move subjob queues to BuildScheduler class. self._finished_subjobs = None self._failed_atoms = None self._postbuild_tasks_are_finished = False self._timing_file_path = None self._state_timestamps = {status: None for status in BuildStatus } # initialize all timestamps to None self._record_state_timestamp(BuildStatus.QUEUED)
def __init__(self): self._logger = get_logger(__name__) self._all_slaves_by_url = {} self._all_builds_by_id = OrderedDict() # This is an OrderedDict so we can more easily implement get_queue() self._builds_waiting_for_slaves = Queue() self._request_queue = Queue() self._request_handler = SerialRequestHandler() self._request_queue_worker_thread = SafeThread( target=self._build_preparation_loop, name='RequestHandlerLoop', daemon=True) self._request_queue_worker_thread.start() self._slave_allocation_worker_thread = SafeThread( target=self._slave_allocation_loop, name='SlaveAllocationLoop', daemon=True) self._slave_allocation_worker_thread.start() self._master_results_path = Configuration['results_directory'] # It's important that idle slaves are only in the queue once so we use OrderedSet self._idle_slaves = OrderedSetQueue() # Delete all old builds when master starts. Remove this if/when build numbers are unique across master # starts/stops if os.path.exists(self._master_results_path): shutil.rmtree(self._master_results_path) fs.create_dir(self._master_results_path)
def log_app_debug_info_and_force_kill_after_delay(): time.sleep(seconds) logger = log.get_logger(__name__) logger.error('ClusterRunner did not exit within {} seconds. App debug info:\n\n{}.', seconds, app_info.get_app_info_string()) logger.critical('ClusterRunner seems to be hanging unexpectedly. Hard killing the process. Farewell!') os._exit(1)
def log_app_debug_info_and_force_kill_after_delay(): time.sleep(seconds) logger = log.get_logger(__name__) logger.error('ClusterRunner did not exit within {} seconds. App debug info:\n\n{}.', seconds, app_info.get_app_info_string()) logger.critical('ClusterRunner seems to be hanging unexpectedly. Sending SIGKILL to self. Farewell!') os.kill(os.getpid(), signal.SIGKILL)
def __init__(self, port, host, num_executors=10): """ :param port: The port number the slave service is running on :type port: int :param host: The hostname at which the slave is reachable :type host: str :param num_executors: The number of executors this slave should operate with -- this determines how many concurrent subjobs the slave can execute. :type num_executors: int """ self.port = port self.host = host self.is_alive = True self._slave_id = None self._num_executors = num_executors self._logger = log.get_logger(__name__) self._idle_executors = Queue(maxsize=num_executors) self.executors_by_id = {} for executor_id in range(num_executors): executor = SubjobExecutor(executor_id) self._idle_executors.put(executor) self.executors_by_id[executor_id] = executor self._master_url = None self._network = Network(min_connection_poolsize=num_executors) self._master_api = None # wait until we connect to a master first self._project_type = None # this will be instantiated during build setup self._current_build_id = None self._build_teardown_coin = None
def __init__(self, port, host, num_executors=10): """ :param port: The port number the slave service is running on :type port: int :param host: The hostname at which the slave is reachable :type host: str :param num_executors: The number of executors this slave should operate with -- this determines how many concurrent subjobs the slave can execute. :type num_executors: int """ self.port = port self.host = host self.is_alive = True self._slave_id = None self._num_executors = num_executors self._logger = log.get_logger(__name__) self._idle_executors = Queue(maxsize=num_executors) self.executors_by_id = {} for executor_id in range(num_executors): executor = SubjobExecutor(executor_id) self._idle_executors.put(executor) self.executors_by_id[executor_id] = executor self._master_url = None self._network = Network(min_connection_poolsize=num_executors) self._master_api = None # wait until we connect to a master first self._project_type = None # this will be instantiated during build setup self._current_build_id = None self._build_teardown_coin = None self._base_executor_index = None
def __init__(self, atomizer_dicts): """ :param atomizer_dicts: A list of dicts mapping atomizer env var names to atomizer commands :type atomizer_dicts: list[dict[str, str]] """ self._logger = log.get_logger(__name__) self._atomizer_dicts = atomizer_dicts
def __init__(self): self._logger = get_logger(__name__) self._builds_waiting_for_slaves = Queue() self._request_queue = Queue() self._request_queue_worker_thread = SafeThread( target=self._build_preparation_loop, name='RequestHandlerLoop', daemon=True) self._project_preparation_locks = {}
def __init__(self): self._logger = get_logger(__name__) self._master_results_path = Configuration['results_directory'] self._slave_registry = SlaveRegistry.singleton() self._scheduler_pool = BuildSchedulerPool() self._build_request_handler = BuildRequestHandler(self._scheduler_pool) self._build_request_handler.start() self._slave_allocator = SlaveAllocator(self._scheduler_pool) self._slave_allocator.start() # The best practice for determining the number of threads to use is # the number of threads per core multiplied by the number of physical # cores. So for example, with 10 cores, 2 sockets and 2 per core, the # max would be 40. # # Currently we use threads for incrementing/decrementing slave executor # counts (lock acquisition) and tearing down the slave (network IO). 32 threads should be # plenty for these tasks. In the case of heavy load, the bottle neck will be the number # of executors, not the time it takes to lock/unlock the executor counts or the number of # teardown requests. Tweak the number to find the sweet spot if you feel this is the case. self._thread_pool_executor = ThreadPoolExecutor(max_workers=32) # Asynchronously delete (but immediately rename) all old builds when master starts. # Remove this if/when build numbers are unique across master starts/stops if os.path.exists(self._master_results_path): fs.async_delete(self._master_results_path) fs.create_dir(self._master_results_path) # Configure heartbeat tracking self._unresponsive_slaves_cleanup_interval = Configuration['unresponsive_slaves_cleanup_interval'] self._hb_scheduler = sched.scheduler() SlavesCollector.register_slaves_metrics_collector(lambda: self._slave_registry.get_all_slaves_by_id().values())
def __init__(self, port, host, num_executors=10): """ :param port: The port number the slave service is running on :type port: int :param host: The hostname at which the slave is reachable :type host: str :param num_executors: The number of executors this slave should operate with -- this determines how many concurrent subjobs the slave can execute. :type num_executors: int """ self.port = port self.host = host self._slave_id = None self._num_executors = num_executors self._logger = log.get_logger(__name__) self._idle_executors = Queue(maxsize=num_executors) self.executors = {} for executor_id in range(num_executors): executor = SubjobExecutor(executor_id) self._idle_executors.put(executor) self.executors[executor_id] = executor self._setup_complete_event = Event() self._master_url = None self._network = Network(min_connection_poolsize=num_executors) self._master_api = None # wait until we connect to a master first self._project_type = None # this will be instantiated during build setup self._current_build_id = None UnhandledExceptionHandler.singleton().add_teardown_callback(self._async_teardown_build, should_disconnect_from_master=True)
def __init__(self, build_request): """ :type build_request: BuildRequest """ self._logger = get_logger(__name__) self._build_id = self._build_id_counter.increment() self._build_request = build_request self._artifacts_archive_file = None self._build_artifact = None self._error_message = None self._preparation_coin = SingleUseCoin() # protects against separate threads calling prepare() more than once self._project_type = None self._build_completion_lock = Lock() # protects against more than one thread detecting the build's finish self._all_subjobs_by_id = {} self._unstarted_subjobs = None # WIP(joey): Move subjob queues to BuildScheduler class. self._finished_subjobs = None self._failed_atoms = None self._postbuild_tasks_are_finished = False # WIP(joey): Remove and use build state. self._timing_file_path = None self._state_machine = BuildFsm( build_id=self._build_id, enter_state_callbacks={ BuildState.ERROR: self._on_enter_error_state, BuildState.CANCELED: self._on_enter_canceled_state, } )
def __init__(self, build_request): """ :type build_request: BuildRequest """ self._logger = get_logger(__name__) self._build_id = self._build_id_counter.increment() self.build_request = build_request self._artifacts_archive_file = None self._build_artifact = None """ :type : BuildArtifact""" self._error_message = None self.is_prepared = False self._preparation_coin = SingleUseCoin() # protects against separate threads calling prepare() more than once self._project_type = None self._num_slaves_in_use = 0 self._build_completion_lock = Lock() # protects against more than one thread detecting the build's finish self._num_allocated_executors = 0 self._max_executors = float('inf') self._build_completion_lock = Lock() self._all_subjobs_by_id = {} self._unstarted_subjobs = None self._finished_subjobs = None self._postbuild_tasks_are_finished = False self._teardowns_finished = False
def __init__(self, build_id, subjob_id, project_type, job_config, atoms): """ :param build_id: :type build_id: int :param subjob_id: :type subjob_id: int :param project_type: :type project_type: ProjectType :param job_config: the job's configuration from clusterrunner.yaml :type job_config: JobConfig :param atoms: the atom project_type strings :type atoms: list[app.master.atom.Atom] :return: """ self._logger = get_logger(__name__) self._build_id = build_id self._subjob_id = subjob_id self._project_type = project_type # todo: Unused; remove. self.job_config = job_config self._atoms = atoms self._set_atoms_subjob_id(atoms, subjob_id) self._set_atom_state(AtomState.NOT_STARTED) self.timings = { } # a dict, atom_ids are the keys and seconds are the values self.slave = None # The slave that had been assigned this subjob. Is None if not started.
def _grouped_atoms(atoms, max_executors, timing_file_path, project_directory): """ Return atoms that are grouped for optimal CI performance. If a timing file exists, then use the TimeBasedAtomGrouper. If not, use the default AtomGrouper (groups each atom into its own subjob). :param atoms: all of the atoms to be run this time :type atoms: list[app.master.atom.Atom] :param max_executors: the maximum number of executors for this build :type max_executors: int :param timing_file_path: path to where the timing data file would be stored (if it exists) for this job :type timing_file_path: str :type project_directory: str :return: the grouped atoms (in the form of list of lists of strings) :rtype: list[list[app.master.atom.Atom]] """ atom_time_map = None if os.path.isfile(timing_file_path): with open(timing_file_path, 'r') as json_file: try: atom_time_map = json.load(json_file) except ValueError: logger = log.get_logger(__name__) logger.warning('Failed to load timing data from file that exists {}', timing_file_path) if atom_time_map is not None and len(atom_time_map) > 0: atom_grouper = TimeBasedAtomGrouper(atoms, max_executors, atom_time_map, project_directory) else: atom_grouper = AtomGrouper(atoms, max_executors) return atom_grouper.groupings()
def __init__(self, url, build_project_directory='', project_directory='', remote='origin', branch='master', config=None, job_name=None, remote_files=None, atoms_override=None): """ Note: the first line of each parameter docstring will be exposed as command line argument documentation for the clusterrunner build client. :param url: url to the git repo (ie: https, ssh) :type url: str :param build_project_directory: the symlinked directory of where PROJECT_DIR should end up being set to :type build_project_directory: str :param project_directory: path within the repo that contains clusterrunner.yaml :type project_directory: str :param remote: The git remote name to fetch from :type remote: str :param branch: The git branch name on the remote to fetch :type branch: str :param config: a yaml string representing the project_type's config :type config: str|None :param job_name: a list of job names we intend to run :type job_name: list [str] | None :param remote_files: dictionary mapping of output file to URL :type remote_files: dict[str, str] | None :param atoms_override: The list of overridden atoms (if specified, will not run atomizer). :type atoms_override: list[str] | None """ super().__init__(config, job_name, remote_files, atoms_override) self._url = url self._remote = remote self._branch = branch self._repo_directory = self.get_full_repo_directory(self._url) self._timing_file_directory = self.get_timing_file_directory(self._url) self._local_ref = None self._logger = log.get_logger(__name__) # We explicitly set the repo directory to 700 so we don't inadvertently expose the repo to access by other users fs.create_dir(self._repo_directory, self.DIRECTORY_PERMISSIONS) fs.create_dir(self._timing_file_directory, self.DIRECTORY_PERMISSIONS) fs.create_dir(os.path.dirname(build_project_directory)) # Create a symlink from the generated build project directory to the actual project directory. # This is done in order to switch between the master's and the slave's copies of the repo while not # having to do something hacky in order to user the master's generated atoms on the slaves. actual_project_directory = os.path.join(self._repo_directory, project_directory) try: os.unlink(build_project_directory) except FileNotFoundError: pass os.symlink(actual_project_directory, build_project_directory) self.project_directory = build_project_directory
def __init__(self, base_api_url): """ :param base_api_url: The base API url of the service (e.g., 'http://localhost:43000') :type base_api_url: str """ self._api = UrlBuilder(base_api_url) self._network = Network() self._logger = log.get_logger(__name__)
def initialize(self, route_node=None, cluster_master=None): """ :type route_node: RouteNode :type cluster_master: app.master.cluster_master.ClusterMaster """ self._logger = log.get_logger(__name__) self._cluster_master = cluster_master super().initialize(route_node)
def __init__(self, raw_yaml_contents): """ :param raw_yaml_contents: Raw string contents of project clusterrunner.yaml file :type raw_yaml_contents: string """ self._job_configs = None self._logger = log.get_logger(__name__) self._raw_yaml_contents = raw_yaml_contents
def __init__(self, raw_yaml_contents): """ :param raw_yaml_contents: Raw string contents of project clusterrunner.yaml file :type raw_yaml_contents: string """ self._job_configs = None self._logger = log.get_logger(__name__) self._raw_yaml_contents = raw_yaml_contents
def __init__(self, build_artifact_dir): """ :param build_artifact_dir: absolute path to the build artifact (IE: '/var/clusterrunner/artifacts/20') :type build_artifact_dir: str """ self._logger = get_logger(__name__) self.build_artifact_dir = build_artifact_dir self._failed_commands = None
def rsa_key(host): """ :param host: The RSA key for host that we want to retrieve :type host: str :return: the rsa key string, without the 'ssh-rsa' prefix. Returns None if failed ssh-keyscan fails. :rtype: str|None """ proc = subprocess.Popen('ssh-keyscan -t rsa {}'.format(host), shell=True, stdout=PIPE, stderr=PIPE) output, error = proc.communicate() if proc.returncode != 0: log.get_logger(__name__).error('Failed to get rsa string with output: {}, error: {}'.format(output, error)) return None line = output.decode("utf-8") # We want the string to the right of, and not including, the 'ssh-rsa' string. return line.split('ssh-rsa', 1)[-1].strip()
def __init__(self, base_api_url): """ :param base_api_url: The base API url of the service (e.g., 'http://localhost:43000') :type base_api_url: str """ self._api = UrlBuilder(self._ensure_url_has_scheme(base_api_url)) self._network = Network() self._logger = log.get_logger(__name__)
def __init__(self, executor_id): """ :type executor_id: int """ self.id = executor_id self._project_type = None self._logger = log.get_logger(__name__) self._current_build_id = None self._current_subjob_id = None
def __init__(self, session_dir, docker_process): """ :type session_dir: TemporaryDirectory :type docker_process: Popen """ self._session_dir = session_dir self._docker_process = docker_process self._logger = log.get_logger(__name__) self._logger.debug('Started docker session, pid: {}', self._docker_process.pid)
def __init__(self, executor_id): """ :type executor_id: int """ self.id = executor_id self._project_type = None self._logger = log.get_logger(__name__) self._current_build_id = None self._current_subjob_id = None
def __init__(self): self._logger = get_logger(__name__) self._builds_waiting_for_slaves = Queue() self._request_queue = Queue() self._request_queue_worker_thread = SafeThread( target=self._build_preparation_loop, name='RequestHandlerLoop', daemon=True) self._project_preparation_locks = {}
def __init__(self, build_request_handler): """ :type build_request_handler: BuildRequestHandler """ self._logger = get_logger(__name__) self._build_request_handler = build_request_handler self._idle_slaves = OrderedSetQueue() self._allocation_thread = SafeThread( target=self._slave_allocation_loop, name='SlaveAllocationLoop', daemon=True)
def __init__(self, session_dir, docker_process): """ :type session_dir: TemporaryDirectory :type docker_process: Popen """ self._session_dir = session_dir self._docker_process = docker_process self._logger = log.get_logger(__name__) self._logger.debug('Started docker session, pid: {}', self._docker_process.pid)
def __init__(self, min_connection_poolsize=DEFAULT_POOLSIZE): """ :param min_connection_poolsize: The minimum connection pool size for this instance :type min_connection_poolsize: int """ self._session = requests.Session() self._logger = get_logger(__name__) poolsize = max(min_connection_poolsize, DEFAULT_POOLSIZE) self._session.mount('http://', HTTPAdapter(pool_connections=poolsize, pool_maxsize=poolsize))
def __init__(self, min_connection_poolsize=DEFAULT_POOLSIZE): """ :param min_connection_poolsize: The minimum connection pool size for this instance :type min_connection_poolsize: int """ self._logger = get_logger(__name__) self._session = None self._poolsize = max(min_connection_poolsize, DEFAULT_POOLSIZE) self.reset_session()
def log_app_debug_info_and_force_kill_after_delay(): time.sleep(seconds) logger = log.get_logger(__name__) logger.error( 'ClusterRunner did not exit within {} seconds. App debug info:\n\n{}.', seconds, app_info.get_app_info_string()) logger.critical( 'ClusterRunner seems to be hanging unexpectedly. Sending SIGKILL to self. Farewell!' ) os.kill(os.getpid(), signal.SIGKILL)
def __init__(self, min_connection_poolsize=DEFAULT_POOLSIZE): """ :param min_connection_poolsize: The minimum connection pool size for this instance :type min_connection_poolsize: int """ self._logger = get_logger(__name__) self._session = None self._poolsize = max(min_connection_poolsize, DEFAULT_POOLSIZE) self.reset_session()
def log_app_debug_info_and_force_kill_after_delay(): time.sleep(seconds) logger = log.get_logger(__name__) logger.error( 'ClusterRunner did not exit within {} seconds. App debug info:\n\n{}.', seconds, app_info.get_app_info_string()) logger.critical( 'ClusterRunner seems to be hanging unexpectedly. Hard killing the process. Farewell!' ) os._exit(1)
def __init__(self, scheduler_pool): """ :type scheduler_pool: app.master.build_scheduler_pool.BuildSchedulerPool """ self._logger = get_logger(__name__) self._scheduler_pool = scheduler_pool self._request_queue = Queue() self._request_queue_worker_thread = SafeThread( target=self._build_preparation_loop, name='RequestHandlerLoop', daemon=True) self._project_preparation_locks = {}
def __init__(self, filename=None): """ :param filename: The name of the logfile :type filename: str | None """ self.filename = filename self.logging_disabled = filename is None self._analytics_logger = None self._event_id_generator = Counter() self._log_cache = collections.deque() self._logger = log.get_logger(__name__)
def __init__(self, filename=None): """ :param filename: The name of the logfile :type filename: str | None """ self.filename = filename self.logging_disabled = filename is None self._analytics_logger = None self._event_id_generator = Counter() self._log_cache = collections.deque() self._logger = log.get_logger(__name__)
def __init__(self, build_request_handler): """ :type build_request_handler: BuildRequestHandler """ self._logger = get_logger(__name__) self._build_request_handler = build_request_handler self._idle_slaves = OrderedSetQueue() self._allocation_thread = SafeThread( target=self._slave_allocation_loop, name='SlaveAllocationLoop', daemon=True)
def __init__(self, scheduler_pool): """ :type scheduler_pool: app.master.build_scheduler_pool.BuildSchedulerPool """ self._logger = get_logger(__name__) self._scheduler_pool = scheduler_pool self._idle_slaves = OrderedSetQueue() self._allocation_thread = SafeThread( target=self._slave_allocation_loop, name='SlaveAllocationLoop', daemon=True)
def __init__(self, scheduler_pool): """ :type scheduler_pool: BuildSchedulerPool """ self._logger = get_logger(__name__) self._scheduler_pool = scheduler_pool self._builds_waiting_for_slaves = Queue() self._request_queue = Queue() self._request_queue_worker_thread = SafeThread( target=self._build_preparation_loop, name='RequestHandlerLoop', daemon=True) self._project_preparation_locks = {} self._subjob_calculator = SubjobCalculator()
def __init__(self, min_connection_poolsize=DEFAULT_POOLSIZE): """ :param min_connection_poolsize: The minimum connection pool size for this instance :type min_connection_poolsize: int """ self._session = requests.Session() self._logger = get_logger(__name__) poolsize = max(min_connection_poolsize, DEFAULT_POOLSIZE) self._session.mount( 'http://', HTTPAdapter(pool_connections=poolsize, pool_maxsize=poolsize))
def __init__(self, scheduler_pool): """ :type scheduler_pool: app.master.build_scheduler_pool.BuildSchedulerPool """ self._logger = get_logger(__name__) self._scheduler_pool = scheduler_pool self._request_queue = Queue() self._request_queue_worker_thread = SafeThread( target=self._build_preparation_loop, name='RequestHandlerLoop', daemon=True) self._project_preparation_locks = {}
def __init__(self, url, build_project_directory='', project_directory='', remote='origin', branch='master', hash='FETCH_HEAD', config=None, job_name=None, remote_files=None, atoms_override=None): """ Note: the first line of each parameter docstring will be exposed as command line argument documentation for the clusterrunner build client. :param url: url to the git repo (ie: https, ssh) :type url: str :param build_project_directory: the symlinked directory of where PROJECT_DIR should end up being set to :type build_project_directory: str :param project_directory: path within the repo that contains clusterrunner.yaml :type project_directory: str :param remote: The git remote name to fetch from :type remote: str :param branch: The git branch name on the remote to fetch :type branch: str :param hash: The hash to reset hard on. If hash is not set, we use the FETCH_HEAD of <branch>. :type hash: str :param config: a yaml string representing the project_type's config :type config: str|None :param job_name: a list of job names we intend to run :type job_name: list [str] | None :param remote_files: dictionary mapping of output file to URL :type remote_files: dict[str, str] | None :param atoms_override: The list of overridden atoms (if specified, will not run atomizer). :type atoms_override: list[str] | None """ super().__init__(config, job_name, remote_files, atoms_override) self._url = url self._remote = remote self._branch = branch self._hash = hash self._repo_directory = self.get_full_repo_directory(self._url) self._timing_file_directory = self.get_timing_file_directory(self._url) self._local_ref = None self._logger = log.get_logger(__name__) # We explicitly set the repo directory to 700 so we don't inadvertently expose the repo to access by other users fs.create_dir(self._repo_directory, self.DIRECTORY_PERMISSIONS) fs.create_dir(self._timing_file_directory, self.DIRECTORY_PERMISSIONS) fs.create_dir(os.path.dirname(build_project_directory)) # Create a symlink from the generated build project directory to the actual project directory. # This is done in order to switch between the master's and the slave's copies of the repo while not # having to do something hacky in order to user the master's generated atoms on the slaves. actual_project_directory = os.path.join(self._repo_directory, project_directory) try: os.unlink(build_project_directory) except FileNotFoundError: pass os.symlink(actual_project_directory, build_project_directory) self.project_directory = build_project_directory
def __init__(self, build_id, enter_state_callbacks): """ :type build_id: int :type enter_state_callbacks: dict[BuildState, callable] """ self._logger = log.get_logger(__name__) self._build_id = build_id self._transition_timestamps = {state: None for state in BuildState} # initialize all timestamps to None self._fsm = self._create_state_machine() for build_state, callback in enter_state_callbacks.items(): self._register_enter_state_callback(build_state, callback)
def __init__(self): super().__init__() self._handling_lock = Lock() self._teardown_callback_stack = LifoQueue() # we execute callbacks in the reverse order that they were added self._logger = log.get_logger(__name__) self._handled_exceptions = Queue() # Set up a handler to be called when process receives SIGTERM. # Note: this will raise if called on a non-main thread, but we should NOT work around that here. (That could # prevent the teardown handler from ever being registered!) Calling code should be organized so that this # singleton is only ever initialized on the main thread. signal.signal(signal.SIGTERM, self._application_teardown_signal_handler) signal.signal(signal.SIGINT, self._application_teardown_signal_handler)
def __init__(self, config=None, job_name=None, remote_files=None): """ :param config: A yaml string representing a cluster_runner.yaml file :type config: str | None :type job_name: str | None :param remote_files: key-value pairs of where the key is the output_file and the value is the url :type remote_files: dict[str, str] | None """ self._logger = get_logger(__name__) self.project_directory = '' self._config = config self._job_name = job_name self._remote_files = remote_files if remote_files else {}
def rsa_key(host): """ :param host: The RSA key for host that we want to retrieve :type host: str :return: the rsa key string, without the 'ssh-rsa' prefix. Returns None if failed ssh-keyscan fails. :rtype: str|None """ proc = subprocess.Popen('ssh-keyscan -t rsa {}'.format(host), shell=True, stdout=PIPE, stderr=PIPE) output, error = proc.communicate() if proc.returncode != 0: log.get_logger(__name__).error( 'Failed to get rsa string with output: {}, error: {}'.format( output, error)) return None line = output.decode("utf-8") # We want the string to the right of, and not including, the 'ssh-rsa' string. return line.split('ssh-rsa', 1)[-1].strip()
def __init__(self, host, username, executable_path): """ :param host: the fully qualified hostname of the host to deploy to :type host: str :param username: the user who is executing this process and whose ssh credentials will be used :type username: str :param executable_path: the path to the clusterrunner executable on the remote host :type executable_path: str """ self._logger = get_logger(__name__) self.host = host self._username = username self._executable_path = executable_path self._shell_client = ShellClientFactory.create(host, username)
def __init__(self, slave_url, num_executors): """ :type slave_url: str :type num_executors: int """ self.url = slave_url self.num_executors = num_executors self.id = self._slave_id_counter.increment() self._num_executors_in_use = Counter() self._network = Network(min_connection_poolsize=num_executors) self.current_build_id = None self._is_alive = True self._slave_api = UrlBuilder(slave_url, self.API_VERSION) self._logger = log.get_logger(__name__)
def __init__(self, slave_url, num_executors): """ :type slave_url: str :type num_executors: int """ self.url = slave_url self.num_executors = num_executors self.id = self._slave_id_counter.increment() self._num_executors_in_use = Counter() self._network = Network(min_connection_poolsize=num_executors) self.current_build_id = None self.is_alive = True self._slave_api = UrlBuilder(slave_url, self.API_VERSION) self._logger = log.get_logger(__name__)
def __init__(self, build_id, enter_state_callbacks): """ :type build_id: int :type enter_state_callbacks: dict[BuildState, callable] """ self._logger = log.get_logger(__name__) self._build_id = build_id self._transition_timestamps = {state: None for state in BuildState } # initialize all timestamps to None self._fsm = self._create_state_machine() for build_state, callback in enter_state_callbacks.items(): self._register_enter_state_callback(build_state, callback)
def __init__(self, scheduler_pool): """ :type scheduler_pool: BuildSchedulerPool """ self._logger = get_logger(__name__) self._scheduler_pool = scheduler_pool self._builds_waiting_for_slaves = Queue() self._request_queue = Queue() self._request_queue_worker_thread = SafeThread( target=self._build_preparation_loop, name='RequestHandlerLoop', daemon=True) self._project_preparation_locks = {} self._subjob_calculator = SubjobCalculator()
def __init__(self, config=None, job_name=None, remote_files=None): """ :param config: A yaml string representing a clusterrunner.yaml file :type config: str | None :type job_name: str | None :param remote_files: key-value pairs of where the key is the output_file and the value is the url :type remote_files: dict[str, str] | None """ self.project_directory = '' self._config = config self._job_name = job_name self._remote_files = remote_files if remote_files else {} self._logger = log.get_logger(__name__) self._kill_event = Event()
def __init__(self, build): """ :type build: Build """ self._logger = get_logger(__name__) self._build = build job_config = build.project_type.job_config() self._max_executors = job_config.max_executors self._max_executors_per_slave = job_config.max_executors_per_slave self._slaves_allocated = [] self._num_executors_allocated = 0 self._num_executors_in_use = 0 self._subjob_assignment_lock = Lock( ) # prevents subjobs from being skipped
def __init__(self): self._logger = get_logger(__name__) self._master_results_path = Configuration['results_directory'] self._all_slaves_by_url = {} self._all_builds_by_id = OrderedDict() self._build_request_handler = BuildRequestHandler() self._build_request_handler.start() self._slave_allocator = SlaveAllocator(self._build_request_handler) self._slave_allocator.start() # Asynchronously delete (but immediately rename) all old builds when master starts. # Remove this if/when build numbers are unique across master starts/stops if os.path.exists(self._master_results_path): fs.async_delete(self._master_results_path) fs.create_dir(self._master_results_path)
def __init__(self): super().__init__() self._handling_lock = Lock() self._teardown_callback_stack = LifoQueue( ) # we execute callbacks in the reverse order that they were added self._logger = log.get_logger(__name__) self._handled_exceptions = Queue() self._teardown_callback_raised_exception = False # Set up handlers to be called when the application process receives certain signals. # Note: this will raise if called on a non-main thread, but we should NOT work around that here. (That could # prevent the teardown handler from ever being registered!) Calling code should be organized so that this # singleton is only ever initialized on the main thread. signal.signal(signal.SIGTERM, self._application_teardown_signal_handler) signal.signal(signal.SIGINT, self._application_teardown_signal_handler) signal.signal(self.SIGINFO, self._application_info_dump_signal_handler)
def __init__(self, master_url, request_params, secret): """ :param master_url: The url of the master which the build will be executed on :type master_url: str :param request_params: A dict of request params that will be json-encoded and sent in the build request :type request_params: dict :type secret: str """ self._master_url = self._ensure_url_has_scheme(master_url) self._request_params = request_params self._secret = secret self._build_id = None self._network = Network() self._logger = get_logger(__name__) self._last_build_status_details = None self._master_api = UrlBuilder(master_url, self.API_VERSION) self._cluster_master_api_client = ClusterMasterAPIClient(master_url)