def create(self, parent): """ Create a new environment containing added resources. """ env = Environment(parent=parent) for item in self._resources: env.add(item) return env
def __init__(self, **options): super(Pool, self).__init__(**options) self.unassigned = [] # unassigned tasks self.task_assign_cnt = {} # uid: times_assigned self.should_reschedule = default_check_reschedule self._workers = Environment(parent=self) self._conn = self.CONN_MANAGER(self._cfg) self._pool_lock = threading.Lock() self._metadata = {}
class Pool(Executor): """ Pool task executor object that initializes workers and dispatches tasks. """ CONFIG = PoolConfig CONN_MANAGER = ConnectionManager def __init__(self, **options): super(Pool, self).__init__(**options) self.unassigned = [] # unassigned tasks self.task_assign_cnt = {} # uid: times_assigned self.should_reschedule = default_check_reschedule self._workers = Environment(parent=self) self._workers_last_result = {} self._conn = self.CONN_MANAGER(self._cfg) self._pool_lock = threading.Lock() self._request_handlers = {} self._metadata = {} def uid(self): """Pool name.""" return self.cfg.name def add(self, task, uid): """ Add a task for execution. :param task: Task to be scheduled to workers. :type task: :py:class:`~testplan.runners.pools.tasks.base.Task` :param uid: Task uid. :type uid: ``str`` """ if not isinstance(task, Task): raise ValueError('Task was expected, got {} instead.'.format( type(task))) super(Pool, self).add(task, uid) self.unassigned.append(uid) def set_reschedule_check(self, check_reschedule): """ Sets callable with custom rules to determine if a task should be rescheduled. It must accept the pool object and the task result, and based on these it returns if the task should be rescheduled (i.e due to a known rare system error). :param check_reschedule: Custom callable for task reschedule. :type check_reschedule: ``callable`` that takes ``pool``, ``task_result`` arguments. :return: True if Task should be rescheduled else False. :rtype: ``bool`` """ validate_func('pool', 'task_result')(check_reschedule) self.should_reschedule = check_reschedule def _loop(self): worker_monitor = threading.Thread(target=self._workers_monitoring) worker_monitor.daemon = True worker_monitor.start() while self.active: if self.status.tag == self.status.STARTING: self.status.change(self.status.STARTED) elif self.status.tag == self.status.STOPPING: self.status.change(self.status.STOPPED) break else: msg = self._conn.accept() if msg: try: with self._pool_lock: self.handle_request(msg) except Exception as exc: self.logger.error(format_trace(inspect.trace(), exc)) time.sleep(self.cfg.active_loop_sleep) def handle_request(self, request): """ Handles a worker request. I.e TaskPull, TaskResults, Heartbeat etc. :param request: Worker request. :type request: :py:class:`~testplan.runners.pools.communication.Message` """ sender_index = request.sender_metadata['index'] worker = self._workers[sender_index] if not worker.active: self.logger.critical( 'Ignoring message {} - {} from inactive worker {}'.format( request.cmd, request.data, worker)) # TODO check whether should we respond. worker.respond(Message(**self._metadata).make(Message.Ack)) return else: worker.last_heartbeat = time.time() self.logger.debug('Pool {} request received by {} - {}, {}'.format( self.cfg.name, worker, request.cmd, request.data)) response = Message(**self._metadata) # Pool metadata if not self.active or self.status.tag == self.STATUS.STOPPING: worker.respond(response.make(Message.Stop)) elif request.cmd == Message.ConfigRequest: options = [] cfg = self.cfg while cfg: try: options.append(cfg.denormalize()) except Exception as exc: self.logger.error('Could not denormalize: {} - {}'.format( cfg, exc)) cfg = cfg.parent worker.respond(response.make(Message.ConfigSending, data=options)) elif request.cmd == Message.TaskPullRequest: tasks = [] if self.status.tag == self.status.STARTED: for _ in range(request.data): try: uid = self.unassigned.pop(0) except IndexError: break if uid not in self.task_assign_cnt: self.task_assign_cnt[uid] = 0 if self.task_assign_cnt[uid] >= self.cfg.task_retries_limit: self._discard_task( uid, '{} already reached max retries: {}'.format( self._input[uid], self.cfg.task_retries_limit)) continue else: self.task_assign_cnt[uid] += 1 task = self._input[uid] self.logger.test_info('Scheduling {} to {}'.format( task, worker)) worker.assigned.add(uid) tasks.append(task) if tasks: worker.respond( response.make(Message.TaskSending, data=tasks)) worker.requesting = request.data - len(tasks) return worker.requesting = request.data worker.respond(response.make(Message.Ack)) elif request.cmd == Message.TaskResults: for task_result in request.data: uid = task_result.task.uid() worker.assigned.remove(uid) if worker not in self._workers_last_result: self._workers_last_result[worker] = time.time() self.logger.test_info('De-assign {} from {}'.format( task_result.task, worker)) if self.should_reschedule(self, task_result): if self.task_assign_cnt[uid] >= self.cfg.task_retries_limit: self.logger.test_info( 'Will not reschedule {} again as it ' 'reached max retries'.format( self._input[uid], self.cfg.task_retries_limit)) else: self.logger.test_info( 'Rescheduling {} due to ' 'should_reschedule() cfg option of {}'.format( task_result.task, self)) self.unassigned.append(uid) continue self._print_test_result(task_result) self._results[uid] = task_result self.ongoing.remove(uid) worker.respond(response.make(Message.Ack)) elif request.cmd == Message.Heartbeat: worker.last_heartbeat = time.time() self.logger.debug( 'Received heartbeat from {} at {} after {}s.'.format( worker, request.data, time.time() - request.data)) worker.respond( response.make(Message.Ack, data=worker.last_heartbeat)) elif request.cmd == Message.SetupFailed: self.logger.test_info('Worker {} setup failed:{}{}'.format( worker, os.linesep, request.data)) worker.respond(response.make(Message.Ack)) self._deco_worker(worker, 'Aborting {}, setup failed.') elif request.cmd in self._request_handlers: self._request_handlers[request.cmd](worker, response) else: self.logger.error('Unknown request: {} {} {} {}'.format( request, dir(request), request.cmd, request.data)) worker.respond(response.make(Message.Ack)) def _deco_worker(self, worker, message): self.logger.critical(message.format(worker)) if os.path.exists(worker.outfile): self.logger.critical('\tlogfile: {}'.format(worker.outfile)) while worker.assigned: uid = worker.assigned.pop() self.logger.test_info('Re-assigning {} from {} to {}.'.format( self._input[uid], worker, self)) self.unassigned.append(uid) worker.abort() def _workers_handler_monitoring(self, worker, workers_last_killed={}): inactivity_threshold = self.cfg.worker_inactivity_threshold if worker not in workers_last_killed: workers_last_killed[worker] = time.time() worker_last_killed = workers_last_killed[worker] if not worker.assigned or\ time.time() - worker_last_killed < inactivity_threshold: return try: proc = psutil.Process(worker.handler.pid) children = list(proc.children(recursive=True)) worker_last_result = self._workers_last_result.get(worker, 0) if all(item.status() == 'zombie' for item in children) and\ time.time() - worker_last_result > inactivity_threshold: workers_last_killed[worker] = time.time() try: while worker.assigned: uid = worker.assigned.pop() self.logger.test_info( 'Re-assigning {} from {} to {}.'.format( self._input[uid], worker, self)) self.unassigned.append(uid) self.logger.test_info( 'Restarting worker: {}'.format(worker)) worker.stop() worker.start() except Exception as exc: self.logger.critical( 'Worker {} failed to restart: {}'.format(worker, exc)) self._deco_worker( worker, 'Aborting {}, due to defunct child process.') except psutil.NoSuchProcess: pass def _workers_monitoring(self): if not self.cfg.worker_heartbeat: # No heartbeat means no fault tolerance for worker. return monitor_started = time.time() loop_sleep = self.cfg.worker_heartbeat * self.cfg.heartbeats_miss_limit while self._loop_handler.is_alive(): w_total = set() w_uninitialized = set() w_active = set() w_inactive = set() monitor_alive = time.time() - monitor_started init_window = monitor_alive <= self.cfg.heartbeat_init_window with self._pool_lock: for worker in self._workers: if getattr(worker, 'handler', None): self._workers_handler_monitoring(worker) w_total.add(worker) if not worker.active: w_inactive.add(worker) elif worker.last_heartbeat is None: w_uninitialized.add(worker) if not init_window: self._deco_worker( worker, 'Aborting {}, could not initialize.') elif time.time() - worker.last_heartbeat > loop_sleep: w_inactive.add(worker) self._deco_worker( worker, 'Aborting {}, failed to send heartbeats.') else: w_active.add(worker) if w_total: if len(w_inactive) == len(w_total): self.logger.critical( 'All workers of {} are inactive.'.format(self)) self.abort() break try: # For early finish of worker monitoring thread. wait_until_predicate(lambda: not self._loop_handler.is_alive(), timeout=loop_sleep, interval=0.05) except RuntimeError: return def _discard_task(self, uid, reason): self.logger.critical('Discard task {} of {} - {}.'.format( self._input[uid], self, reason)) self._results[uid] = TaskResult( task=self._input[uid], status=False, reason='Task discarded by {} - {}.'.format(self, reason)) self.ongoing.remove(uid) def _discard_pending_tasks(self): self.logger.critical('Discard pending tasks of {}.'.format(self)) while self.ongoing: uid = self.ongoing[0] self._results[uid] = TaskResult( task=self._input[uid], status=False, reason='Task discarding due to pool {} abort.'.format(self)) self.ongoing.pop(0) def _print_test_result(self, task_result): if not isinstance(task_result.result, RunnableResult) or\ not hasattr(task_result.result, 'report'): return # Currently prints report top level result and not details. name = task_result.result.report.name if task_result.result.report.passed is True: self.logger.test_info('{} -> {}'.format(name, Color.green('Pass'))) else: self.logger.test_info('{} -> {}'.format(name, Color.red('Fail'))) def _add_workers(self): """TODO.""" for idx in (str(i) for i in range(self.cfg.size)): worker = self.cfg.worker_type(index=idx) self.logger.debug('Initialized %s', worker) worker.parent = self worker.cfg.parent = self.cfg self.logger.debug('Worker %(index)d outfile = %(outfile)s', { 'index': idx, 'outfile': worker.outfile }) self._workers.add(worker, uid=idx) self._conn.register(worker) self.logger.debug('Added {}.'.format(worker)) def starting(self): """Starting the pool and workers.""" super(Pool, self).starting() self.make_runpath_dirs() self._metadata['runpath'] = self.runpath self._add_workers() self._workers.start() if self._workers.start_exceptions: for msg in self._workers.start_exceptions.values(): self.logger.error(msg) self._workers.stop() raise RuntimeError( 'All workers of {} failed to start.'.format(self)) def workers_requests(self): """Count how many tasks workers are requesting.""" return sum(worker.requesting for worker in self._workers) def stopping(self): """Stop connections and workers.""" self._conn.close() self._workers.stop() super(Pool, self).stopping() def abort_dependencies(self): """Empty generator to override parent implementation.""" return yield def aborting(self): """Aborting logic.""" self.logger.debug('Aborting pool {}'.format(self)) self._conn.close() for worker in self._workers: worker.abort() self._discard_pending_tasks() self.logger.debug('Aborted pool {}'.format(self))