def test_typical_usage( # pylint: disable=redefined-outer-name capsys, typical_usage_config, typical_usage_stderr, ): try: logging.config.dictConfig(typical_usage_config) log = contextlog.get_logger(ctx="test") log.info("Message #1") saved_logger = None # Only for test! def method(): log = contextlog.get_logger(ctx_internal="method") nonlocal saved_logger saved_logger = log log.debug("Message #2") try: raise RuntimeError except Exception: log.exception("Exception") method() log = contextlog.get_logger() log.info("Message #3") captured_stderr = capsys.readouterr()[1] typical_usage_stderr = typical_usage_stderr.format( module_path=__file__, logger=saved_logger, ) assert captured_stderr == typical_usage_stderr finally: logging.Logger.manager.loggerDict = {}
def add_jobs(self, head, jobs): request_number = self._request_counter.increment() now = make_isotime() added_ids = [] with self._client.make_write_request("add_jobs()") as request: for job in jobs: job_id = make_job_id() get_logger().info("Registering job", job_id=job_id, request_number=request_number, head=head, method=job.method_name, kwargs=job.kwargs) request.create(_get_path_job(job_id), { "head": head, "method": job.method_name, "kwargs": job.kwargs, "created": now, "request": request_number, }) request.create(_get_path_job_state(job_id), { "state": job.state, "stack": None, "finished": None, "retval": None, "exc": None, }) self._input_queue.put(request, job_id) added_ids.append(job_id) return added_ids
def __exit__(self, exc_type, exc_value, traceback): if exc_value is not None: raise exc_value assert len( self._ops) > 0, "_WriteRequest() does not contain operations" if len(self._ops) == 1: (op_name, kwargs) = self._ops[0] getattr(self._client.zk, op_name)(**kwargs) else: trans = self._client.zk.transaction() for (op_name, kwargs) in self._ops: if op_name == "set": op_name = "set_data" getattr(trans, op_name)(**kwargs) results = trans.commit() need_err = False for result in reversed(results): if isinstance(result, kazoo.exceptions.RuntimeInconsistency): need_err = True continue elif isinstance(result, Exception): raise result assert not need_err, "No other exceptions, but runtime is inconsistent: {}".format( results) get_logger().debug("Completed write-request", comment=self._comment)
def test_patch_threading(): try: orig_thread_start = threading.Thread.start orig_thread_bootstrap = threading.Thread._bootstrap # pylint: disable=protected-access contextlog.get_logger(foo="bar") class TestThread(threading.Thread): def __init__(self): threading.Thread.__init__(self) self.context = None def run(self): self.context = contextlog.get_logger().get_context() contextlog.patch_threading() thread = TestThread() thread.start() thread.join() assert thread.context == {"foo": "bar"} finally: threading.Thread.start = orig_thread_start threading.Thread._bootstrap = orig_thread_bootstrap # pylint: disable=protected-access logging.Logger.manager.loggerDict = {}
def __enter__(self): get_logger().debug("Acquiring lock", comment=self._comment) while not self._try_acquire(): wait = threading.Event() if self._client.zk.exists(self._path, watch=lambda _: wait.set()) is not None: wait.wait()
def handler(self, **kwargs): try: (result, message) = self.process_request(**kwargs) return { "status": "ok", "message": message, "result": result } except ApiError as err: result = { "status": "error", "message": err.message, "result": err.result, } return (result, err.code) except Exception as err: get_logger().exception("Unhandled API exception") message = "{}: {}".format(type(err).__name__, err) if hasattr(err, "__module__"): message = "{}.{}".format(err.__module__, message) result = { "status": "error", "message": message, "result": None, } return (result, 500)
def test_typical_usage(capsys, typical_usage_config, typical_usage_stderr): logging.config.dictConfig(typical_usage_config) log = contextlog.get_logger(ctx="test") log.info("Message #1") saved_logger = None # Only for test! def method(): bar = 1 log = contextlog.get_logger(ctx_internal="method") nonlocal saved_logger saved_logger = log log.debug("Message #2") try: raise RuntimeError except: log.exception("Exception") method() log = contextlog.get_logger() log.info("Message #3") logging.getLogger(__name__).info("Message #4") captured_stderr = capsys.readouterr()[1] typical_usage_stderr = typical_usage_stderr.format( module_path=__file__, logger=saved_logger, ) assert captured_stderr == typical_usage_stderr
def add(client, event_root, handler_type, parents_list = None): assert isinstance(event_root, rules.EventRoot), "Invalid event type" if parents_list is not None: assert isinstance(parents_list, (tuple, list)) for item in parents_list: assert len(item) == 2 parents_list = list(parents_list) else: parents_list = [] job_id = str(uuid.uuid4()) job_number = client.IncrementalCounter(zoo.JOBS_COUNTER_PATH).increment() event_root = event_root.copy() event_root.get_extra()[rules.EXTRA.HANDLER] = handler_type event_root.get_extra()[rules.EXTRA.JOB_ID] = job_id event_root.get_extra()[rules.EXTRA.COUNTER] = job_number input_dict = { zoo.INPUT_JOB_ID: job_id, zoo.INPUT_EVENT: event_root, } control_job_path = zoo.join(zoo.CONTROL_JOBS_PATH, job_id) with client.transaction("add_event") as trans: client.TransactionalQueue(zoo.INPUT_PATH).put(trans, pickle.dumps(input_dict)) trans.create(control_job_path) trans.pcreate(zoo.join(control_job_path, zoo.CONTROL_PARENTS), parents_list) trans.pcreate(zoo.join(control_job_path, zoo.CONTROL_ADDED), time.time()) contextlog.get_logger(job_id=job_id).info("Registered job with number %d", job_number) return job_id
def run_in_context(method, kwargs=None, job_id=None, extra=None, fatal=True): if callable(method): state = context.dump_call(method, (kwargs or {})) else: assert isinstance(method, bytes) state = method backend = _Backend() thread = context.JobThread( backend=backend, job_id=(job_id or str(uuid.uuid4())), state=state, extra=extra, fatal_internal=True, ) thread.start() thread.join() if backend.end.exc is not None: if fatal: raise RuntimeError(backend.end.exc) else: get_logger().error(backend.end.exc) return _Result(job_id, backend.steps, backend.end)
def increment(self): with self._client.zk.Lock(join(self._path, "__lock__")): old = self.get() new = old + 1 self._client.zk.set(self._path, _encode_value(new)) get_logger().debug("Value changed: %d -> %d", old, new, comment=self._path) return old
def _poll_running(self): for task_id in self._client.get_children(zoo.RUNNING_PATH): if self._stop_flag: break task_lock_path = zoo.join(zoo.RUNNING_PATH, task_id, zoo.LOCK) try: # XXX: There is no need to control lock running_dict = self._client.pget(zoo.join(zoo.RUNNING_PATH, task_id)) contextlog.get_logger(job_id=running_dict[zoo.RUNNING_JOB_ID], task_id=task_id) control_task_path = zoo.join(zoo.CONTROL_JOBS_PATH, running_dict[zoo.RUNNING_JOB_ID], zoo.CONTROL_TASKS, task_id) created = self._client.pget(zoo.join(control_task_path, zoo.CONTROL_TASK_CREATED)) recycled = self._client.pget(zoo.join(control_task_path, zoo.CONTROL_TASK_RECYCLED)) except zoo.NoNodeError: # XXX: Tasks without jobs lock = self._client.SingleLock(task_lock_path) with lock.try_context() as lock: if lock is not None: self._remove_running(lock, task_id) continue if max(created or 0, recycled or 0) + self._delay <= time.time(): # Grab only old tasks lock = self._client.SingleLock(task_lock_path) with lock.try_context() as lock: if lock is not None: if self._client.pget(zoo.join(control_task_path, zoo.CONTROL_TASK_FINISHED)) is None: self._push_back_running(lock, task_id) else: self._remove_running(lock, task_id)
def set_value(self, path, value, version=None): try: self.replace_value(path, value=value, version=version, default=None) return True except CasVersionError: get_logger().exception("Can't set '%s' value with version %s", path, version) return False
def _find_job_and_run(backend, scripts_dir, conns): _unlock_logging() (read_conn, write_conn) = conns read_conn.close() sys.dont_write_bytecode = True try: with backend.connected(): job = backend.jobs_process.get_job() if job is None: get_logger().info("Nothing to run; exit") write_conn.send(None) write_conn.close() return scripts_path = os.path.join(scripts_dir, job.head) sys.path.insert(0, scripts_path) logger = get_logger(job_id=job.job_id) logger.info("Starting the job") _rename_process(job) thread = context.JobThread( backend=backend, job_id=job.job_id, state=job.state, extra={"head": job.head}, fatal_internal=(not job.respawn), ) thread.start() write_conn.send(job.job_id) write_conn.close() thread.join() except Exception: logger.exception("Unhandled exception in subprocess") raise
def restore_call(state): """ Распикливает состояние (континулет) для запуска """ get_logger().debug("Restoring the continulet state...") import _continuation cont = pickle.loads(state) assert isinstance(cont, _continuation.continulet), "The unpickled state is a garbage!" return cont
def load_file(file_path): with open(file_path) as yaml_file: get_logger().debug("Loading config from '%s'...", file_path) try: return yaml.load(yaml_file, _YamlLoader) except Exception: # Reraise internal exception as standard ValueError and show the incorrect file raise ValueError("Incorrect YAML syntax in file '{}'".format(file_path))
def restore_call(state): """ Распикливает состояние (континулет) для запуска """ get_logger().debug("Restoring the continulet state...") import _continuation cont = pickle.loads(state) assert isinstance( cont, _continuation.continulet), "The unpickled state is a garbage!" return cont
def replace_value(self, path, value=CasNoValue, version=None, default=CasNoValue, fatal_write=True): """ replace_value() - implementation of the CAS, stores the new value if it is superior to the existing version. Standard kazoo set() require strict comparison and incremented version of the data themselves. If: value == CasNoValue -- read operation value == ... -- write the new value and return the old version is None -- write version is not None -- write if version >= old_version """ lock_path = _get_path_cas_storage_lock(path) path = _get_path_cas_storage(path) try: with self._client.make_write_request("cas_ensure_path()") as request: request.create(path, recursive=True) except zoo.NodeExistsError: pass with self._client.get_lock(lock_path): old = self._client.get(path) if old is zoo.EmptyValue: if default is CasNoValue: raise CasNoValueError() old = CasData(value=default, version=None, stored=None) else: old = CasData( value=old["value"], version=old["version"], stored=from_isotime(old["stored"]), ) if value is not CasNoValue: if version is not None and old.version is not None and version <= old.version: write_ok = False msg = "Can't rewrite '{}' with version {} (old version: {})".format(path, version, old.version) if fatal_write: raise CasVersionError(msg) else: get_logger().debug(msg) else: with self._client.make_write_request("cas_save()") as request: request.set(path, { "value": value, "version": version, "stored": make_isotime(), }) write_ok = True else: write_ok = None return (old, write_ok)
def _set_jobs_limit(self, backend): if self._app_config.max_jobs is None: all_jobs = backend.jobs_control.get_jobs_count() workers = max(len(backend.system_apps_state.get_full_state().get("worker", {})), 1) jobs_limit = int(all_jobs / workers + 1) else: jobs_limit = self._app_config.max_jobs if self._jobs_limit != jobs_limit: get_logger().info("Set new jobs limit: %d", jobs_limit) self._jobs_limit = jobs_limit
def get_exposed(backend, loader): head = backend.scripts.get_head() exposed = None errors = None exc = None if head is not None: try: (exposed, errors) = loader.get_exposed(head) except Exception as err: exc = "{}: {}".format(type(err).__name__, err) get_logger().exception("Can't load HEAD '%s'", head) return (head, exposed, errors, exc)
def get_exposed(backend, loader): head = backend.rules.get_head() exposed = None errors = None exc = None if head is not None: try: (exposed, errors) = loader.get_exposed(head) except Exception as err: exc = "{}: {}".format(type(err).__name__, err) get_logger().exception("Can't load HEAD '%s'", head) return (head, exposed, errors, exc)
def get_backend(self): backend = self._free_backends.get() if not backend.is_connected(): try: backend.open() except Exception: get_logger().error("Can't open backend %s", backend) self._free_backends.put(backend) raise try: yield backend finally: self._free_backends.put(backend)
def get_backend(self): backend = self._free_backends.get() if not backend.is_alive(): if backend.is_opened(): try: backend.close() except Exception: get_logger().error("Can't close backend %s", backend) try: backend.open() except Exception: get_logger().error("Can't open backend %s", backend) self._free_backends.put(backend) raise return backend
def add_job(self, job): logger = get_logger( job_id=job.job_id, head=job.head, method=job.method_name, kwargs=job.kwargs, ) try: with self._client.make_write_request("add_job()") as request: logger.info("Registering job") request.create(_get_path_job(job.job_id), { "head": job.head, "method": job.method_name, "kwargs": job.kwargs, "created": make_isotime(), "respawn": job.respawn, }) request.create(_get_path_job_state(job.job_id), { "state": job.state, "stack": None, "finished": None, "retval": None, "exc": None, }) return True except zoo.NodeExistsError: logger.error("The job already exists") return False
def add_job(self, job): logger = get_logger( job_id=job.job_id, head=job.head, method=job.method_name, kwargs=job.kwargs, ) try: with self._client.make_write_request("add_job()") as request: logger.info("Registering job") request.create( _get_path_job(job.job_id), { "head": job.head, "method": job.method_name, "kwargs": job.kwargs, "created": make_isotime(), "respawn": job.respawn, }) request.create( _get_path_job_state(job.job_id), { "state": job.state, "stack": None, "finished": None, "retval": None, "exc": None, }) return True except zoo.NodeExistsError: logger.error("The job already exists") return False
def open(self): if self._chroot is not None: self._ensure_chroot() self.zk = kazoo.client.KazooClient( hosts=self._hosts, timeout=self._timeout, randomize_hosts=self._randomize_hosts, command_retry={"max_delay": 60}, ) if self._chroot is not None: self.zk.chroot = self._chroot logger = get_logger() start_retries = self._start_retries while True: remaining = ("inf" if start_retries is None else start_retries) logger.debug("Trying to connect to ZK, attempts remaining: %s (timeout: %d)", remaining, self._start_timeout, hosts=self._hosts) try: self.zk.start(timeout=self._start_timeout) break except Exception: logger.exception("Can't connect to ZK in this time") if start_retries is not None: if start_retries > 0: start_retries -= 1 else: raise logger.debug("Started ZK client", hosts=self._hosts)
def _get_exposed_unsafe(path): assert os.access(path, os.F_OK), "Can't find module path: {}".format(path) logger = get_logger() logger.debug("Loading scripts from path: %s", path) sys.path.insert(0, path) try: modules = {} errors = {} for name in _get_all_modules(path): try: modules[name] = importlib.import_module(name) except Exception: errors[name] = traceback.format_exc() logger.exception("Can't import module '%s' from path '%s'", name, path) logger.debug("Found %d modules in path '%s'", len(modules), path) methods = {} for (module_name, module) in modules.items(): for obj_name in dir(module): if obj_name.startswith("__"): continue obj = getattr(module, obj_name) if callable(obj) and getattr(obj, _ATTR_EXPOSED, False): methods["{}.{}".format(module_name, obj_name)] = obj logger.debug("Loaded %d exposed methods from path '%s'", len(methods), path) return (methods, errors) finally: sys.path.remove(path)
def _push_back_running(self, lock, task_id): running_dict = self._client.pget(zoo.join(zoo.RUNNING_PATH, task_id)) job_id = running_dict[zoo.RUNNING_JOB_ID] logger = contextlog.get_logger() try: with self._client.transaction("push_back_running") as trans: lock.release(trans) trans.delete(zoo.join(zoo.RUNNING_PATH, task_id)) self._ready_queue.put( trans, pickle.dumps({ zoo.READY_JOB_ID: job_id, zoo.READY_TASK_ID: task_id, zoo.READY_HANDLER: running_dict[zoo.RUNNING_HANDLER], zoo.READY_STATE: running_dict[zoo.RUNNING_STATE], })) trans.pset( zoo.join(zoo.CONTROL_JOBS_PATH, job_id, zoo.CONTROL_TASKS, task_id, zoo.CONTROL_TASK_RECYCLED), time.time()) logger.info("Pushed back") except zoo.TransactionError: logger.exception("Cannot push-back running")
def run(self): logger = contextlog.get_logger( job_id=self._task.get_job_id(), task_id=self._task.get_task_id(), ) try: self._task.init_cont() except Exception: logger.exception("Cont-init error") self._saver(self._task, None, traceback.format_exc(), None) while not self._stop_flag and self._task.is_pending(): if not self._controller(self._task): self._saver(self._task, None, None, None) logger.info("Task is cancelled") return (stack_list, exc, state) = self._task.step() if exc is not None: logger.error("Unhandled step() error") self._saver(self._task, None, exc, None) return self._saver(self._task, stack_list, None, state) if not self._task.is_pending(): self._saver(self._task, None, None, None) logger.info("Task is finished") else: logger.info("Task is stopped")
def _saver_unsafe(self, task, stack_list, exc, state): job_id = task.get_job_id() task_id = task.get_task_id() logger = contextlog.get_logger(job_id=job_id, task_id=task_id) try: with self._client.transaction("saver") as trans: trans.pset(zoo.join(zoo.RUNNING_PATH, task_id), { zoo.RUNNING_JOB_ID: job_id, zoo.RUNNING_HANDLER: None, zoo.RUNNING_STATE: state, }) control_task_path = zoo.join(zoo.CONTROL_JOBS_PATH, job_id, zoo.CONTROL_TASKS, task_id) if state is None: trans.pset(zoo.join(control_task_path, zoo.CONTROL_TASK_FINISHED), time.time()) status = zoo.TASK_STATUS.FINISHED else: status = zoo.TASK_STATUS.READY trans.pset(zoo.join(control_task_path, zoo.CONTROL_TASK_STATUS), status) trans.pset(zoo.join(control_task_path, zoo.CONTROL_TASK_STACK), ( stack_list and [ item for item in stack_list if item[0].startswith(self._rules_path) ] )) trans.pset(zoo.join(control_task_path, zoo.CONTROL_TASK_EXC), exc) except zoo.TransactionError: logger.exception("saver error, current task has been dropped") raise logger.debug("Saved; status: %s", status)
def process(self): logger = get_logger() sleep_mode = False with self.get_backend_object().connected() as backend: while not self._stop_event.is_set(): gen_jobs = backend.jobs_process.get_ready_jobs() while not self._stop_event.is_set(): self._manager.manage(backend) self._write_worker_state(backend) if self._manager.get_current() >= self._app_config.max_jobs: logger.debug("Have reached the maximum concurrent jobs %(maxjobs)d," " sleeping %(delay)f seconds...", {"maxjobs": self._app_config.max_jobs, "delay": self._app_config.max_jobs_sleep}) time.sleep(self._app_config.max_jobs_sleep) else: try: job = next(gen_jobs) except StopIteration: if not sleep_mode: logger.debug("No jobs in queue, sleeping for %(delay)f seconds...", {"delay": self._app_config.empty_sleep}) sleep_mode = True time.sleep(self._app_config.empty_sleep) break else: sleep_mode = False if not self._manager.run_job(job, self.get_backend_object()): backend.jobs_process.release_job(job.job_id) self._not_started += 1
def _try_start_job(self): logger = get_logger() logger.info("Starting candidate process") (read_conn, write_conn) = multiprocessing.Pipe(True) proc = multiprocessing.Process( target=_find_job_and_run, kwargs={ "backend": self.get_backend_object(), "scripts_dir": self._config.core.scripts_dir, "conns": (read_conn, write_conn), }, ) proc.start() write_conn.close() if read_conn.poll(self._app_config.wait_slowpokes): job_id = read_conn.recv() if job_id is not None: # Задача нашлась и запущена self._procs[job_id] = proc return True else: # Задач нет, надо поспать proc.join() else: # Слоупок logger.error("Detected slowpoke process %d", proc.pid) killed = _send_signal(proc, signal.SIGKILL) proc.join() if killed: logger.info("Killed slowpoke job process %d with retcode %d", proc.pid, proc.exitcode) else: logger.info("Found dead slowpoke job process %d with retcode %d", proc.pid, proc.exitcode) self._not_started += 1 read_conn.close()
def __init__(self, backend, job_id, state, extra, __unpickle=False): # pylint: disable=unused-argument threading.Thread.__init__(self, name="JobThread::" + job_id) self._backend = backend self._job_id = job_id self._state = state self._extra = extra self._cont = None self._log_context = get_logger().get_context() # Proxy context into the continulet
def _remove_running(self, lock, task_id): logger = contextlog.get_logger() try: with self._client.transaction("remove_running") as trans: lock.release(trans) trans.delete(zoo.join(zoo.RUNNING_PATH, task_id)) logger.info("Running removed") except zoo.TransactionError: logger.exception("Cannot remove running")
def method(): log = contextlog.get_logger(ctx_internal="method") nonlocal saved_logger saved_logger = log log.debug("Message #2") try: raise RuntimeError except Exception: log.exception("Exception")
def manage(self, backend): for (job_id, (method_name, proc)) in self._procs.copy().items(): logger = get_logger(job_id=job_id, method=method_name) if not proc.is_alive(): logger.info("Finished job process %(pid)d with retcode %(retcode)d", {"pid": proc.pid, "retcode": proc.exitcode}) self._finish(job_id) elif backend.jobs_process.is_deleted_job(job_id): self._kill(proc) self._finish(job_id)
def process(self): logger = get_logger() with self.get_backend_object().connected() as backend: sleep_mode = False while not self._stop_event.is_set(): sleep_mode = (not self._gc_jobs(backend)) # Separate function for a different log context if not sleep_mode: logger.debug("No jobs in list, entering to sleep mode with interval %f seconds...", self._app_config.empty_sleep) sleep_mode = True time.sleep(self._app_config.empty_sleep)
def _kill(self, proc): logger = get_logger() logger.info("Killing job process %(pid)d...", {"pid": proc.pid}) try: proc.terminate() proc.join() except Exception: logger.exception("Can't kill process %(pid)d; ignored", {"pid": proc.pid}) return logger.info("Killed job process %(pid)d with retcode %(exitcode)d", {"pid": proc.pid, "exitcode": proc.exitcode})
def run(self): logger = get_logger(**self._log_context) logger.debug("Initializing context...") try: self._cont = restore_call(self._state) except Exception: logger.exception("Context initialization has failed") self._backend.jobs_process.done_job( job_id=self._job_id, retval=None, exc=traceback.format_exc(), ) raise logger.debug("Activation...") while self._cont.is_pending(): try: logger.debug("Entering continulet...") stack_or_retval = self._cont.switch() logger.debug("Exited from continulet") if self._cont.is_pending(): # In progress self._backend.jobs_process.save_job_state( job_id=self._job_id, state=pickle.dumps(self._cont), stack=stack_or_retval, ) else: # Done self._backend.jobs_process.done_job( job_id=self._job_id, retval=stack_or_retval, exc=None, ) except Exception as err: if isinstance(err, SuicideError): logger.warning("Suicide; fatal_internal=%s", self._fatal_internal) else: logger.exception("Unhandled step error; fatal_internal=%s", self._fatal_internal) # self._cont.switch() switches the stack, so we will see a valid exception, up to this place # in the script. sys.exc_info() return a raw exception data. Some of them can't be pickled, for # example, traceback-object. For those who use the API, easier to read the text messages. # traceback.format_exc() simply converts data from sys.exc_info() into a string. if self._fatal_internal: self._backend.jobs_process.done_job( job_id=self._job_id, retval=None, exc=traceback.format_exc(), ) logger.debug("Context failed") return logger.debug("Context finished")
def _remove_modules_unsafe(path): logger = get_logger() logger.debug("Removed modules with path: %s", path) for name in list(sys.modules): module_path = getattr(sys.modules[name], "__file__", None) if module_path is None: # FIXME: We don't support the namespaces yet logger.debug("Ignored module/package without __file__ attribute: %s", name) elif module_path.startswith(os.path.normpath(path) + os.path.sep): logger.debug("Removed old module: %s", name) del sys.modules[name]
def _poll_running(self): for task_id in self._client.get_children(zoo.RUNNING_PATH): if self._stop_flag: break task_lock_path = zoo.join(zoo.RUNNING_PATH, task_id, zoo.LOCK) try: # XXX: There is no need to control lock running_dict = self._client.pget( zoo.join(zoo.RUNNING_PATH, task_id)) contextlog.get_logger(job_id=running_dict[zoo.RUNNING_JOB_ID], task_id=task_id) control_task_path = zoo.join(zoo.CONTROL_JOBS_PATH, running_dict[zoo.RUNNING_JOB_ID], zoo.CONTROL_TASKS, task_id) created = self._client.pget( zoo.join(control_task_path, zoo.CONTROL_TASK_CREATED)) recycled = self._client.pget( zoo.join(control_task_path, zoo.CONTROL_TASK_RECYCLED)) except zoo.NoNodeError: # XXX: Tasks without jobs lock = self._client.SingleLock(task_lock_path) with lock.try_context() as lock: if lock is not None: self._remove_running(lock, task_id) continue if max(created or 0, recycled or 0) + self._delay <= time.time(): # Grab only old tasks lock = self._client.SingleLock(task_lock_path) with lock.try_context() as lock: if lock is not None: if self._client.pget( zoo.join(control_task_path, zoo.CONTROL_TASK_FINISHED)) is None: self._push_back_running(lock, task_id) else: self._remove_running(lock, task_id)
def process(self): logger = get_logger() with self.get_backend_object().connected() as backend: sleep_mode = False while not self._stop_event.is_set(): sleep_mode = ( not self._gc_jobs(backend) ) # Separate function for a different log context if not sleep_mode: logger.debug( "No jobs in list, entering to sleep mode with interval %f seconds...", self._app_config.empty_sleep) sleep_mode = True time.sleep(self._app_config.empty_sleep)
def _run_task(self, ready_dict): job_id = ready_dict[zoo.READY_JOB_ID] task_id = ready_dict[zoo.READY_TASK_ID] state = ready_dict[zoo.READY_STATE] handler = ( ready_dict[zoo.READY_HANDLER] if state is None else None ) assert task_id not in self._threads_dict, "Duplicating tasks" logger = contextlog.get_logger(job_id=job_id, task_id=task_id) lock_path = zoo.join(zoo.RUNNING_PATH, task_id, zoo.LOCK) try: parents_list = self._client.pget(zoo.join(zoo.CONTROL_JOBS_PATH, job_id, zoo.CONTROL_PARENTS)) created = self._client.pget(zoo.join(zoo.CONTROL_JOBS_PATH, job_id, zoo.CONTROL_TASKS, task_id, zoo.CONTROL_TASK_CREATED)) except zoo.NoNodeError: logger.exception("Missing the necessary control nodes for the ready job") return with self._client.transaction("init_task") as trans: trans.pcreate(zoo.join(zoo.RUNNING_PATH, task_id), { zoo.RUNNING_JOB_ID: job_id, zoo.RUNNING_HANDLER: handler, zoo.RUNNING_STATE: state, }) for (node, value) in ( (zoo.CONTROL_TASK_STATUS, ( zoo.TASK_STATUS.NEW if state is None else zoo.TASK_STATUS.READY )), (zoo.CONTROL_TASK_CREATED, ( created or time.time() )), (zoo.CONTROL_TASK_RECYCLED, time.time()), ): trans.pset(zoo.join(zoo.CONTROL_JOBS_PATH, job_id, zoo.CONTROL_TASKS, task_id, node), value) trans.create(lock_path, ephemeral=True) # XXX: Acquired SingleLock() self._ready_queue.consume(trans) task_thread = _TaskThread( controller=self._controller, saver=self._saver, parents_list=parents_list, job_id=job_id, task_id=task_id, handler=handler, state=state, ) self._threads_dict[task_id] = { _TASK_THREAD: task_thread, _TASK_LOCK: self._client.SingleLock(lock_path), } message = ( "Spawned the new job" if state is None else "Respawned the old job" ) logger.info("%s (parents: %s)", message, parents_list) task_thread.start()
def init_cont(self): assert self._cont is None, "Continulet is already constructed" logger = contextlog.get_logger(job_id=self._job_id, task_id=self._task_id) if self._handler is not None: logger.debug("Creating a new continulet...") handler = pickle.loads(self._handler) cont = _continuation.continulet(lambda _: handler()) elif self._state is not None: logger.debug("Restoring the old state...") cont = pickle.loads(self._state) assert isinstance(cont, _continuation.continulet), "The unpickled state is a garbage!" else: raise RuntimeError("Required handler OR state") logger.debug("... continulet is ready") self._cont = cont
def step(self): assert self._cont is not None, "Run init_cont() first" assert self._cont.is_pending(), "Attempt to step() on a finished task" logger = contextlog.get_logger(job_id=self._job_id, task_id=self._task_id) logger.debug("Activating...") try: stack_list = self._cont.switch() logger.debug("... stack --> %s", str(stack_list)) return (stack_list, None, pickle.dumps(self._cont)) except Exception: logger.exception("Step error") # self._cont.switch() switches the stack, so we will see a valid exception, up to this place in the rule. # sys.exc_info() return a raw exception data. Some of them can't be pickled, for example, traceback-object. # For those who use the API, easier to read the text messages. traceback.format_exc() simply converts data # from sys.exc_info() into a string. return (None, traceback.format_exc(), None)
def _gc_jobs(self, backend): processed = 0 self._dump_collector_state(backend) for (job_id, done) in backend.jobs_gc.get_jobs(): logger = get_logger(job_id=job_id) logger.debug("Processing: done=%s", done) if done: backend.jobs_gc.remove_job_data(job_id) logger.info("Removed done job") else: backend.jobs_gc.push_back_job(job_id) logger.info("Pushed-back unfinished job") processed += 1 self._processed += 1 self._dump_collector_state(backend) return bool(processed)
def process(self): logger = get_logger() with self.get_backend_object().connected() as backend: while not self._stop_event.is_set(): self._set_jobs_limit(backend) self._manage_processes(backend) self._dump_worker_state(backend) if len(self._procs) >= self._jobs_limit: logger.debug("Have reached the maximum concurrent jobs %d, sleeping %f seconds...", self._jobs_limit, self._app_config.max_jobs_sleep) time.sleep(self._app_config.max_jobs_sleep) elif backend.jobs_process.has_awaiting_jobs(): if not self._try_start_job(): logger.debug("No jobs to start, sleeping for %f seconds...", self._app_config.empty_sleep) time.sleep(self._app_config.empty_sleep)