def wait_dns_records(self): if not self.wait_dns: return left = self.wait_dns timeout = time.time() + left self.svc.print_status_data_eval() self.log.info("wait address propagation to peers") path = ".monitor.nodes.'%s'.services.status.'%s'.resources.'%s'.info.ipaddr~[0-9]" % ( rcEnv.nodename, self.svc.path, self.rid) try: result = self.svc.node._wait(path=path, duration=left) except KeyboardInterrupt: raise ex.excError( "dns resolution not ready after %s (ip not in local dataset)" % print_duration(self.wait_dns)) left = time.time() - timeout while left: result = self.svc.node.daemon_get({"action": "sync"}, timeout=left) if result["status"] == 0: break left = time.time() - timeout if left <= 0: raise ex.excError( "dns resolution not ready after %s (cluster sync timeout)" % print_duration(self.wait_dns))
def queue_action(self, action, delay=0, path=None, rid=None, now=None): sig = (action, path, rid) if sig in self.running: self.log.debug("skip already running action '%s'", sig) return if sig in self.delayed: self.promote_queued_action(sig, delay, now) return exp = now + delay self.delayed[sig] = { "queued": self.now, "expire": exp, "delay": delay, } if not delay: self.log.debug("queued action '%s' for run in %s", sig, print_duration(exp - self.now)) else: self.log.debug("queued action '%s' for run in %s + %s delay", sig, print_duration(exp - self.now), print_duration(delay)) return
def fmt_cpu_time(get, stats_data): if stats_data is None: return "" time = 0 for _data in stats_data.values(): try: time += get(_data) except (KeyError, TypeError) as exc: pass try: return print_duration(time) except Exception: return ""
def dequeue_actions(self): """ Get merged tasks to run from get_todo(), execute them and purge the delayed hash. """ dequeued = [] for task in self.get_todo(): cmd = self.format_cmd(task["action"], task["path"], task["rids"]) log_cmd = self.format_log_cmd(task["action"], task["path"], task["rids"]) self.log.info("run '%s' queued %s ago", " ".join(log_cmd), print_duration(self.now - task["queued"])) self.exec_action(task["sigs"], cmd) dequeued += task["sigs"] self.delete_queued(dequeued)
def _status(self, verbose=False): if self.last is None: return rcStatus.DOWN if len(self.active_pairs) not in (len(self.pairs), 0): self.status_log("cloneset has %d/%d active devs" % (len(self.active_pairs), len(self.pairs))) return rcStatus.WARN elif self.last < datetime.datetime.now() - datetime.timedelta( seconds=self.sync_max_delay): self.status_log("Last sync on %s older than %s" % (self.last, print_duration(self.sync_max_delay))) return rcStatus.WARN else: self.status_log("Last sync on %s" % self.last, "info") return rcStatus.UP
def fmt_svc_uptime(key, stats_data): if stats_data is None: return "" total = 0 now = time.time() top = 0 for node, _data in stats_data.items(): try: uptime = now - _data["services"][key]["created"] if uptime > top: top = uptime except (TypeError, KeyError) as exc: pass try: return print_duration(top) except Exception: return ""
def can_sync(self, target=None): try: ls = self.get_local_state() ts = datetime.datetime.strptime(ls['date'], "%Y-%m-%d %H:%M:%S.%f") except IOError: self.log.error("btrfs state file not found") return True except: import sys import traceback e = sys.exc_info() print(e[0], e[1], traceback.print_tb(e[2])) return False if self.skip_sync(ts): self.status_log("Last sync on %s older than %s"%(ts, print_duration(self.sync_max_delay))) return False return True
def janitor_certificates(self): if self.now < self.last_janitor_certs + JANITOR_CERTS_INTERVAL: return if self.first_available_node() != rcEnv.nodename: return self.last_janitor_certs = time.time() for path in [p for p in shared.SERVICES]: try: obj = shared.SERVICES[path] except KeyError: continue if obj.kind not in ("sec", "usr"): continue try: ca = obj.oget("DEFAULT", "ca") except Exception as exc: continue if ca != self.cluster_ca: continue cf_mtime = shared.CLUSTER_DATA.get(rcEnv.nodename, {}).get( "services", {}).get("config", {}).get(obj.path, {}).get("updated") if cf_mtime is None: continue if obj.path not in self.certificates or self.certificates[ obj.path]["mtime"] < cf_mtime: try: expire = obj.get_cert_expire() except ex.excError: # usr in creation expire = None self.certificates[obj.path] = { "mtime": cf_mtime, "expire": expire, } expire = self.certificates[obj.path]["expire"] if not expire: continue expire_delay = expire - self.now #print(obj.path, "expire in:", print_duration(expire_delay)) if expire_delay < 3600: self.log.info("renew %s certificate, expiring in %s", obj.path, print_duration(expire_delay)) obj.gen_cert()
def sync_status(self, verbose=False): self.init_src_btrfs() try: ls = self.get_local_state() now = datetime.datetime.now() last = datetime.datetime.strptime(ls['date'], "%Y-%m-%d %H:%M:%S.%f") delay = datetime.timedelta(seconds=self.sync_max_delay) except IOError: self.status_log("btrfs state file not found") return rcStatus.WARN except: import sys import traceback e = sys.exc_info() print(e[0], e[1], traceback.print_tb(e[2])) return rcStatus.WARN if last < now - delay: self.status_log("Last sync on %s older than %s"%(last, print_duration(self.sync_max_delay))) return rcStatus.WARN return rcStatus.UP
def docker(self, action): """ Wrap docker commands to honor <action>. """ if self.lib.docker_cmd is None: raise ex.excError("docker executable not found") sec_env = {} cfg_env = {} cmd = self.lib.docker_cmd + [] if action == "start": if not self.detach: signal.signal(signal.SIGALRM, alarm_handler) signal.alarm(self.start_timeout) if self.rm: self.container_rm() if self.container_id is None: self.is_up_clear_cache() if self.container_id is None: try: image_id = self.lib.get_image_id(self.image) except ValueError as exc: raise ex.excError(str(exc)) if image_id is None: self.lib.docker_login(self.image) sec_env = self.kind_environment_env("sec", self.secrets_environment) cfg_env = self.kind_environment_env("cfg", self.configs_environment) cmd += ["run"] cmd += self._add_run_args() for var in sec_env: cmd += ["-e", var] for var in cfg_env: cmd += ["-e", var] cmd += [self.image] if self.run_command: cmd += self.run_command else: cmd += ["start", self.container_id] elif action == "stop": cmd += ["stop", self.container_id] elif action == "kill": cmd += ["kill", self.container_id] else: self.log.error("unsupported docker action: %s", action) return 1 env = {} env.update(os.environ) env.update(sec_env) env.update(cfg_env) try: ret = self.vcall(cmd, warn_to_info=True, env=env)[0] except KeyboardInterrupt: self.log.error("%s timeout exceeded", print_duration(self.start_timeout)) if action == "start": cmd = self.lib.docker_cmd + ["kill", self.container_name] self.vcall(cmd, warn_to_info=True, env=env) ret = 1 if not self.detach: signal.alarm(0) if ret != 0: raise ex.excError if action == "start": self.is_up_clear_cache() elif action in ("stop", "kill"): if self.rm: self.container_rm() self.is_up_clear_cache() self.lib.docker_stop()
def _status(self, verbose=False): try: data = self.lsflash() self.get_last(data) except ex.excError as e: self.status_log(str(e)) return rcStatus.WARN r = rcStatus.UP record_disabled = [] persist_disabled = [] record_enabled = [] state_invalid = [] for _data in data: if _data['Recording'] == "Disabled": record_disabled.append(_data['ID']) elif _data['Recording'] == "Enabled": record_enabled.append(_data['ID']) if _data['State'] != "Valid": state_invalid.append(_data['ID']) if _data['Persistent'] == "Disabled": persist_disabled.append(_data['ID']) if self.recording and len(record_disabled) > 0: self.status_log("Recording disabled on %s" % ','.join(record_disabled)) r = rcStatus.WARN elif not self.recording and len(record_enabled) > 0: self.status_log("Recording enabled on %s" % ','.join(record_enabled)) r = rcStatus.WARN if len(state_invalid) > 0: self.status_log("State not valid on %s" % ','.join(state_invalid)) r = rcStatus.WARN if len(persist_disabled) > 0: self.status_log("Persistent disabled on %s" % ','.join(persist_disabled)) r = rcStatus.WARN pairs = [] for d in data: if 'ID' not in d: continue pairs.append(d['ID']) missing = set(self.pairs) - set(pairs) missing = sorted(list(missing)) if len(missing) > 0: self.status_log("Missing flashcopy on %s" % ','.join(missing)) r = rcStatus.WARN if self.last is None: return rcStatus.WARN elif self.last < datetime.datetime.now() - datetime.timedelta( seconds=self.sync_max_delay): self.status_log("Last sync on %s older than %s" % (self.last, print_duration(self.sync_max_delay))) return rcStatus.WARN elif r == rcStatus.WARN: return rcStatus.WARN self.status_log("Last sync on %s" % self.last) return rcStatus.UP