Пример #1
0
 def lock_release(self,
                  name,
                  lock_id,
                  timeout=None,
                  silent=False,
                  thr=None):
     released = False
     if timeout is None:
         timeout = 5
     deadline = time.time() + timeout
     with shared.LOCKS_LOCK:
         if not lock_id or shared.LOCKS.get(name, {}).get("id") != lock_id:
             return
         del shared.LOCKS[name]
     shared.wake_monitor(reason="unlock", immediate=True)
     if not silent:
         thr.log.info("released locally %s", name)
     while time.time() < deadline:
         if self._lock_released(name, lock_id):
             released = True
             break
         time.sleep(0.5)
     if released is False:
         thr.log.warning('timeout waiting for lock %s %s release on peers',
                         name, lock_id)
Пример #2
0
 def lock_release(self, name, lock_id, silent=False, thr=None):
     with shared.LOCKS_LOCK:
         if not lock_id or shared.LOCKS.get(name, {}).get("id") != lock_id:
             return
         del shared.LOCKS[name]
     shared.wake_monitor(reason="unlock", immediate=True)
     if not silent:
         thr.log.info("released %s", name)
Пример #3
0
 def _lock_acquire(self, nodename, name):
     with shared.LOCKS_LOCK:
         if name in shared.LOCKS:
             return
         lock_id = str(uuid.uuid4())
         shared.LOCKS[name] = {
             "requested": time.time(),
             "requester": nodename,
             "id": lock_id,
         }
     shared.wake_monitor(reason="lock", immediate=True)
     return lock_id
Пример #4
0
    def action(self, nodename, thr=None, **kwargs):
        """
        Care with locks
        """
        thr.log_request("shutdown daemon", nodename, **kwargs)
        with shared.THREADS_LOCK:
            shared.THREADS["scheduler"].stop()
            mon = shared.THREADS["monitor"]
        if thr.stopped() or shared.NMON_DATA.status == "shutting":
            thr.log.info("already shutting")
            # wait for service shutdown to finish before releasing the dup client
            while True:
                if mon._shutdown:
                    break
                time.sleep(0.3)
            return {"status": 0}
        try:
            thr.set_nmon("shutting")
            mon.kill_procs()
            for path in shared.SMON_DATA:
                _, _, kind = split_path(path)
                if kind not in ("svc", "vol"):
                    continue
                thr.set_smon(path, local_expect="shutdown")
            self.wait_shutdown()

            # send a last status to peers so they can takeover asap
            mon.update_hb_data()

            mon._shutdown = True
            shared.wake_monitor("services shutdown done")
        except Exception as exc:
            thr.log.exception(exc)

        thr.log.info("services are now shutdown")
        while True:
            with shared.THREADS_LOCK:
                if not shared.THREADS["monitor"].is_alive():
                    break
            time.sleep(0.3)
        shared.DAEMON_STOP.set()
        return {"status": 0}
Пример #5
0
    def _read_config(self):
        """
        Reload the node configuration file and notify the threads to do the
        same, if the file's mtime has changed since the last load.
        """
        mtime = self.get_config_mtime()
        if mtime is None:
            return
        if self.last_config_mtime is not None and \
           self.last_config_mtime >= mtime:
            return
        try:
            with shared.NODE_LOCK:
                if shared.NODE:
                    shared.NODE.close()
                shared.NODE = Node()
                shared.NODE.set_rlimit()
                shared.NODE.network_setup()
            unset_lazy(self, "config_hbs")
            if self.last_config_mtime:
                self.log.info("node config reloaded (changed)")
            else:
                self.log.info("node config loaded")
            self.last_config_mtime = mtime

            # signal the node config change to threads
            for thr in self.threads.values():
                if thr.stopped():
                    thr.unstop()
                else:
                    thr.notify_config_change()
            shared.wake_monitor(reason="config change", immediate=True)

            # signal the caller the config has changed
            return True
        except Exception as exc:
            self.log.warning("failed to load config: %s", str(exc))
Пример #6
0
 def stop_threads(self):
     """
     Send a stop notification to all threads, and wait for them to
     complete their shutdown.
     Stop dns last, so the service is available as long as possible.
     """
     self.log.info("signal stop to all threads")
     for thr_id, thr in self.threads.items():
         if thr_id == "dns":
             continue
         thr.stop()
     shared.wake_collector()
     shared.wake_scheduler()
     shared.wake_monitor(reason="stop threads", immediate=True)
     shared.wake_heartbeat_tx()
     for thr_id, thr in self.threads.items():
         if thr_id == "dns":
             continue
         self.log.info("waiting for %s to stop", thr_id)
         thr.join()
     if "dns" in self.threads:
         self.threads["dns"].stop()
         self.log.info("waiting for dns to stop")
         self.threads["dns"].join()
Пример #7
0
 def action(self, nodename, thr=None, **kwargs):
     options = self.parse_options(kwargs)
     if not options.thr_id:
         thr.log_request("stop daemon", nodename, **kwargs)
         if options.get("upgrade"):
             thr.set_nmon(status="upgrade")
             thr.log.info("announce upgrade state")
         else:
             thr.set_nmon(status="maintenance")
             thr.log.info("announce maintenance state")
         time.sleep(5)
         shared.DAEMON_STOP.set()
         return {"status": 0}
     elif options.thr_id == "tx":
         thr_ids = [thr_id for thr_id in shared.THREADS.keys() if thr_id.endswith("tx")]
     else:
         thr_ids = [options.thr_id]
     for thr_id in thr_ids:
         with shared.THREADS_LOCK:
             has_thr = thr_id in shared.THREADS
         if not has_thr:
             thr.log_request("stop thread requested on non-existing thread", nodename, **kwargs)
             return {"error": "thread does not exist"*50, "status": 1}
         thr.log_request("stop thread %s" % thr_id, nodename, **kwargs)
         with shared.THREADS_LOCK:
             shared.THREADS[thr_id].stop()
         if thr_id == "scheduler":
             shared.wake_scheduler()
         elif thr_id == "monitor":
             shared.wake_monitor("shutdown")
         elif thr_id.endswith("tx"):
             shared.wake_heartbeat_tx()
         if options.get("wait", False):
             with shared.THREADS_LOCK:
                 shared.THREADS[thr_id].join()
     return {"status": 0}
Пример #8
0
 def _store_rx_data(self, data, nodename):
     current_gen = shared.REMOTE_GEN.get(nodename, 0)
     our_gen_on_peer = data.get("gen", {}).get(Env.nodename, 0)
     kind = data.get("kind", "full")
     change = False
     if kind == "patch":
         if current_gen == 0:
             # waiting for a full: ignore patches
             return
         if nodename not in shared.CLUSTER_DATA:
             # happens during init. ignore the patch, and ask for a full
             shared.REMOTE_GEN[nodename] = 0
             shared.LOCAL_GEN[nodename] = our_gen_on_peer
             return
         deltas = data.get("deltas", [])
         gens = sorted([int(gen) for gen in deltas])
         gens = [gen for gen in gens if gen > current_gen]
         if len(gens) == 0:
             #self.log.info("no more recent gen in received deltas")
             if our_gen_on_peer > shared.LOCAL_GEN[nodename]:
                 shared.LOCAL_GEN[nodename] = our_gen_on_peer
                 shared.CLUSTER_DATA[nodename]["gen"][
                     Env.nodename] = our_gen_on_peer
             return
         with shared.CLUSTER_DATA_LOCK:
             for gen in gens:
                 #self.log.debug("merge node %s gen %d (%d diffs)", nodename, gen, len(deltas[str(gen)]))
                 if gen - 1 != current_gen:
                     self.log.warning(
                         "unsynchronized node %s dataset. local gen %d, received %d. "
                         "ask for a full.", nodename, current_gen, gen)
                     shared.REMOTE_GEN[nodename] = 0
                     shared.LOCAL_GEN[nodename] = our_gen_on_peer
                     shared.CLUSTER_DATA[nodename]["gen"] = {
                         nodename: gen,
                         Env.nodename: our_gen_on_peer,
                     }
                     break
                 try:
                     json_delta.patch(shared.CLUSTER_DATA[nodename],
                                      deltas[str(gen)])
                     current_gen = gen
                     shared.REMOTE_GEN[nodename] = gen
                     shared.LOCAL_GEN[nodename] = our_gen_on_peer
                     shared.CLUSTER_DATA[nodename]["gen"] = {
                         nodename: gen,
                         Env.nodename: our_gen_on_peer,
                     }
                     self.log.debug(
                         "patch node %s dataset to gen %d, peer has gen %d of our dataset",
                         nodename, shared.REMOTE_GEN[nodename],
                         shared.LOCAL_GEN[nodename])
                     if self.patch_has_nodes_info_change(deltas[str(gen)]):
                         self.on_nodes_info_change()
                     change = True
                 except Exception as exc:
                     self.log.warning(
                         "failed to apply node %s dataset gen %d patch: %s. "
                         "ask for a full: %s", nodename, gen,
                         deltas[str(gen)], exc)
                     shared.REMOTE_GEN[nodename] = 0
                     shared.LOCAL_GEN[nodename] = our_gen_on_peer
                     shared.CLUSTER_DATA[nodename]["gen"] = {
                         nodename: gen,
                         Env.nodename: our_gen_on_peer,
                     }
                     return
     elif kind == "ping":
         with shared.CLUSTER_DATA_LOCK:
             shared.REMOTE_GEN[nodename] = 0
             shared.LOCAL_GEN[nodename] = our_gen_on_peer
             if nodename not in shared.CLUSTER_DATA:
                 shared.CLUSTER_DATA[nodename] = {}
             shared.CLUSTER_DATA[nodename]["gen"] = {
                 nodename: 0,
                 Env.nodename: our_gen_on_peer,
             }
             shared.CLUSTER_DATA[nodename]["monitor"] = data["monitor"]
             self.log.debug(
                 "reset node %s dataset gen, peer has gen %d of our dataset",
                 nodename, shared.LOCAL_GEN[nodename])
             change = True
     else:
         data_gen = data.get("gen", {}).get(nodename)
         if data_gen is None:
             self.log.debug("no 'gen' in full dataset from %s: drop",
                            nodename)
             return
         last_gen = shared.REMOTE_GEN.get(nodename)
         if last_gen is not None and last_gen >= data_gen:
             self.log.debug(
                 "already installed or beyond %s gen %d dataset: drop",
                 nodename, data_gen)
             return
         node_status = data.get("monitor", {}).get("status")
         if node_status in ("init", "maintenance",
                            "upgrade") and nodename in shared.CLUSTER_DATA:
             for path, idata in shared.CLUSTER_DATA[nodename].get(
                     "services", {}).get("status", {}).items():
                 if path in data["services"]["status"]:
                     continue
                 idata["preserved"] = True
                 data["services"]["status"][path] = idata
         with shared.CLUSTER_DATA_LOCK:
             shared.CLUSTER_DATA[nodename] = data
             new_gen = data.get("gen", {}).get(nodename, 0)
             shared.LOCAL_GEN[nodename] = our_gen_on_peer
             self.on_nodes_info_change()
             shared.REMOTE_GEN[nodename] = new_gen
             shared.CLUSTER_DATA[nodename]["gen"] = {
                 nodename: new_gen,
                 Env.nodename: our_gen_on_peer,
             }
             self.log.debug(
                 "install node %s dataset gen %d, peer has gen %d of our dataset",
                 nodename, shared.REMOTE_GEN[nodename],
                 shared.LOCAL_GEN[nodename])
             change = True
     if change:
         shared.wake_monitor(
             "node %s %s dataset gen %d received through %s" %
             (nodename, kind, shared.REMOTE_GEN[nodename], self.name))