def test_invalid_znode(self): instance1 = ServiceInstance(Endpoint("host1", 10000)) handler1 = CallbackHandler() listener1 = ClusterListener(self.client, "/home/my_cluster", instance1, handler1.promotion_callback, handler1.demotion_callback, handler1.master_callback) listener1.start() self.client.ensure_path("/home/my_cluster/master") self.client.create("/home/my_cluster/master/member_", "Invalid Data", sequence=True) # Invalid ZNode data translates into a 'None' return. assert handler1.detected.get(True, 1) is None
def test_invalid_znode(self): instance1 = ServiceInstance(Endpoint("host1", 10000)) handler1 = CallbackHandler() listener1 = ClusterListener( self.client, "/home/my_cluster", instance1, handler1.promotion_callback, handler1.demotion_callback, handler1.master_callback, ) listener1.start() self.client.ensure_path("/home/my_cluster/master") self.client.create("/home/my_cluster/master/member_", "Invalid Data", sequence=True) # Invalid ZNode data translates into a 'None' return. assert handler1.detected.get(True, 1) is None
def __init__(self, self_instance, kazoo, cluster_root, installer, task_control, state_manager): """ :param self_instance: The local ServiceInstance associated with this task runner. :param kazoo: Kazoo client, it should be started before being passed in. :param cluster_root: The ZooKeeper root path for *this cluster*. :param installer: The PackageInstaller for MySQL. :param task_control: The TaskControl that interacts with the task process. :param state_manager: The StateManager for managing the executor state. """ self._installer = installer self._env = None # The environment variables for the 'task_control' commands. Set by the # installer. self._task_control = task_control self._state_manager = state_manager self._lock = threading.Lock() self._popen = None # The singleton task process started by '_task_control'. self._started = False # Indicates whether start() has already been called. self._stopping = False # Indicates whether stop() has already been called. self._exited = threading.Event( ) # Set when the task process has exited. self._result = Queue.Queue( ) # The returncode returned by the task process or an exception. # Public events and queue. self.promoted = threading.Event() self.demoted = threading.Event() self.master = Queue.Queue() # Set when a master change is detected. self._kazoo = kazoo self._listener = ClusterListener(kazoo, cluster_root, self_instance, promotion_callback=self._on_promote, demotion_callback=self._on_demote, master_callback=self._on_master_change ) # Listener started by start().
def test_callbacks(self): manager = ClusterManager(self.client, "/home/my_cluster") # Set up 2 listeners. instance1 = ServiceInstance(Endpoint("host1", 10000)) handler1 = CallbackHandler() listener1 = ClusterListener(self.client, "/home/my_cluster", instance1, handler1.promotion_callback, handler1.demotion_callback, handler1.master_callback, handler1.termination_callback) listener1.start() member1 = manager.add_member(instance1) instance2 = ServiceInstance(Endpoint("host2", 10000)) handler2 = CallbackHandler() listener2 = ClusterListener(self.client, "/home/my_cluster", instance2, handler2.promotion_callback, handler2.demotion_callback, handler2.master_callback) listener2.start() member2 = manager.add_member(instance2) # Test promotion. manager.promote_member(member1) assert handler1.promoted.wait(1) assert handler2.detected.get(True, 1) == instance1 assert (self.storage.paths["/home/my_cluster/master/member_0000000000"] ["data"] == ServiceInstance.pack(instance1)) assert (self.storage.paths["/home/my_cluster/slaves/member_0000000001"] ["data"] == ServiceInstance.pack(instance2)) manager.promote_member(member2) assert handler1.demoted.wait(1) assert handler2.promoted.wait(1) assert (self.storage.paths["/home/my_cluster/master/member_0000000001"] ["data"] == ServiceInstance.pack(instance2)) assert "/home/my_cluster/master/member_0000000000" not in self.storage.paths manager.remove_member(member2) assert handler2.demoted.wait(1) # Test removing cluster. manager.remove_member(member1) manager.delete_cluster() assert handler1.terminated.wait(1)
def __init__(self, self_instance, kazoo, cluster_root, installer, task_control, state_manager): """ :param self_instance: The local ServiceInstance associated with this task runner. :param kazoo: Kazoo client, it should be started before being passed in. :param cluster_root: The ZooKeeper root path for *this cluster*. :param installer: The PackageInstaller for MySQL. :param task_control: The TaskControl that interacts with the task process. :param state_manager: The StateManager for managing the executor state. """ self._installer = installer self._env = None # The environment variables for the 'task_control' commands. Set by the # installer. self._task_control = task_control self._state_manager = state_manager self._lock = threading.Lock() self._popen = None # The singleton task process started by '_task_control'. self._started = False # Indicates whether start() has already been called. self._stopping = False # Indicates whether stop() has already been called. self._exited = threading.Event() # Set when the task process has exited. self._result = Queue.Queue() # The returncode returned by the task process or an exception. # Public events and queue. self.promoted = threading.Event() self.demoted = threading.Event() self.master = Queue.Queue() # Set when a master change is detected. self._kazoo = kazoo self._listener = ClusterListener( kazoo, cluster_root, self_instance, promotion_callback=self._on_promote, demotion_callback=self._on_demote, master_callback=self._on_master_change) # Listener started by start().
class MysosTaskRunner(TaskRunner): """ A runner that manages the lifecycle of a MySQL task (through the provided 'task_control'). The task is executed as a long-running process its return code can be obtained using 'join()'. Thread-safety: This class is accessed from the MysosExecutor thread (not the ExecutorDriver thread because MysosExecutor invokes operations asynchronously) and the ClusterListener thread and is thread-safe. TODO(jyx): Push the knowledge of the underlying subprocess down to the task control and stop the the subprocess using the task control. """ def __init__(self, self_instance, kazoo, cluster_root, installer, task_control, state_manager): """ :param self_instance: The local ServiceInstance associated with this task runner. :param kazoo: Kazoo client, it should be started before being passed in. :param cluster_root: The ZooKeeper root path for *this cluster*. :param installer: The PackageInstaller for MySQL. :param task_control: The TaskControl that interacts with the task process. :param state_manager: The StateManager for managing the executor state. """ self._installer = installer self._env = None # The environment variables for the 'task_control' commands. Set by the # installer. self._task_control = task_control self._state_manager = state_manager self._lock = threading.Lock() self._popen = None # The singleton task process started by '_task_control'. self._started = False # Indicates whether start() has already been called. self._stopping = False # Indicates whether stop() has already been called. self._exited = threading.Event() # Set when the task process has exited. self._result = Queue.Queue() # The returncode returned by the task process or an exception. # Public events and queue. self.promoted = threading.Event() self.demoted = threading.Event() self.master = Queue.Queue() # Set when a master change is detected. self._kazoo = kazoo self._listener = ClusterListener( kazoo, cluster_root, self_instance, promotion_callback=self._on_promote, demotion_callback=self._on_demote, master_callback=self._on_master_change) # Listener started by start(). # --- Public interface. --- def start(self): """ Start the runner in a separate thread and wait for the task process to be forked. """ with self._lock: if self._started: raise TaskError("Runner already started") self._started = True # Can potentially hold the lock for a long time but it's OK since the runner is not accessed # by multiple threads until after it's started; can be a noop as well, depending on the # installer implementation. try: # 1. Install the application. self._env = self._installer.install() log.info("Package installation completed. Resulting environment variables: %s" % self._env) # 2. Restore/initialize the application state. self._state_manager.bootstrap(self._task_control, self._env) log.info("Executor state fully bootstrapped") # 3. Start the task subprocess. # Store the process so we can kill it if necessary. self._popen = self._task_control.start(env=self._env) log.info("Task started in subprocess %s" % self._popen.pid) defer(self._wait) # 4. Start monitoring. # Only start listening to ZK events after the task subprocess has been successfully started. self._listener.start() except (PackageInstaller.Error, StateManager.Error, CalledProcessError) as e: raise TaskError("Failed to start MySQL task: %s" % e) def _wait(self): # Block until the subprocess exits and delivers the return code. self._result.put(self._popen.wait()) # Notify stop() if it is waiting. self._exited.set() def stop(self, timeout=10): with self._lock: # stop() could be called by multiple threads. Locking so we only stop the runner once. if self._stopping: log.warn("The runner is already stopping/stopped") return False else: log.info("Stopping runner") self._stopping = True try: return self._stop(timeout) finally: self._kazoo.stop() log.info("Runner cleaned up") def _stop(self, timeout): """ Stop the runner and wait for its thread (and the sub-processes) to exit. :param timeout: The timeout that the process should die before a hard SIGKILL is issued (SIGTERM is used initially). :return: True if an active runner is stopped, False if the runner is not started or already stopping/stopped. """ with self._lock: if not self._started: log.warn("Cannot stop the runner because it's not started") return False if not self._popen: log.info("The runner task did not start successfully so no need to kill it") return False try: log.info("Terminating process group: %s" % self._popen.pid) os.killpg(self._popen.pid, signal.SIGTERM) except OSError as e: log.info("The sub-processes are already terminated: %s" % e) return False log.info("Waiting for process to terminate due to SIGTERM") # Escalate to SIGKILL if SIGTERM is not sufficient. if not self._exited.wait(timeout=timeout): with self._lock: try: log.warn("Killing process group %s which failed to terminate cleanly within %s secs" % (self._popen.pid, timeout)) os.killpg(self._popen.pid, signal.SIGKILL) except OSError as e: log.info("The sub-processes are already terminated: %s" % e) return False else: return True log.info("Waiting for process to terminate due to SIGKILL") if not self._exited.wait(timeout=timeout): raise TaskError("Failed to kill process group %s" % self._popen.pid) return True def get_log_position(self): """ Get the log position of the MySQL slave. Return None if it cannot be obtained. """ try: log_position = self._task_control.get_log_position(env=self._env) return log_position except CalledProcessError as e: raise TaskError("Unable to get the slave's log position: %s" % e) def join(self): """ Wait for the runner to terminate. :return: The return code of the subprocess. NOTE: A negative value -N indicates that the child was terminated by signal N (on Unix). :exception: The TaskError exception due to an error in task control operations. """ # Using 'sys.maxint' makes this forever wait interruptible. result = self._result.get(True, sys.maxint) if isinstance(result, Exception): raise result else: return result # --- ClusterListener handlers. --- def _on_promote(self): self.promoted.set() if not self._exited.is_set(): defer(self._promote) def _promote(self): try: self._task_control.promote(env=self._env) except CalledProcessError as e: self._result.put(TaskError("Failed to promote the slave: %s" % e)) self.stop() def _on_demote(self): """ Executor shuts itself down when demoted. """ self.demoted.set() # Stop the runner asynchronously. if not self._exited.is_set(): log.info("Shutting down runner because it is demoted.") # Call stop() asynchronously because this callback is invoked from the Kazoo thread which we # don't want to block. defer(self.stop) def _on_master_change(self, master): self.master.put(master) if not self._exited.is_set(): defer(lambda: self._reparent(master)) def _reparent(self, master): try: self._task_control.reparent( master.service_endpoint.host, master.service_endpoint.port, env=self._env) except CalledProcessError as e: self._result.put(TaskError("Failed to reparent the slave: %s" % e)) self.stop()
class MysosTaskRunner(TaskRunner): """ A runner that manages the lifecycle of a MySQL task (through the provided 'task_control'). The task is executed as a long-running process its return code can be obtained using 'join()'. Thread-safety: This class is accessed from the MysosExecutor thread (not the ExecutorDriver thread because MysosExecutor invokes operations asynchronously) and the ClusterListener thread and is thread-safe. TODO(jyx): Push the knowledge of the underlying subprocess down to the task control and stop the the subprocess using the task control. """ def __init__(self, self_instance, kazoo, cluster_root, installer, task_control, state_manager): """ :param self_instance: The local ServiceInstance associated with this task runner. :param kazoo: Kazoo client, it should be started before being passed in. :param cluster_root: The ZooKeeper root path for *this cluster*. :param installer: The PackageInstaller for MySQL. :param task_control: The TaskControl that interacts with the task process. :param state_manager: The StateManager for managing the executor state. """ self._installer = installer self._env = None # The environment variables for the 'task_control' commands. Set by the # installer. self._task_control = task_control self._state_manager = state_manager self._lock = threading.Lock() self._popen = None # The singleton task process started by '_task_control'. self._started = False # Indicates whether start() has already been called. self._stopping = False # Indicates whether stop() has already been called. self._exited = threading.Event( ) # Set when the task process has exited. self._result = Queue.Queue( ) # The returncode returned by the task process or an exception. # Public events and queue. self.promoted = threading.Event() self.demoted = threading.Event() self.master = Queue.Queue() # Set when a master change is detected. self._kazoo = kazoo self._listener = ClusterListener(kazoo, cluster_root, self_instance, promotion_callback=self._on_promote, demotion_callback=self._on_demote, master_callback=self._on_master_change ) # Listener started by start(). # --- Public interface. --- def start(self): """ Start the runner in a separate thread and wait for the task process to be forked. """ with self._lock: if self._started: raise TaskError("Runner already started") self._started = True # Can potentially hold the lock for a long time but it's OK since the runner is not accessed # by multiple threads until after it's started; can be a noop as well, depending on the # installer implementation. try: # 1. Install the application. self._env = self._installer.install() log.info( "Package installation completed. Resulting environment variables: %s" % self._env) # 2. Restore/initialize the application state. self._state_manager.bootstrap(self._task_control, self._env) log.info("Executor state fully bootstrapped") # 3. Start the task subprocess. # Store the process so we can kill it if necessary. self._popen = self._task_control.start(env=self._env) log.info("Task started in subprocess %s" % self._popen.pid) defer(self._wait) # 4. Start monitoring. # Only start listening to ZK events after the task subprocess has been successfully started. self._listener.start() except (PackageInstaller.Error, StateManager.Error, CalledProcessError) as e: raise TaskError("Failed to start MySQL task: %s" % e) def _wait(self): # Block until the subprocess exits and delivers the return code. self._result.put(self._popen.wait()) # Notify stop() if it is waiting. self._exited.set() def stop(self, timeout=10): with self._lock: # stop() could be called by multiple threads. Locking so we only stop the runner once. if self._stopping: log.warn("The runner is already stopping/stopped") return False else: log.info("Stopping runner") self._stopping = True try: return self._stop(timeout) finally: self._kazoo.stop() log.info("Runner cleaned up") def _stop(self, timeout): """ Stop the runner and wait for its thread (and the sub-processes) to exit. :param timeout: The timeout that the process should die before a hard SIGKILL is issued (SIGTERM is used initially). :return: True if an active runner is stopped, False if the runner is not started or already stopping/stopped. """ with self._lock: if not self._started: log.warn("Cannot stop the runner because it's not started") return False if not self._popen: log.info( "The runner task did not start successfully so no need to kill it" ) return False try: log.info("Terminating process group: %s" % self._popen.pid) os.killpg(self._popen.pid, signal.SIGTERM) except OSError as e: log.info("The sub-processes are already terminated: %s" % e) return False log.info("Waiting for process to terminate due to SIGTERM") # Escalate to SIGKILL if SIGTERM is not sufficient. if not self._exited.wait(timeout=timeout): with self._lock: try: log.warn( "Killing process group %s which failed to terminate cleanly within %s secs" % (self._popen.pid, timeout)) os.killpg(self._popen.pid, signal.SIGKILL) except OSError as e: log.info("The sub-processes are already terminated: %s" % e) return False else: return True log.info("Waiting for process to terminate due to SIGKILL") if not self._exited.wait(timeout=timeout): raise TaskError("Failed to kill process group %s" % self._popen.pid) return True def get_log_position(self): """ Get the log position of the MySQL slave. Return None if it cannot be obtained. """ try: log_position = self._task_control.get_log_position(env=self._env) return log_position except CalledProcessError as e: raise TaskError("Unable to get the slave's log position: %s" % e) def join(self): """ Wait for the runner to terminate. :return: The return code of the subprocess. NOTE: A negative value -N indicates that the child was terminated by signal N (on Unix). :exception: The TaskError exception due to an error in task control operations. """ # Using 'sys.maxint' makes this forever wait interruptible. result = self._result.get(True, sys.maxint) if isinstance(result, Exception): raise result else: return result # --- ClusterListener handlers. --- def _on_promote(self): self.promoted.set() if not self._exited.is_set(): defer(self._promote) def _promote(self): try: self._task_control.promote(env=self._env) except CalledProcessError as e: self._result.put(TaskError("Failed to promote the slave: %s" % e)) self.stop() def _on_demote(self): """ Executor shuts itself down when demoted. """ self.demoted.set() # Stop the runner asynchronously. if not self._exited.is_set(): log.info("Shutting down runner because it is demoted.") # Call stop() asynchronously because this callback is invoked from the Kazoo thread which we # don't want to block. defer(self.stop) def _on_master_change(self, master): self.master.put(master) if not self._exited.is_set(): defer(lambda: self._reparent(master)) def _reparent(self, master): try: self._task_control.reparent(master.service_endpoint.host, master.service_endpoint.port, env=self._env) except CalledProcessError as e: self._result.put(TaskError("Failed to reparent the slave: %s" % e)) self.stop()
def test_callbacks(self): manager = ClusterManager(self.client, "/home/my_cluster") # Set up 2 listeners. instance1 = ServiceInstance(Endpoint("host1", 10000)) handler1 = CallbackHandler() listener1 = ClusterListener( self.client, "/home/my_cluster", instance1, handler1.promotion_callback, handler1.demotion_callback, handler1.master_callback, handler1.termination_callback, ) listener1.start() member1 = manager.add_member(instance1) instance2 = ServiceInstance(Endpoint("host2", 10000)) handler2 = CallbackHandler() listener2 = ClusterListener( self.client, "/home/my_cluster", instance2, handler2.promotion_callback, handler2.demotion_callback, handler2.master_callback, ) listener2.start() member2 = manager.add_member(instance2) # Test promotion. manager.promote_member(member1) assert handler1.promoted.wait(1) assert handler2.detected.get(True, 1) == instance1 assert self.storage.paths["/home/my_cluster/master/member_0000000000"]["data"] == ServiceInstance.pack( instance1 ) assert self.storage.paths["/home/my_cluster/slaves/member_0000000001"]["data"] == ServiceInstance.pack( instance2 ) manager.promote_member(member2) assert handler1.demoted.wait(1) assert handler2.promoted.wait(1) assert self.storage.paths["/home/my_cluster/master/member_0000000001"]["data"] == ServiceInstance.pack( instance2 ) assert "/home/my_cluster/master/member_0000000000" not in self.storage.paths manager.remove_member(member2) assert handler2.demoted.wait(1) # Test removing cluster. manager.remove_member(member1) manager.delete_cluster() assert handler1.terminated.wait(1)