def test_leaving(self, client): """Create a plugin not in the cluster and try to leave the cluster. Nothing should be written to etcd.""" e = EtcdSynchronizer(self.plugin, self.watcher_ip) e.start_thread() e.leave_cluster() e._client.write.assert_not_called() e.terminate()
def test_mark_failed(self, client): """Create a plugin not in the cluster and try to mark it as failed. Nothing should be written to etcd.""" e = EtcdSynchronizer(self.plugin, self.watcher_ip) e.start_thread() e.mark_node_failed() e._client.write.assert_not_called() e.terminate()
def test_scale_up(self): # Create an existing cluster of two nodes, and a third new node sync1 = EtcdSynchronizer(DummyPlugin(None), '10.0.0.1') sync2 = EtcdSynchronizer(DummyPlugin(None), '10.0.0.2') sync3 = EtcdSynchronizer(DummyPlugin(None), '10.0.0.3') mock_client = sync1._client mock_client.write( "/test", json.dumps({ "10.0.0.1": "normal", "10.0.0.2": "normal" })) for s in [sync1, sync2, sync3]: s.start_thread() # Check that the third node joins the cluster self.wait_for_all_normal(mock_client, required_number=3) end = json.loads(mock_client.read("/test").value) self.assertEqual("normal", end.get("10.0.0.3")) for s in [sync1, sync2, sync3]: s.terminate()
def test_watcher(self): """Create a new 3-node cluster with one plugin not in the cluster and check that the main three all end up in NORMAL state""" e = EtcdSynchronizer(self.plugin, self.watcher_ip) e.start_thread() self.make_and_start_synchronizers(3) mock_client = self.syncs[0]._client self.wait_for_all_normal(mock_client, required_number=3) # Pause for one second - the watcher plugin might be called just after # all other nodes enter 'normal' state sleep(1) self.assertTrue(self.plugin.on_stable_cluster_called) end = json.loads(mock_client.read("/test").value) self.assertEqual("normal", end.get("10.0.0.0")) self.assertEqual("normal", end.get("10.0.0.1")) self.assertEqual("normal", end.get("10.0.0.2")) self.assertEqual(None, end.get("10.1.1.1")) e.terminate()
def test_failure(self): # Create synchronisers, using a FailPlugin for one which will crash and # not complete (simulating a failed node) sync1 = EtcdSynchronizer(DummyPlugin(None), '10.0.0.1') sync2 = EtcdSynchronizer(FailPlugin(None), '10.0.0.2') sync3 = EtcdSynchronizer(DummyPlugin(None), '10.0.0.3') mock_client = sync1._client for s in [sync1, sync2, sync3]: s.start_thread() # After a few seconds, the scale-up will still not have completed sleep(3) end = json.loads(mock_client.read("/test").value) self.assertNotEqual("normal", end.get("10.0.0.1")) self.assertNotEqual("normal", end.get("10.0.0.2")) self.assertNotEqual("normal", end.get("10.0.0.3")) # Start a synchroniser to take 10.0.0.2's place sync2.terminate() error_syncer = EtcdSynchronizer(NullPlugin('/test'), '10.0.0.2', force_leave=True) error_syncer.mark_node_failed() error_syncer.leave_cluster() error_syncer.start_thread() # 10.0.0.2 will be removed from the cluster, and the cluster will # stabilise self.wait_for_all_normal(mock_client, required_number=2, tries=50) end = json.loads(mock_client.read("/test").value) self.assertEqual("normal", end.get("10.0.0.1")) self.assertEqual("normal", end.get("10.0.0.3")) self.assertEqual(None, end.get("10.0.0.2")) for s in [sync1, sync3, error_syncer]: s.terminate()
def test_scale_down(self): # Start with a stable cluster of two nodes sync1 = EtcdSynchronizer(DummyPlugin(None), '10.0.1.1') sync2 = EtcdSynchronizer(DummyPlugin(None), '10.0.1.2') mock_client = sync1._client mock_client.write("/test", json.dumps({"10.0.1.1": "normal", "10.0.1.2": "normal"})) for s in [sync1, sync2]: s.start_thread() # Make the second node leave sync2.leave_cluster() sync2.thread.join(20) sync2.terminate() self.wait_for_all_normal(mock_client, required_number=1) # Check that it's left and the cluster is stable end = json.loads(mock_client.read("/test").value) self.assertEqual(None, end.get("10.0.1.2")) self.assertEqual("normal", end.get("10.0.1.1")) sync1.terminate()
def test_scale_down(self): # Start with a stable cluster of four nodes syncs = [ EtcdSynchronizer(DummyPlugin(None), ip) for ip in [ '10.0.1.1', '10.0.1.2', '10.0.1.3', '10.0.1.4', ] ] mock_client = syncs[0]._client mock_client.write( "/test", json.dumps({ "10.0.1.1": "normal", "10.0.1.2": "normal", "10.0.1.3": "normal", "10.0.1.4": "normal", })) for s in syncs: s.start_thread() # Allow the cluster to stabilise, then make the second and fourth nodes leave sleep(1) syncs[1].leave_cluster() syncs[3].leave_cluster() self.wait_for_all_normal(mock_client, required_number=2, tries=50) # Check that it's left and the cluster is stable end = json.loads(mock_client.read("/test").value) self.assertEqual("normal", end.get("10.0.1.1")) self.assertEqual("normal", end.get("10.0.1.3")) self.assertEqual(None, end.get("10.0.1.2")) self.assertEqual(None, end.get("10.0.1.4")) for s in syncs: s.terminate()
def main(args): syslog.openlog("cluster-manager", syslog.LOG_PID) pdlogs.STARTUP.log() try: arguments = docopt(__doc__, argv=args) except DocoptExit: pdlogs.EXITING_BAD_CONFIG.log() raise mgmt_ip = arguments['--mgmt-local-ip'] sig_ip = arguments['--sig-local-ip'] local_site_name = arguments['--local-site'] remote_site_name = arguments['--remote-site'] signaling_namespace = arguments.get('--signaling-namespace') log_dir = arguments['--log-directory'] log_level = LOG_LEVELS.get(arguments['--log-level'], logging.DEBUG) stdout_err_log = os.path.join(log_dir, "cluster-manager.output.log") if not arguments['--foreground']: utils.daemonize(stdout_err_log) logging_config.configure_logging(log_level, log_dir, "cluster-manager", show_thread=True) # urllib3 logs a WARNING log whenever it recreates a connection, but our # etcd usage does this frequently (to allow watch timeouts), so deliberately # ignore this log urllib_logger = logging.getLogger('urllib3') urllib_logger.setLevel(logging.ERROR) utils.install_sigusr1_handler("cluster-manager") # Drop a pidfile. pid = os.getpid() with open(arguments['--pidfile'], "w") as pidfile: pidfile.write(str(pid) + "\n") plugins_dir = "/usr/share/clearwater/clearwater-cluster-manager/plugins/" plugins = load_plugins_in_dir(plugins_dir, PluginParams(ip=sig_ip, mgmt_ip=mgmt_ip, local_site=local_site_name, remote_site=remote_site_name, signaling_namespace=signaling_namespace)) plugins.sort(key=lambda x: x.key()) plugins_to_use = [] files = [] skip = False for plugin in plugins: for plugin_file in plugin.files(): if plugin_file in files: _log.info("Skipping plugin {} because {} " "is already managed by another plugin" .format(plugin, plugin_file)) skip = True if not skip: plugins_to_use.append(plugin) files.extend(plugin.files()) synchronizers = [] threads = [] for plugin in plugins_to_use: syncer = EtcdSynchronizer(plugin, sig_ip, etcd_ip=mgmt_ip) syncer.start_thread() synchronizers.append(syncer) threads.append(syncer.thread) _log.info("Loaded plugin %s" % plugin) install_sigquit_handler(synchronizers) install_sigterm_handler(synchronizers) while any([thread.isAlive() for thread in threads]): for thread in threads: if thread.isAlive(): thread.join(1) _log.info("No plugin threads running, waiting for a SIGTERM or SIGQUIT") while not should_quit: sleep(1) _log.info("Quitting") _log.debug("%d threads outstanding at exit" % activeCount()) pdlogs.EXITING.log() syslog.closelog() # Use os.exit to skip exit handlers - otherwise the concurrent.futures exit # handler waits for an infinite wait to end os._exit(0)
node_type = sys.argv[3] datastore = sys.argv[4] dead_node_ip = sys.argv[5] etcd_key = sys.argv[6] key = make_key(site, node_type, datastore, etcd_key) _log.info("Using etcd key %s" % (key)) if datastore == "cassandra": try: sys.path.append( "/usr/share/clearwater/clearwater-cluster-manager/failed_plugins") from cassandra_failed_plugin import CassandraFailedPlugin error_syncer = EtcdSynchronizer(CassandraFailedPlugin( key, dead_node_ip), dead_node_ip, etcd_ip=etcd_ip, force_leave=True) except ImportError: print "You must run mark_node_failed on a node that has Cassandra installed to remove a node from a Cassandra cluster" sys.exit(1) else: error_syncer = EtcdSynchronizer(NullPlugin(key), dead_node_ip, etcd_ip=etcd_ip, force_leave=True) # Check that the dead node is even a member of the cluster etcd_result, idx = error_syncer.read_from_etcd(wait=False, timeout=10) if etcd_result is None:
def main(args): syslog.openlog("cluster-manager", syslog.LOG_PID) pdlogs.STARTUP.log() try: arguments = docopt(__doc__, argv=args) except DocoptExit: pdlogs.EXITING_BAD_CONFIG.log() raise mgmt_ip = arguments['--mgmt-local-ip'] sig_ip = arguments['--sig-local-ip'] local_site_name = arguments['--local-site'] remote_site_name = arguments['--remote-site'] remote_cassandra_seeds = arguments['--remote-cassandra-seeds'] if remote_cassandra_seeds: remote_cassandra_seeds = remote_cassandra_seeds.split(',') else: remote_cassandra_seeds = [] signaling_namespace = arguments.get('--signaling-namespace') local_uuid = UUID(arguments['--uuid']) etcd_key = arguments.get('--etcd-key') etcd_cluster_key = arguments.get('--etcd-cluster-key') cluster_manager_enabled = arguments['--cluster-manager-enabled'] log_dir = arguments['--log-directory'] log_level = LOG_LEVELS.get(arguments['--log-level'], logging.DEBUG) stdout_err_log = os.path.join(log_dir, "cluster-manager.output.log") # Check that there's an etcd_cluster_key value passed to the cluster # manager if etcd_cluster_key == "": # The etcd_cluster_key isn't valid, and possibly get weird entries in # the etcd database if we allow the cluster_manager to start pdlogs.EXITING_MISSING_ETCD_CLUSTER_KEY.log() exit(1) if not arguments['--foreground']: utils.daemonize(stdout_err_log) # Process names are limited to 15 characters, so abbreviate prctl.prctl(prctl.NAME, "cw-cluster-mgr") logging_config.configure_logging(log_level, log_dir, "cluster-manager", show_thread=True) # urllib3 logs a WARNING log whenever it recreates a connection, but our # etcd usage does this frequently (to allow watch timeouts), so deliberately # ignore this log urllib_logger = logging.getLogger('urllib3') urllib_logger.setLevel(logging.ERROR) utils.install_sigusr1_handler("cluster-manager") # Drop a pidfile. We must keep a reference to the file object here, as this keeps # the file locked and provides extra protection against two processes running at # once. pidfile_lock = None try: pidfile_lock = utils.lock_and_write_pid_file(arguments['--pidfile']) # noqa except IOError: # We failed to take the lock - another process is already running exit(1) plugins_dir = "/usr/share/clearwater/clearwater-cluster-manager/plugins/" plugins = load_plugins_in_dir(plugins_dir, PluginParams(ip=sig_ip, mgmt_ip=mgmt_ip, local_site=local_site_name, remote_site=remote_site_name, remote_cassandra_seeds=remote_cassandra_seeds, signaling_namespace=signaling_namespace, uuid=local_uuid, etcd_key=etcd_key, etcd_cluster_key=etcd_cluster_key)) plugins.sort(key=lambda x: x.key()) plugins_to_use = [] files = [] skip = False for plugin in plugins: for plugin_file in plugin.files(): if plugin_file in files: _log.info("Skipping plugin {} because {} " "is already managed by another plugin" .format(plugin, plugin_file)) skip = True if not skip: plugins_to_use.append(plugin) files.extend(plugin.files()) synchronizers = [] threads = [] if cluster_manager_enabled == "N": # Don't start any threads as we don't want the cluster manager to run pdlogs.DO_NOT_START.log() elif etcd_cluster_key == "DO_NOT_CLUSTER": # Don't start any threads as we don't want this box to cluster pdlogs.DO_NOT_CLUSTER.log() else: for plugin in plugins_to_use: syncer = EtcdSynchronizer(plugin, sig_ip, etcd_ip=mgmt_ip) syncer.start_thread() synchronizers.append(syncer) threads.append(syncer.thread) _log.info("Loaded plugin %s" % plugin) install_sigquit_handler(synchronizers) utils.install_sigterm_handler(synchronizers) while any([thread.isAlive() for thread in threads]): for thread in threads: if thread.isAlive(): thread.join(1) _log.info("No plugin threads running, waiting for a SIGTERM or SIGQUIT") while not utils.should_quit and not should_quit: sleep(1) _log.info("Quitting") _log.debug("%d threads outstanding at exit" % activeCount()) pdlogs.EXITING.log() syslog.closelog()
logfile = "/var/log/clearwater-etcd/mark_node_failed.log" print "Detailed output being sent to %s" % logfile logging.basicConfig(filename=logfile, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.DEBUG) local_ip = sys.argv[1] site = sys.argv[2] node_type = sys.argv[3] datastore = sys.argv[4] dead_node_ip = sys.argv[5] key = make_key(site, node_type, datastore) logging.info("Using etcd key %s" % (key)) error_syncer = EtcdSynchronizer(NullPlugin(key), dead_node_ip, etcd_ip=local_ip, force_leave=True) print "Marking node as failed and removing it from the cluster - will take at least 30 seconds" # Move the dead node into ERROR state to allow in-progress operations to # complete error_syncer.mark_node_failed() # Move the dead node out of the cluster error_syncer.start_thread() error_syncer.leave_cluster() # Wait for it to leave error_syncer.thread.join() print "Process complete - %s has left the cluster" % dead_node_ip c = etcd.Client(local_ip, 4000)
def main(args): syslog.openlog("cluster-manager", syslog.LOG_PID) pdlogs.STARTUP.log() try: arguments = docopt(__doc__, argv=args) except DocoptExit: pdlogs.EXITING_BAD_CONFIG.log() raise mgmt_ip = arguments['--mgmt-local-ip'] sig_ip = arguments['--sig-local-ip'] local_site_name = arguments['--local-site'] remote_site_name = arguments['--remote-site'] remote_cassandra_seeds = arguments['--remote-cassandra-seeds'] if remote_cassandra_seeds: remote_cassandra_seeds = remote_cassandra_seeds.split(',') else: remote_cassandra_seeds = [] signaling_namespace = arguments.get('--signaling-namespace') local_uuid = UUID(arguments['--uuid']) etcd_key = arguments.get('--etcd-key') etcd_cluster_key = arguments.get('--etcd-cluster-key') cluster_manager_enabled = arguments['--cluster-manager-enabled'] log_dir = arguments['--log-directory'] log_level = LOG_LEVELS.get(arguments['--log-level'], logging.DEBUG) stdout_err_log = os.path.join(log_dir, "cluster-manager.output.log") # Check that there's an etcd_cluster_key value passed to the cluster # manager if etcd_cluster_key == "": # The etcd_cluster_key isn't valid, and possibly get weird entries in # the etcd database if we allow the cluster_manager to start pdlogs.EXITING_MISSING_ETCD_CLUSTER_KEY.log() exit(1) if not arguments['--foreground']: utils.daemonize(stdout_err_log) # Process names are limited to 15 characters, so abbreviate prctl.prctl(prctl.NAME, "cw-cluster-mgr") logging_config.configure_logging(log_level, log_dir, "cluster-manager", show_thread=True) # urllib3 logs a WARNING log whenever it recreates a connection, but our # etcd usage does this frequently (to allow watch timeouts), so deliberately # ignore this log urllib_logger = logging.getLogger('urllib3') urllib_logger.setLevel(logging.ERROR) utils.install_sigusr1_handler("cluster-manager") # Drop a pidfile. We must keep a reference to the file object here, as this keeps # the file locked and provides extra protection against two processes running at # once. pidfile_lock = None try: pidfile_lock = utils.lock_and_write_pid_file( arguments['--pidfile']) # noqa except IOError: # We failed to take the lock - another process is already running exit(1) plugins_dir = "/usr/share/clearwater/clearwater-cluster-manager/plugins/" plugins = load_plugins_in_dir( plugins_dir, PluginParams(ip=sig_ip, mgmt_ip=mgmt_ip, local_site=local_site_name, remote_site=remote_site_name, remote_cassandra_seeds=remote_cassandra_seeds, signaling_namespace=signaling_namespace, uuid=local_uuid, etcd_key=etcd_key, etcd_cluster_key=etcd_cluster_key)) plugins.sort(key=lambda x: x.key()) plugins_to_use = [] files = [] skip = False for plugin in plugins: for plugin_file in plugin.files(): if plugin_file in files: _log.info("Skipping plugin {} because {} " "is already managed by another plugin".format( plugin, plugin_file)) skip = True if not skip: plugins_to_use.append(plugin) files.extend(plugin.files()) synchronizers = [] threads = [] if cluster_manager_enabled == "N": # Don't start any threads as we don't want the cluster manager to run pdlogs.DO_NOT_START.log() elif etcd_cluster_key == "DO_NOT_CLUSTER": # Don't start any threads as we don't want this box to cluster pdlogs.DO_NOT_CLUSTER.log() else: for plugin in plugins_to_use: syncer = EtcdSynchronizer(plugin, sig_ip, etcd_ip=mgmt_ip) syncer.start_thread() synchronizers.append(syncer) threads.append(syncer.thread) _log.info("Loaded plugin %s" % plugin) install_sigquit_handler(synchronizers) install_sigterm_handler(synchronizers) while any([thread.isAlive() for thread in threads]): for thread in threads: if thread.isAlive(): thread.join(1) _log.info("No plugin threads running, waiting for a SIGTERM or SIGQUIT") while not should_quit: sleep(1) _log.info("Quitting") _log.debug("%d threads outstanding at exit" % activeCount()) pdlogs.EXITING.log() syslog.closelog()
node_type = sys.argv[3] datastore = sys.argv[4] dead_node_ip = sys.argv[5] etcd_key = sys.argv[6] key = make_key(site, node_type, datastore, etcd_key) logging.info("Using etcd key %s" % (key)) if datastore == "cassandra": try: sys.path.append( "/usr/share/clearwater/clearwater-cluster-manager/failed_plugins") from cassandra_failed_plugin import CassandraFailedPlugin error_syncer = EtcdSynchronizer(CassandraFailedPlugin( key, dead_node_ip), dead_node_ip, etcd_ip=local_ip, force_leave=True) except ImportError: print "You must run mark_node_failed on a node that has Cassandra installed to remove a node from a Cassandra cluster" sys.exit(1) else: error_syncer = EtcdSynchronizer(NullPlugin(key), dead_node_ip, etcd_ip=local_ip, force_leave=True) print "Marking node as failed and removing it from the cluster - will take at least 30 seconds" # Move the dead node into ERROR state to allow in-progress operations to # complete error_syncer.mark_node_failed()
def make_and_start_synchronizers(self, num, klass=DummyPlugin): ips = ["10.0.0.%s" % d for d in range(num)] self.syncs = [EtcdSynchronizer(klass(ip), ip) for ip in ips] for s in self.syncs: s.start_thread() sleep(1) # Allow cluster to stabilise
etcd_ip = sys.argv[1] site = sys.argv[2] node_type = sys.argv[3] datastore = sys.argv[4] dead_node_ip = sys.argv[5] etcd_key = sys.argv[6] key = make_key(site, node_type, datastore, etcd_key) _log.info("Using etcd key %s" % (key)) if datastore == "cassandra": try: sys.path.append("/usr/share/clearwater/clearwater-cluster-manager/failed_plugins") from cassandra_failed_plugin import CassandraFailedPlugin error_syncer = EtcdSynchronizer(CassandraFailedPlugin(key, dead_node_ip), dead_node_ip, etcd_ip=etcd_ip, force_leave=True) except ImportError: print "You must run mark_node_failed on a node that has Cassandra installed to remove a node from a Cassandra cluster" sys.exit(1) else: error_syncer = EtcdSynchronizer(NullPlugin(key), dead_node_ip, etcd_ip=etcd_ip, force_leave=True) # Check that the dead node is even a member of the cluster etcd_result, idx = error_syncer.read_from_etcd(wait=False, timeout=10) if etcd_result is None: print "Failed to contact etcd cluster on '{}' - node not removed".format(etcd_ip) sys.exit(1) cluster_info = ClusterInfo(etcd_result)