def test_leaving(self, client):
        """Create a plugin not in the cluster and try to leave the cluster.
        Nothing should be written to etcd."""
        e = EtcdSynchronizer(self.plugin, self.watcher_ip)
        e.start_thread()

        e.leave_cluster()
        e._client.write.assert_not_called()

        e.terminate()
    def test_mark_failed(self, client):
        """Create a plugin not in the cluster and try to mark it as failed.
        Nothing should be written to etcd."""
        e = EtcdSynchronizer(self.plugin, self.watcher_ip)
        e.start_thread()

        e.mark_node_failed()
        e._client.write.assert_not_called()

        e.terminate()
예제 #3
0
    def test_scale_up(self):
        # Create an existing cluster of two nodes, and a third new node
        sync1 = EtcdSynchronizer(DummyPlugin(None), '10.0.0.1')
        sync2 = EtcdSynchronizer(DummyPlugin(None), '10.0.0.2')
        sync3 = EtcdSynchronizer(DummyPlugin(None), '10.0.0.3')
        mock_client = sync1._client
        mock_client.write(
            "/test", json.dumps({
                "10.0.0.1": "normal",
                "10.0.0.2": "normal"
            }))
        for s in [sync1, sync2, sync3]:
            s.start_thread()

        # Check that the third node joins the cluster
        self.wait_for_all_normal(mock_client, required_number=3)
        end = json.loads(mock_client.read("/test").value)
        self.assertEqual("normal", end.get("10.0.0.3"))
        for s in [sync1, sync2, sync3]:
            s.terminate()
예제 #4
0
    def test_failure(self):

        # Create synchronisers, using a FailPlugin for one which will crash and
        # not complete (simulating a failed node)
        sync1 = EtcdSynchronizer(DummyPlugin(None), '10.0.0.1')
        sync2 = EtcdSynchronizer(FailPlugin(None), '10.0.0.2')
        sync3 = EtcdSynchronizer(DummyPlugin(None), '10.0.0.3')
        mock_client = sync1._client
        for s in [sync1, sync2, sync3]:
            s.start_thread()

        # After a few seconds, the scale-up will still not have completed
        sleep(3)
        end = json.loads(mock_client.read("/test").value)
        self.assertNotEqual("normal", end.get("10.0.0.1"))
        self.assertNotEqual("normal", end.get("10.0.0.2"))
        self.assertNotEqual("normal", end.get("10.0.0.3"))

        # Start a synchroniser to take 10.0.0.2's place
        sync2.terminate()
        error_syncer = EtcdSynchronizer(NullPlugin('/test'),
                                        '10.0.0.2',
                                        force_leave=True)
        error_syncer.mark_node_failed()
        error_syncer.leave_cluster()
        error_syncer.start_thread()

        # 10.0.0.2 will be removed from the cluster, and the cluster will
        # stabilise
        self.wait_for_all_normal(mock_client, required_number=2, tries=50)
        end = json.loads(mock_client.read("/test").value)
        self.assertEqual("normal", end.get("10.0.0.1"))
        self.assertEqual("normal", end.get("10.0.0.3"))
        self.assertEqual(None, end.get("10.0.0.2"))
        for s in [sync1, sync3, error_syncer]:
            s.terminate()
예제 #5
0
    def test_scale_down(self):
        # Start with a stable cluster of four nodes
        syncs = [
            EtcdSynchronizer(DummyPlugin(None), ip) for ip in [
                '10.0.1.1',
                '10.0.1.2',
                '10.0.1.3',
                '10.0.1.4',
            ]
        ]
        mock_client = syncs[0]._client
        mock_client.write(
            "/test",
            json.dumps({
                "10.0.1.1": "normal",
                "10.0.1.2": "normal",
                "10.0.1.3": "normal",
                "10.0.1.4": "normal",
            }))
        for s in syncs:
            s.start_thread()

        # Allow the cluster to stabilise, then make the second and fourth nodes leave
        sleep(1)
        syncs[1].leave_cluster()
        syncs[3].leave_cluster()

        self.wait_for_all_normal(mock_client, required_number=2, tries=50)

        # Check that it's left and the cluster is stable
        end = json.loads(mock_client.read("/test").value)
        self.assertEqual("normal", end.get("10.0.1.1"))
        self.assertEqual("normal", end.get("10.0.1.3"))
        self.assertEqual(None, end.get("10.0.1.2"))
        self.assertEqual(None, end.get("10.0.1.4"))

        for s in syncs:
            s.terminate()
    def test_watcher(self):
        """Create a new 3-node cluster with one plugin not in the cluster and
        check that the main three all end up in NORMAL state"""

        e = EtcdSynchronizer(self.plugin, self.watcher_ip)
        e.start_thread()

        self.make_and_start_synchronizers(3)
        mock_client = self.syncs[0]._client
        self.wait_for_all_normal(mock_client, required_number=3)

        # Pause for one second - the watcher plugin might be called just after
        # all other nodes enter 'normal' state
        sleep(1)
        self.assertTrue(self.plugin.on_stable_cluster_called)

        end = json.loads(mock_client.read("/test").value)
        self.assertEqual("normal", end.get("10.0.0.0"))
        self.assertEqual("normal", end.get("10.0.0.1"))
        self.assertEqual("normal", end.get("10.0.0.2"))
        self.assertEqual(None, end.get("10.1.1.1"))

        e.terminate()
예제 #7
0
 def make_and_start_synchronizers(self, num, klass=DummyPlugin):
     ips = ["10.0.0.%s" % d for d in range(num)]
     self.syncs = [EtcdSynchronizer(klass(ip), ip) for ip in ips]
     for s in self.syncs:
         s.start_thread()
     sleep(1)  # Allow cluster to stabilise
예제 #8
0
def main(args):
    syslog.openlog("cluster-manager", syslog.LOG_PID)
    pdlogs.STARTUP.log()
    try:
        arguments = docopt(__doc__, argv=args)
    except DocoptExit:
        pdlogs.EXITING_BAD_CONFIG.log()
        raise

    mgmt_ip = arguments['--mgmt-local-ip']
    sig_ip = arguments['--sig-local-ip']
    local_site_name = arguments['--local-site']
    remote_site_name = arguments['--remote-site']
    remote_cassandra_seeds = arguments['--remote-cassandra-seeds']
    if remote_cassandra_seeds:
        remote_cassandra_seeds = remote_cassandra_seeds.split(',')
    else:
        remote_cassandra_seeds = []
    signaling_namespace = arguments.get('--signaling-namespace')
    local_uuid = UUID(arguments['--uuid'])
    etcd_key = arguments.get('--etcd-key')
    etcd_cluster_key = arguments.get('--etcd-cluster-key')
    cluster_manager_enabled = arguments['--cluster-manager-enabled']
    log_dir = arguments['--log-directory']
    log_level = LOG_LEVELS.get(arguments['--log-level'], logging.DEBUG)

    stdout_err_log = os.path.join(log_dir, "cluster-manager.output.log")

    # Check that there's an etcd_cluster_key value passed to the cluster
    # manager
    if etcd_cluster_key == "":
        # The etcd_cluster_key isn't valid, and possibly get weird entries in
        # the etcd database if we allow the cluster_manager to start
        pdlogs.EXITING_MISSING_ETCD_CLUSTER_KEY.log()
        exit(1)

    if not arguments['--foreground']:
        utils.daemonize(stdout_err_log)

    # Process names are limited to 15 characters, so abbreviate
    prctl.prctl(prctl.NAME, "cw-cluster-mgr")

    logging_config.configure_logging(log_level,
                                     log_dir,
                                     "cluster-manager",
                                     show_thread=True)

    # urllib3 logs a WARNING log whenever it recreates a connection, but our
    # etcd usage does this frequently (to allow watch timeouts), so deliberately
    # ignore this log
    urllib_logger = logging.getLogger('urllib3')
    urllib_logger.setLevel(logging.ERROR)

    utils.install_sigusr1_handler("cluster-manager")

    # Drop a pidfile. We must keep a reference to the file object here, as this keeps
    # the file locked and provides extra protection against two processes running at
    # once.
    pidfile_lock = None
    try:
        pidfile_lock = utils.lock_and_write_pid_file(
            arguments['--pidfile'])  # noqa
    except IOError:
        # We failed to take the lock - another process is already running
        exit(1)

    plugins_dir = "/usr/share/clearwater/clearwater-cluster-manager/plugins/"
    plugins = load_plugins_in_dir(
        plugins_dir,
        PluginParams(ip=sig_ip,
                     mgmt_ip=mgmt_ip,
                     local_site=local_site_name,
                     remote_site=remote_site_name,
                     remote_cassandra_seeds=remote_cassandra_seeds,
                     signaling_namespace=signaling_namespace,
                     uuid=local_uuid,
                     etcd_key=etcd_key,
                     etcd_cluster_key=etcd_cluster_key))
    plugins.sort(key=lambda x: x.key())
    plugins_to_use = []
    files = []
    skip = False
    for plugin in plugins:
        for plugin_file in plugin.files():
            if plugin_file in files:
                _log.info("Skipping plugin {} because {} "
                          "is already managed by another plugin".format(
                              plugin, plugin_file))
                skip = True

        if not skip:
            plugins_to_use.append(plugin)
            files.extend(plugin.files())

    synchronizers = []
    threads = []

    if cluster_manager_enabled == "N":
        # Don't start any threads as we don't want the cluster manager to run
        pdlogs.DO_NOT_START.log()
    elif etcd_cluster_key == "DO_NOT_CLUSTER":
        # Don't start any threads as we don't want this box to cluster
        pdlogs.DO_NOT_CLUSTER.log()
    else:
        for plugin in plugins_to_use:
            syncer = EtcdSynchronizer(plugin, sig_ip, etcd_ip=mgmt_ip)
            syncer.start_thread()

            synchronizers.append(syncer)
            threads.append(syncer.thread)
            _log.info("Loaded plugin %s" % plugin)

    install_sigquit_handler(synchronizers)
    install_sigterm_handler(synchronizers)

    while any([thread.isAlive() for thread in threads]):
        for thread in threads:
            if thread.isAlive():
                thread.join(1)

    _log.info("No plugin threads running, waiting for a SIGTERM or SIGQUIT")
    while not should_quit:
        sleep(1)
    _log.info("Quitting")
    _log.debug("%d threads outstanding at exit" % activeCount())
    pdlogs.EXITING.log()
    syslog.closelog()
예제 #9
0
node_type = sys.argv[3]
datastore = sys.argv[4]
dead_node_ip = sys.argv[5]
etcd_key = sys.argv[6]

key = make_key(site, node_type, datastore, etcd_key)
logging.info("Using etcd key %s" % (key))

if datastore == "cassandra":
    try:
        sys.path.append(
            "/usr/share/clearwater/clearwater-cluster-manager/failed_plugins")
        from cassandra_failed_plugin import CassandraFailedPlugin
        error_syncer = EtcdSynchronizer(CassandraFailedPlugin(
            key, dead_node_ip),
                                        dead_node_ip,
                                        etcd_ip=local_ip,
                                        force_leave=True)
    except ImportError:
        print "You must run mark_node_failed on a node that has Cassandra installed to remove a node from a Cassandra cluster"
        sys.exit(1)
else:
    error_syncer = EtcdSynchronizer(NullPlugin(key),
                                    dead_node_ip,
                                    etcd_ip=local_ip,
                                    force_leave=True)

print "Marking node as failed and removing it from the cluster - will take at least 30 seconds"
# Move the dead node into ERROR state to allow in-progress operations to
# complete
error_syncer.mark_node_failed()