Exemplo n.º 1
0
 def restart(self):
     processes = [ps['group'] for ps in self._rpc.supervisor.getAllProcessInfo()]
     for ps in processes:
         self._rpc.supervisor.stopProcessGroup(ps)
     for ps in processes:
         self._rpc.supervisor.startProcessGroup(ps)
     wait_until_true(self._services_up)
Exemplo n.º 2
0
    def test_cluster_down(self):
        """
        Check Calamari's reaction to total loss of contact with
        a Ceph cluster being monitored.

        - The cluster update time should stop getting incremented
        - The system should recover promptly when the cluster comes
          back online.
        """
        cluster_id = self._wait_for_cluster()

        def update_time():
            return self.api.get("cluster/%s" %
                                cluster_id).json()['update_time']

        # Lose contact with the cluster
        self.ceph_ctl.go_dark(cluster_id)
        initial_update_time = update_time()
        log.debug("Sleeping for %s seconds, don't panic!" %
                  LOSE_CONTACT_TIMEOUT)
        time.sleep(LOSE_CONTACT_TIMEOUT)
        # The update time should not have been incremented
        self.assertEqual(initial_update_time, update_time())

        # Regain contact with the cluster
        self.ceph_ctl.go_dark(cluster_id, dark=False)
        # The update time should start incrementing again
        wait_until_true(lambda: update_time() != initial_update_time,
                        timeout=NEXT_HEARTBEAT_TIMEOUT)
        self.assertNotEqual(initial_update_time, update_time())
Exemplo n.º 3
0
 def restart(self):
     processes = [ps['group'] for ps in self._rpc.supervisor.getAllProcessInfo()]
     for ps in processes:
         self._rpc.supervisor.stopProcessGroup(ps)
     for ps in processes:
         self._rpc.supervisor.startProcessGroup(ps)
     wait_until_true(self._services_up)
Exemplo n.º 4
0
    def test_cluster_down(self):
        """
        Check Calamari's reaction to total loss of contact with
        a Ceph cluster being monitored.

        - The cluster update time should stop getting incremented
        - The system should recover promptly when the cluster comes
          back online.
        """
        cluster_id = self._wait_for_cluster()

        def update_time():
            return self.api.get("cluster/%s" % cluster_id).json()['update_time']

        # Lose contact with the cluster
        self.ceph_ctl.go_dark(cluster_id)
        initial_update_time = update_time()
        log.debug("Sleeping for %s seconds, don't panic!" % LOSE_CONTACT_TIMEOUT)
        time.sleep(LOSE_CONTACT_TIMEOUT)
        # The update time should not have been incremented
        self.assertEqual(initial_update_time, update_time())

        # Regain contact with the cluster
        self.ceph_ctl.go_dark(cluster_id, dark=False)
        # The update time should start incrementing again
        wait_until_true(lambda: update_time() != initial_update_time, timeout=NEXT_HEARTBEAT_TIMEOUT)
        self.assertNotEqual(initial_update_time, update_time())
Exemplo n.º 5
0
    def authorize_keys(self, minion_ids):
        def _fqdns_present():
            found_ids = [m['id'] for m in self.api.get("key").json()]
            all_present = len(set(minion_ids) & set(found_ids)) == len(minion_ids)

            log.debug("checking keys, looking for %s found %s (%s)" % (minion_ids, found_ids, all_present))

            return all_present

        wait_until_true(_fqdns_present, timeout=KEY_WAIT_PERIOD * len(minion_ids))

        for minion_id in minion_ids:
            log.debug("Authorising key for %s" % minion_id)
            r = self.api.patch("key/%s" % minion_id, {'status': 'accepted'})
            r.raise_for_status()
Exemplo n.º 6
0
    def authorize_keys(self, minion_ids):
        def _fqdns_present():
            found_ids = [m['id'] for m in self.api.get("key").json()]
            all_present = len(set(minion_ids)
                              & set(found_ids)) == len(minion_ids)

            log.debug("checking keys, looking for %s found %s (%s)" %
                      (minion_ids, found_ids, all_present))

            return all_present

        wait_until_true(_fqdns_present,
                        timeout=KEY_WAIT_PERIOD * len(minion_ids))

        for minion_id in minion_ids:
            log.debug("Authorising key for %s" % minion_id)
            r = self.api.patch("key/%s" % minion_id, {'status': 'accepted'})
            r.raise_for_status()
Exemplo n.º 7
0
    def test_osd_out(self):
        """
        Check Calamari's reaction to an OSD going down:

        - The OSD map should be updated
        - The health information should be updated and indicate a warning
        """

        cluster_id = self._wait_for_cluster()

        # Pick an OSD and check its initial status
        osd_id = 0
        osd_url = "cluster/{0}/osd/{1}".format(cluster_id, osd_id)

        # Check it's initially up and in
        initial_osd_status = self.api.get(osd_url).json()
        self.assertEqual(initial_osd_status['up'], True)
        self.assertEqual(initial_osd_status['in'], True)

        # Cause it to 'spontaneously' (as far as calamari is concerned)
        # be marked out
        self.ceph_ctl.mark_osd_in(cluster_id, osd_id, False)

        # Wait for the status to filter up to the REST API
        wait_until_true(lambda: self.api.get(osd_url).json()['in'] is True,
                        timeout=NEXT_HEARTBEAT_TIMEOUT)

        # Wait for the health status to reflect the degradation
        # NB this is actually a bit racy, because we assume the PGs remain degraded long enough
        # to affect the health state: in theory they could all get remapped instantaneously, in
        # which case the cluster would never appear unhealthy and this would be an invalid check.
        health_url = "cluster/{0}/sync_object/health".format(cluster_id)

        def check():
            status = self.api.get(health_url).json()['overall_status']
            log.debug("health status: %s" % status)
            return self.api.get(
                health_url).status_code == 200 and status == "HEALTH_WARN"

        wait_until_true(lambda: check, timeout=NEXT_HEARTBEAT_TIMEOUT)

        # Bring the OSD back into the cluster
        self.ceph_ctl.mark_osd_in(cluster_id, osd_id, True)

        # Wait for the status
        wait_until_true(lambda: self.api.get(osd_url).json()['in'] is True,
                        timeout=NEXT_HEARTBEAT_TIMEOUT)

        # Wait for the health
        # This can take a long time, because it has to wait for PGs to fully recover
        wait_until_true(lambda: self.api.get(health_url).json()[
            'overall_status'] == "HEALTH_OK",
                        timeout=OSD_RECOVERY_PERIOD * 2)
Exemplo n.º 8
0
    def authorize_keys(self, minion_ids):
        def _fqdns_present():
            found_ids = [m['id'] for m in self.api.get("key").json()]
            all_present = len(set(minion_ids) & set(found_ids)) == len(minion_ids)

            log.debug("checking keys, looking for %s found %s (%s)" % (minion_ids, found_ids, all_present))

            return all_present

        wait_until_true(_fqdns_present, timeout=KEY_WAIT_PERIOD * len(minion_ids))

        for minion_id in minion_ids:
            if self.api.get("key/%s" % minion_id).json()['status'] == 'accepted':
                # skip already accepted minions (happens running against external calamari instance)
                log.debug("Key for %s is already authorised" % minion_id)
                continue
            log.debug("Authorising key for %s" % minion_id)
            r = self.api.patch("key/%s" % minion_id, {'status': 'accepted'})
            r.raise_for_status()
Exemplo n.º 9
0
    def authorize_keys(self, minion_ids):
        def _fqdns_present():
            found_ids = [m['id'] for m in self.api.get("key").json()]
            all_present = len(set(minion_ids) & set(found_ids)) == len(minion_ids)

            log.debug("checking keys, looking for %s found %s (%s)" % (minion_ids, found_ids, all_present))

            return all_present

        wait_until_true(_fqdns_present, timeout=KEY_WAIT_PERIOD * len(minion_ids))

        for minion_id in minion_ids:
            if self.api.get("key/%s" % minion_id).json()['status'] == 'accepted':
                # skip already accepted minions (happens running against external calamari instance)
                log.debug("Key for %s is already authorised" % minion_id)
                continue
            log.debug("Authorising key for %s" % minion_id)
            r = self.api.patch("key/%s" % minion_id, {'status': 'accepted'})
            r.raise_for_status()
Exemplo n.º 10
0
    def test_osd_out(self):
        """
        Check Calamari's reaction to an OSD going down:

        - The OSD map should be updated
        - The health information should be updated and indicate a warning
        """

        cluster_id = self._wait_for_cluster()

        # Pick an OSD and check its initial status
        osd_id = 0
        osd_url = "cluster/{0}/osd/{1}".format(cluster_id, osd_id)

        # Check it's initially up and in
        initial_osd_status = self.api.get(osd_url).json()
        self.assertEqual(initial_osd_status['up'], True)
        self.assertEqual(initial_osd_status['in'], True)

        # Cause it to 'spontaneously' (as far as calamari is concerned)
        # be marked out
        self.ceph_ctl.mark_osd_in(cluster_id, osd_id, False)

        # Wait for the status to filter up to the REST API
        wait_until_true(lambda: self.api.get(osd_url).json()['in'] is True,
                        timeout=NEXT_HEARTBEAT_TIMEOUT)

        # Wait for the health status to reflect the degradation
        # NB this is actually a bit racy, because we assume the PGs remain degraded long enough
        # to affect the health state: in theory they could all get remapped instantaneously, in
        # which case the cluster would never appear unhealthy and this would be an invalid check.
        health_url = "cluster/{0}/sync_object/health".format(cluster_id)

        def check():
            status = self.api.get(health_url).json()['overall_status']
            log.debug("health status: %s" % status)
            return self.api.get(health_url).status_code == 200 and status == "HEALTH_WARN"

        wait_until_true(lambda: check,
                        timeout=NEXT_HEARTBEAT_TIMEOUT)

        # Bring the OSD back into the cluster
        self.ceph_ctl.mark_osd_in(cluster_id, osd_id, True)

        # Wait for the status
        wait_until_true(lambda: self.api.get(osd_url).json()['in'] is True, timeout=NEXT_HEARTBEAT_TIMEOUT)

        # Wait for the health
        # This can take a long time, because it has to wait for PGs to fully recover
        wait_until_true(lambda: self.api.get(health_url).json()['overall_status'] == "HEALTH_OK",
                        timeout=OSD_RECOVERY_PERIOD * 2)
Exemplo n.º 11
0
    def start(self):
        def _is_stale(ps):
            names = [
                "bin/salt-master", "bin/supervisord", "bin/cthulhu-manager",
                "calamari/manage.py", "bin/carbon-cache.py"
            ]

            try:
                cmdline = ps.cmdline()
            except psutil.AccessDenied:
                return False

            if not cmdline:
                return False

            if "bin/python" not in cmdline[0]:
                return False

            for c in cmdline:
                for name in names:
                    if c.endswith(name):
                        log.error("Stale {0} process: {1}".format(
                            name, ps.pid))
                        return True

            return False

        if self._first:
            log.info("EmbeddedCalamariControl.start: clearing down salt")
            self._first = False
            # Clean out the salt master's caches to mitigate any confusion from continually removing
            # and adding servers with the same FQDNs.
            erase_paths = [
                "dev/var/cache/salt/master/*", "dev/var/run/salt/master/*",
                "dev/etc/salt/pki/*"
            ]
            for path in erase_paths:
                for f in glob.glob(os.path.join(TREE_ROOT, path)):
                    if os.path.isdir(f):
                        shutil.rmtree(f)
                    else:
                        os.unlink(f)

            lingering_salt = [
                p for p in psutil.get_process_list() if _is_stale(p)
            ]
            for p in lingering_salt:
                log.warn("Killing stale process: %s" % p.pid)
                p.kill()

        config_path = os.path.join(TREE_ROOT, "dev/supervisord.conf")
        assert os.path.exists(config_path)
        self._ps = subprocess.Popen(["supervisord", "-n", "-c", config_path],
                                    cwd=os.path.abspath(TREE_ROOT),
                                    stdout=open("supervisord.out.log", 'w'),
                                    stderr=open("supervisord.err.log", 'w'))
        if not self._ps:
            raise RuntimeError("Failed to launch supervisor")

        config = ConfigParser()
        config.read(config_path)
        xmlrpc_addr = config.get('inet_http_server', 'port')
        self._rpc = xmlrpclib.ServerProxy("http://%s/RPC2" % xmlrpc_addr)

        try:
            # Wait for supervisor to start responding to RPC
            wait_until_true(self._available)

            # Wait for all supervisor's children to start
            wait_until_true(self._services_up)
        except:
            # Ensure that failures during startup do not leave a
            # zombie supervisor process
            log.error("Exception during setup, killing supervisor")
            self._ps.send_signal(signal.SIGINT)
            try:
                wait_until_true(lambda: self._ps.poll() is not None)
            except WaitTimeout:
                log.error("Supervisor isn't dying, sending it KILL")
                self._ps.send_signal(signal.SIGKILL)
            self._ps.wait()
            raise

        # The calamari REST API goes through a brief period between process
        # startup and servicing connections
        wait_until_true(self._api_connectable)

        # Calamari REST API will return 503s until the backend is fully up
        # and responding to ZeroRPC requests.
        wait_until_true(lambda: self.api.get("cluster").status_code != 503,
                        timeout=30)

        # Because we are embedded, we should act like a fresh instance
        # and not let any old keys linger
        self.clear_keys()
Exemplo n.º 12
0
    def start(self):

        def _is_stale(ps):
            names = ["bin/salt-master",
                     "bin/supervisord",
                     "bin/cthulhu-manager",
                     "calamari/manage.py",
                     "bin/carbon-cache.py"]

            try:
                cmdline = ps.cmdline()
            except psutil.AccessDenied:
                return False

            if not cmdline:
                return False

            if "bin/python" not in cmdline[0]:
                return False

            for c in cmdline:
                for name in names:
                    if c.endswith(name):
                        log.error("Stale {0} process: {1}".format(
                            name, ps.pid
                        ))
                        return True

            return False

        if self._first:
            log.info("EmbeddedCalamariControl.start: clearing down salt")
            self._first = False
            # Clean out the salt master's caches to mitigate any confusion from continually removing
            # and adding servers with the same FQDNs.
            erase_paths = ["dev/var/cache/salt/master/*", "dev/var/run/salt/master/*", "dev/etc/salt/pki/*"]
            for path in erase_paths:
                for f in glob.glob(os.path.join(TREE_ROOT, path)):
                    if os.path.isdir(f):
                        shutil.rmtree(f)
                    else:
                        os.unlink(f)

            lingering_salt = [p for p in psutil.get_process_list() if _is_stale(p)]
            for p in lingering_salt:
                log.warn("Killing stale process: %s" % p.pid)
                p.kill()

        config_path = os.path.join(TREE_ROOT, "dev/supervisord.conf")
        assert os.path.exists(config_path)
        self._ps = subprocess.Popen(
            ["supervisord", "-n", "-c", config_path],
            cwd=os.path.abspath(TREE_ROOT),
            stdout=open("supervisord.out.log", 'w'),
            stderr=open("supervisord.err.log", 'w')
        )
        if not self._ps:
            raise RuntimeError("Failed to launch supervisor")

        config = ConfigParser()
        config.read(config_path)
        xmlrpc_addr = config.get('inet_http_server', 'port')
        self._rpc = xmlrpclib.ServerProxy("http://%s/RPC2" % xmlrpc_addr)

        try:
            # Wait for supervisor to start responding to RPC
            wait_until_true(self._available)

            # Wait for all supervisor's children to start
            wait_until_true(self._services_up)
        except:
            # Ensure that failures during startup do not leave a
            # zombie supervisor process
            log.error("Exception during setup, killing supervisor")
            self._ps.send_signal(signal.SIGINT)
            try:
                wait_until_true(lambda: self._ps.poll() is not None)
            except WaitTimeout:
                log.error("Supervisor isn't dying, sending it KILL")
                self._ps.send_signal(signal.SIGKILL)
            self._ps.wait()
            raise

        # The calamari REST API goes through a brief period between process
        # startup and servicing connections
        wait_until_true(self._api_connectable)

        # Calamari REST API will return 503s until the backend is fully up
        # and responding to ZeroRPC requests.
        wait_until_true(lambda: self.api.get("cluster").status_code != 503, timeout=30)

        # Because we are embedded, we should act like a fresh instance
        # and not let any old keys linger
        self.clear_keys()