Exemplo n.º 1
0
    def lustre_check(self):
        """
        Check Router health at Lustre level.

        Check LNET routing capabilities and change object state
        based on the results.
        """

        # LNET is not loaded
        if not os.path.isfile("/proc/sys/lnet/routes"):
            self.state = OFFLINE
            return

        # Read routing information
        try:
            routes = open("/proc/sys/lnet/routes")
            # read only first line
            state = routes.readline().strip().lower()
        except:
            self.state = RUNTIME_ERROR
            raise ComponentError(self, "Could not read routing information")

        # routing info tells this is ok?
        if state == "routing enabled":
            self.state = MOUNTED
        elif state == "routing disabled":
            self.state = TARGET_ERROR
            raise ComponentError(self, "Misconfigured router")
        else:
            self.state = RUNTIME_ERROR
            raise ComponentError(self, "Bad routing status")
Exemplo n.º 2
0
    def full_check(self, mountdata=True):
        """Device type check."""

        try:
            info = os.stat(self.dev)
        except OSError as exp:
            raise ComponentError(self, str(exp))

        if not stat.S_ISBLK(info[stat.ST_MODE]):
            raise ComponentError(self, "bad journal device")
Exemplo n.º 3
0
    def lustre_check(self):
        """
        Check Client health at Lustre level.
        """

        self.state = None  # Undefined

        proc_lov_match = glob("/proc/fs/lustre/lov/%s-clilov-*" %
                              self.fs.fs_name)

        if not proc_lov_match:
            self.state = OFFLINE
            return

        #
        # There is at least one clilov declared. Check for coherence.
        #
        loaded = os.path.isdir(proc_lov_match[0])

        # check for presence in /proc/mounts
        f_proc_mounts = open("/proc/mounts", 'r')
        try:
            curr_lnetdev = None
            for line in f_proc_mounts:
                if line.find(" %s lustre " % self.mount_path) > 0:
                    lnetdev, mntp = line.split(' ', 2)[0:2]
                    if loaded:
                        curr_lnetdev = lnetdev
                        self.state = MOUNTED
                        self.mtpt = mntp
                    else:
                        self.state = CLIENT_ERROR
                        if lnetdev != curr_lnetdev:
                            raise ComponentError(
                                self, "conflicting mounts "
                                "detected for %s and %s on %s" %
                                (lnetdev, curr_lnetdev, self.mount_path))
                        else:
                            raise ComponentError(
                                self, "multiple mounts "
                                "detected for %s (%s)" %
                                (lnetdev, self.mount_path))
        finally:
            f_proc_mounts.close()

        if loaded and self.state != MOUNTED:
            # up but not mounted = incoherent state
            self.state = CLIENT_ERROR
            raise ComponentError(
                self, "incoherent client state for FS '%s'"
                " (not mounted but loaded. Mount in "
                "progress?)" % self.fs.fs_name)

        # Look for some evictions
        self._lustre_check_proc_state()
Exemplo n.º 4
0
    def full_check(self, mountdata=True):
        """Device type check."""

        try:
            info = os.stat(self.dev)
        except OSError, exp:
            raise ComponentError(self, str(exp))
Exemplo n.º 5
0
class Journal(Component):
    """
    Manage a target external journal device.
    """

    TYPE = 'journal'

    def __init__(self, target, device):
        Component.__init__(self, target.fs, target.server,
                           target.action_enabled, target._mode)
        self.target = target
        self.dev = device

    @property
    def label(self):
        return self.uniqueid()

    def uniqueid(self):
        return "%s_jdev" % self.target.uniqueid()

    def longtext(self):
        return "%s journal (%s)" % (self.target.get_id(), self.dev)

    def full_check(self, mountdata=True):
        """Device type check."""

        try:
            info = os.stat(self.dev)
        except OSError, exp:
            raise ComponentError(self, str(exp))

        if not stat.S_ISBLK(info[stat.ST_MODE]):
            raise ComponentError(self, "bad journal device")
Exemplo n.º 6
0
    def _lustre_check_proc_state(self):
        """Check current target status in /proc/fs/lustre/*/*/state"""

        self.proc_states = {}
        for entry in glob("/proc/fs/lustre/??c/%s-*/state" % self.fs.fs_name):
            f_state = open(entry, 'r')
            for line in f_state:
                if line.startswith('current_state:'):
                    state_name = line.split(None, 1)[1].strip()

                    # Ignore inactive targets
                    if state_name != 'FULL':
                        mo = re.search(
                            r'/(%s-\w{3}[0-9a-fA-F]{4})-' % self.fs.fs_name,
                            entry)
                        try:
                            if not self.fs.components[mo.group(1)].is_active():
                                break
                        except (AttributeError, KeyError):
                            pass

                    self.proc_states.setdefault(state_name, 0)
                    self.proc_states[state_name] += 1
                    # Stop reading other file lines
                    break
            f_state.close()

        if 'EVICTED' in self.proc_states:
            self.state = CLIENT_ERROR
            raise ComponentError(
                self, 'client connection error (%d evictions)' %
                self.proc_states['EVICTED'])
Exemplo n.º 7
0
 def raise_if_started(self, message):
     """Raise a ComponentError if the target device is mounted."""
     if self.local_state != OFFLINE:
         if self.is_started():
             reason = "%s: target %s (%s) is started"
         else:
             reason = "%s: target %s (%s) is busy"
         self.local_state = TARGET_ERROR
         raise ComponentError(self, reason % (message, self.label, self.dev))
Exemplo n.º 8
0
    def full_check(self, mountdata=True):
        """
        Sanity checks for device files and Lustre status.
        If mountdata is set to False, target content will not be analyzed.
        """

        # check for disk level status
        try:
            self._device_check()
            if mountdata:
                self._mountdata_check(self.label)

            if self.journal:
                self.journal.full_check()

        except (ComponentError, DiskDeviceError), error:
            self.local_state = TARGET_ERROR
            raise ComponentError(self, str(error))
Exemplo n.º 9
0
    def failover(self, candidates):
        """
        Helper method to change Target current server based on a candidate list.

        It checks if only one server from the candidate list matches one of the
        failover server of this target. If more than one matches, it
        raises an exception. If no server matches it returns False. If it has
        changes the current server, it returns true.
        """
        intersec = self.failservers.select(candidates)

        # If we have more than one possible failover nodes, it is ambiguous
        if len(intersec) > 1:
            raise ComponentError(self, "More than one failover server matches.")

        if len(intersec) == 1:
            self.server = intersec[0]
            return True

        return False
Exemplo n.º 10
0
    def lustre_check(self):
        """
        Check target health at Lustre level.
        """

        self.local_state = None   # Unknown

        # find pathnames matching wanted lustre procfs
        # (Since Lustre 2.4. More than one path could be returned.
        #  The first one is fine.)
        mntdev_path = glob('/proc/fs/lustre/*/%s/mntdev' % self.label)

        recov_path = glob('/proc/fs/lustre/*/%s/recovery_status' % self.label)
        assert len(recov_path) <= 1

        # check for label presence in /proc : is this lustre target started?
        if len(mntdev_path) == 0 and len(recov_path) == 0:
            self.local_state = OFFLINE
        elif len(mntdev_path) == 0:
            self.local_state = TARGET_ERROR
            raise ComponentError(self, "incoherent state in " \
                                       "/proc/fs/lustre for %s" % self.label)
        else:
            # get target's real device
            fproc = open(mntdev_path[0])
            try:
                self.mntdev = fproc.readline().rstrip('\n')
            finally:
                fproc.close()

            loaded = True

            # check for presence in /proc/mounts
            f_proc_mounts = open("/proc/mounts", 'r')
            try:
                for line in f_proc_mounts:
                    if line.find("%s " % self.mntdev) == 0:
                        if line.split(' ', 3)[2] == "lustre":
                            if loaded:
                                self.local_state = MOUNTED
                            else:
                                self.local_state = TARGET_ERROR
                                raise ComponentError(self, "multiple " \
                                        " mounts detected for %s" % self.label)
            finally:
                f_proc_mounts.close()

            if self.local_state != MOUNTED and loaded:
                self.local_state = TARGET_ERROR
                # up but not mounted = incoherent state
                # check for loaded state: ST, UP...
                raise ComponentError(self, "incoherent state for %s " \
                                     "(started but not mounted?)" % self.label)

            if self.local_state == MOUNTED and not loaded:
                self.local_state = TARGET_ERROR
                # mounted but not up = incoherent state
                # /etc/fstab was not correctly cleaned
                raise ComponentError(self, "incoherent state for %s " \
                                     "(mounted but not started?)" % self.label)

            if self.local_state == MOUNTED and self.TYPE != MGT.TYPE:
                # check for MDT or OST recovery (MGS doesn't make any recovery)
                try:
                    fproc = open(recov_path[0], 'r')
                except (IOError, IndexError):
                    self.local_state = TARGET_ERROR
                    raise ComponentError(self, "recovery_state file not " \
                                                  "found for %s" % self.label)

                try:

                    for line in fproc:
                        if line.startswith("status:"):
                            status = line.rstrip().split(' ', 2)[1]
                            break

#
# Recovering information depends on Lustre version.
#
# VERSION:                2.0            1.8                     1.6
#
# connected_clients:  connect/TOTAL   connect/TOTAL            connect/TOTAL
# req_replay:         req_replay      ---                      ---
# lock_repay:         lock_replay     ---                      ---
# delayed_client:     ---             delay/TOTAL              ---
# completed_clients:  connect-replay  TOTAL-recov-delay/TOTAL  TOTAL-recov/TOTAL
# evicted_clients:    stale           ---                      ---
#
                    if status == "RECOVERING":
                        time_remaining = "??"
                        completed = -1
                        evicted = 0
                        total = 0
                        for line in fproc:
                            line = line.strip()
                            if line.startswith("time_remaining:"):
                                time_remaining = line.split(' ', 1)[1]
                            elif line.startswith("connected_clients:"):
                                total = int(line.split('/', 1)[1])
                            elif line.startswith("evicted_clients:"):
                                evicted = int(line.split(' ', 1)[1])
                            elif line.startswith("completed_clients:"):
                                completed = line.split(' ', 1)[1]
                                completed = int(completed.split('/', 1)[0])
                        self.local_state = RECOVERING
                        self.recov_info = "%ss (%s/%s)" % (time_remaining,
                                                    completed + evicted, total)
                finally:
                    fproc.close()