Пример #1
0
    def determine_array_status(self):
        # kinda weird but MD can leave an array device present with
        # array status clean so we need to check that too.
        try:
            stat ('/dev/%s' % self.dev_name)
        except OSError:
            return 'stopped'

        if self.level != 'linear':
            try:
                array_state = get_sysfs_param ('/sys/block/%s/md/array_state' % self.dev_name)
                sync_action = get_sysfs_param ('/sys/block/%s/md/sync_action' % self.dev_name)
                num_failed  = get_sysfs_param('/sys/block/%s/md/failed_disks' % self.dev_name)
                # check the number of failed disks and then see if we're syncing
                #
                if int (num_failed, 10) != 0:
                    if sync_action == 'idle':
                        return 'degraded'
                    else:
                        return 'rebuilding'
                else:
                    return 'online'
            except IOError:
                return 'stopped'
        else:
            # linear raid levels do not support sync/failed disks
            # we simply assume these arrays are online, as they
            # do not maintain raid drive element state
            # if the raid array device exists, it is online
            #
            return 'online'
Пример #2
0
    def purge_faulty_drives(self):
	dev_entry_re = recompile ("^dev")

	# walk the dev-sdX entries in the sysfs for a raid device and remove any
	# that are faulty.
	md_rootdir  =	'/sys/block/%s/md/' % self.get_devname()
	try:
	    dir = listdir(md_rootdir)
	    for d in dir:
		if dev_entry_re.match (d):
		    state_entry = '%s%s/state' % (md_rootdir, d) 
		    try:
			state	    = '%s' % get_sysfs_param(state_entry)
		    except (IOError, OSError):
			# ignore and continue
			continue
		
		    if state == "faulty":
			rlog_debug ('Cleaning up stale device [%s] reference in array [%s]' % (
				    d, self.get_devname()))
			# we found a disk that should have been removed but wasnt
			if not set_sysfs_param (state_entry, 'remove'):
			    rlog_notice ('Unable to remove faulty device [%s] from array [%s]' % (
					 self.get_devname(),
					 d))
	except (IOError, OSError):
	    # make sure we keep on going if we have a problem, we want to try to
	    # fix any inconsistancies found
	    pass
Пример #3
0
    def collect_rebuild_info(self):
	try:
	    sync_state   = get_sysfs_param('/sys/block/%s/md/sync_completed' % self.dev_name)
	    values=sync_state.strip().split("/")
	    self.sync_completed_kb  = int(values[0])
	    self.sync_total_kb	    = int(values[1])
	    rlog_debug ("Rebuild info : [%d:%d]" % (self.sync_completed_kb, self.sync_total_kb))
	
	except (rrdm_error, IndexError):
	    self.sync_total_kb = -1
	    self.sync_complete_kb = 0
Пример #4
0
    def fill_from_system_info(self):
        if (self.hd.status == 'missing'):
            self.raid_port = -1
            self.raid_status = 'missing'

        path = '/sys/block/%s/md/dev-%s/slot' % (self.raid_array.dev_name, self.dev_name)
        try:
            self.raid_port = int (get_sysfs_param (path) , 10)
        except Exception:
            self.raid_port = -1
            self.raid_status = 'missing'
Пример #5
0
    def fill_from_system_info(self,
                             device_list,
                             name,
                             dev_name,
                             fstype,
                             type,
                             layout,
                             level,
                             cfg_size_mb,
			     sysfscfg_list = []):
        self.dev_name   = dev_name
        self.name       = name
        self.fstype     = fstype
        self.type       = type
        self.layout     = layout
        self.level      = level
        self.cfg_size_mb = cfg_size_mb

        self.__device_list = device_list
	self.__sysfscfg_list = sysfscfg_list

        # currently we expect each raid to go across all drives
        # in the system 
        self.num_drives = self.__device_list.get_expected_drives()

        self.status = self.determine_array_status()
	if self.is_rebuilding():
	    self.collect_rebuild_info()

	if not self.is_stopped():
	    try:
		self.uuid = get_sysfs_param ('/sys/block/%s/md/uuid' % self.dev_name)
	    except rrdm_error:
		self.uuid = ''
	    
        rlog_debug ('raid status for [%s] is [%s]' % (self.dev_name, self.status))
        for diskpart in self.__device_list.get_devices():
            part_num = diskpart.part_id
            disk = diskpart.hd
            rlog_debug ('adding disk device for raid array [%s] part [%s]' % \
                        (self.dev_name, 
                         part_num))

            rpart = diskpart.get_devname()
            rdevice = '/dev/%s' % rpart

            # This is an assumption that should hold true even on old boxes,
            # the raid port should equal the logical port carried by the device in
            # the drive list.
            # originally we simply encoded the raid port in the rvbd SB as the drive
            # number.  This would be an issue if we supported moving around disks,
            # but as we don't support that today, we should be ok.
            # the problem with moving drives around would be that each drive physically
            # could now be a different rdev in a number of arrays, and the SB doesnt store
            # this well today
            # 
            rdev = diskpart.get_logical_device() 
	
#            try:
#		# if the disk has a valid riverbed SB, we can use the SB info to give us
#		# the raid port, otherwise we need to fall back to using mdadm to get the
#		# raid port.
#		#
#		if disk.has_valid_superblock():
#		    rdev = disk.superblock.get_raid_port()
#		    rlog_debug('Superblock indicates [%s] is [%s]' % (rpart, rdev))
#		elif not disk.is_failed():
#		    # fallback to mdadm's brief superblock output and get the raid port from there.
#		    rlog_debug ('Disk %s has no riverbed superblock, checking mdadm' % rpart)
#		    dev_sb_output = read_brief_md_sb(rdevice)
#		    # we expect a string rdev here.
#		    rdev = '%s' % get_rdev_from_brief_sb(dev_sb_output)
#                else:
#                    rdev = 'unknown'
#                    raise rrdm_error ("Disk %s doesnt have a riverbed superblock" % rpart)
#            except rrdm_error:
#
#                # we can't read the SB info for this disk and we know its not missing, so..
#                # fill it in failed and go to the next disk
#                newpart=RaidPartition()
#		# here we need to use a fallback from the config if the drive is missing,
#		# and we want movable drives.
#		newpart.make_partition(part_num, disk, self, disk.portnum)
#                newpart.device_name = '%s' % rpart
#
#                self.found_devices = self.found_devices + 1
#                self.part_list.append(newpart)
#                continue

            if rdev == 'unknown':
                continue

            rlog_debug ('disk [%s] is [%s] raid drive [%s]' % \
                        (rpart, self.dev_name, rdev))
            
            base_dev=hwtool_disk_map.find_devname_by_port(disk.portnum)
            base_devname='%s%s' % (base_dev, part_num)
            
            path='/sys/block/%s/md/dev-%s/state' % (self.dev_name, base_devname)
            try:
                disk_state=get_sysfs_param(path)
                disk_status = convert_md_status_to_rrdm(disk_state)
            except IOError:
                disk_status='failed'

            newpart=RaidPartition()
	    newpart.make_partition (part_num, disk, self, rdev, disk_status)
            newpart.device_name = '%s' % rpart

            self.part_list.append(newpart)
            self.found_devices = self.found_devices + 1
            continue
Пример #6
0
    def fail(self):
        # once you've failed the disk, it disappears from the sysfs entry,
        # you can only fail a drive once, also b/c of that read the dev name first.
        #
        # failing is a 2 stage process of setting the drive to faulty and removing it
        # from the array.
        #
        array_name      = self.raid_array.get_devname()

        # XXX currently assumes that the disk in port X is raid X
        #
        if self.raid_port == 'unknown':
            # if this drive isnt in the system assume its on the hard drive.
            rlog_debug ('drive has been removed using drive-raid map')
	    sysconfig = SystemConfig()
	    if sysconfig.is_config_valid():
		portnum = sysconfig.get_disk_rport(self.hd.portnum)
	    else:
		# if we don't know which raid port to fail, don't just continue on.
		# skip out and log a msg.
		#
		rlog_notice ('Unable to determie rport when failing disk [%s]' %
			     portnum)
		return
        else:
            portnum = self.raid_port

        state_cmd   = "faulty"
        remove_cmd  = "remove"

	md_devname_path = '/sys/block/%s/md/rd%s/device' % (array_name, portnum)

        try:
            md_dev_name = get_sysfs_param (md_devname_path)
        except IOError:
            raise rrdm_error ('unable to read raid device : %s' % md_devname_path)

	# use the device name indicated by RAID, since if the drive is missing,
	# md might still have a reference to the device, but we don't have a scsi device
	# to use to figure out what the name of the device that used ot be in the array
	# is
        md_state_path  = '/sys/block/%s/md/dev-%s/state' % (array_name, md_dev_name)

        rlog_notice ('Failing array [%s] device [%s:%s]' % (array_name,
                      portnum, md_dev_name))
	retries = 0

	while retries < 3:
	    try:
		if exists (md_state_path):
		    sys_file = open (md_state_path, "w")
		    try:
			sys_file.write(state_cmd)
		    finally:
			sys_file.close()

		    sleep (0.5)

		    sys_file = open (md_state_path, "w")
		    try:
			sys_file.write(remove_cmd)
		    finally:
			sys_file.close()

		    # if we succeed, give a grace period to allow for the request 
		    # to complete.
		    sleep (0.5)

		# bail out its failed already or we succeeded
		# make sure drive is really gone, and if its not.. retry
		if not exists (md_state_path):
		    break
		    
	    except IOError:
		retries += 1

	if exists (md_state_path):
	    rlog_debug('Unable to fail %s on %s with cmd [%s:%s]' % (
		       self.raid_port, array_name, md_state_path,remove_cmd))