def label_drives(blank_drives, ip_label, fs, disk_label_list, cp_input_list): """ Label the partition and filesystem of a drive :param blank_drives: drives that have not been labelled yet :param disk_label_list: list of devices that are labelled and their swift device number ("v" is with the swift number if the device is an lvm) :param ip_label: hex representation of the ip address that is used for the swift device labels :param cp_input_list: list of tuples that show the devices selected by the configuration(drive/part/lvm name, swift device name) :return disk_label_list: see above """ for blank in blank_drives: config_match, config_disk_val = confirm_config(blank, cp_input_list) if not config_match: print("%s has not been selected for swift use" % (blank)) continue full_label, disk_number = get_full_label(config_disk_val, disk_label_list, cp_input_list, ip_label) p_lab_status, p_lab_output = run_cmd('/sbin/parted -s ' + blank + ' name 1 "' + full_label + '"') if p_lab_status != 0: print("Error labelling partition %s - %s" % (blank, p_lab_output)) sys.exit(1) time.sleep(1) xfs_label_status, xfs_label_output = run_cmd( '/usr/sbin/xfs_admin -L "' + full_label + '" ' + blank + '1') if xfs_label_status != 0: print("Error labelling xfs partition %s - %s" % (blank, xfs_label_output)) # Revert the partition label back os.system('/sbin/parted -s ' + blank + ' name 1 "primary"') sys.exit(1) disk_label_list.append([str(disk_number), blank + "1"]) return disk_label_list
def check_details(): """ Parses ntp data in the form: remote refid st t when poll reach delay offset jitter =========================================================================== bindcat.fhsu.ed .INIT. 16 u - 1024 0 0.000 0.000 0.000 origin.towfowi. .INIT. 16 u - 1024 0 0.000 0.000 0.000 time-b.nist.gov .INIT. 16 u - 1024 0 0.000 0.000 0.000 services.quadra .INIT. 16 u - 1024 0 0.000 0.000 0.000 associd=0 status=c011 leap_alarm, sync_unspec, 1 event, freq_not_set, version="ntpd [email protected] Fri Apr 10 19:04:04 UTC 2015 (1)", processor="x86_64", system="Linux/3.14.44-1-amd64-hlinux", leap=11, stratum=16, precision=-23, rootdelay=0.000, rootdisp=26.340, refid=INIT, reftime=00000000.00000000 Mon, Jan 1 1900 0:00:00.000, clock=d94f932a.13f33874 Tue, Jul 14 2015 13:54:50.077, peer=0, tc=3, mintc=3, offset=0.000, frequency=0.000, sys_jitter=0.000, clk_jitter=0.000, clk_wander=0.000 """ results = [] cmd_result = run_cmd('ntpq -pcrv') if cmd_result.exitcode != 0: failed = CheckFailure.child(dimensions={ 'check': BASE_RESULT.name, 'error': cmd_result.output, }) failed.value = Severity.fail return [failed] results.append(check_ntpq_fact(cmd_result, 'stratum')) results.append(check_ntpq_fact(cmd_result, 'offset')) return results
def ping_check(hp): try: cmd_result = run_cmd('ping -c 1 -A %s' % hp.host) if cmd_result.exitcode == 0: return (True, ) else: return (False, "ping_check failed") except Exception: return (False, "ping_check failed")
def ping_check(hp): try: cmd_result = run_cmd('ping -c 1 -A %s' % hp.host) if cmd_result.exitcode == 0: return (True,) else: return (False, "ping_check failed") except Exception: return (False, "ping_check failed")
def is_valid_label(d, r): if d.label == LABEL_CHECK_DISABLED: return True rc = run_cmd('xfs_admin -l %s | grep -q %s' % (d.mount, d.label)) if rc.exitcode == 0: return True else: return False
def check_status(): cmd_result = run_cmd('systemctl status ntp') r = BASE_RESULT.child() if cmd_result.exitcode != 0: r['error'] = cmd_result.output r.value = Severity.fail else: r.value = Severity.ok return [r]
def get_product_name(): ''' Returns the type of node (product) ''' prod_stat, prod_out = run_cmd("dmidecode -s system-product-name") if prod_stat != 0: print("Error getting node product name - %s" % (prod_out)) sys.exit(1) else: return prod_out
def handler(signum, frame): """ Handler that is called if the parted command is > 10 seconds. This will then kill the parted command. """ pid_out = run_cmd('ps -A') for line in pid_out.output.splitlines(): if 'parted' in line: pid = int(line.split(None, 1)[0]) os.kill(pid, signal.SIGKILL) raise TimeoutError("Parted command is hanging for device")
def label_volumes(blank_volumes, ip_label, disk_label_list, cp_input_list): """ Label the filesystem of a logical volume :param blank_volumes: volumes that have not been labelled yet :param disk_label_list: list of devices that are labelled and their swift device number ("v" is with the swift number if the device is an lvm) :param ip_label: hex representation of the ip address that is used for the swift device labels :param cp_input_list: list of tuples that show the devices selected by the configuration(drive/part/lvm name, swift device name) :return disk_label_list: see above """ for blank_vol in blank_volumes: # No need to confirm that volume is marked for swift use as only # volumes in the config processor are included from the start swift_vol_match = False for full_vol, swift_name in cp_input_list: if blank_vol == full_vol: # Remove "lvm" config_vol_val = swift_name[3:] swift_vol_match = True if not swift_vol_match: print("%s has not been selected for swift use" % (blank_vol)) continue full_label, volume_number = get_full_vol_label(config_vol_val, disk_label_list, cp_input_list, ip_label) vol_label_status, vol_label_output = run_cmd( '/usr/sbin/xfs_admin -L "' + full_label + '" ' + blank_vol) if vol_label_status != 0: print("Error labelling xfs volume %s - %s" % (blank_vol, vol_label_output)) sys.exit(1) disk_label_list.append([str(volume_number), blank_vol]) return disk_label_list
def find_parted_drives(drives): """ Find partitioned devices This goes through each device on he node. If there is no partition present it will be added to a list for later partitioning. Output is divided into partitioned and unpartitioned drives : param drives: list of all drives specified by the swift input model : return partitioned_drives: list of partitioned drives : return umpartitioned_drives: list of unpartitioned drives """ partitioned_drives = [] unpartitioned_drives = [] signal.signal(signal.SIGALRM, handler) for drive, swift_name in drives: # Ignore lvms if is_lvm(swift_name): continue status = -1 signal.alarm(20) try: status, output = run_cmd('/sbin/parted ' + drive + ' print') except TimeoutError as exc: print("%s %s - killing command" % (exc, drive)) if status == 0 and "Error" not in output: partitioned_drives.append(drive) elif status == -1: continue else: unpartitioned_drives.append(drive) # Disable the alarm signal.alarm(0) return partitioned_drives, unpartitioned_drives
def get_drive_partitions(parted_d): """ Returns all partition of a given drives """ status, output = run_cmd('/sbin/parted ' + parted_d + ' print') if status == 0: if "unrecognised disk label" in output: return [] else: partitions = output.split("Flags")[2].split("\n") else: print("Error reading %s - %s" % (parted_d, output)) sys.exit(1) # Remove blank entries for p in partitions: if not p: partitions.remove(p) return partitions
def check_details(): """ Parses ntp data in the form: remote refid st t when poll reach delay offset jitter =========================================================================== bindcat.fhsu.ed .INIT. 16 u - 1024 0 0.000 0.000 0.000 origin.towfowi. .INIT. 16 u - 1024 0 0.000 0.000 0.000 time-b.nist.gov .INIT. 16 u - 1024 0 0.000 0.000 0.000 services.quadra .INIT. 16 u - 1024 0 0.000 0.000 0.000 associd=0 status=c011 leap_alarm, sync_unspec, 1 event, freq_not_set, version="ntpd [email protected] Fri Apr 10 19:04:04 UTC 2015 (1)", processor="x86_64", system="Linux/3.14.44-1-amd64-hlinux", leap=11, stratum=16, precision=-23, rootdelay=0.000, rootdisp=26.340, refid=INIT, reftime=00000000.00000000 Mon, Jan 1 1900 0:00:00.000, clock=d94f932a.13f33874 Tue, Jul 14 2015 13:54:50.077, peer=0, tc=3, mintc=3, offset=0.000, frequency=0.000, sys_jitter=0.000, clk_jitter=0.000, clk_wander=0.000 """ results = [] cmd_result = run_cmd('ntpq -pcrv') if cmd_result.exitcode != 0: failed = CheckFailure.child( dimensions={ 'check': BASE_RESULT.name, 'error': cmd_result.output, } ) failed.value = Severity.fail return [failed] results.append(check_ntpq_fact(cmd_result, 'stratum')) results.append(check_ntpq_fact(cmd_result, 'offset')) return results
def is_xfs(d, r): rc = run_cmd('mount | grep -qE "%s.*xfs"' % d.mount) if rc.exitcode == 0: return True else: return False
def label_partitions(blank_partitions, ip_label, fs, disk_label_list, cp_input_list): """ Label particular partitions and filesystems in a drive :param blank_partitions: partitions that have not been labelled yet :param disk_label_list: list of devices that are labelled and their swift device number ("v" is with the swift number if the device is an lvm) :param ip_label: hex representation of the ip address that is used for the swift device labels :param cp_input_list: list of tuples that show the devices selected by the configuration(drive/part/lvm name, swift device name) :return disk_label_list: see above """ for blank_p in blank_partitions: config_match, config_disk_no = confirm_config(blank_p, cp_input_list) if not config_match: print("%s has not been selected for swift use" % (blank_p)) continue full_label, disk_number = get_full_label(config_disk_no, disk_label_list, cp_input_list, ip_label) p_lab_status, p_lab_output = run_cmd('/sbin/parted -s ' + blank_p[:-1] + ' name ' + blank_p[-1] + ' "' + full_label + '"') if p_lab_status != 0: print("Error labelling partition %s - %s" % (blank_p, p_lab_output)) sys.exit(1) time.sleep(1) # Make sure that there is a filesystem to label part_status, part_output = run_cmd('/sbin/parted -s ' + blank_p + ' p') # Add the fs to the partition if it isn't there already if fs not in part_output and fs == "xfs": create_fs_stat, create_fs_out = run_cmd( '/sbin/mkfs.xfs -f -i size=256 ' + blank_p) if create_fs_stat != 0: print("Error: Failed to create filesystem for partition %s " "- %s" % (blank_p, create_fs_out)) sys.exit(1) fs_label_status, fs_label_output = run_cmd('/usr/sbin/xfs_admin -L "' + full_label + '" ' + blank_p) if fs_label_status != 0: print("Error labelling xfs partition %s - %s" % (blank_p, fs_label_output)) # Revert the partition label back os.system('/sbin/parted -s ' + blank_p[:-1] + ' name ' + blank_p[-1] + ' "primary"') sys.exit(1) disk_label_list.append([str(disk_number), blank_p]) return disk_label_list
def mount_devices(ip_label, disk_label_list, mount_dir, cp_input_list): """ Function to mount all labelled devices. Returns a list of mounted drives :param ip_label: hex representation of the ip address that is used for the swift device labels :param disk_label_list: list of devices that are labelled and their swift device number ("v" is with the swift number if the device is an lvm) :param mount_dir: directory where the devices will be mounted to :param cp_input_list: list of tuples that show the devices selected by the configuration(drive/part/lvm name, swift device name) """ mounts_to_check = [] return_results = [] mount_threads = [] if not os.path.isdir(mount_dir): cmd_status, cmd_output = run_cmd("/bin/mkdir " + mount_dir) if cmd_status != 0: print("Error creating mount directory %s" % (mount_dir)) sys.exit(1) for label_no in disk_label_list: # Logical volumes if "v" in label_no[0]: if cp_input_list: mount_vol_match = False for full_vol, swift_name in cp_input_list: if ("lvm" + label_no[0][1:]) == swift_name: # Remove "v" from label_no val when creating mount # point mount_point = os.path.join(mount_dir, "lvm" + label_no[0][1:]) mount_vol_match = True if not mount_vol_match: print( "Not mounting lvm%s - it isn't marked for swift use" % (label_no[0][1:])) continue else: mount_point = os.path.join(mount_dir, "lvm" + label_no[0][1:]) else: if cp_input_list: mount_match = False for full_vol, swift_name in cp_input_list: if ("disk" + label_no[0]) == swift_name: mount_point = mount_dir + "disk" + label_no[0] mount_match = True if not mount_match: print( "Not mounting disk%s - it isn't marked for swift use" % (label_no[0])) continue else: mount_point = mount_dir + "disk" + label_no[0] # Create mount dir if it doesn't exist if not os.path.isdir(mount_point): cmp_status, cmp_output = run_cmd("/bin/mkdir " + mount_point) if cmp_status != 0: print("Error creating mount point %s - %s" % (mount_point, cmp_output)) sys.exit(1) # Make sure that the directories are owned by root:root chown_status, chown_out = run_cmd("/bin/chown root:root " + mount_point) if chown_status != 0: print("Error changing ownership of %s - %s" % (mount_point, chown_out)) sys.exit(1) if "v" in label_no[0]: mount_label = ip_label + "v" + three_dig_format(label_no[0][1:]) else: mount_label = ip_label + "h" + three_dig_format(label_no[0]) # Check if already mounted if os.path.ismount(mount_point): print("%s is already mounted" % (mount_point)) else: t = threading.Thread(target=mount_by_label, args=(mount_point, mount_label, return_results)) t.start() mount_threads.append(t) mounts_to_check.append([mount_label, mount_point]) for mt in mount_threads: mt.join(thread_timeout) # TODO - this will soon be logged instead of printed out if return_results: print("%s" % ("Running: ".join(return_results))) # Confirm that the drives are mounted for drives in mounts_to_check: if os.path.isdir(drives[1]): # Double-check the directories are owned by swift:swift # NOTE - there is a corner-case where the thread_timeout value # expires and the ownership will not be changed here (the mount # thread may complete at a later time). In this case, it is # necessary that the diags pick up bad ownership asap chwn_status, chwn_out = run_cmd("/bin/chown swift:swift " + drives[1]) if chwn_status != 0: print("Error changing ownership of %s - %s" % (drives[1], chwn_out)) sys.exit(1) print("Mounted to %s with label %s" % (drives[1], drives[0]))
def get_controller_info(): """ parses controller data from hpssacli in the form. returns a dict. key's are lowercased versions of the key name on each line, including special characters. Values are not changed. keys 'model' and 'slot' are parsed from the first line Smart Array P410 in Slot 1 Bus Interface: PCI Slot: 1 Serial Number: PACCR0M9VZ41S4Q Cache Serial Number: PACCQID12061TTQ RAID 6 (ADG) Status: Disabled Controller Status: OK Hardware Revision: C Firmware Version: 6.60 Rebuild Priority: Medium Expand Priority: Medium Surface Scan Delay: 15 secs Surface Scan Mode: Idle Queue Depth: Automatic Monitor and Performance Delay: 60 min Elevator Sort: Enabled Degraded Performance Optimization: Disabled Inconsistency Repair Policy: Disabled Wait for Cache Room: Disabled Surface Analysis Inconsistency Notification: Disabled Post Prompt Timeout: 15 secs Cache Board Present: True Cache Status: OK Cache Ratio: 25% Read / 75% Write Drive Write Cache: Disabled Total Cache Size: 256 MB Total Cache Memory Available: 144 MB No-Battery Write Cache: Disabled Cache Backup Power Source: Batteries Battery/Capacitor Count: 1 Battery/Capacitor Status: OK SATA NCQ Supported: True Number of Ports: 2 Internal only Encryption Supported: False Driver Name: hpsa Driver Version: 3.4.0 Driver Supports HP SSD Smart Path: False Smart Array P440ar in Slot 0 (Embedded) (HBA Mode) Bus Interface: PCI Slot: 0 Serial Number: PDNLH0BRH7V7GC Cache Serial Number: PDNLH0BRH7V7GC Controller Status: OK Hardware Revision: B Firmware Version: 2.14 Controller Temperature (C): 50 Number of Ports: 2 Internal only Driver Name: hpsa Driver Version: 3.4.4 HBA Mode Enabled: True PCI Address (Domain:Bus:Device.Function): 0000:03:00.0 Negotiated PCIe Data Rate: PCIe 3.0 x8 (7880 MB/s) Controller Mode: HBA Controller Mode Reboot: Not Required Current Power Mode: MaxPerformance Host Serial Number: MXQ51906YF """ results = [] controller_slots = [] controller_result = BASE_RESULT.child() controller_result.name += '.' + 'smart_array' rc = run_cmd(LOCK_FILE_COMMAND + 'hpssacli ctrl all show detail') if rc.exitcode != 0: if 'Error: No controllers detected.' in str(rc.output): return [[], []] if len(rc.output) > 1847: rc = rc._replace(exitcode=rc.exitcode, output='...' + rc.output[-1844:]) raise Exception('{0}: hpssacli ctrl all show detail ' 'failed with exit code: {1}'.format( rc.output, rc.exitcode)) if rc.output: lines = rc.output.split('\n') else: raise Exception('{0}: hpssacli ctrl all show detail ' 'failed with exit code: {1}'.format( rc.output, rc.exitcode)) info = [] text_scanner = TextScanner(lines) root = text_scanner.get_root_block() c_info = None # Extract controller information for controller in root.subblocks: line = controller.text if line.startswith('Smart Array') or line.startswith('Smart HBA'): model, _ = parse_controller_name(line) c_info = {'model': model} info.append(c_info) # Process controller attributes for attribute in controller.subblocks: parse_cont_attribute(attribute, c_info, controller_slots) elif line.startswith('CACHE STATUS'): for attribute in controller.subblocks: # Process controller attributes att = attribute.text if ': ' in att and c_info: parse_cont_attribute(attribute, c_info, controller_slots) else: # Unknown controller type continue # Walk dictionary to gather controller metrics for c_info in info: results.extend(check_controller(c_info, controller_result)) return results, controller_slots
def get_smart_array_info(): """ parses controller data from hpssacli in the form. returns a dict. key's are lowercased versions of the key name on each line, including special characters. Values are not changed. keys 'model' and 'slot' are parsed from the first line Smart Array P410 in Slot 1 Bus Interface: PCI Slot: 1 Serial Number: PACCR0M9VZ41S4Q Cache Serial Number: PACCQID12061TTQ RAID 6 (ADG) Status: Disabled Controller Status: OK Hardware Revision: C Firmware Version: 6.60 Rebuild Priority: Medium Expand Priority: Medium Surface Scan Delay: 15 secs Surface Scan Mode: Idle Queue Depth: Automatic Monitor and Performance Delay: 60 min Elevator Sort: Enabled Degraded Performance Optimization: Disabled Inconsistency Repair Policy: Disabled Wait for Cache Room: Disabled Surface Analysis Inconsistency Notification: Disabled Post Prompt Timeout: 15 secs Cache Board Present: True Cache Status: OK Cache Ratio: 25% Read / 75% Write Drive Write Cache: Disabled Total Cache Size: 256 MB Total Cache Memory Available: 144 MB No-Battery Write Cache: Disabled Cache Backup Power Source: Batteries Battery/Capacitor Count: 1 Battery/Capacitor Status: OK SATA NCQ Supported: True Number of Ports: 2 Internal only Encryption Supported: False Driver Name: hpsa Driver Version: 3.4.0 Driver Supports HP SSD Smart Path: False Smart Array P440ar in Slot 0 (Embedded) (HBA Mode) Bus Interface: PCI Slot: 0 Serial Number: PDNLH0BRH7V7GC Cache Serial Number: PDNLH0BRH7V7GC Controller Status: OK Hardware Revision: B Firmware Version: 2.14 Controller Temperature (C): 50 Number of Ports: 2 Internal only Driver Name: hpsa Driver Version: 3.4.4 HBA Mode Enabled: True PCI Address (Domain:Bus:Device.Function): 0000:03:00.0 Negotiated PCIe Data Rate: PCIe 3.0 x8 (7880 MB/s) Controller Mode: HBA Controller Mode Reboot: Not Required Current Power Mode: MaxPerformance Host Serial Number: MXQ51906YF """ results = [] controller_result = BASE_RESULT.child() controller_result.name += '.' + 'smart_array' rc = run_cmd(LOCK_FILE_COMMAND + 'hpssacli ctrl all show detail') if rc.exitcode != 0: if 'Error: No controllers detected.' in str(rc.output): return [] r = MetricData.single( 'check.failure', Severity.fail, '{check} failed with: {error}', { 'check': controller_result.name, 'error': str(rc.output), 'component': 'swiftlm-scan' }) return [r] if rc.output: lines = rc.output.split('\n') else: r = MetricData.single( 'check.failure', Severity.fail, '{check} failed with: {error}', { 'check': controller_result.name, 'error': 'No usable output from hpssacli', 'component': 'swiftlm-scan' }) return [r] controllers = [] info = {} for line in lines: # Ignore blank lines if (not line) or (line.isspace()) or (line == "\n"): continue if is_cont_heading(line): if info: controllers.append(info) # To get controller model, assume that the line is in the form: # <model> in Slot <slot> model = line.strip().split("in Slot")[0].strip() info = {'model': model} continue k, v = line.split(':', 1) k = k.strip().lower() v = v.strip() info[k] = v if info: controllers.append(info) controller_slots = [] for c in controllers: results.extend(check_controller(c, controller_result)) if c.get('slot'): controller_slots.append(c.get('slot')) return results, controller_slots
def create_volume_fs(cp_input_list, disk_label_list, iplabel): """ Create xfs filesystems for any volumes that don't have one. Also, add labelled volumes to list As well as adding the labelled volumes to the disk label list this will return a list of unlabelled volumes. Furthermore, any unlabelled volume that doesn't have an xfs filesystem will be give one. :param cp_input_list: list of tuples that show the devices selected by the configuration (drive/part/lvm name, swift device name) :param disk_label_list: list of devices that are labelled and their swift device number ("v" is with the swift number if the device is an lvm) :param iplabel: hex representation of the ip address that is used for the swift device labels :return blank_volumes: volumes that have yet to be labelled :return disk_label_list: list of devices that have been labelled and their device number """ blank_volumes = [] for full_vol, swift_name in cp_input_list: # Only interested in logical volumes if "lvm" in swift_name: lvm_l_stat, lvm_l_out = run_cmd("/usr/sbin/xfs_admin -l " + full_vol) # Create an xfs filesystem if there is not already one present if lvm_l_stat != 0: # Make sure that the volume is not being used already # 1) Check if it has another filesystem ext_status, _ = run_cmd("/sbin/e2label" + full_vol) if ext_status == 0: print("Cannot proceed - %s has a non-compatible swift fs" % (full_vol)) sys.exit(1) # 2) Check if already mounted. * Note - os.path.ismount() # can't be used as cannot be certain of mount point at this # stage _, mount_status = run_cmd("df") logical_vol = full_vol.split("/")[-1] if logical_vol in mount_status: print("Cannot proceed - %s is already mounted" % (full_vol)) sys.exit(1) lvm_xfs_stat, lvm_xfs_out = run_cmd( "/sbin/mkfs.xfs -f -i size=256" + " " + full_vol) if lvm_xfs_stat != 0: print("Error: Failed to create filesystem for volume %s" " - %s" % (full_vol, lvm_xfs_out)) sys.exit(1) blank_volumes.append(full_vol) # Otherwise check if labelled correctly else: if (iplabel + "v") in lvm_l_out: nop, number = lvm_l_out.strip('"').split("v") disk_label_list.append(["v" + str(int(number)), full_vol]) else: blank_volumes.append(full_vol) return blank_volumes, disk_label_list
def format_drives(blank_drives, unparted_drives, fs, cp_input_list): """ Partition all raw drives As well as partitioning and adding a fs to a drive, the function will also add them to the unlabelled drives list :param blank_drives: list of partitioned drives that do not have a swift label :param unparted_drives: list of drives that need to be partitioned :param fs: type of filesystem to create :parama cp_input_list: list of tuples that show the devices selected by the configuration(drive/part/lvm name, swift device name) :return blank_drives: see above """ for unparted in unparted_drives: if cp_input_list: # Make sure that the drive is marked for swift use by the config # processor config_match = False for full_vol, swift_name in cp_input_list: if unparted == full_vol: config_match = True if not config_match: print("%s has not been selected for swift use" % (unparted)) continue # Partition the drive: create_status, create_output = run_cmd("/sbin/parted -s " + unparted + " mklabel gpt") if create_status != 0: print("Error: Failed to create partition table for disk %s - %s" % (unparted, create_output)) sys.exit(1) partition_status, partition_output = run_cmd("/sbin/parted -s -- " + unparted + " mkpart primary 1 -1") if partition_status != 0: print("Error: Failed to create partition table for disk %s - %s" % (unparted, partition_output)) sys.exit(1) blockdev_retry_max = 5 blockdev_retry_count = 0 while blockdev_retry_count < blockdev_retry_max: blockdev_status, blockdev_output = run_cmd("/sbin/partprobe %s" % unparted) if blockdev_status != 0: blockdev_retry_count += 1 time.sleep(blockdev_retry_count) else: break if blockdev_retry_max == blockdev_retry_count: print("Error: Failed to reread the disks partition table %s - %s" % (unparted, blockdev_output)) sys.exit(1) # Need to introduce a delay between creating the partition and the # file system time.sleep(1) # Only supports xfs for now if fs == "xfs": filesys_status, filesys_output = run_cmd( "/sbin/mkfs.xfs -f -i size=256 " + unparted + "1") if filesys_status != 0: print("Error: Failed to create filesystem for disk %s - %s" % (unparted, filesys_output)) sys.exit(1) blank_drives.append(unparted) return blank_drives
def get_logical_drive_info(slot, cache_check=True): """ array L Logical Drive: 12 Size: 1.8 TB Fault Tolerance: 0 Heads: 255 Sectors Per Track: 32 Cylinders: 65535 Strip Size: 256 KB Full Stripe Size: 256 KB Status: OK Caching: Enabled Unique Identifier: 600508B1001CEA938043498011A76404 Disk Name: /dev/sdl Mount Points: /srv/node/disk11 1.8 TB Partition Number 2 OS Status: LOCKED Logical Drive Label: AF3C73D8PACCR0M9VZ41S4QEB69 Drive Type: Data LD Acceleration Method: Controller Cache BUG: It appears that the current build of hpssacli has a bug and outputs Disk Name and Mount Points on the same line. We work around this by checking for these specifically but that could fail if they change """ results = [] drive_result = BASE_RESULT.child() drive_result.name += '.' + 'logical_drive' rc = run_cmd( LOCK_FILE_COMMAND + 'hpssacli ctrl slot=%s ld all show detail' % slot) if rc.exitcode != 0: r = MetricData.single('check.failure', Severity.fail, '{check} slot: {slot} failed with: {error}', {'check': drive_result.name, 'slot': slot, 'error': str(rc.output), 'component': 'swiftlm-scan'}) return [r] # Remove blank lines and strip trailing/leading spaces for each line lines = [l.strip() for l in rc.output.split('\n') if l.strip()] if not lines: r = MetricData.single('check.failure', Severity.fail, '{check} slot: {slot} failed with: {error}', {'check': drive_result.name, 'slot': slot, 'error': 'No usable output from hpssacli', 'component': 'swiftlm-scan'}) return [r] # First line should be the controller model and slot number. # We already have this so remove it if it exists if is_cont_heading(lines[0]): lines = lines[1:] drives = [] drive_info = {} for line in lines: # If we see two colons we have to assume that it is a bugged version # of hpssacli and split them accordingly. cc = line.count(':') if cc == 2: _, dn, mp = line.split(':') drive_info['disk name'] = dn.strip().split()[0] drive_info['mount points'] = mp.strip() continue # The Array # line may be useful in the future but does not follow # the format of colon seperated infommation. # It is also the only delimiter between drives. We create a new # drive_info dict when we see it. if line.startswith('array '): if drive_info: drives.append(drive_info) drive_info = {} drive_info['array'] = line.split()[1] continue k, v = line.split(':', 1) k = k.strip().lower() v = v.strip() drive_info[k] = v # Have to add the last drive. if drive_info: drives.append(drive_info) for d in drives: results.extend(check_logical_drive(d, drive_result, cache_check)) return results
def get_physical_drive_info(slot): """ Parses drive data from hpssacli in the form. There are multiple drives in the output. array A physicaldrive 2C:1:1 Port: 2C Box: 1 Bay: 1 Status: OK Drive Type: Data Drive Interface Type: SAS Size: 2 TB Native Block Size: 512 Rotational Speed: 7200 Firmware Revision: HPD3 Serial Number: YFJMHTZD Model: HP MB2000FBUCL Current Temperature (C): 27 Maximum Temperature (C): 38 PHY Count: 2 PHY Transfer Rate: 6.0Gbps, Unknown """ results = [] drive_result = BASE_RESULT.child(dimensions={ 'controller_slot': str(slot), }) drive_result.name += '.physical_drive' rc = run_cmd( LOCK_FILE_COMMAND + 'hpssacli ctrl slot=%s pd all show detail' % slot) if rc.exitcode != 0: r = MetricData.single('check.failure', Severity.fail, '{check} slot: {slot} failed with: {error}', {'check': drive_result.name, 'slot': slot, 'error': str(rc.output), 'component': 'swiftlm-scan'}) return [r] # Remove blank lines and strip trailing/leading spaces for each line lines = [l.strip() for l in rc.output.split('\n') if l.strip()] if not lines: r = MetricData.single('check.failure', Severity.fail, '{check} slot: {slot} failed with: {error}', {'check': drive_result.name, 'slot': slot, 'error': 'No usable output from hpssacli', 'component': 'swiftlm-scan'}) return [r] if is_cont_heading(lines[0]): lines = lines[1:] drives = [] drive_info = {} for line in lines: # The first two lines for each drive are special. # The physicaldrive line will contain 2 colons and duplicates # information so we drop it. cc = line.count(':') if cc > 1: continue # The Array # line may be useful in the future but does not follow # the format of colon seperated infommation. # It is also the only delimiter between drives. We create a new # drive_info dict when we see it. if line.startswith('array '): if drive_info: drives.append(drive_info) drive_info = {} drive_info['array'] = line.split()[1] continue k, v = line.split(':', 1) k = k.strip().lower() v = v.strip() drive_info[k] = v # Have to add the last drive. if drive_info: drives.append(drive_info) for d in drives: results.extend(check_physical_drive(d, drive_result)) return results
def is_valid_xfs(d, r): rc = run_cmd('xfs_info %s' % d.mount) if rc.exitcode == 0: return True else: return False
def get_logical_drive_info(controller_slot, cache_check=True): """ array L Logical Drive: 12 Size: 1.8 TB Fault Tolerance: 0 Heads: 255 Sectors Per Track: 32 Cylinders: 65535 Strip Size: 256 KB Full Stripe Size: 256 KB Status: OK Caching: Enabled Unique Identifier: 600508B1001CEA938043498011A76404 Disk Name: /dev/sdl Mount Points: /srv/node/disk11 1.8 TB Partition Number 2 OS Status: LOCKED Logical Drive Label: AF3C73D8PACCR0M9VZ41S4QEB69 Drive Type: Data LD Acceleration Method: Controller Cache """ results = [] drive_result = BASE_RESULT.child( dimensions={'controller_slot': controller_slot}) drive_result.name += '.' + 'logical_drive' rc = run_cmd(LOCK_FILE_COMMAND + 'hpssacli ctrl slot=%s ld all show detail' % controller_slot) if rc.exitcode != 0: if len(rc.output) > 1847: rc = rc._replace(exitcode=rc.exitcode, output='...' + rc.output[-1844:]) raise Exception('{0}: hpssacli ctrl slot={1} ld all show detail ' 'failed with exit code: {2}'.format( rc.output, controller_slot, rc.exitcode)) lines = rc.output.split('\n') if lines == ['']: raise Exception('{0}: hpssacli ctrl slot={1} ld all show detail ' 'failed with exit code: {2}'.format( rc.output, controller_slot, rc.exitcode)) drive_info = [] text_scanner = TextScanner(lines) root = text_scanner.get_root_block() # Extract logical drive information for controller in root.subblocks: line = controller.text if line.startswith("Smart Array"): for array in controller.subblocks: line = array.text if "array" in line: _, array_letter, array_name = parse_array_name(line) for lun in array.subblocks: line = lun.text if "Logical Drive:" in line: logical_drive = line.strip() try: _, ld_num = parse_ld_name(line) ld_data = { 'array': array_letter, 'logical_drive': ld_num } except ValueError: continue for attribute in lun.subblocks: line = attribute.text try: k, v = parse_attribute(line, underscoring=False) except ValueError: continue if any(k in s for s in METRIC_KEYS): ld_data.update({k: v}) drive_info.append(ld_data) # Now walk the LUNs and check them for ld_data in drive_info: results.extend(check_logical_drive(ld_data, drive_result, cache_check)) return results
def generate_drive_info(my_ip, mount_dir, disk_label_list): ''' Gather all the partitioned and labelled drive data on the node into a single array of element. :param my_ip: ip address of the node :param mount_dir: directory where the swift devices were mounted to :param disk_label_list: list of devices that are labelled and their swift device number ("v" is with the swift number if the device is an lvm) :return node_info: array with all the device information of the node ''' server_type = get_product_name() node_info = { "hostname": socket.getfqdn(), "ip_addr": my_ip, "model": server_type } drive_info = [] for lab in disk_label_list: # Get the size of the drive size_status, size_output = run_cmd("/sbin/blockdev --getsize64 " + lab[1]) if size_status == 0: size = ((float(size_output) / 1024) / 1024) / 1024 else: print("Error getting size of %s - %s" % (lab[1], size_output)) sys.exit(1) # Check if drive is mounted if "v" in lab[0]: mount_to_check = os.path.join(mount_dir, LVM_MOUNT + lab[0][1:]) else: mount_to_check = os.path.join(mount_dir, DISK_MOUNT + lab[0]) if os.path.ismount(mount_to_check): is_mounted = True else: is_mounted = False if "v" in lab[0]: drive_info.append({ "name": lab[1], "swift_drive_name": "lvm" + lab[0][1:], "size_gb": size, "mounted": str(is_mounted) }) else: drive_info.append({ "name": lab[1], "swift_drive_name": "disk" + lab[0], "size_gb": size, "mounted": str(is_mounted) }) node_info["devices"] = drive_info return node_info
def get_logical_drive_info(slot, cache_check=True): """ array L Logical Drive: 12 Size: 1.8 TB Fault Tolerance: 0 Heads: 255 Sectors Per Track: 32 Cylinders: 65535 Strip Size: 256 KB Full Stripe Size: 256 KB Status: OK Caching: Enabled Unique Identifier: 600508B1001CEA938043498011A76404 Disk Name: /dev/sdl Mount Points: /srv/node/disk11 1.8 TB Partition Number 2 OS Status: LOCKED Logical Drive Label: AF3C73D8PACCR0M9VZ41S4QEB69 Drive Type: Data LD Acceleration Method: Controller Cache BUG: It appears that the current build of hpssacli has a bug and outputs Disk Name and Mount Points on the same line. We work around this by checking for these specifically but that could fail if they change """ results = [] drive_result = BASE_RESULT.child() drive_result.name += '.' + 'logical_drive' rc = run_cmd(LOCK_FILE_COMMAND + 'hpssacli ctrl slot=%s ld all show detail' % slot) if rc.exitcode != 0: r = MetricData.single( 'check.failure', Severity.fail, '{check} slot: {slot} failed with: {error}', { 'check': drive_result.name, 'slot': slot, 'error': str(rc.output), 'component': 'swiftlm-scan' }) return [r] # Remove blank lines and strip trailing/leading spaces for each line lines = [l.strip() for l in rc.output.split('\n') if l.strip()] if not lines: r = MetricData.single( 'check.failure', Severity.fail, '{check} slot: {slot} failed with: {error}', { 'check': drive_result.name, 'slot': slot, 'error': 'No usable output from hpssacli', 'component': 'swiftlm-scan' }) return [r] # First line should be the controller model and slot number. # We already have this so remove it if it exists if is_cont_heading(lines[0]): lines = lines[1:] drives = [] drive_info = {} for line in lines: # If we see two colons we have to assume that it is a bugged version # of hpssacli and split them accordingly. cc = line.count(':') if cc == 2: _, dn, mp = line.split(':') drive_info['disk name'] = dn.strip().split()[0] drive_info['mount points'] = mp.strip() continue # The Array # line may be useful in the future but does not follow # the format of colon seperated infommation. # It is also the only delimiter between drives. We create a new # drive_info dict when we see it. if line.startswith('array '): if drive_info: drives.append(drive_info) drive_info = {} drive_info['array'] = line.split()[1] continue k, v = line.split(':', 1) k = k.strip().lower() v = v.strip() drive_info[k] = v # Have to add the last drive. if drive_info: drives.append(drive_info) for d in drives: results.extend(check_logical_drive(d, drive_result, cache_check)) return results
def get_smart_array_info(): """ parses controller data from hpssacli in the form. returns a dict. key's are lowercased versions of the key name on each line, including special characters. Values are not changed. keys 'model' and 'slot' are parsed from the first line Smart Array P410 in Slot 1 Bus Interface: PCI Slot: 1 Serial Number: PACCR0M9VZ41S4Q Cache Serial Number: PACCQID12061TTQ RAID 6 (ADG) Status: Disabled Controller Status: OK Hardware Revision: C Firmware Version: 6.60 Rebuild Priority: Medium Expand Priority: Medium Surface Scan Delay: 15 secs Surface Scan Mode: Idle Queue Depth: Automatic Monitor and Performance Delay: 60 min Elevator Sort: Enabled Degraded Performance Optimization: Disabled Inconsistency Repair Policy: Disabled Wait for Cache Room: Disabled Surface Analysis Inconsistency Notification: Disabled Post Prompt Timeout: 15 secs Cache Board Present: True Cache Status: OK Cache Ratio: 25% Read / 75% Write Drive Write Cache: Disabled Total Cache Size: 256 MB Total Cache Memory Available: 144 MB No-Battery Write Cache: Disabled Cache Backup Power Source: Batteries Battery/Capacitor Count: 1 Battery/Capacitor Status: OK SATA NCQ Supported: True Number of Ports: 2 Internal only Encryption Supported: False Driver Name: hpsa Driver Version: 3.4.0 Driver Supports HP SSD Smart Path: False Smart Array P440ar in Slot 0 (Embedded) (HBA Mode) Bus Interface: PCI Slot: 0 Serial Number: PDNLH0BRH7V7GC Cache Serial Number: PDNLH0BRH7V7GC Controller Status: OK Hardware Revision: B Firmware Version: 2.14 Controller Temperature (C): 50 Number of Ports: 2 Internal only Driver Name: hpsa Driver Version: 3.4.4 HBA Mode Enabled: True PCI Address (Domain:Bus:Device.Function): 0000:03:00.0 Negotiated PCIe Data Rate: PCIe 3.0 x8 (7880 MB/s) Controller Mode: HBA Controller Mode Reboot: Not Required Current Power Mode: MaxPerformance Host Serial Number: MXQ51906YF """ results = [] controller_result = BASE_RESULT.child() controller_result.name += '.' + 'smart_array' rc = run_cmd(LOCK_FILE_COMMAND + 'hpssacli ctrl all show detail') if rc.exitcode != 0: if 'Error: No controllers detected.' in str(rc.output): return [] r = MetricData.single('check.failure', Severity.fail, '{check} failed with: {error}', {'check': controller_result.name, 'error': str(rc.output), 'component': 'swiftlm-scan'}) return [r] if rc.output: lines = rc.output.split('\n') else: r = MetricData.single('check.failure', Severity.fail, '{check} failed with: {error}', {'check': controller_result.name, 'error': 'No usable output from hpssacli', 'component': 'swiftlm-scan'}) return [r] controllers = [] info = {} for line in lines: # Ignore blank lines if (not line) or (line.isspace()) or (line == "\n"): continue if is_cont_heading(line): if info: controllers.append(info) # To get controller model, assume that the line is in the form: # <model> in Slot <slot> model = line.strip().split("in Slot")[0].strip() info = {'model': model} continue k, v = line.split(':', 1) k = k.strip().lower() v = v.strip() info[k] = v if info: controllers.append(info) controller_slots = [] for c in controllers: results.extend(check_controller(c, controller_result)) if c.get('slot'): controller_slots.append(c.get('slot')) return results, controller_slots
def get_physical_drive_info(slot): """ Parses drive data from hpssacli in the form. There are multiple drives in the output. array A physicaldrive 2C:1:1 Port: 2C Box: 1 Bay: 1 Status: OK Drive Type: Data Drive Interface Type: SAS Size: 2 TB Native Block Size: 512 Rotational Speed: 7200 Firmware Revision: HPD3 Serial Number: YFJMHTZD Model: HP MB2000FBUCL Current Temperature (C): 27 Maximum Temperature (C): 38 PHY Count: 2 PHY Transfer Rate: 6.0Gbps, Unknown """ results = [] drive_result = BASE_RESULT.child(dimensions={ 'controller_slot': str(slot), }) drive_result.name += '.physical_drive' rc = run_cmd(LOCK_FILE_COMMAND + 'hpssacli ctrl slot=%s pd all show detail' % slot) if rc.exitcode != 0: r = MetricData.single( 'check.failure', Severity.fail, '{check} slot: {slot} failed with: {error}', { 'check': drive_result.name, 'slot': slot, 'error': str(rc.output), 'component': 'swiftlm-scan' }) return [r] # Remove blank lines and strip trailing/leading spaces for each line lines = [l.strip() for l in rc.output.split('\n') if l.strip()] if not lines: r = MetricData.single( 'check.failure', Severity.fail, '{check} slot: {slot} failed with: {error}', { 'check': drive_result.name, 'slot': slot, 'error': 'No usable output from hpssacli', 'component': 'swiftlm-scan' }) return [r] if is_cont_heading(lines[0]): lines = lines[1:] drives = [] drive_info = {} for line in lines: # The first two lines for each drive are special. # The physicaldrive line will contain 2 colons and duplicates # information so we drop it. cc = line.count(':') if cc > 1: continue # The Array # line may be useful in the future but does not follow # the format of colon seperated infommation. # It is also the only delimiter between drives. We create a new # drive_info dict when we see it. if line.startswith('array '): if drive_info: drives.append(drive_info) drive_info = {} drive_info['array'] = line.split()[1] continue k, v = line.split(':', 1) k = k.strip().lower() v = v.strip() drive_info[k] = v # Have to add the last drive. if drive_info: drives.append(drive_info) for d in drives: results.extend(check_physical_drive(d, drive_result)) return results
def get_physical_drive_info(controller_slot): """ Parses drive data from hpssacli in the form. There are multiple drives in the output. Smart Array P410 in Slot 1 array A physicaldrive 2C:1:1 Port: 2C Box: 1 Bay: 1 Status: OK Drive Type: Data Drive Interface Type: SAS Size: 2 TB Native Block Size: 512 Rotational Speed: 7200 Firmware Revision: HPD3 Serial Number: YFJMHTZD Model: HP MB2000FBUCL Current Temperature (C): 27 Maximum Temperature (C): 38 PHY Count: 2 PHY Transfer Rate: 6.0Gbps, Unknown """ results = [] drive_result = BASE_RESULT.child( dimensions={'controller_slot': str(controller_slot)}) drive_result.name += '.physical_drive' rc = run_cmd(LOCK_FILE_COMMAND + 'hpssacli ctrl slot=%s pd all show detail' % controller_slot) if rc.exitcode != 0: if len(rc.output) > 1847: rc = rc._replace(exitcode=rc.exitcode, output='...' + rc.output[-1844:]) raise Exception('{0}: hpssacli ctrl slot={1} pd all show detail ' 'failed with exit code: {2}'.format( rc.output, controller_slot, rc.exitcode)) lines = rc.output.split('\n') if lines == ['']: raise Exception('{0}: hpssacli ctrl slot={1} pd all show detail ' 'failed with exit code: {2}'.format( rc.output, controller_slot, rc.exitcode)) drive_info = [] text_scanner = TextScanner(lines) root = text_scanner.get_root_block() # Extract drive information for controller in root.subblocks: line = controller.text if line.startswith("Smart Array"): _, controller_key = parse_controller_name(line) for assignment in controller.subblocks: line = assignment.text if "array" in line: # drives assigned to a LUN pass elif "hba drives" in line.lower(): # controller in HBA mode pass elif "unassigned" in line.lower(): # Unassigned drives are probably unassigned for a reason # (such as failed) so we'll ignore them continue else: # Unrecognised assignment - ignore continue for pd in assignment.subblocks: # Parse drive attributes pd_data = {} for attribute in pd.subblocks: parse_cont_attribute(attribute, pd_data) drive_info.append(pd_data) # Now walk drive_info to get metrics' data from the controller(s), # array(s), physical drive(s), and logical drive(s) for pd_data in drive_info: results.extend(check_physical_drive(pd_data, drive_result)) return results