def _shutdown(): console_log.info("Initiating server shutdown per manager request") # This will initiate a "nice" shutdown with a wall from root, etc. AgentShell.try_run(["shutdown", "-H" if halt else "-h", at_time]) console_log.info("Terminating") os._exit(0)
def _reboot(): console_log.info("Initiating server reboot per manager request") # reboot(8) just calls shutdown anyhow. AgentShell.try_run(["shutdown", "-r", at_time]) console_log.info("Terminating") os._exit(0)
def kernel_status(): """ :return: {'running': {'kernel-X.Y.Z'}, 'required': <'kernel-A.B.C' or None>} """ running_kernel = "kernel-%s" % AgentShell.try_run(["uname", "-r"]).strip() try: required_kernel_stdout = AgentShell.try_run( ["rpm", "-qR", "lustre-modules"]) except AgentShell.CommandExecutionError: try: required_kernel_stdout = AgentShell.try_run( ["rpm", "-qR", "lustre-client-modules"]) except AgentShell.CommandExecutionError: required_kernel_stdout = None required_kernel = None if required_kernel_stdout: for line in required_kernel_stdout.split("\n"): if line.startswith('kernel'): required_kernel = "kernel-%s.%s" % (line.split(" = ")[1], platform.machine()) available_kernels = [] for installed_kernel in AgentShell.try_run(["rpm", "-q", "kernel"]).split("\n"): if installed_kernel: available_kernels.append(installed_kernel) return { 'running': running_kernel, 'required': required_kernel, 'available': available_kernels }
def set_address(self, ipv4_address, prefix): ifaddr = "%s/%s" % (ipv4_address, prefix) console_log.info("Set %s (%s) up" % (self.name, ifaddr)) if self.ipv4_address != ipv4_address: node_admin.unmanage_network(self.device, self.mac_address) AgentShell.try_run( ['/sbin/ip', 'link', 'set', 'dev', self.name, 'up']) AgentShell.try_run( ['/sbin/ip', 'addr', 'add', ifaddr, 'dev', self.name]) # The link address change is asynchronous, so we need to wait for the # address to stick of we have a race condition. timeout = 30 while self.ipv4_address != ipv4_address and timeout != 0: self.refresh() time.sleep(1) timeout -= 1 if self.ipv4_address != ipv4_address: raise RuntimeError( 'Unable to set the address %s for interface %s' % (self.ipv4_address, self.name)) node_admin.write_ifcfg(self.device, self.mac_address, self.ipv4_address, self.ipv4_netmask) else: console_log.info("Nothing to do as %s already has address %s" % (self.name, ifaddr))
def private_key_file(self): """Return a path to a PEM file""" if not os.path.exists(self.PRIVATE_KEY_FILE): console_log.info("Generating private key") AgentShell.try_run(['openssl', 'genrsa', '-out', self.PRIVATE_KEY_FILE, '2048', '-sha256']) return self.PRIVATE_KEY_FILE
def action_two(arg1): """An action which invokes subprocess_one and subprocess_two""" assert arg1 == "arg2_test" stdout = AgentShell.try_run(['subprocess_one', 'subprocess_one_arg']) assert stdout == 'subprocess_one_stdout' AgentShell.try_run(['subprocess_two', 'subprocess_two_arg']) return ACTION_TWO_RETVAL
def mount_lustre_filesystem(mountspec, mountpoint): try: os.makedirs(mountpoint, 0o755) except OSError as e: if e.errno != errno.EEXIST: raise create_fstab_entry(mountspec, mountpoint) AgentShell.try_run(["/bin/mount", mountpoint])
def private_key_file(self): """Return a path to a PEM file""" if not os.path.exists(self.PRIVATE_KEY_FILE): console_log.info("Generating private key") AgentShell.try_run([ "openssl", "genrsa", "-out", self.PRIVATE_KEY_FILE, "2048", "-sha256" ]) return self.PRIVATE_KEY_FILE
def delete_node(nodename): rc, stdout, stderr = AgentShell.run_old(["crm_node", "-l"]) node_id = None for line in stdout.split("\n"): node_id, name, status = line.split(" ") if name == nodename: break AgentShell.try_run(["crm_node", "--force", "-R", node_id]) cibxpath("delete", '//nodes/node[@uname="{}"]'.format(nodename)) cibxpath("delete", '//status/node_state[@uname="{}"]'.format(nodename))
def disable_standby(self): AgentShell.try_run([ "crm_attribute", "-N", self.name, "-n", "standby", "-v", "off", "--lifetime=forever", ])
def set_attribute(self, key, value): AgentShell.try_run([ "crm_attribute", "-t", "nodes", "-U", self.name, "-n", key, "-v", str(value), ])
def _move_target(target_label, dest_node): """ common plumbing for failover/failback. Move the target with label to the destination node. :param target_label: The label of the node to move :param dest_node: The target to move it to. :return: None if successful or an error message if an error occurred. """ # Issue the command to Pacemaker to move the target arg_list = [ 'crm_resource', '--resource', target_label, '--move', '--node', dest_node ] # For on going debug purposes, lets get the resource locations at the beginning. This provides useful # log output in the case where things don't work. AgentShell.run(['crm_mon', '-1']) # Now before we start cleanup anything that has gone on before. HA is a fickle old thing and this will make sure # that everything is clean before we start. AgentShell.try_run( ['crm_resource', '--resource', target_label, '--cleanup']) result = AgentShell.run(arg_list) if result.rc != 0: return "Error (%s) running '%s': '%s' '%s'" % ( result.rc, " ".join(arg_list), result.stdout, result.stderr) timeout = 100 # Now wait for it to complete its move, this will succeed quickly if it was already there while timeout > 0: if get_resource_location(target_label) == dest_node: break time.sleep(1) timeout -= 1 # now delete the constraint that crm_resource --move created AgentShell.try_run([ 'crm_resource', '--resource', target_label, '--un-move', '--node', dest_node ]) if timeout == 0: return "Failed to move target %s to node %s" % (target_label, dest_node) return None
def unmanage_network(device, mac_address): """Rewrite the network configuration file to set NM_CONTROLLED="no" TODO: This is destructive and overwrites the file loosing all settings. This needs to be fixed up. """ ifcfg_path = write_ifcfg(device, mac_address, None, None) if platform_info.distro_version >= 7.0: try: AgentShell.try_run(['nmcli', 'con', 'load', ifcfg_path]) except AgentShell.CommandExecutionError as cee: if cee.result.rc not in [ 127, 2, 8 ]: # network manager may be uninstalled (127) stopped (8) raise
def delete_node(nodename): rc, stdout, stderr = AgentShell.run_old(['crm_node', '-l']) node_id = None for line in stdout.split('\n'): node_id, name, status = line.split(" ") if name == nodename: break AgentShell.try_run(['crm_node', '--force', '-R', node_id]) cibadmin( ["--delete", "-o", "nodes", "-X", "<node uname=\"%s\"/>" % nodename]) cibadmin([ "--delete", "-o", "nodes", "--crm_xml", "<node_state uname=\"%s\"/>" % nodename ])
def generate_csr(self, common_name): """Return a CSR as a string""" output = AgentShell.try_run([ "openssl", "req", "-new", "-sha256", "-subj", "/C=/ST=/L=/O=/CN=%s" % common_name, "-key", self.private_key_file ]) return output.strip()
def _process_zpool(self, pool, block_devices): """ Either read pool info from store if unavailable or inspect by importing :param pool: dict of pool info :return: None """ pool_name = pool['pool'] with ZfsDevice(pool_name, True) as zfs_device: if zfs_device.available: out = AgentShell.try_run(["zpool", "list", "-H", "-o", "name,size,guid", pool['pool']]) self._add_zfs_pool(out, block_devices) else: # zpool probably imported elsewhere, attempt to read from store, this should return # previously seen zpool state either with or without datasets pool_id = pool.get('id', None) try: if pool_id is None: data = find_name_in_store(pool_name) else: data = read_from_store(pool_id) except KeyError as e: daemon_log.error("ZfsPool unavailable and could not be retrieved from store: %s (" "pool info: %s)" % (e, pool)) else: # populate self._pools/datasets/zvols info from saved data read from store self._update_pool_or_datasets(block_devices, data['pool'], data['datasets'], data['zvols'])
def _get_zpool_datasets(pool_name, drives): """ Retrieve datasets belonging to a zpool """ out = AgentShell.try_run(['zfs', 'list', '-H', '-o', 'name,avail,guid']) zpool_datasets = {} if out.strip() != "no datasets available": for line in filter(None, out.split('\n')): name, size_str, uuid = line.split() size = util.human_to_bytes(size_str) if name.startswith("%s/" % pool_name): # This will need discussion, but for now fabricate a major:minor. Do we ever use them as numbers? major_minor = "zfsset:%s" % uuid zpool_datasets[uuid] = { "name": name, "path": name, "block_device": major_minor, "uuid": uuid, "size": size, "drives": drives } daemon_log.debug("zfs mount '%s'" % name) return zpool_datasets
def _check_HYD4050(): """ HYD-4050 means that kernels are not installed with a default kernel or the initramfs isn't present. This function checks for these cases and returns an error message if a problem exists. return: None if everything is OK, error message if not. """ # Make sure that there is an initramfs for the booting kernel try: default_kernel = AgentShell.try_run(["grubby", "--default-kernel"]).strip() except AgentShell.CommandExecutionError: return ("Unable to determine your default kernel. " "This node may not boot successfully until grub " "is fixed to have a default kernel to boot.") default_kernel_version = default_kernel[default_kernel.find("-") + 1:] initramfs = "/boot/initramfs-%s.img" % default_kernel_version if not os.path.isfile(initramfs): return ("There is no initramfs (%s) for the default kernel (%s). " "This node may not boot successfully until an initramfs " "is created." % (initramfs, default_kernel_version)) return None
def action_one_no_context(arg1): """An action which invokes subprocess_one""" assert arg1 == "arg1_test" stdout = AgentShell.try_run(['subprocess_one', 'subprocess_one_arg']) assert stdout == 'subprocess_one_stdout' return ACTION_ONE_NO_CONTEXT_RETVAL
def configured(self): """ configured returns True if this node has a pacemaker configuration set by IML. :return: True if configuration present else False """ return "fence_chroma" in AgentShell.try_run( ["cibadmin", "--query", "-o", "resource"] )
def __init__(self, block_devices): self.block_devices = block_devices self.mpaths = {} self.vgs = {} self.lvs = {} for vg_name, vg_uuid, vg_size in self._get_vgs(): self.vgs[vg_name] = { 'name': vg_name, 'uuid': vg_uuid, 'size': vg_size, 'pvs_major_minor': [] } self.lvs[vg_name] = {} for lv_name, lv_uuid, lv_size, lv_path in self._get_lvs(vg_name): # Do this to cache the device, type see blockdevice and filesystem for info. BlockDevice('lvm_volume', '/dev/mapper/%s-%s' % (vg_name, lv_name)) self.lvs[vg_name][lv_name] = { 'name': lv_name, 'uuid': lv_uuid, 'size': lv_size } stdout = AgentShell.try_run(['dmsetup', 'table']) self._parse_dm_table(stdout)
def kernel_status(): """ :return: {'running': {'kernel-X.Y.Z'}, 'required': <'kernel-A.B.C' or None>} """ running_kernel = "kernel-%s" % AgentShell.try_run(["uname", "-r"]).strip() available_kernels = [ k for k in AgentShell.try_run(["rpm", "-q", "kernel"]).split("\n") if k ] if AgentShell.run(["rpm", "-q", "--whatprovides", "kmod-lustre"]).rc == 0: try: modlist = [ os.path.splitext(os.path.basename(k))[0] for k in AgentShell.try_run([ "rpm", "-ql", "--whatprovides", "lustre-osd", "kmod-lustre" ]).split("\n") if k.endswith(".ko") ] required_kernel = latest_kernel(available_kernels, modlist) except (AgentShell.CommandExecutionError, StopIteration): required_kernel = None elif AgentShell.run(["rpm", "-q", "kmod-lustre-client"]).rc == 0: # but on a worker, we can ask kmod-lustre-client what the required # kernel is try: modlist = [ os.path.splitext(os.path.basename(k))[0] for k in AgentShell.try_run([ "rpm", "-ql", "--whatprovides", "kmod-lustre-client" ]).split("\n") if k.endswith(".ko") ] required_kernel = latest_kernel(available_kernels, modlist) except (AgentShell.CommandExecutionError, StopIteration): required_kernel = None else: required_kernel = None return { "running": running_kernel, "required": required_kernel, "available": available_kernels, }
def _res_set_started(ha_label, running): # RAISES AgentShell.CommandExecutionError on error if running: role = "Started" else: role = "Stopped" AgentShell.try_run([ "crm_resource", "--resource", ha_label, "--set-parameter", "target-role", "--meta", "--parameter-value", role, ])
def action_one_with_context(agent_daemon_context, arg1): """An action which invokes subprocess_one""" assert isinstance(agent_daemon_context, AgentDaemonContext) assert arg1 == "arg1_test" stdout = AgentShell.try_run(['subprocess_one', 'subprocess_one_arg']) assert stdout == 'subprocess_one_stdout' return ACTION_ONE_WITH_CONTEXT_RETVAL
def has_link(self): old_link_state_up = self.is_up # HYD-2003: Some NICs require the interface to be in an UP state # before link detection will work. time_left = 0 if not self.is_up: AgentShell.try_run( ["/sbin/ip", "link", "set", "dev", self.name, "up"]) time_left = 10 def _get_device_state(name): try: filepath = operstate.format(name) if os.path.exists(filepath): with open(filepath, "r") as f: return f.read().strip() else: return "unknown" except IOError: print( "Could not read state of ethernet device {}".format(name)) return "unknown" def _has_link(): return _get_device_state(self.name) == "up" try: while time_left: # Poll for link status on newly-up interfaces if _has_link(): return True else: time.sleep(1) time_left -= 1 return _has_link() except IOError: # If the ioctl fails, then for the purposes of this test, the # interface is not usable. HYD-2679 return False finally: if not old_link_state_up: AgentShell.try_run( ["/sbin/ip", "link", "set", "dev", self.name, "down"])
def unmanage_network(device, mac_address): """ Rewrite the network configuration file to set NM_CONTROLLED="no" TODO: This is destructive and overwrites the file clearing previously configured settings, needs fixing. """ ifcfg_path = write_ifcfg(device, mac_address, None, None) if ifcfg_path: try: AgentShell.try_run(['nmcli', 'con', 'load', ifcfg_path]) except OSError as e: if e.errno != errno.ENOENT: raise e except AgentShell.CommandExecutionError as cee: if cee.result.rc != NM_STOPPED_RC: raise cee
def kernel_status(): """ :return: {'running': {'kernel-X.Y.Z'}, 'required': <'kernel-A.B.C' or None>} """ running_kernel = "kernel-%s" % AgentShell.try_run(["uname", "-r"]).strip() if AgentShell.run(["rpm", "-q", "--whatprovides", "kmod-lustre"]).rc == 0: # on a server, a required kernel is a lustre patched kernel since we # are building storage servers that can support both ldiskfs and zfs try: required_kernel = \ next(k for k in sorted(AgentShell.try_run(["rpm", "-q", "kernel"]).split('\n'), reverse=True) if "_lustre" in k) except (AgentShell.CommandExecutionError, StopIteration): required_kernel = None elif AgentShell.run(["rpm", "-q", "kmod-lustre-client"]).rc == 0: # but on a worker, we can ask kmod-lustre-client what the required # kernel is try: required_kernel_prefix = \ next(k for k in AgentShell.try_run(["rpm", "-q", "--requires", "kmod-lustre-client"]).split('\n') if "kernel >=" in k).split(" >= ")[1] required_kernel = AgentShell.try_run( ["rpm", "-q", "kernel-%s*" % required_kernel_prefix]).split('\n')[0] except (AgentShell.CommandExecutionError, StopIteration): required_kernel = None else: required_kernel = None available_kernels = [] for installed_kernel in AgentShell.try_run(["rpm", "-q", "kernel"]).split("\n"): if installed_kernel: available_kernels.append(installed_kernel) return { 'running': running_kernel, 'required': required_kernel, 'available': available_kernels }
def has_link(self): import array import struct import fcntl old_link_state_up = self.is_up # HYD-2003: Some NICs require the interface to be in an UP state # before link detection will work. time_left = 0 if not self.is_up: AgentShell.try_run( ['/sbin/ip', 'link', 'set', 'dev', self.name, 'up']) time_left = 10 def _has_link(): SIOCETHTOOL = 0x8946 ETHTOOL_GLINK = 0x0000000a sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) ecmd = array.array('B', struct.pack('2I', ETHTOOL_GLINK, 0)) ifreq = struct.pack('16sP', self.name, ecmd.buffer_info()[0]) fcntl.ioctl(sock.fileno(), SIOCETHTOOL, ifreq) sock.close() return bool(struct.unpack('4xI', ecmd.tostring())[0]) try: while time_left: # Poll for link status on newly-up interfaces if _has_link(): return True else: time.sleep(1) time_left -= 1 return _has_link() except IOError: # If the ioctl fails, then for the purposes of this test, the # interface is not usable. HYD-2679 return False finally: if not old_link_state_up: AgentShell.try_run( ['/sbin/ip', 'link', 'set', 'dev', self.name, 'down'])
def get_cluster_node_name(): try: return AgentShell.try_run(["crm_node", "-n"]).strip() except Exception as e: console_log.info( "Could not get cluster node name {}. Falling back to socket.getfqdn()".format( e ) ) return socket.getfqdn()
def _find_resource_constraint(ha_label, location): stdout = AgentShell.try_run(["crm_resource", "-r", ha_label, "-a"]) for line in stdout.rstrip().split("\n"): match = re.match( "\s+:\s+Node\s+([^\s]+)\s+\(score=[^\s]+ id=%s-%s\)" % (ha_label, location), line) if match: return match.group(1) return None