def lsof(pid=None, file=None): lsof_args = ['lsof', '-F', 'pan0'] if pid: lsof_args += ["-p", str(pid)] if file: lsof_args += [file] pids = defaultdict(dict) current_pid = None rc, stdout, stderr = AgentShell.run_old(lsof_args) if rc != 0: if stderr: raise RuntimeError(stderr) # lsof exits non-zero if there's nothing holding the file open return pids for line in stdout.split("\n"): match = re.match(r'^p(\d+)\x00', line) if match: current_pid = match.group(1) continue match = re.match(r'^a(\w)\x00n(.*)\x00', line) if match: mode = match.group(1) file = match.group(2) pids[current_pid][file] = {'mode': mode} return pids
def _do_configure_pacemaker(pc): # ignoring quorum should only be done on clusters of 2 if len(pc.nodes) > 2: no_quorum_policy = "stop" else: no_quorum_policy = "ignore" error = _unconfigure_fencing() if error: return error # this could race with other cluster members to make sure # any errors are only due to it already existing try: cibadmin([ "--create", "-o", "resources", "-X", '<primitive class="stonith" id="st-fencing" type="fence_chroma"/>', ]) except Exception as e: rc, stdout, stderr = AgentShell.run_old( ["crm_resource", "--locate", "--resource", "st-fencing"]) if rc == 0: # no need to do the rest if another member is already doing it return None else: return e.message pc.create_update_properyset( "cib-bootstrap-options", { "no-quorum-policy": no_quorum_policy, "symmetric-cluster": "true", "cluster-infrastructure": "openais", "stonith-enabled": "true", }, ) def set_rsc_default(name, value): """ :param name: attribute to set :param value: value to set :return: None if an error else a canned error message """ return AgentShell.run_canned_error_message([ "crm_attribute", "--type", "rsc_defaults", "--attr-name", name, "--attr-value", value, ]) return (set_rsc_default("resource-stickiness", "1000") or set_rsc_default("failure-timeout", RSRC_FAIL_WINDOW) or set_rsc_default("migration-threshold", RSRC_FAIL_MIGRATION_COUNT))
def delete_node(nodename): rc, stdout, stderr = AgentShell.run_old(["crm_node", "-l"]) node_id = None for line in stdout.split("\n"): node_id, name, status = line.split(" ") if name == nodename: break AgentShell.try_run(["crm_node", "--force", "-R", node_id]) cibxpath("delete", '//nodes/node[@uname="{}"]'.format(nodename)) cibxpath("delete", '//status/node_state[@uname="{}"]'.format(nodename))
def get_resource_locations(): # FIXME: this may break on non-english systems or new versions of pacemaker """Parse `crm_mon -1` to identify where (if anywhere) resources (i.e. targets) are running.""" rc, lines_text, stderr = AgentShell.run_old(["crm_mon", "-1", "-r"]) if rc != 0: # Pacemaker not running, or no resources configured yet return { "crm_mon_error": { "rc": rc, "stdout": lines_text, "stderr": stderr } } locations = {} before_resources = True for line in lines_text.split("\n"): # if we don't have a DC for this cluster yet, we can't really believe # anything it says if line == "Current DC: NONE": return {} # skip down to the resources part if before_resources: if line.startswith("Full list of resources:"): before_resources = False continue # only interested in Target resources if "(ocf::chroma:Target)" not in line: continue # The line can have 3 - 5 arguments so pad it out to at least 5 and # throw away any extra # credit it goes to Aric Coady for this little trick columns = (line.lstrip().split() + [None, None])[:5] # In later pacemakers a new entry is added for stopped servers # MGS_424f74 (ocf::chroma:Target): (target-role:Stopped) Stopped # and for started servers: # MGS_424f74 (ocf::chroma:Target): (target-role:Stopped) Started lotus-13vm6 # (target-role:Stopped) is new. if "target-role" in columns[2]: del columns[2] # and even newer pacemakers add a "(disabled)" to the end of the line: # MGS_e1321a (ocf::chroma:Target): Stopped (disabled) if columns[3] == "(disabled)": columns[3] = None locations[columns[0]] = columns[3] return locations
def get_resource_locations(): # FIXME: this may break on non-english systems or new versions of pacemaker """Parse `crm_mon -1` to identify where (if anywhere) resources (i.e. targets) are running.""" try: rc, lines_text, stderr = AgentShell.run_old(["crm_mon", "-1", "-r"]) except OSError, e: # ENOENT is fine here. Pacemaker might not be installed yet. if e.errno != errno.ENOENT: raise
def start_target(ha_label): ''' Start the high availability target Return: Value using simple return protocol ''' # HYD-1989: brute force, try up to 3 times to start the target i = 0 while True: i += 1 error = AgentShell.run_canned_error_message([ 'crm_resource', '-r', ha_label, '-p', 'target-role', '-m', '-v', 'Started' ]) if error: return agent_error(error) # now wait for it to start _wait_target(ha_label, True) # and make sure it didn't start but (the RA) fail(ed) rc, stdout, stderr = AgentShell.run_old(['crm_mon', '-1']) failed = True for line in stdout.split("\n"): if line.lstrip().startswith(ha_label): if line.find("FAILED") < 0: failed = False if failed: # try to leave things in a sane state for a failed mount error = AgentShell.run_canned_error_message([ 'crm_resource', '-r', ha_label, '-p', 'target-role', '-m', '-v', 'Stopped' ]) if error: return agent_error(error) if i < 4: console_log.info("failed to start target %s" % ha_label) else: return agent_error("Failed to start target %s" % ha_label) else: location = get_resource_location(ha_label) if not location: return agent_error("Started %s but now can't locate it!" % ha_label) return agent_result(location)
def _read_crm_mon_as_xml(self): """Run crm_mon --one-shot --as-xml, return raw output or None For expected return values (0, 10), return the stdout from output. If the return value is unexpected, log a warning, and return None """ crm_command = ['crm_mon', '--one-shot', '--as-xml'] try: rc, stdout, stderr = AgentShell.run_old(crm_command) except OSError, e: # ENOENT is fine here. Pacemaker might not be installed yet. if e.errno != errno.ENOENT: raise
def delete_node(nodename): rc, stdout, stderr = AgentShell.run_old(['crm_node', '-l']) node_id = None for line in stdout.split('\n'): node_id, name, status = line.split(" ") if name == nodename: break AgentShell.try_run(['crm_node', '--force', '-R', node_id]) cibadmin( ["--delete", "-o", "nodes", "-X", "<node uname=\"%s\"/>" % nodename]) cibadmin([ "--delete", "-o", "nodes", "--crm_xml", "<node_state uname=\"%s\"/>" % nodename ])
def _read_crm_mon_as_xml(self): """Run crm_mon --one-shot --as-xml, return raw output or None For expected return values (0, 10), return the stdout from output. If the return value is unexpected, log a warning, and return None """ crm_command = ['crm_mon', '--one-shot', '--as-xml'] rc, stdout, stderr = AgentShell.run_old(crm_command) if rc not in [0, 10]: # 10 Corosync is not running on this node daemon_log.warning("rc=%s running '%s': '%s' '%s'" % (rc, crm_command, stdout, stderr)) stdout = None return stdout
def _get_cluster_size(): # you'd think there'd be a way to query the value of a property # such as "expected-quorum-votes" but there does not seem to be, so # just count nodes instead rc, stdout, stderr = AgentShell.run_old(["crm_node", "-l"]) if not stdout: return 0 n = 0 for line in stdout.rstrip().split('\n'): node_id, name, status = line.split(" ") if status == "member" or status == "lost": n = n + 1 return n
def _query_ha_targets(): targets = {} rc, stdout, stderr = AgentShell.run_old(['crm_resource', '-l']) if rc == 234: return targets elif rc != 0: raise RuntimeError("Error %s running crm_resource -l: %s %s" % (rc, stdout, stderr)) else: for resource_id in stdout.split("\n"): if len(resource_id) < 1: continue target = {'ha_label': resource_id} raw_xml = "\n".join( AgentShell.try_run(['crm_resource', '-r', resource_id, '-q']).split("\n")[2:]) target['uuid'] = _get_nvpairid_from_xml(raw_xml) targets[resource_id] = target return targets
def _read_crm_mon_as_xml(self): """Run crm_mon --one-shot --as-xml, return raw output or None For expected return values (0, 10), return the stdout from output. If the return value is unexpected, log a warning, and return None """ crm_command = ["crm_mon", "--one-shot", "--as-xml"] try: rc, stdout, stderr = AgentShell.run_old(crm_command) except OSError as e: # ENOENT is fine here. Pacemaker might not be installed yet. if e.errno != errno.ENOENT: raise e return None if rc not in [0, 10]: # 10 Corosync is not running on this node daemon_log.warning("rc=%s running '%s': '%s' '%s'" % (rc, crm_command, stdout, stderr)) stdout = None return stdout
def properties(self): """Returns less volatile node data suitable for host validation. If the fetched property is expensive to compute, it should be cached / updated less frequently. """ zfs_not_installed, stdout, stderr = AgentShell.run_old( ['which', 'zfs']) return { 'zfs_installed': not zfs_not_installed, 'distro': platform.linux_distribution()[0], 'distro_version': float('.'.join(platform.linux_distribution()[1].split('.')[:2])), 'python_version_major_minor': float("%s.%s" % (platform.python_version_tuple()[0], platform.python_version_tuple()[1])), 'python_patchlevel': int(platform.python_version_tuple()[2]), 'kernel_version': platform.release() }
def properties(self): """Returns less volatile node data suitable for host validation. If the fetched property is expensive to compute, it should be cached / updated less frequently. """ zfs_not_installed, stdout, stderr = AgentShell.run_old(["which", "zfs"]) return { "zfs_installed": not zfs_not_installed, "distro": platform.linux_distribution()[0], "distro_version": float( ".".join(platform.linux_distribution()[1].split(".")[:2]) ), "python_version_major_minor": float( "%s.%s" % ( platform.python_version_tuple()[0], platform.python_version_tuple()[1], ) ), "python_patchlevel": int(platform.python_version_tuple()[2]), "kernel_version": platform.release(), }
def configure_target_ha(primary, device, ha_label, uuid, mount_point): ''' Configure the target high availability Return: Value using simple return protocol ''' if primary: # If the target already exists with the same params, skip. # If it already exists with different params, that is an error rc, stdout, stderr = AgentShell.run_old( ["crm_resource", "-r", ha_label, "-g", "target"]) if rc == 0: info = _get_target_config(stdout.rstrip("\n")) if info['bdev'] == device and info['mntpt'] == mount_point: return agent_result_ok else: return agent_error( "A resource with the name %s already exists" % ha_label) tmp_f, tmp_name = tempfile.mkstemp() os.write( tmp_f, "<primitive class=\"ocf\" provider=\"chroma\" type=\"Target\" id=\"%s\">\ <meta_attributes id=\"%s-meta_attributes\">\ <nvpair name=\"target-role\" id=\"%s-meta_attributes-target-role\" value=\"Stopped\"/>\ </meta_attributes>\ <operations id=\"%s-operations\">\ <op id=\"%s-monitor-5\" interval=\"5\" name=\"monitor\" timeout=\"60\"/>\ <op id=\"%s-start-0\" interval=\"0\" name=\"start\" timeout=\"300\"/>\ <op id=\"%s-stop-0\" interval=\"0\" name=\"stop\" timeout=\"300\"/>\ </operations>\ <instance_attributes id=\"%s-instance_attributes\">\ <nvpair id=\"%s-instance_attributes-target\" name=\"target\" value=\"%s\"/>\ </instance_attributes>\ </primitive>" % (ha_label, ha_label, ha_label, ha_label, ha_label, ha_label, ha_label, ha_label, ha_label, uuid)) os.close(tmp_f) cibadmin(["-o", "resources", "-C", "-x", "%s" % tmp_name]) score = 20 preference = "primary" else: score = 10 preference = "secondary" # Hostname. This is a shorterm point fix that will allow us to make HP2 release more functional. Between el6 and el7 # (truthfully we should probably be looking at Pacemaker or Corosync versions) Pacemaker started to use fully qualified # domain names rather than just the nodename. lotus-33vm15.lotus.hpdd.lab.intel.com vs lotus-33vm15. To keep compatiblity # easily we have to make the contraints follow the same fqdn vs node. if platform_info.distro_version >= 7.0: node = socket.getfqdn() else: node = os.uname()[1] result = cibadmin([ "-o", "constraints", "-C", "-X", "<rsc_location id=\"%s-%s\" node=\"%s\" rsc=\"%s\" score=\"%s\"/>" % (ha_label, preference, node, ha_label, score) ]) if result.rc == 76: return agent_error("A constraint with the name %s-%s already exists" % (ha_label, preference)) _mkdir_p_concurrent(mount_point) return agent_result_ok
def scsi_id_command(cmd): rc, out, err = AgentShell.run_old(cmd) if rc: return None else: return out.strip()
def corosync_running(): rc, stdout, stderr = AgentShell.run_old(['service', 'corosync', 'status']) return rc == 0
def yum_util(action, packages=[], fromrepo=None, enablerepo=None, narrow_updates=False): ''' A wrapper to perform yum actions in encapsulated way. :param action: clean, install, remove, update, requires etc :param packages: Packages to install or remove :param fromrepo: The repo the action should be carried out from, others are disabled. :param enablerepo: The repo to enable for the action, others are not disabled or enabled :param narrow_updates: ? :return: No return but throws CommandExecutionError on error. ''' if fromrepo and enablerepo: raise ValueError( "Cannot provide fromrepo and enablerepo simultaneously") repo_arg = [] valid_rc_values = [0] # Some errors values other than 0 are valid. if fromrepo: repo_arg = ['--disablerepo=*', '--enablerepo=%s' % ','.join(fromrepo)] elif enablerepo: repo_arg = ['--enablerepo=%s' % ','.join(enablerepo)] if narrow_updates and action == 'query': repo_arg.extend(['--pkgnarrow=updates', '-a']) if action == 'clean': cmd = ['yum', 'clean', 'all' ] + (repo_arg if repo_arg else ["--enablerepo=*"]) elif action == 'install': cmd = ['yum', 'install', '-y'] + repo_arg + list(packages) elif action == 'remove': cmd = ['yum', 'remove', '-y'] + repo_arg + list(packages) elif action == 'update': cmd = ['yum', 'update', '-y'] + repo_arg + list(packages) elif action == 'requires': cmd = ['repoquery', '--requires'] + repo_arg + list(packages) elif action == 'query': cmd = ['repoquery'] + repo_arg + list(packages) elif action == 'repoquery': cmd = ['repoquery'] + repo_arg + [ '-a', '--qf=%{EPOCH} %{NAME} %{VERSION} %{RELEASE} %{ARCH}' ] elif action == 'check-update': cmd = ['yum', 'check-update', '-q'] + repo_arg + list(packages) valid_rc_values = [ 0, 100 ] # check-update returns 100 if updates are available. else: raise RuntimeError('Unknown yum util action %s' % action) # This is a poor solution for HYD-3855 but not one that carries any known cost. # We sometimes see intermittent failures in test, and possibly out of test, that occur # 1 in 50 (estimate) times. yum commands are idempotent and so trying the command three # times has no downside and changes the estimated chance of fail to 1 in 12500. for hyd_3885 in range(2, -1, -1): rc, stdout, stderr = AgentShell.run_old(cmd) if rc in valid_rc_values: return stdout else: daemon_log.info("HYD-3885 Retrying yum command '%s'" % " ".join(cmd)) if hyd_3885 == 0: daemon_log.info("HYD-3885 Retry yum command failed '%s'" % " ".join(cmd)) raise AgentShell.CommandExecutionError( AgentShell.RunResult(rc, stdout, stderr, False), cmd) # Out of retries so raise for the caller..