def unconfigure_target_ha(primary, ha_label, uuid): ''' Unconfigure the target high availability :param primary: Boolean if localhost is primary :param ha_label: String that identifies resource :param uuid: UUID that identifies config :return: Value using simple return protocol ''' with PreservePacemakerCorosyncState(): info = _get_target_config(uuid) if get_resource_location(ha_label): return agent_error( "cannot unconfigure-ha: {} is still running ".format(ha_label)) _unconfigure_target_priority(primary, ha_label) if primary: result = _unconfigure_target_ha(ha_label, info) if result.rc != 0 and result.rc != 234: return agent_error( "Error {} trying to cleanup resource {}".format( result.rc, ha_label)) return agent_result_ok
def stop_target(ha_label): ''' Stop the high availability target Return: Value using simple return protocol ''' # HYD-7230: brute force, try up to 3 times to stop the target i = 0 while True: i += 1 # Issue the command to Pacemaker to stop the target if _resource_exists(_zfs_name(ha_label)): # Group disable will disable all members of group regardless of current status error = AgentShell.run_canned_error_message( ['pcs', 'resource', 'disable', _group_name(ha_label)]) else: error = AgentShell.run_canned_error_message( ['pcs', 'resource', 'disable', ha_label]) if error: return agent_error(error) if _wait_target(ha_label, False): return agent_result_ok if i < 4: console_log.info("failed to stop target %s", ha_label) else: return agent_error("Failed to stop target {}".format(ha_label))
def get_corosync_autoconfig(): """ Automatically detect the configuration for corosync. :return: dictionary containing 'result' or 'error'. """ ring0 = get_shared_ring() if not ring0: return agent_error("Failed to detect ring0 interface") ring1_ipaddr, ring1_prefix = generate_ring1_network(ring0) try: ring1 = detect_ring1(ring0, ring1_ipaddr, ring1_prefix) except RingDetectionError as e: return agent_error(e.message) return agent_result({ "interfaces": { ring0.name: { "dedicated": False, "ipaddr": ring0.ipv4_address, "prefix": ring0.ipv4_prefixlen, }, ring1.name: { "dedicated": True, "ipaddr": ring1.ipv4_address, "prefix": ring1.ipv4_prefixlen, }, }, "mcast_port": ring1.mcastport, })
def get_corosync_autoconfig(): """ Automatically detect the configuration for corosync. :return: dictionary containing 'result' or 'error'. """ ring0 = get_ring0() if not ring0: return agent_error('Failed to detect ring0 interface') ring1_ipaddr, ring1_prefix = generate_ring1_network(ring0) try: ring1 = detect_ring1(ring0, ring1_ipaddr, ring1_prefix) except RingDetectionError as e: return agent_error(e.message) return agent_result({ 'interfaces': { ring0.name: { 'dedicated': False, 'ipaddr': ring0.ipv4_address, 'prefix': ring0.ipv4_prefixlen }, ring1.name: { 'dedicated': True, 'ipaddr': ring1.ipv4_address, 'prefix': ring1.ipv4_prefixlen } }, 'mcast_port': ring1.mcastport })
def unconfigure_target_ha(primary, ha_label, uuid): ''' Unconfigure the target high availability Return: Value using simple return protocol ''' with PreservePacemakerCorosyncState(): if get_resource_location(ha_label): return agent_error("cannot unconfigure-ha: %s is still running " % ha_label) if primary: result = cibadmin( ["-D", "-X", "<rsc_location id=\"%s-primary\">" % ha_label]) result = cibadmin(["-D", "-X", "<primitive id=\"%s\">" % ha_label]) if result.rc != 0 and result.rc != 234: return agent_error("Error %s trying to cleanup resource %s" % (result.rc, ha_label)) else: result = cibadmin( ["-D", "-X", "<rsc_location id=\"%s-secondary\">" % ha_label]) return agent_result_ok
def stop_target(ha_label): ''' Start the high availability target Return: Value using simple return protocol ''' # HYD-7230: brute force, try up to 3 times to stop the target i = 0 while True: i += 1 # Issue the command to Pacemaker to stop the target error = AgentShell.run_canned_error_message([ 'crm_resource', '-r', ha_label, '-p', 'target-role', '-m', '-v', 'Stopped' ]) if error: return agent_error(error) if _wait_target(ha_label, False): return agent_result_ok if i < 4: console_log.info("failed to stop target %s" % ha_label) else: return agent_error("failed to stop target %s" % ha_label)
def unconfigure_corosync2(host_fqdn, mcast_port): """ Unconfigure the corosync application. For corosync2 don't disable pcsd, just remove host node from cluster and disable corosync from auto starting (service should already be stopped in state transition) Note that pcs cluster commands handle editing and removal of the corosync.conf file Return: Value using simple return protocol """ error = corosync_service.disable() if error: return agent_error(error) # Detect if we are the only node in the cluster, we want to do this before next command removes conf file cluster_nodes = _nodes_in_cluster() result = AgentShell.run(["pcs", "--force", "cluster", "node", "remove", host_fqdn]) if result.rc != 0: if "No such file or directory" in result.stderr: # we want to return successful if the configuration file does not exist console_log.warning(result.stderr) elif "Error: Unable to update any nodes" in result.stderr: # this error is expected when this is the last node in the cluster if len(cluster_nodes) != 1: return agent_error(result.stderr) else: return agent_error(result.stderr) return agent_ok_or_error( firewall_control.remove_rule(PCS_TCP_PORT, "tcp", "pcs", persist=True) or firewall_control.remove_rule(mcast_port, "udp", "corosync", persist=True) )
def start_target(ha_label): ''' Start the high availability target Return: Value using simple return protocol ''' # HYD-1989: brute force, try up to 3 times to start the target i = 0 while True: i += 1 error = AgentShell.run_canned_error_message([ 'crm_resource', '-r', ha_label, '-p', 'target-role', '-m', '-v', 'Started' ]) if error: return agent_error(error) # now wait for it to start _wait_target(ha_label, True) # and make sure it didn't start but (the RA) fail(ed) rc, stdout, stderr = AgentShell.run_old(['crm_mon', '-1']) failed = True for line in stdout.split("\n"): if line.lstrip().startswith(ha_label): if line.find("FAILED") < 0: failed = False if failed: # try to leave things in a sane state for a failed mount error = AgentShell.run_canned_error_message([ 'crm_resource', '-r', ha_label, '-p', 'target-role', '-m', '-v', 'Stopped' ]) if error: return agent_error(error) if i < 4: console_log.info("failed to start target %s" % ha_label) else: return agent_error("Failed to start target %s" % ha_label) else: location = get_resource_location(ha_label) if not location: return agent_error("Started %s but now can't locate it!" % ha_label) return agent_result(location)
def start_target(ha_label): """ Start the high availability target Return: Value using simple return protocol """ if not _resource_exists(ha_label): return agent_error("Target {} does not exist".format(ha_label)) # if resource already started but not on primary, move it location = get_resource_location(ha_label) primary = _find_resource_constraint(ha_label, True) if location: if location != primary: console_log.info( "Resource %s already started, moving to primary node %s", ha_label, primary, ) error = _move_target(ha_label, primary) if error: return agent_error(error) location = primary return agent_result(location) try: _res_set_started(ha_label, True) if _resource_exists(_zfs_name(ha_label)): _res_set_started(_zfs_name(ha_label), True) # enable group also, in case group was disabled _res_set_started(_group_name(ha_label), True) # now wait for it to start if not _wait_target(ha_label, True): # try to leave things in a sane state for a failed mount _res_set_started(ha_label, False) return agent_error("Failed to start target {}".format(ha_label)) location = get_resource_location(ha_label) if not location: return agent_error( "Started {} but now can't locate it!".format(ha_label)) return agent_result(location) except AgentShell.CommandExecutionError as err: return agent_error( "Error (%s) running '%s': '%s' '%s'" % (err.result.rc, err.command, err.result.stdout, err.result.stderr))
def test_set_profile_fail(self): # Three times because yum will try three times. self.add_commands( CommandCaptureCommand( ('yum', 'install', '-y', '--exclude', 'kernel-debug', 'python2-iml-agent-management'), rc=1, stdout="Bad command stdout", stderr="Bad command stderr"), CommandCaptureCommand(('yum', 'clean', 'metadata')), CommandCaptureCommand( ('yum', 'install', '-y', '--exclude', 'kernel-debug', 'python2-iml-agent-management'), rc=1, stdout="Bad command stdout", stderr="Bad command stderr"), CommandCaptureCommand(('yum', 'clean', 'metadata')), CommandCaptureCommand( ('yum', 'install', '-y', '--exclude', 'kernel-debug', 'python2-iml-agent-management'), rc=1, stdout="Bad command stdout", stderr="Bad command stderr"), CommandCaptureCommand(('yum', 'clean', 'metadata'))) config.update('settings', 'profile', {'managed': False}) # Go from managed = False to managed = True, but it will fail. self.assertEqual( agent_updates.update_profile({'managed': True}), agent_error( 'Unable to set profile because yum returned Bad command stdout' )) self.assertRanAllCommandsInOrder()
def start_monitored_copytool(id): # Start the monitor first so that we have a reader on the FIFO when # the copytool begins emitting events. Then start the copytool copytool_vars = _copytool_vars(id) for service_name in ["chroma-copytool-monitor", "chroma-copytool"]: _write_service_init( service_name, copytool_vars["id"], copytool_vars["ct_path"], copytool_vars["ct_arguments"], ) service = ServiceControl.create("%s-%s" % (service_name, id)) service.daemon_reload() if service.running: error = service.restart() else: error = service.start() if error: return agent_error(error) return agent_result_ok
def configure_pacemaker(): """ Configure pacemaker :return: Error string on failure, None on success """ # Corosync needs to be running for pacemaker -- if it's not, make # an attempt to get it going. if not corosync_service.running: error = corosync_service.restart() if error: return agent_error(error) for action in [ enable_pacemaker, stop_pacemaker, start_pacemaker, _configure_pacemaker, ]: error = action() if error != agent_result_ok: return error time.sleep(1) return agent_result_ok
def _failoverback_target(ha_label, primary): """Fail a target over to the destination node Return: Value using simple return protocol """ node = _find_resource_constraint(ha_label, primary) if not node: return agent_error("Unable to find the {} server for '{}'".format( 'primary' if primary else 'secondary', ha_label)) error = _move_target(ha_label, node) if error: return agent_error(error) return agent_result_ok
def _configure_pacemaker(): ''' Configure pacemaker if this node is the dc. :return: agent_ok if no error else returns an agent_error ''' pc = PacemakerConfig() timeout_time = time.time() + PACEMAKER_CONFIGURE_TIMEOUT error = None while (pc.configured is False) and (time.time() < timeout_time): if pc.is_dc: daemon_log.info( 'Configuring (global) pacemaker configuration because I am the DC' ) error = _do_configure_pacemaker(pc) if error: return agent_error(error) else: daemon_log.info( 'Not configuring (global) pacemaker configuration because I am not the DC' ) time.sleep(10) if pc.configured is False: error = 'Failed to configure (global) pacemaker configuration dc=%s' % pc.dc return agent_ok_or_error(error)
def _failoverback_target(ha_label, destination): """Fail a target over to the destination node Return: Value using simple return protocol """ node = _find_resource_constraint(ha_label, destination) if not node: return agent_error("Unable to find the %s server for '%s'" % (destination, ha_label)) error = _move_target(ha_label, node) if error: return agent_error(error) return agent_result_ok
def install_packages(repos, packages): """ Explicitly evaluate and install or update any specific-version dependencies and satisfy even if that involves installing an older package than is already installed. Primary use case is installing lustre-modules, which depends on a specific kernel package. :param repos: List of strings, yum repo names :param packages: List of strings, yum package names :return: package report of the format given by the lustre device plugin """ if packages != []: yum_util("clean") out = yum_util("requires", enablerepo=repos, packages=packages) for requirement in [l.strip() for l in out.strip().split("\n")]: match = re.match("([^\)/]*) = (.*)", requirement) if match: require_package, require_version = match.groups() packages.append("%s-%s" % (require_package, require_version)) yum_util("install", enablerepo=repos, packages=packages) error = _check_HYD4050() if error: return agent_error(error) ServiceControl.create("iml-update-check").start(0) return agent_result_ok
def test_set_profile_fail(self): # Three times because yum will try three times. self.add_commands( CommandCaptureCommand( ( "yum", "install", "-y", "--exclude", "kernel-debug", "python2-iml-agent-management", ), rc=1, stdout="Bad command stdout", stderr="Bad command stderr", ), CommandCaptureCommand(("yum", "clean", "metadata")), CommandCaptureCommand( ( "yum", "install", "-y", "--exclude", "kernel-debug", "python2-iml-agent-management", ), rc=1, stdout="Bad command stdout", stderr="Bad command stderr", ), CommandCaptureCommand(("yum", "clean", "metadata")), CommandCaptureCommand( ( "yum", "install", "-y", "--exclude", "kernel-debug", "python2-iml-agent-management", ), rc=1, stdout="Bad command stdout", stderr="Bad command stderr", ), CommandCaptureCommand(("yum", "clean", "metadata")), ) config.update("settings", "profile", {"managed": False}) # Go from managed = False to managed = True, but it will fail. self.assertEqual( agent_updates.update_profile({"managed": True}), agent_error( "Unable to set profile because yum returned Bad command stdout" ), ) self.assertRanAllCommandsInOrder()
def initialise_block_device_drivers(): console_log.info("Initialising drivers for block device types") for cls in util.all_subclasses(BlockDevice): error = cls.initialise_driver(config.profile_managed) if error: return agent_error(error) return agent_result_ok
def terminate_block_device_drivers(): console_log.info("Terminating drivers for block device types") for cls in util.all_subclasses(BlockDevice): error = cls.terminate_driver() if error: return agent_error(error) return agent_result_ok
def unconfigure_repo(filename): full_filename = os.path.join(REPO_PATH, filename) try: os.remove(full_filename) except OSError as error: if error.errno != errno.ENOENT: return agent_error(str(error)) return agent_result_ok
def configure_target_ha(primary, device, ha_label, uuid, mount_point): """ Configure the target high availability :return: Value using simple return protocol """ _mkdir_p_concurrent(mount_point) if primary: info = _get_target_config(uuid) # If the target already exists with the same params, skip. # If it already exists with different params, that is an error if _resource_exists(ha_label): if info["bdev"] == device and info["mntpt"] == mount_point: return agent_result_ok return agent_error( "A resource with the name {} already exists".format(ha_label)) if info["bdev"] != device or info["mntpt"] != mount_point: console_log.error( "Mismatch for %s do not match configured (%s on %s) != (%s on %s)", ha_label, device, mount_point, info["bdev"], info["mntpt"], ) result = _configure_target_ha(ha_label, info, False) if result.rc != 0: return agent_error("Failed to create {}: {}".format( ha_label, result.rc)) result = _configure_target_priority(primary, ha_label, _this_node()) if result.rc != 0: return agent_error( "Failed to create location constraint on {}: {}".format( ha_label, result.rc)) return agent_result_ok
def stop_target(ha_label): """ Stop the high availability target Return: Value using simple return protocol """ try: # Issue the command to Pacemaker to stop the target if _resource_exists(_zfs_name(ha_label)): _res_set_started(_group_name(ha_label), False) else: _res_set_started(ha_label, False) except AgentShell.CommandExecutionError as err: return agent_error( "Error (%s) running '%s': '%s' '%s'" % (err.result.rc, err.command, err.result.stdout, err.result.stderr)) if not _wait_target(ha_label, False): return agent_error("Failed to stop target {}".format(ha_label)) return agent_result_ok
def terminate_block_device_drivers(): """ When the agent is stopped we want to allow block devices to do any termination that they might need, this function may also be called by the manager. """ console_log.info("Terminating drivers for block device types") for cls in util.all_subclasses(BlockDevice): error = cls.terminate_driver() if error: return agent_error(error) return agent_result_ok
def initialise_block_device_drivers(): """ When the agent is run we want to allow block devices to do any initialization that they might need, this function may also be called by the manager. """ console_log.info("Initialising drivers for block device types") for cls in util.all_subclasses(BlockDevice): error = cls.initialise_driver(config.profile_managed) if error: return agent_error(error) return agent_result_ok
def unconfigure_corosync(): """ Unconfigure the corosync application. :return: Value using simple return protocol """ corosync_service.stop() corosync_service.disable() mcast_port = None with open("/etc/corosync/corosync.conf") as f: for line in f.readlines(): match = re.match("\s*mcastport:\s*(\d+)", line) if match: mcast_port = match.group(1) break if mcast_port is None: return agent_error("Failed to find mcastport in corosync.conf") try: remove("/etc/corosync/corosync.conf") except OSError as e: if e.errno != errno.ENOENT: return agent_error("Failed to remove corosync.conf") except: return agent_error("Failed to remove corosync.conf") error = firewall_control.remove_rule(mcast_port, "udp", "corosync", persist=True) if error: return agent_error(error) return agent_result_ok
def configure_repo(filename, file_contents): crypto = Crypto(config.path) full_filename = os.path.join(REPO_PATH, filename) temp_full_filename = full_filename + '.tmp' file_contents = file_contents.format(crypto.AUTHORITY_FILE, crypto.PRIVATE_KEY_FILE, crypto.CERTIFICATE_FILE) try: file_handle = os.fdopen(os.open(temp_full_filename, os.O_WRONLY | os.O_CREAT, 0644), 'w') file_handle.write(file_contents) file_handle.close() os.rename(temp_full_filename, full_filename) except OSError as error: return agent_error(str(error)) return agent_result_ok
def run(self, cmd, agent_daemon_context, args): # FIXME: provide a log object to action plugins that we capture # and send back to the caller try: fn = self.commands[cmd] except KeyError: return agent_error( "Requested command %s was unknown to the agent" % cmd) # Only pass in the agent_daemon_context if the agent_daemon_context is expected by the function. # This feature was added just prior to 3.1 and whilst it would be better to always pass the context the # scope of the change was prohibitive at that time. # Not a fixme because it is of little value to make the additional changes at this time. if 'agent_daemon_context' in fn.__code__.co_varnames: return fn(agent_daemon_context, **args) else: return fn(**args)
def configure_corosync(ring0_name, ring1_name, old_mcast_port, new_mcast_port): """ Process configuration including negotiated multicast port, no IP address information required :param ring0_name: :param ring1_name: :param old_mcast_port: None if we are configuring corosync for the first-time, present if changing mcast port :param new_mcast_port: desired corosync multicast port as configured by user :return: Value using simple return protocol """ interfaces = [ InterfaceInfo( CorosyncRingInterface(name=ring0_name, ringnumber=0, mcastport=new_mcast_port), None, None, ), InterfaceInfo( CorosyncRingInterface(name=ring1_name, ringnumber=1, mcastport=new_mcast_port), None, None, ), ] config = render_config( [interface.corosync_iface for interface in interfaces]) write_config_to_file("/etc/corosync/corosync.conf", config) if old_mcast_port is not None: error = firewall_control.remove_rule(old_mcast_port, "udp", "corosync", persist=True) if error: return agent_error(error) return agent_ok_or_error( firewall_control.add_rule( new_mcast_port, "udp", "corosync", persist=True) or corosync_service.enable())
def stop_monitored_copytool(id): # Stop the monitor after the copytool so that we can relay the # unconfigure event. for service_name in ['chroma-copytool-monitor', 'chroma-copytool']: service = ServiceControl.create('%s-%s' % (service_name, id)) if os.path.exists(_init_file_name(service_name, id)) and service.running: error = service.stop() if error: return agent_error(error) os.remove(_init_file_name(service_name, id)) service.daemon_reload() # Finally cause the system agents to see our changes. return agent_result_ok
def _fake_invoke_agent(self, host, invoke, args=None): args = args if args is not None else {} assert type(args) is dict, "args list must be dict :%s" % type(args) args = InvokeAgentInvoke(host.fqdn, invoke, args, None, None) self._invokes_history.append(args) result = self._get_executable_invoke(args) result.executions_remaining -= 1 if result.error: return agent_error(result.error) if result.result: return agent_result(result.result) return agent_result_ok