def enable_sbd( lib_env, default_watchdog, watchdog_dict, sbd_options, default_device_list=None, node_device_dict=None, allow_unknown_opts=False, ignore_offline_nodes=False, no_watchdog_validation=False, allow_invalid_option_values=False, ): # pylint: disable=too-many-arguments, too-many-locals """ Enable SBD on all nodes in cluster. lib_env -- LibraryEnvironment default_watchdog -- watchdog for nodes which are not specified in watchdog_dict. Uses default value from settings if None. watchdog_dict -- dictionary with node names as keys and watchdog path as value sbd_options -- dictionary in format: <SBD config option>: <value> default_device_list -- list of devices for all nodes node_device_dict -- dictionary with node names as keys and list of devices as value allow_unknown_opts -- if True, accept also unknown options. ignore_offline_nodes -- if True, omit offline nodes no_watchdog_validation -- it True, do not validate existance of a watchdog on the nodes allow_invalid_option_values -- if True, invalid values of some options will be treated as warning instead of errors """ using_devices = not (default_device_list is None and node_device_dict is None) if default_device_list is None: default_device_list = [] if node_device_dict is None: node_device_dict = {} if not default_watchdog: default_watchdog = settings.sbd_watchdog_default sbd_options = {opt.upper(): val for opt, val in sbd_options.items()} corosync_conf = lib_env.get_corosync_conf() node_list, get_nodes_report_list = get_existing_nodes_names(corosync_conf) if not node_list: get_nodes_report_list.append( reports.corosync_config_no_nodes_defined()) target_list = lib_env.get_node_target_factory().get_target_list( node_list, skip_non_existing=ignore_offline_nodes, ) full_watchdog_dict = _get_full_target_dict(target_list, watchdog_dict, default_watchdog) full_device_dict = _get_full_target_dict(target_list, node_device_dict, default_device_list) if lib_env.report_processor.report_list( get_nodes_report_list + [ reports.node_not_found(node) for node in (set(list(watchdog_dict.keys()) + list(node_device_dict.keys())) - set(node_list)) ] + _validate_watchdog_dict(full_watchdog_dict) + (sbd.validate_nodes_devices(full_device_dict) if using_devices else [] ) + _validate_sbd_options(sbd_options, allow_unknown_opts, allow_invalid_option_values)).has_errors: raise LibraryError() com_cmd = GetOnlineTargets( lib_env.report_processor, ignore_offline_targets=ignore_offline_nodes, ) com_cmd.set_targets(target_list) online_targets = run_and_raise(lib_env.get_node_communicator(), com_cmd) # check if SBD can be enabled if no_watchdog_validation: lib_env.report_processor.report( reports.sbd_watchdog_validation_inactive()) com_cmd = CheckSbd(lib_env.report_processor) for target in online_targets: com_cmd.add_request( target, ( # Do not send watchdog if validation is turned off. Listing of # available watchdogs in pcsd may restart the machine in some # corner cases. "" if no_watchdog_validation else full_watchdog_dict[target.label]), full_device_dict[target.label] if using_devices else [], ) run_and_raise(lib_env.get_node_communicator(), com_cmd) # enable ATB if needed if not using_devices: if sbd.atb_has_to_be_enabled_pre_enable_check(corosync_conf): lib_env.report_processor.report( reports.corosync_quorum_atb_will_be_enabled_due_to_sbd()) corosync_conf.set_quorum_options({"auto_tie_breaker": "1"}) lib_env.push_corosync_conf(corosync_conf, ignore_offline_nodes) # distribute SBD configuration config = sbd.get_default_sbd_config() config.update(sbd_options) com_cmd = SetSbdConfig(lib_env.report_processor) for target in online_targets: com_cmd.add_request( target, sbd.create_sbd_config(config, target.label, full_watchdog_dict[target.label], full_device_dict[target.label])) run_and_raise(lib_env.get_node_communicator(), com_cmd) # remove cluster prop 'stonith_watchdog_timeout' com_cmd = RemoveStonithWatchdogTimeout(lib_env.report_processor) com_cmd.set_targets(online_targets) run_and_raise(lib_env.get_node_communicator(), com_cmd) # enable SBD service an all nodes com_cmd = EnableSbdService(lib_env.report_processor) com_cmd.set_targets(online_targets) run_and_raise(lib_env.get_node_communicator(), com_cmd) lib_env.report_processor.report( reports.cluster_restart_required_to_apply_changes())
def enable_sbd( lib_env, default_watchdog, watchdog_dict, sbd_options, default_device_list=None, node_device_dict=None, allow_unknown_opts=False, ignore_offline_nodes=False, ): """ Enable SBD on all nodes in cluster. lib_env -- LibraryEnvironment default_watchdog -- watchdog for nodes which are not specified in watchdog_dict. Uses default value from settings if None. watchdog_dict -- dictionary with node names as keys and watchdog path as value sbd_options -- dictionary in format: <SBD config option>: <value> default_device_list -- list of devices for all nodes node_device_dict -- dictionary with node names as keys and list of devices as value allow_unknown_opts -- if True, accept also unknown options. ignore_offline_nodes -- if True, omit offline nodes """ node_list = _get_cluster_nodes(lib_env) target_list = lib_env.get_node_target_factory().get_target_list(node_list) using_devices = not ( default_device_list is None and node_device_dict is None ) if default_device_list is None: default_device_list = [] if node_device_dict is None: node_device_dict = {} if not default_watchdog: default_watchdog = settings.sbd_watchdog_default sbd_options = dict([(opt.upper(), val) for opt, val in sbd_options.items()]) full_watchdog_dict = _get_full_target_dict( target_list, watchdog_dict, default_watchdog ) full_device_dict = _get_full_target_dict( target_list, node_device_dict, default_device_list ) lib_env.report_processor.process_list( _check_node_names_in_cluster( node_list, list(watchdog_dict.keys()) + list(node_device_dict.keys()) ) + _validate_watchdog_dict(full_watchdog_dict) + (_validate_device_dict(full_device_dict) if using_devices else []) + _validate_sbd_options(sbd_options, allow_unknown_opts) ) com_cmd = GetOnlineTargets( lib_env.report_processor, ignore_offline_targets=ignore_offline_nodes, ) com_cmd.set_targets(target_list) online_targets = run_and_raise(lib_env.get_node_communicator(), com_cmd) # check if SBD can be enabled com_cmd = CheckSbd(lib_env.report_processor) for target in online_targets: com_cmd.add_request( target, full_watchdog_dict[target.label], full_device_dict[target.label] if using_devices else [], ) run_and_raise(lib_env.get_node_communicator(), com_cmd) # enable ATB if neede if not lib_env.is_cman_cluster and not using_devices: corosync_conf = lib_env.get_corosync_conf() if sbd.atb_has_to_be_enabled_pre_enable_check(corosync_conf): lib_env.report_processor.process(reports.sbd_requires_atb()) corosync_conf.set_quorum_options( lib_env.report_processor, {"auto_tie_breaker": "1"} ) lib_env.push_corosync_conf(corosync_conf, ignore_offline_nodes) # distribute SBD configuration config = sbd.get_default_sbd_config() config.update(sbd_options) com_cmd = SetSbdConfig(lib_env.report_processor) for target in online_targets: com_cmd.add_request( target, sbd.create_sbd_config( config, target.label, full_watchdog_dict[target.label], full_device_dict[target.label] ) ) run_and_raise(lib_env.get_node_communicator(), com_cmd) # remove cluster prop 'stonith_watchdog_timeout' com_cmd = RemoveStonithWatchdogTimeout(lib_env.report_processor) com_cmd.set_targets(online_targets) run_and_raise(lib_env.get_node_communicator(), com_cmd) # enable SBD service an all nodes com_cmd = EnableSbdService(lib_env.report_processor) com_cmd.set_targets(online_targets) run_and_raise(lib_env.get_node_communicator(), com_cmd) lib_env.report_processor.process( reports.cluster_restart_required_to_apply_changes() )
def add_nodes( env, nodes, wait=False, start=False, enable=False, force=False, force_unresolvable=False, skip_offline_nodes=False, no_watchdog_validation=False, ): # pylint: disable=too-many-locals """ Add specified nodes to the local cluster Raise LibraryError on any error. env LibraryEnvironment nodes list -- list of dicts which represents node. Supported keys are: name (required), addrs (list), devices (list), watchdog wait -- specifies if command should try to wait for cluster to start up. Has no effect start is False. If set to False command will not wait for cluster to start. If None command will wait for some default timeout. If int wait set timeout to int value of seconds. start bool -- if True start cluster when it is set up enable bool -- if True enable cluster when it is set up force bool -- if True some validations errors are treated as warnings force_unresolvable bool -- if True not resolvable addresses of nodes are treated as warnings skip_offline_nodes bool -- if True non fatal connection failures to other hosts are treated as warnings no_watchdog_validation bool -- if True do not validate specified watchdogs on remote hosts """ _ensure_live_env(env) # raises if env is not live report_processor = SimpleReportProcessor(env.report_processor) target_factory = env.get_node_target_factory() is_sbd_enabled = sbd.is_sbd_enabled(env.cmd_runner()) corosync_conf = env.get_corosync_conf() cluster_nodes_names = corosync_conf.get_nodes_names() corosync_node_options = {"name", "addrs"} sbd_node_options = {"devices", "watchdog"} keys_to_normalize = {"addrs"} if is_sbd_enabled: keys_to_normalize |= sbd_node_options new_nodes = [_normalize_dict(node, keys_to_normalize) for node in nodes] # get targets for existing nodes target_report_list, cluster_nodes_target_list = ( target_factory.get_target_list_with_reports( cluster_nodes_names, skip_non_existing=skip_offline_nodes, )) report_processor.report_list(target_report_list) # get a target for qnetd if needed qdevice_model, qdevice_model_options, _, _ = ( corosync_conf.get_quorum_device_settings()) if qdevice_model == "net": try: qnetd_target = target_factory.get_target( qdevice_model_options["host"]) except HostNotFound: report_processor.report( reports.host_not_found([qdevice_model_options["host"]])) # Get targets for new nodes and report unknown (== not-authorized) nodes. # If a node doesn't contain the 'name' key, validation of inputs reports it. # That means we don't report missing names but cannot rely on them being # present either. target_report_list, new_nodes_target_list = ( target_factory.get_target_list_with_reports( [node["name"] for node in new_nodes if "name" in node], allow_skip=False, )) report_processor.report_list(target_report_list) # Set default values for not-specified node options. # Use an address defined in known-hosts for each node with no addresses # specified. This allows users not to specify node addresses at all which # simplifies the whole node add command / form significantly. new_nodes_target_dict = { target.label: target for target in new_nodes_target_list } addrs_defaulter = _get_addrs_defaulter(report_processor, new_nodes_target_dict) new_nodes_defaulters = {"addrs": addrs_defaulter} if is_sbd_enabled: watchdog_defaulter = _get_watchdog_defaulter(report_processor, new_nodes_target_dict) new_nodes_defaulters["devices"] = lambda _: [] new_nodes_defaulters["watchdog"] = watchdog_defaulter new_nodes = [ _set_defaults_in_dict(node, new_nodes_defaulters) for node in new_nodes ] new_nodes_dict = { node["name"]: node for node in new_nodes if "name" in node } # Validate inputs - node options names # We do not want to make corosync validators know about SBD options and # vice versa. Therefore corosync and SBD validators get only valid corosync # and SBD options respectively, and we need to check for any surplus # options here. report_processor.report_list( validate_names_in( corosync_node_options | sbd_node_options, set([ option for node_options in [node.keys() for node in new_nodes] for option in node_options ]), option_type="node")) # Validate inputs - corosync part try: cib = env.get_cib() cib_nodes = get_remote_nodes(cib) + get_guest_nodes(cib) except LibraryError: cib_nodes = [] report_processor.report( reports.get_problem_creator( report_codes.FORCE_LOAD_NODES_FROM_CIB, force)(reports.cib_load_error_get_nodes_for_validation)) # corosync validator rejects non-corosync keys new_nodes_corosync = [{ key: node[key] for key in corosync_node_options if key in node } for node in new_nodes] report_processor.report_list( config_validators.add_nodes(new_nodes_corosync, corosync_conf.get_nodes(), cib_nodes, force_unresolvable=force_unresolvable)) # Validate inputs - SBD part if is_sbd_enabled: report_processor.report_list( sbd.validate_new_nodes_devices({ node["name"]: node["devices"] for node in new_nodes if "name" in node })) else: for node in new_nodes: sbd_options = sbd_node_options.intersection(node.keys()) if sbd_options and "name" in node: report_processor.report( reports.sbd_not_used_cannot_set_sbd_options( sbd_options, node["name"])) # Validate inputs - flags part wait_timeout = _get_validated_wait_timeout(report_processor, wait, start) # Get online cluster nodes # This is the only call in which we accept skip_offline_nodes option for the # cluster nodes. In all the other actions we communicate only with the # online nodes. This allows us to simplify code as any communication issue # is considered an error, ends the command processing and is not possible # to skip it by skip_offline_nodes. We do not have to care about a situation # when a communication command cannot connect to some nodes and then the # next command can connect but fails due to the previous one did not # succeed. online_cluster_target_list = [] if cluster_nodes_target_list: com_cmd = GetOnlineTargets( report_processor, ignore_offline_targets=skip_offline_nodes, ) com_cmd.set_targets(cluster_nodes_target_list) online_cluster_target_list = run_com(env.get_node_communicator(), com_cmd) offline_cluster_target_list = [ target for target in cluster_nodes_target_list if target not in online_cluster_target_list ] if len(online_cluster_target_list) == 0: report_processor.report( reports.unable_to_perform_operation_on_any_node()) elif offline_cluster_target_list and skip_offline_nodes: # TODO: report (warn) how to fix offline nodes when they come online # report_processor.report(None) pass # Validate existing cluster nodes status atb_has_to_be_enabled = sbd.atb_has_to_be_enabled(env.cmd_runner(), corosync_conf, len(new_nodes)) if atb_has_to_be_enabled: report_processor.report( reports.corosync_quorum_atb_will_be_enabled_due_to_sbd()) if online_cluster_target_list: com_cmd = CheckCorosyncOffline( report_processor, allow_skip_offline=False, ) com_cmd.set_targets(online_cluster_target_list) run_com(env.get_node_communicator(), com_cmd) # Validate new nodes. All new nodes have to be online. com_cmd = GetHostInfo(report_processor) com_cmd.set_targets(new_nodes_target_list) report_processor.report_list( _host_check_cluster_setup( run_com(env.get_node_communicator(), com_cmd), force, # version of services may not be the same across the existing # cluster nodes, so it's not easy to make this check properly check_services_versions=False, )) # Validate SBD on new nodes if is_sbd_enabled: if no_watchdog_validation: report_processor.report(reports.sbd_watchdog_validation_inactive()) com_cmd = CheckSbd(report_processor) for new_node_target in new_nodes_target_list: new_node = new_nodes_dict[new_node_target.label] # Do not send watchdog if validation is turned off. Listing of # available watchdogs in pcsd may restart the machine in some # corner cases. com_cmd.add_request( new_node_target, watchdog="" if no_watchdog_validation else new_node["watchdog"], device_list=new_node["devices"], ) run_com(env.get_node_communicator(), com_cmd) if report_processor.has_errors: raise LibraryError() # Validation done. If errors occured, an exception has been raised and we # don't get below this line. # First set up everything else than corosync. Once the new nodes are present # in corosync.conf, they're considered part of a cluster and the node add # command cannot be run again. So we need to minimize the amout of actions # (and therefore possible failures) after adding the nodes to corosync. # distribute auth tokens of all cluster nodes (including the new ones) to # all new nodes com_cmd = UpdateKnownHosts( env.report_processor, known_hosts_to_add=env.get_known_hosts(cluster_nodes_names + list(new_nodes_dict.keys())), known_hosts_to_remove=[], ) com_cmd.set_targets(new_nodes_target_list) run_and_raise(env.get_node_communicator(), com_cmd) # qdevice setup if qdevice_model == "net": qdevice_net.set_up_client_certificates( env.cmd_runner(), env.report_processor, env.communicator_factory, qnetd_target, corosync_conf.get_cluster_name(), new_nodes_target_list, # we don't want to allow skiping offline nodes which are being # added, otherwise qdevice will not work properly skip_offline_nodes=False, allow_skip_offline=False) # sbd setup if is_sbd_enabled: sbd_cfg = environment_file_to_dict(sbd.get_local_sbd_config()) com_cmd = SetSbdConfig(env.report_processor) for new_node_target in new_nodes_target_list: new_node = new_nodes_dict[new_node_target.label] com_cmd.add_request( new_node_target, sbd.create_sbd_config( sbd_cfg, new_node["name"], watchdog=new_node["watchdog"], device_list=new_node["devices"], )) run_and_raise(env.get_node_communicator(), com_cmd) com_cmd = EnableSbdService(env.report_processor) com_cmd.set_targets(new_nodes_target_list) run_and_raise(env.get_node_communicator(), com_cmd) else: com_cmd = DisableSbdService(env.report_processor) com_cmd.set_targets(new_nodes_target_list) run_and_raise(env.get_node_communicator(), com_cmd) # booth setup booth_sync.send_all_config_to_node( env.get_node_communicator(), env.report_processor, new_nodes_target_list, rewrite_existing=force, skip_wrong_config=force, ) # distribute corosync and pacemaker authkeys files_action = {} forceable_io_error_creator = reports.get_problem_creator( report_codes.SKIP_FILE_DISTRIBUTION_ERRORS, force) if os.path.isfile(settings.corosync_authkey_file): try: files_action.update( node_communication_format.corosync_authkey_file( open(settings.corosync_authkey_file, "rb").read())) except EnvironmentError as e: report_processor.report( forceable_io_error_creator( reports.file_io_error, env_file_role_codes.COROSYNC_AUTHKEY, file_path=settings.corosync_authkey_file, operation="read", reason=format_environment_error(e))) if os.path.isfile(settings.pacemaker_authkey_file): try: files_action.update( node_communication_format.pcmk_authkey_file( open(settings.pacemaker_authkey_file, "rb").read())) except EnvironmentError as e: report_processor.report( forceable_io_error_creator( reports.file_io_error, env_file_role_codes.PACEMAKER_AUTHKEY, file_path=settings.pacemaker_authkey_file, operation="read", reason=format_environment_error(e))) # pcs_settings.conf was previously synced using pcsdcli send_local_configs. # This has been changed temporarily until new system for distribution and # syncronization of configs will be introduced. if os.path.isfile(settings.pcsd_settings_conf_location): try: files_action.update( node_communication_format.pcs_settings_conf_file( open(settings.pcsd_settings_conf_location, "r").read())) except EnvironmentError as e: report_processor.report( forceable_io_error_creator( reports.file_io_error, env_file_role_codes.PCS_SETTINGS_CONF, file_path=settings.pcsd_settings_conf_location, operation="read", reason=format_environment_error(e))) # stop here if one of the files could not be loaded and it was not forced if report_processor.has_errors: raise LibraryError() if files_action: com_cmd = DistributeFilesWithoutForces(env.report_processor, files_action) com_cmd.set_targets(new_nodes_target_list) run_and_raise(env.get_node_communicator(), com_cmd) # Distribute and reload pcsd SSL certificate report_processor.report( reports.pcsd_ssl_cert_and_key_distribution_started( [target.label for target in new_nodes_target_list])) try: with open(settings.pcsd_cert_location, "r") as f: ssl_cert = f.read() except EnvironmentError as e: report_processor.report( reports.file_io_error( env_file_role_codes.PCSD_SSL_CERT, file_path=settings.pcsd_cert_location, reason=format_environment_error(e), operation="read", )) try: with open(settings.pcsd_key_location, "r") as f: ssl_key = f.read() except EnvironmentError as e: report_processor.report( reports.file_io_error( env_file_role_codes.PCSD_SSL_KEY, file_path=settings.pcsd_key_location, reason=format_environment_error(e), operation="read", )) if report_processor.has_errors: raise LibraryError() com_cmd = SendPcsdSslCertAndKey(env.report_processor, ssl_cert, ssl_key) com_cmd.set_targets(new_nodes_target_list) run_and_raise(env.get_node_communicator(), com_cmd) # When corosync >= 2 is in use, the procedure for adding a node is: # 1. add the new node to corosync.conf on all existing nodes # 2. reload corosync.conf before the new node is started # 3. start the new node # If done otherwise, membership gets broken and qdevice hangs. Cluster # will recover after a minute or so but still it's a wrong way. corosync_conf.add_nodes(new_nodes_corosync) if atb_has_to_be_enabled: corosync_conf.set_quorum_options(dict(auto_tie_breaker="1")) com_cmd = DistributeCorosyncConf( env.report_processor, corosync_conf.config.export(), allow_skip_offline=False, ) com_cmd.set_targets(online_cluster_target_list + new_nodes_target_list) run_and_raise(env.get_node_communicator(), com_cmd) com_cmd = ReloadCorosyncConf(env.report_processor) com_cmd.set_targets(online_cluster_target_list) run_and_raise(env.get_node_communicator(), com_cmd) # Optionally enable and start cluster services. if enable: com_cmd = EnableCluster(env.report_processor) com_cmd.set_targets(new_nodes_target_list) run_and_raise(env.get_node_communicator(), com_cmd) if start: _start_cluster( env.communicator_factory, env.report_processor, new_nodes_target_list, wait_timeout=wait_timeout, )